--- v10
+++ v21
@@ -1,17 +1,17 @@
-When shadow stack is introduced, [R/O + _PAGE_DIRTY_HW] PTE is reserved
-for shadow stack. Copy-on-write PTEs have [R/O + _PAGE_COW].
+When Shadow Stack is introduced, [R/O + _PAGE_DIRTY] PTE is reserved for
+shadow stack. Copy-on-write PTEs have [R/O + _PAGE_COW].
-When a PTE goes from [R/W + _PAGE_DIRTY_HW] to [R/O + _PAGE_COW], it could
+When a PTE goes from [R/W + _PAGE_DIRTY] to [R/O + _PAGE_COW], it could
become a transient shadow stack PTE in two cases:
The first case is that some processors can start a write but end up seeing
a read-only PTE by the time they get to the Dirty bit, creating a transient
shadow stack PTE. However, this will not occur on processors supporting
-shadow stack, therefore we don't need a TLB flush here.
+Shadow Stack, and a TLB flush is not necessary.
-The second case is that when the software, without atomic, tests & replaces
-_PAGE_DIRTY_HW with _PAGE_COW, a transient shadow stack PTE can exist.
-This is prevented with cmpxchg.
+The second case is that when _PAGE_DIRTY is replaced with _PAGE_COW non-
+atomically, a transient shadow stack PTE can be created as a result.
+Thus, prevent that with cmpxchg.
Dave Hansen, Jann Horn, Andy Lutomirski, and Peter Zijlstra provided many
insights to the issue. Jann Horn provided the cmpxchg solution.
@@ -19,53 +19,31 @@
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
---
-v10:
-- Replace bit shift with pte_wrprotect()/pmd_wrprotect(), which use bit
- test & shift.
-- Move READ_ONCE of old_pte into try_cmpxchg() loop.
-- Change static_cpu_has() to cpu_feature_enabled().
-
-v9:
-- Change compile-time conditionals to runtime checks.
-- Fix parameters of try_cmpxchg(): change pte_t/pmd_t to
- pte_t.pte/pmd_t.pmd.
-
-v4:
-- Implement try_cmpxchg().
-
- arch/x86/include/asm/pgtable.h | 52 ++++++++++++++++++++++++++++++++++
- 1 file changed, 52 insertions(+)
+ arch/x86/include/asm/pgtable.h | 36 ++++++++++++++++++++++++++++++++++
+ 1 file changed, 36 insertions(+)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
-index f4870cd040de..eaa38adb1038 100644
+index 7a7f6af01ab5..71cc4abec704 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
-@@ -1316,6 +1316,32 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
+@@ -1280,6 +1280,24 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
static inline void ptep_set_wrprotect(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
+ /*
-+ * Some processors can start a write, but end up seeing a read-only
-+ * PTE by the time they get to the Dirty bit. In this case, they
-+ * will set the Dirty bit, leaving a read-only, Dirty PTE which
-+ * looks like a shadow stack PTE.
-+ *
-+ * However, this behavior has been improved and will not occur on
-+ * processors supporting shadow stack. Without this guarantee, a
-+ * transition to a non-present PTE and flush the TLB would be
-+ * needed.
-+ *
-+ * When changing a writable PTE to read-only and if the PTE has
-+ * _PAGE_DIRTY_HW set, move that bit to _PAGE_COW so that the
-+ * PTE is not a shadow stack PTE.
++ * If Shadow Stack is enabled, pte_wrprotect() moves _PAGE_DIRTY
++ * to _PAGE_COW (see comments at pte_wrprotect()).
++ * When a thread reads a RW=1, Dirty=0 PTE and before changing it
++ * to RW=0, Dirty=0, another thread could have written to the page
++ * and the PTE is RW=1, Dirty=1 now. Use try_cmpxchg() to detect
++ * PTE changes and update old_pte, then try again.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_SHSTK)) {
+ pte_t old_pte, new_pte;
+
++ old_pte = READ_ONCE(*ptep);
+ do {
-+ old_pte = READ_ONCE(*ptep);
+ new_pte = pte_wrprotect(old_pte);
-+
+ } while (!try_cmpxchg(&ptep->pte, &old_pte.pte, new_pte.pte));
+
+ return;
@@ -73,32 +51,24 @@
clear_bit(_PAGE_BIT_RW, (unsigned long *)&ptep->pte);
}
-@@ -1372,6 +1398,32 @@ static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
+@@ -1330,6 +1348,24 @@ static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
static inline void pmdp_set_wrprotect(struct mm_struct *mm,
unsigned long addr, pmd_t *pmdp)
{
+ /*
-+ * Some processors can start a write, but end up seeing a read-only
-+ * PMD by the time they get to the Dirty bit. In this case, they
-+ * will set the Dirty bit, leaving a read-only, Dirty PMD which
-+ * looks like a Shadow Stack PMD.
-+ *
-+ * However, this behavior has been improved and will not occur on
-+ * processors supporting Shadow Stack. Without this guarantee, a
-+ * transition to a non-present PMD and flush the TLB would be
-+ * needed.
-+ *
-+ * When changing a writable PMD to read-only and if the PMD has
-+ * _PAGE_DIRTY_HW set, we move that bit to _PAGE_COW so that the
-+ * PMD is not a shadow stack PMD.
++ * If Shadow Stack is enabled, pmd_wrprotect() moves _PAGE_DIRTY
++ * to _PAGE_COW (see comments at pmd_wrprotect()).
++ * When a thread reads a RW=1, Dirty=0 PMD and before changing it
++ * to RW=0, Dirty=0, another thread could have written to the page
++ * and the PMD is RW=1, Dirty=1 now. Use try_cmpxchg() to detect
++ * PMD changes and update old_pmd, then try again.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_SHSTK)) {
+ pmd_t old_pmd, new_pmd;
+
++ old_pmd = READ_ONCE(*pmdp);
+ do {
-+ old_pmd = READ_ONCE(*pmdp);
+ new_pmd = pmd_wrprotect(old_pmd);
-+
+ } while (!try_cmpxchg((pmdval_t *)pmdp, (pmdval_t *)&old_pmd, pmd_val(new_pmd)));
+
+ return;