--- v4
+++ v5
@@ -1,335 +1,126 @@
-This patch adds basic shadow stack enabling/disabling routines.
-A task's shadow stack is allocated from memory with VM_SHSTK
-flag set and read-only protection. The shadow stack is
-allocated to a fixed size of RLIMIT_STACK.
+can_follow_write_pte/pmd look for the (RO & DIRTY) PTE/PMD to
+verify an exclusive RO page still exists after a broken COW.
+
+A shadow stack PTE is RO & PAGE_DIRTY_SW when it is shared,
+otherwise RO & PAGE_DIRTY_HW.
+
+Introduce pte_exclusive() and pmd_exclusive() to also verify a
+shadow stack PTE is exclusive.
+
+Also rename can_follow_write_pte/pmd() to can_follow_write() to
+make their meaning clear; i.e. "Can we write to the page?", not
+"Is the PTE writable?"
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
- arch/x86/include/asm/cet.h | 30 +++++++
- arch/x86/include/asm/disabled-features.h | 8 +-
- arch/x86/include/asm/msr-index.h | 14 +++
- arch/x86/include/asm/processor.h | 5 ++
- arch/x86/kernel/Makefile | 2 +
- arch/x86/kernel/cet.c | 109 +++++++++++++++++++++++
- arch/x86/kernel/cpu/common.c | 24 +++++
- arch/x86/kernel/process.c | 2 +
- fs/proc/task_mmu.c | 3 +
- 9 files changed, 196 insertions(+), 1 deletion(-)
- create mode 100644 arch/x86/include/asm/cet.h
- create mode 100644 arch/x86/kernel/cet.c
+ arch/x86/mm/pgtable.c | 18 ++++++++++++++++++
+ include/asm-generic/pgtable.h | 4 ++++
+ mm/gup.c | 8 +++++---
+ mm/huge_memory.c | 8 +++++---
+ 4 files changed, 32 insertions(+), 6 deletions(-)
-diff --git a/arch/x86/include/asm/cet.h b/arch/x86/include/asm/cet.h
-new file mode 100644
-index 000000000000..ad278c520414
---- /dev/null
-+++ b/arch/x86/include/asm/cet.h
-@@ -0,0 +1,30 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+#ifndef _ASM_X86_CET_H
-+#define _ASM_X86_CET_H
+diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
+index 864954bda7fe..80876b2d03b7 100644
+--- a/arch/x86/mm/pgtable.c
++++ b/arch/x86/mm/pgtable.c
+@@ -903,4 +903,22 @@ inline bool arch_copy_pte_mapping(vm_flags_t vm_flags)
+ {
+ return (vm_flags & VM_SHSTK);
+ }
+
-+#ifndef __ASSEMBLY__
-+#include <linux/types.h>
++inline bool pte_exclusive(pte_t pte, struct vm_area_struct *vma)
++{
++ if (vma->vm_flags & VM_SHSTK)
++ return pte_dirty_hw(pte);
++ else
++ return pte_dirty(pte);
++}
+
-+struct task_struct;
-+/*
-+ * Per-thread CET status
-+ */
-+struct cet_status {
-+ unsigned long shstk_base;
-+ unsigned long shstk_size;
-+ unsigned int shstk_enabled:1;
-+};
-+
-+#ifdef CONFIG_X86_INTEL_CET
-+int cet_setup_shstk(void);
-+void cet_disable_shstk(void);
-+void cet_disable_free_shstk(struct task_struct *p);
-+#else
-+static inline int cet_setup_shstk(void) { return 0; }
-+static inline void cet_disable_shstk(void) {}
-+static inline void cet_disable_free_shstk(struct task_struct *p) {}
-+#endif
-+
-+#endif /* __ASSEMBLY__ */
-+
-+#endif /* _ASM_X86_CET_H */
-diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
-index 33833d1909af..3624a11e5ba6 100644
---- a/arch/x86/include/asm/disabled-features.h
-+++ b/arch/x86/include/asm/disabled-features.h
-@@ -56,6 +56,12 @@
- # define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31))
++#ifdef CONFIG_TRANSPARENT_HUGEPAGE
++inline bool pmd_exclusive(pmd_t pmd, struct vm_area_struct *vma)
++{
++ if (vma->vm_flags & VM_SHSTK)
++ return pmd_dirty_hw(pmd);
++ else
++ return pmd_dirty(pmd);
++}
++#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+ #endif /* CONFIG_X86_INTEL_SHADOW_STACK_USER */
+diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
+index 7512e4dfd642..09881698a566 100644
+--- a/include/asm-generic/pgtable.h
++++ b/include/asm-generic/pgtable.h
+@@ -1131,10 +1131,14 @@ static inline bool arch_has_pfn_modify_check(void)
+ #define pte_set_vma_features(pte, vma) pte
+ #define pmd_set_vma_features(pmd, vma) pmd
+ #define arch_copy_pte_mapping(vma_flags) false
++#define pte_exclusive(pte, vma) pte_dirty(pte)
++#define pmd_exclusive(pmd, vma) pmd_dirty(pmd)
+ #else
+ pte_t pte_set_vma_features(pte_t pte, struct vm_area_struct *vma);
+ pmd_t pmd_set_vma_features(pmd_t pmd, struct vm_area_struct *vma);
+ bool arch_copy_pte_mapping(vm_flags_t vm_flags);
++bool pte_exclusive(pte_t pte, struct vm_area_struct *vma);
++bool pmd_exclusive(pmd_t pmd, struct vm_area_struct *vma);
#endif
-+#ifdef CONFIG_X86_INTEL_SHADOW_STACK_USER
-+#define DISABLE_SHSTK 0
-+#else
-+#define DISABLE_SHSTK (1<<(X86_FEATURE_SHSTK & 31))
-+#endif
-+
- /*
- * Make sure to add features to the correct mask
+ #endif /* _ASM_GENERIC_PGTABLE_H */
+diff --git a/mm/gup.c b/mm/gup.c
+index 1abc8b4afff6..03cb2e331f80 100644
+--- a/mm/gup.c
++++ b/mm/gup.c
+@@ -64,10 +64,12 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
+ * FOLL_FORCE can write to even unwritable pte's, but only
+ * after we've gone through a COW cycle and they are dirty.
*/
-@@ -75,7 +81,7 @@
- #define DISABLED_MASK13 0
- #define DISABLED_MASK14 0
- #define DISABLED_MASK15 0
--#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP)
-+#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP|DISABLE_SHSTK)
- #define DISABLED_MASK17 0
- #define DISABLED_MASK18 0
- #define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19)
-diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
-index 4731f0cf97c5..e073801a44e0 100644
---- a/arch/x86/include/asm/msr-index.h
-+++ b/arch/x86/include/asm/msr-index.h
-@@ -777,4 +777,18 @@
- #define MSR_VM_IGNNE 0xc0010115
- #define MSR_VM_HSAVE_PA 0xc0010117
-
-+/* Control-flow Enforcement Technology MSRs */
-+#define MSR_IA32_U_CET 0x6a0 /* user mode cet setting */
-+#define MSR_IA32_S_CET 0x6a2 /* kernel mode cet setting */
-+#define MSR_IA32_PL0_SSP 0x6a4 /* kernel shstk pointer */
-+#define MSR_IA32_PL3_SSP 0x6a7 /* user shstk pointer */
-+#define MSR_IA32_INT_SSP_TAB 0x6a8 /* exception shstk table */
-+
-+/* MSR_IA32_U_CET and MSR_IA32_S_CET bits */
-+#define MSR_IA32_CET_SHSTK_EN 0x0000000000000001ULL
-+#define MSR_IA32_CET_WRSS_EN 0x0000000000000002ULL
-+#define MSR_IA32_CET_ENDBR_EN 0x0000000000000004ULL
-+#define MSR_IA32_CET_LEG_IW_EN 0x0000000000000008ULL
-+#define MSR_IA32_CET_NO_TRACK_EN 0x0000000000000010ULL
-+
- #endif /* _ASM_X86_MSR_INDEX_H */
-diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
-index d53c54b842da..63918cecf367 100644
---- a/arch/x86/include/asm/processor.h
-+++ b/arch/x86/include/asm/processor.h
-@@ -24,6 +24,7 @@ struct vm86;
- #include <asm/special_insns.h>
- #include <asm/fpu/types.h>
- #include <asm/unwind_hints.h>
-+#include <asm/cet.h>
-
- #include <linux/personality.h>
- #include <linux/cache.h>
-@@ -505,6 +506,10 @@ struct thread_struct {
- unsigned int sig_on_uaccess_err:1;
- unsigned int uaccess_err:1; /* uaccess failed */
-
-+#ifdef CONFIG_X86_INTEL_CET
-+ struct cet_status cet;
-+#endif
-+
- /* Floating point and extended processor state */
- struct fpu fpu;
- /*
-diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
-index 8824d01c0c35..fbb2d91fb756 100644
---- a/arch/x86/kernel/Makefile
-+++ b/arch/x86/kernel/Makefile
-@@ -139,6 +139,8 @@ obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o
- obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o
- obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o
-
-+obj-$(CONFIG_X86_INTEL_CET) += cet.o
-+
- ###
- # 64 bit specific files
- ifeq ($(CONFIG_X86_64),y)
-diff --git a/arch/x86/kernel/cet.c b/arch/x86/kernel/cet.c
-new file mode 100644
-index 000000000000..ec256ae27a31
---- /dev/null
-+++ b/arch/x86/kernel/cet.c
-@@ -0,0 +1,109 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * cet.c - Control Flow Enforcement (CET)
-+ *
-+ * Copyright (c) 2018, Intel Corporation.
-+ * Yu-cheng Yu <yu-cheng.yu@intel.com>
-+ */
-+
-+#include <linux/types.h>
-+#include <linux/mm.h>
-+#include <linux/mman.h>
-+#include <linux/slab.h>
-+#include <linux/uaccess.h>
-+#include <linux/sched/signal.h>
-+#include <asm/msr.h>
-+#include <asm/user.h>
-+#include <asm/fpu/xstate.h>
-+#include <asm/fpu/types.h>
-+#include <asm/compat.h>
-+#include <asm/cet.h>
-+
-+static int set_shstk_ptr(unsigned long addr)
-+{
-+ u64 r;
-+
-+ if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
-+ return -1;
-+
-+ if ((addr >= TASK_SIZE_MAX) || (!IS_ALIGNED(addr, 4)))
-+ return -1;
-+
-+ rdmsrl(MSR_IA32_U_CET, r);
-+ wrmsrl(MSR_IA32_PL3_SSP, addr);
-+ wrmsrl(MSR_IA32_U_CET, r | MSR_IA32_CET_SHSTK_EN);
-+ return 0;
-+}
-+
-+static unsigned long get_shstk_addr(void)
-+{
-+ unsigned long ptr;
-+
-+ if (!current->thread.cet.shstk_enabled)
-+ return 0;
-+
-+ rdmsrl(MSR_IA32_PL3_SSP, ptr);
-+ return ptr;
-+}
-+
-+int cet_setup_shstk(void)
-+{
-+ unsigned long addr, size;
-+
-+ if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
-+ return -EOPNOTSUPP;
-+
-+ size = rlimit(RLIMIT_STACK);
-+ addr = do_mmap_locked(0, size, PROT_READ,
-+ MAP_ANONYMOUS | MAP_PRIVATE, VM_SHSTK);
-+
-+ /*
-+ * Return actual error from do_mmap().
-+ */
-+ if (addr >= TASK_SIZE_MAX)
-+ return addr;
-+
-+ set_shstk_ptr(addr + size - sizeof(u64));
-+ current->thread.cet.shstk_base = addr;
-+ current->thread.cet.shstk_size = size;
-+ current->thread.cet.shstk_enabled = 1;
-+ return 0;
-+}
-+
-+void cet_disable_shstk(void)
-+{
-+ u64 r;
-+
-+ if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
-+ return;
-+
-+ rdmsrl(MSR_IA32_U_CET, r);
-+ r &= ~(MSR_IA32_CET_SHSTK_EN);
-+ wrmsrl(MSR_IA32_U_CET, r);
-+ wrmsrl(MSR_IA32_PL3_SSP, 0);
-+ current->thread.cet.shstk_enabled = 0;
-+}
-+
-+void cet_disable_free_shstk(struct task_struct *tsk)
-+{
-+ if (!cpu_feature_enabled(X86_FEATURE_SHSTK) ||
-+ !tsk->thread.cet.shstk_enabled)
-+ return;
-+
-+ if (tsk == current)
-+ cet_disable_shstk();
-+
-+ /*
-+ * Free only when tsk is current or shares mm
-+ * with current but has its own shstk.
-+ */
-+ if (tsk->mm && (tsk->mm == current->mm) &&
-+ (tsk->thread.cet.shstk_base)) {
-+ vm_munmap(tsk->thread.cet.shstk_base,
-+ tsk->thread.cet.shstk_size);
-+ tsk->thread.cet.shstk_base = 0;
-+ tsk->thread.cet.shstk_size = 0;
-+ }
-+
-+ tsk->thread.cet.shstk_enabled = 0;
-+}
-diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
-index 44c4ef3d989b..bffa9ef47832 100644
---- a/arch/x86/kernel/cpu/common.c
-+++ b/arch/x86/kernel/cpu/common.c
-@@ -411,6 +411,29 @@ static __init int setup_disable_pku(char *arg)
- __setup("nopku", setup_disable_pku);
- #endif /* CONFIG_X86_64 */
-
-+static __always_inline void setup_cet(struct cpuinfo_x86 *c)
-+{
-+ if (cpu_feature_enabled(X86_FEATURE_SHSTK))
-+ cr4_set_bits(X86_CR4_CET);
-+}
-+
-+#ifdef CONFIG_X86_INTEL_SHADOW_STACK_USER
-+static __init int setup_disable_shstk(char *s)
-+{
-+ /* require an exact match without trailing characters */
-+ if (strlen(s))
-+ return 0;
-+
-+ if (!boot_cpu_has(X86_FEATURE_SHSTK))
-+ return 1;
-+
-+ setup_clear_cpu_cap(X86_FEATURE_SHSTK);
-+ pr_info("x86: 'no_cet_shstk' specified, disabling Shadow Stack\n");
-+ return 1;
-+}
-+__setup("no_cet_shstk", setup_disable_shstk);
-+#endif
-+
- /*
- * Some CPU features depend on higher CPUID levels, which may not always
- * be available due to CPUID level capping or broken virtualization
-@@ -1376,6 +1399,7 @@ static void identify_cpu(struct cpuinfo_x86 *c)
- x86_init_rdrand(c);
- x86_init_cache_qos(c);
- setup_pku(c);
-+ setup_cet(c);
-
- /*
- * Clear/Set all flags overridden by options, need do it
-diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
-index c93fcfdf1673..4a776da4c28c 100644
---- a/arch/x86/kernel/process.c
-+++ b/arch/x86/kernel/process.c
-@@ -39,6 +39,7 @@
- #include <asm/desc.h>
- #include <asm/prctl.h>
- #include <asm/spec-ctrl.h>
-+#include <asm/cet.h>
-
- /*
- * per-CPU TSS segments. Threads are completely 'soft' on Linux,
-@@ -134,6 +135,7 @@ void flush_thread(void)
- flush_ptrace_hw_breakpoint(tsk);
- memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
-
-+ cet_disable_shstk();
- fpu__clear(&tsk->thread.fpu);
+-static inline bool can_follow_write_pte(pte_t pte, unsigned int flags)
++static inline bool can_follow_write(pte_t pte, unsigned int flags,
++ struct vm_area_struct *vma)
+ {
+ return pte_write(pte) ||
+- ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte));
++ ((flags & FOLL_FORCE) && (flags & FOLL_COW) &&
++ pte_exclusive(pte, vma));
}
-diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
-index 5ea1d64cb0b4..b20450dde5b7 100644
---- a/fs/proc/task_mmu.c
-+++ b/fs/proc/task_mmu.c
-@@ -652,6 +652,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
- [ilog2(VM_PKEY_BIT4)] = "",
- #endif
- #endif /* CONFIG_ARCH_HAS_PKEYS */
-+#ifdef CONFIG_X86_INTEL_SHADOW_STACK_USER
-+ [ilog2(VM_SHSTK)] = "ss"
-+#endif
- };
- size_t i;
+ static struct page *follow_page_pte(struct vm_area_struct *vma,
+@@ -105,7 +107,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
+ }
+ if ((flags & FOLL_NUMA) && pte_protnone(pte))
+ goto no_page;
+- if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) {
++ if ((flags & FOLL_WRITE) && !can_follow_write(pte, flags, vma)) {
+ pte_unmap_unlock(ptep, ptl);
+ return NULL;
+ }
+diff --git a/mm/huge_memory.c b/mm/huge_memory.c
+index 6e03e26c1cec..3b3e1026fb5b 100644
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -1387,10 +1387,12 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
+ * FOLL_FORCE can write to even unwritable pmd's, but only
+ * after we've gone through a COW cycle and they are dirty.
+ */
+-static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags)
++static inline bool can_follow_write(pmd_t pmd, unsigned int flags,
++ struct vm_area_struct *vma)
+ {
+ return pmd_write(pmd) ||
+- ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd));
++ ((flags & FOLL_FORCE) && (flags & FOLL_COW) &&
++ pmd_exclusive(pmd, vma));
+ }
+ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
+@@ -1403,7 +1405,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
+
+ assert_spin_locked(pmd_lockptr(mm, pmd));
+
+- if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags))
++ if (flags & FOLL_WRITE && !can_follow_write(*pmd, flags, vma))
+ goto out;
+
+ /* Avoid dumping huge zero page */
--
2.17.1