--- v9
+++ v8
@@ -1,236 +1,120 @@
-When operating with shadow stacks enabled, the kernel will automatically
-allocate shadow stacks for new threads, however in some cases userspace
-will need additional shadow stacks. The main example of this is the
-ucontext family of functions, which require userspace allocating and
-pivoting to userspace managed stacks.
+For the current shadow stack implementation, shadow stacks contents can't
+easily be provisioned with arbitrary data. This property helps apps
+protect themselves better, but also restricts any potential apps that may
+want to do exotic things at the expense of a little security.
-Unlike most other user memory permissions, shadow stacks need to be
-provisioned with special data in order to be useful. They need to be setup
-with a restore token so that userspace can pivot to them via the RSTORSSP
-instruction. But, the security design of shadow stacks is that they
-should not be written to except in limited circumstances. This presents a
-problem for userspace, as to how userspace can provision this special
-data, without allowing for the shadow stack to be generally writable.
+The x86 shadow stack feature introduces a new instruction, WRSS, which
+can be enabled to write directly to shadow stack memory from userspace.
+Allow it to get enabled via the prctl interface.
-Previously, a new PROT_SHADOW_STACK was attempted, which could be
-mprotect()ed from RW permissions after the data was provisioned. This was
-found to not be secure enough, as other threads could write to the
-shadow stack during the writable window.
+Only enable the userspace WRSS instruction, which allows writes to
+userspace shadow stacks from userspace. Do not allow it to be enabled
+independently of shadow stack, as HW does not support using WRSS when
+shadow stack is disabled.
-The kernel can use a special instruction, WRUSS, to write directly to
-userspace shadow stacks. So the solution can be that memory can be mapped
-as shadow stack permissions from the beginning (never generally writable
-in userspace), and the kernel itself can write the restore token.
-
-First, a new madvise() flag was explored, which could operate on the
-PROT_SHADOW_STACK memory. This had a couple of downsides:
-1. Extra checks were needed in mprotect() to prevent writable memory from
- ever becoming PROT_SHADOW_STACK.
-2. Extra checks/vma state were needed in the new madvise() to prevent
- restore tokens being written into the middle of pre-used shadow stacks.
- It is ideal to prevent restore tokens being added at arbitrary
- locations, so the check was to make sure the shadow stack had never been
- written to.
-3. It stood out from the rest of the madvise flags, as more of direct
- action than a hint at future desired behavior.
-
-So rather than repurpose two existing syscalls (mmap, madvise) that don't
-quite fit, just implement a new map_shadow_stack syscall to allow
-userspace to map and setup new shadow stacks in one step. While ucontext
-is the primary motivator, userspace may have other unforeseen reasons to
-setup its own shadow stacks using the WRSS instruction. Towards this
-provide a flag so that stacks can be optionally setup securely for the
-common case of ucontext without enabling WRSS. Or potentially have the
-kernel set up the shadow stack in some new way.
-
-The following example demonstrates how to create a new shadow stack with
-map_shadow_stack:
-void *shstk = map_shadow_stack(addr, stack_size, SHADOW_STACK_SET_TOKEN);
+From a fault handler perspective, WRSS will behave very similar to WRUSS,
+which is treated like a user access from a #PF err code perspective.
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
-Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Kees Cook <keescook@chromium.org>
Acked-by: Mike Rapoport (IBM) <rppt@kernel.org>
Tested-by: Pengfei Xu <pengfei.xu@intel.com>
Tested-by: John Allen <john.allen@amd.com>
Tested-by: Kees Cook <keescook@chromium.org>
---
- arch/x86/entry/syscalls/syscall_64.tbl | 1 +
- arch/x86/include/uapi/asm/mman.h | 3 ++
- arch/x86/kernel/shstk.c | 59 ++++++++++++++++++++++----
- include/linux/syscalls.h | 1 +
- include/uapi/asm-generic/unistd.h | 2 +-
- kernel/sys_ni.c | 1 +
- 6 files changed, 58 insertions(+), 9 deletions(-)
+v8:
+ - Update commit log verbiage (Boris)
+ - Drop set_clr_bits_msrl() (Boris)
+ - Fix comments wrss->WRSS (Boris)
-diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
-index c84d12608cd2..f65c671ce3b1 100644
---- a/arch/x86/entry/syscalls/syscall_64.tbl
-+++ b/arch/x86/entry/syscalls/syscall_64.tbl
-@@ -372,6 +372,7 @@
- 448 common process_mrelease sys_process_mrelease
- 449 common futex_waitv sys_futex_waitv
- 450 common set_mempolicy_home_node sys_set_mempolicy_home_node
-+451 64 map_shadow_stack sys_map_shadow_stack
+v6:
+ - Make set_clr_bits_msrl() avoid side effects in 'msr'
+
+v5:
+ - Switch to EOPNOTSUPP
+ - Move set_clr_bits_msrl() to patch where it is first used
+ - Commit log formatting
+
+v3:
+ - Make wrss_control() static
+ - Fix verbiage in commit log (Kees)
+---
+ arch/x86/include/uapi/asm/prctl.h | 1 +
+ arch/x86/kernel/shstk.c | 43 ++++++++++++++++++++++++++++++-
+ 2 files changed, 43 insertions(+), 1 deletion(-)
+
+diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h
+index 7dfd9dc00509..e31495668056 100644
+--- a/arch/x86/include/uapi/asm/prctl.h
++++ b/arch/x86/include/uapi/asm/prctl.h
+@@ -28,5 +28,6 @@
- #
- # Due to a historical design error, certain syscalls are numbered differently
-diff --git a/arch/x86/include/uapi/asm/mman.h b/arch/x86/include/uapi/asm/mman.h
-index 5a0256e73f1e..8148bdddbd2c 100644
---- a/arch/x86/include/uapi/asm/mman.h
-+++ b/arch/x86/include/uapi/asm/mman.h
-@@ -13,6 +13,9 @@
- ((key) & 0x8 ? VM_PKEY_BIT3 : 0))
- #endif
+ /* ARCH_SHSTK_ features bits */
+ #define ARCH_SHSTK_SHSTK (1ULL << 0)
++#define ARCH_SHSTK_WRSS (1ULL << 1)
-+/* Flags for map_shadow_stack(2) */
-+#define SHADOW_STACK_SET_TOKEN (1ULL << 0) /* Set up a restore token in the shadow stack */
-+
- #include <asm-generic/mman.h>
-
- #endif /* _ASM_X86_MMAN_H */
+ #endif /* _ASM_X86_PRCTL_H */
diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c
-index 50733a510446..04c37b33a625 100644
+index 6d2531ce661c..01b45666f1b6 100644
--- a/arch/x86/kernel/shstk.c
+++ b/arch/x86/kernel/shstk.c
-@@ -17,6 +17,7 @@
- #include <linux/compat.h>
- #include <linux/sizes.h>
- #include <linux/user.h>
-+#include <linux/syscalls.h>
- #include <asm/msr.h>
- #include <asm/fpu/xstate.h>
- #include <asm/fpu/types.h>
-@@ -71,19 +72,31 @@ static int create_rstor_token(unsigned long ssp, unsigned long *token_addr)
- return 0;
+@@ -360,6 +360,47 @@ void shstk_free(struct task_struct *tsk)
+ unmap_shadow_stack(shstk->base, shstk->size);
}
--static unsigned long alloc_shstk(unsigned long size)
-+static unsigned long alloc_shstk(unsigned long addr, unsigned long size,
-+ unsigned long token_offset, bool set_res_tok)
- {
- int flags = MAP_ANONYMOUS | MAP_PRIVATE | MAP_ABOVE4G;
- struct mm_struct *mm = current->mm;
-- unsigned long addr, unused;
-+ unsigned long mapped_addr, unused;
-+
-+ if (addr)
-+ flags |= MAP_FIXED_NOREPLACE;
-
- mmap_write_lock(mm);
-- addr = do_mmap(NULL, 0, size, PROT_READ, flags,
-- VM_SHADOW_STACK | VM_WRITE, 0, &unused, NULL);
--
-+ mapped_addr = do_mmap(NULL, addr, size, PROT_READ, flags,
-+ VM_SHADOW_STACK | VM_WRITE, 0, &unused, NULL);
- mmap_write_unlock(mm);
-
-- return addr;
-+ if (!set_res_tok || IS_ERR_VALUE(mapped_addr))
-+ goto out;
-+
-+ if (create_rstor_token(mapped_addr + token_offset, NULL)) {
-+ vm_munmap(mapped_addr, size);
-+ return -EINVAL;
-+ }
-+
-+out:
-+ return mapped_addr;
- }
-
- static unsigned long adjust_shstk_size(unsigned long size)
-@@ -134,7 +147,7 @@ static int shstk_setup(void)
- return -EOPNOTSUPP;
-
- size = adjust_shstk_size(0);
-- addr = alloc_shstk(size);
-+ addr = alloc_shstk(0, size, 0, false);
- if (IS_ERR_VALUE(addr))
- return PTR_ERR((void *)addr);
-
-@@ -178,7 +191,7 @@ unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, unsigned long cl
- return 0;
-
- size = adjust_shstk_size(stack_size);
-- addr = alloc_shstk(size);
-+ addr = alloc_shstk(0, size, 0, false);
- if (IS_ERR_VALUE(addr))
- return addr;
-
-@@ -398,6 +411,36 @@ static int shstk_disable(void)
- return 0;
- }
-
-+SYSCALL_DEFINE3(map_shadow_stack, unsigned long, addr, unsigned long, size, unsigned int, flags)
++static int wrss_control(bool enable)
+{
-+ bool set_tok = flags & SHADOW_STACK_SET_TOKEN;
-+ unsigned long aligned_size;
++ u64 msrval;
+
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
+ return -EOPNOTSUPP;
+
-+ if (flags & ~SHADOW_STACK_SET_TOKEN)
-+ return -EINVAL;
++ /*
++ * Only enable WRSS if shadow stack is enabled. If shadow stack is not
++ * enabled, WRSS will already be disabled, so don't bother clearing it
++ * when disabling.
++ */
++ if (!features_enabled(ARCH_SHSTK_SHSTK))
++ return -EPERM;
+
-+ /* If there isn't space for a token */
-+ if (set_tok && size < 8)
-+ return -ENOSPC;
++ /* Already enabled/disabled? */
++ if (features_enabled(ARCH_SHSTK_WRSS) == enable)
++ return 0;
+
-+ if (addr && addr < SZ_4G)
-+ return -ERANGE;
++ fpregs_lock_and_load();
++ rdmsrl(MSR_IA32_U_CET, msrval);
+
-+ /*
-+ * An overflow would result in attempting to write the restore token
-+ * to the wrong location. Not catastrophic, but just return the right
-+ * error code and block it.
-+ */
-+ aligned_size = PAGE_ALIGN(size);
-+ if (aligned_size < size)
-+ return -EOVERFLOW;
++ if (enable) {
++ features_set(ARCH_SHSTK_WRSS);
++ msrval |= CET_WRSS_EN;
++ } else {
++ features_clr(ARCH_SHSTK_WRSS);
++ if (!(msrval & CET_WRSS_EN))
++ goto unlock;
+
-+ return alloc_shstk(addr, aligned_size, size, set_tok);
++ msrval &= ~CET_WRSS_EN;
++ }
++
++ wrmsrl(MSR_IA32_U_CET, msrval);
++
++unlock:
++ fpregs_unlock();
++
++ return 0;
+}
+
- long shstk_prctl(struct task_struct *task, int option, unsigned long features)
+ static int shstk_disable(void)
{
- if (option == ARCH_SHSTK_LOCK) {
-diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
-index 33a0ee3bcb2e..392dc11e3556 100644
---- a/include/linux/syscalls.h
-+++ b/include/linux/syscalls.h
-@@ -1058,6 +1058,7 @@ asmlinkage long sys_memfd_secret(unsigned int flags);
- asmlinkage long sys_set_mempolicy_home_node(unsigned long start, unsigned long len,
- unsigned long home_node,
- unsigned long flags);
-+asmlinkage long sys_map_shadow_stack(unsigned long addr, unsigned long size, unsigned int flags);
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
+@@ -376,7 +417,7 @@ static int shstk_disable(void)
+ fpregs_unlock();
- /*
- * Architecture-specific system calls
-diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
-index 45fa180cc56a..b12940ec5926 100644
---- a/include/uapi/asm-generic/unistd.h
-+++ b/include/uapi/asm-generic/unistd.h
-@@ -887,7 +887,7 @@ __SYSCALL(__NR_futex_waitv, sys_futex_waitv)
- __SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node)
+ shstk_free(current);
+- features_clr(ARCH_SHSTK_SHSTK);
++ features_clr(ARCH_SHSTK_SHSTK | ARCH_SHSTK_WRSS);
- #undef __NR_syscalls
--#define __NR_syscalls 451
-+#define __NR_syscalls 452
-
- /*
- * 32 bit systems traditionally used different
-diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
-index 860b2dcf3ac4..cb9aebd34646 100644
---- a/kernel/sys_ni.c
-+++ b/kernel/sys_ni.c
-@@ -381,6 +381,7 @@ COND_SYSCALL(vm86old);
- COND_SYSCALL(modify_ldt);
- COND_SYSCALL(vm86);
- COND_SYSCALL(kexec_file_load);
-+COND_SYSCALL(map_shadow_stack);
-
- /* s390 */
- COND_SYSCALL(s390_pci_mmio_read);
+ return 0;
+ }
--
-2.34.1
+2.17.1