[PATCH v5 02/24] rseq: Introduce extensible rseq ABI
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: 2022-11-03 20:05:02
Also in:
lkml
Subsystem:
ptrace support, restartable sequences support, scheduler, the rest · Maintainers:
Oleg Nesterov, Mathieu Desnoyers, Peter Zijlstra, "Paul E. McKenney", Boqun Feng, Ingo Molnar, Juri Lelli, Vincent Guittot, Linus Torvalds
Introduce the extensible rseq ABI, where the feature size supported by the kernel and the required alignment are communicated to user-space through ELF auxiliary vectors. This allows user-space to call rseq registration with a rseq_len of either 32 bytes for the original struct rseq size (which includes padding), or larger. If rseq_len is larger than 32 bytes, then it must be large enough to contain the feature size communicated to user-space through ELF auxiliary vectors. Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> --- Changes since v4: - Accept original rseq alignment for original rseq size. --- include/linux/sched.h | 4 ++++ kernel/ptrace.c | 2 +- kernel/rseq.c | 37 ++++++++++++++++++++++++++++++------- 3 files changed, 35 insertions(+), 8 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 23de7fe86cc4..2a9e14e3e668 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h@@ -1305,6 +1305,7 @@ struct task_struct { #ifdef CONFIG_RSEQ struct rseq __user *rseq; + u32 rseq_len; u32 rseq_sig; /* * RmW on rseq_event_mask must be performed atomically
@@ -2355,10 +2356,12 @@ static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) { if (clone_flags & CLONE_VM) { t->rseq = NULL; + t->rseq_len = 0; t->rseq_sig = 0; t->rseq_event_mask = 0; } else { t->rseq = current->rseq; + t->rseq_len = current->rseq_len; t->rseq_sig = current->rseq_sig; t->rseq_event_mask = current->rseq_event_mask; }
@@ -2367,6 +2370,7 @@ static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) static inline void rseq_execve(struct task_struct *t) { t->rseq = NULL; + t->rseq_len = 0; t->rseq_sig = 0; t->rseq_event_mask = 0; }
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 54482193e1ed..0786450074c1 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c@@ -813,7 +813,7 @@ static long ptrace_get_rseq_configuration(struct task_struct *task, { struct ptrace_rseq_configuration conf = { .rseq_abi_pointer = (u64)(uintptr_t)task->rseq, - .rseq_abi_size = sizeof(*task->rseq), + .rseq_abi_size = task->rseq_len, .signature = task->rseq_sig, .flags = 0, };
diff --git a/kernel/rseq.c b/kernel/rseq.c
index bda8175f8f99..c1058b3f10ac 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c@@ -18,6 +18,9 @@ #define CREATE_TRACE_POINTS #include <trace/events/rseq.h> +/* The original rseq structure size (including padding) is 32 bytes. */ +#define ORIG_RSEQ_SIZE 32 + #define RSEQ_CS_NO_RESTART_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT | \ RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \ RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE)
@@ -87,10 +90,15 @@ static int rseq_update_cpu_id(struct task_struct *t) u32 cpu_id = raw_smp_processor_id(); struct rseq __user *rseq = t->rseq; - if (!user_write_access_begin(rseq, sizeof(*rseq))) + if (!user_write_access_begin(rseq, t->rseq_len)) goto efault; unsafe_put_user(cpu_id, &rseq->cpu_id_start, efault_end); unsafe_put_user(cpu_id, &rseq->cpu_id, efault_end); + /* + * Additional feature fields added after ORIG_RSEQ_SIZE + * need to be conditionally updated only if + * t->rseq_len != ORIG_RSEQ_SIZE. + */ user_write_access_end(); trace_rseq_update(t); return 0;
@@ -117,6 +125,11 @@ static int rseq_reset_rseq_cpu_id(struct task_struct *t) */ if (put_user(cpu_id, &t->rseq->cpu_id)) return -EFAULT; + /* + * Additional feature fields added after ORIG_RSEQ_SIZE + * need to be conditionally reset only if + * t->rseq_len != ORIG_RSEQ_SIZE. + */ return 0; }
@@ -329,7 +342,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, /* Unregister rseq for current thread. */ if (current->rseq != rseq || !current->rseq) return -EINVAL; - if (rseq_len != sizeof(*rseq)) + if (rseq_len != current->rseq_len) return -EINVAL; if (current->rseq_sig != sig) return -EPERM;
@@ -338,6 +351,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, return ret; current->rseq = NULL; current->rseq_sig = 0; + current->rseq_len = 0; return 0; }
@@ -350,7 +364,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, * the provided address differs from the prior * one. */ - if (current->rseq != rseq || rseq_len != sizeof(*rseq)) + if (current->rseq != rseq || rseq_len != current->rseq_len) return -EINVAL; if (current->rseq_sig != sig) return -EPERM;
@@ -359,15 +373,24 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, } /* - * If there was no rseq previously registered, - * ensure the provided rseq is properly aligned and valid. + * If there was no rseq previously registered, ensure the provided rseq + * is properly aligned, as communcated to user-space through the ELF + * auxiliary vector AT_RSEQ_ALIGN. If rseq_len is the original rseq + * size, the required alignment is the original struct rseq alignment. + * + * In order to be valid, rseq_len is either the original rseq size, or + * large enough to contain all supported fields, as communicated to + * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE. */ - if (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) || - rseq_len != sizeof(*rseq)) + if (rseq_len < ORIG_RSEQ_SIZE || + (rseq_len == ORIG_RSEQ_SIZE && !IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE)) || + (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) || + rseq_len < offsetof(struct rseq, end)))) return -EINVAL; if (!access_ok(rseq, rseq_len)) return -EFAULT; current->rseq = rseq; + current->rseq_len = rseq_len; current->rseq_sig = sig; /* * If rseq was previously inactive, and has just been
--
2.25.1