[PATCH v8] futex_waitv.2: new page
From: наб <hidden>
Date: 2026-02-14 19:30:46
On Sat, Feb 14, 2026 at 06:32:17PM +0100, Alejandro Colomar wrote:
quoted
diff --git u/man/man2/futex_waitv.2 p/man/man2/futex_waitv.2 new file mode 100644 index 000000000..a1eeb8ce8 --- /dev/null +++ p/man/man2/futex_waitv.2@@ -0,0 +1,421 @@[...]quoted
+.SH SYNOPSIS +.nf +.BR "#include <linux/futex.h>" " /* Definition of " FUTEX* " constants */"Out of curiosity, why are some macros FUTEX2_* instead of FUTEX_*? (if you know)
They call the futex_*() syscalls "futex2", in contrast to the futex(FUTEX_*) family which is (retronymically) version 1 futex; futex2-specific macros start with FUTEX2_, the original futex macros started with just FUTEX_.
quoted
+.B EINVAL +.B FUTEX2_NUMA +was set in +.IR waiters []. flags , +and the NUMA word +(which is the same size as the futex word) +is too small to contain the index of the biggest NUMA domain +(for example, +.B FUTEX2_SIZE_U8 +and there are more than 255 NUMA domains).Is it 255 or 256? I assume it's a 0-based index, so I'd expect there to fit 256 indices in a u8.
kernel/futex/futex.h:
int bits = 8 * futex_size(flags); // 8
u64 max = ~0ULL; // 0xFFFF`FFFF`FFFF`FFFF
max >>= 64 - bits; // 0xFF
if (nr_node_ids >= max)
return false;
which is first true when nr_node_ids is 0xFF,
so "FUTEX2_SIZE_U8 and at least 255", actually.
Also this variable is "/possible/ NUMA domains" apparently.
Scissor-patch below.
Best,
-- >8 --
From: =?UTF-8?q?=D0=BD=D0=B0=D0=B1?= <redacted>
Date: Tue, 10 Feb 2026 21:32:19 +0100
Subject: [PATCH v8] futex_waitv.2: new page
Signed-off-by: Ahelenia Ziemiańska <redacted>
---
Range-diff against v7:
1: da50b4733 ! 1: c63cce1ec futex_waitv.2: new page
@@ man/man2/futex_waitv.2 (new)
+.I n
+was not in the range
+.RB [ 1 ,
-+.I FUTEX_WAITV_MAX
-+(128)].
++.IR FUTEX_WAITV_MAX ].
+.TP
+.B EINVAL
+.I timeout
@@ man/man2/futex_waitv.2 (new)
+is too small to contain the index of the biggest NUMA domain
+(for example,
+.B FUTEX2_SIZE_U8
-+and there are more than 255 NUMA domains).
++and there are at least 255 possible NUMA domains).
+.TP
+.B EINVAL
+.B FUTEX2_NUMA
@@ man/man2/futex_waitv.2 (new)
+where
+.I .val
+and
-+.I *.uaddr
++.I .uaddr[]
+are 8, 16, or 64 bits are defined, but not implemented
+.RB ( EINVAL ).
+.SH HISTORY
@@ man/man2/futex_waitv.2 (new)
+.P
+.\" SRC BEGIN (futex_waitv.c)
+.EX
++#include <err.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <linux/futex.h>
@@ man/man2/futex_waitv.2 (new)
+ struct futex_waitv waiters[countof(futexes)] = {};
+ int i;
+\&
-+ getentropy(init, sizeof(init));
++ if(getentropy(init, sizeof(init)))
++ err(EXIT_FAILURE, "getentropy");
+ init[0] = init[1] = init[2];
+ for (i = 0; i < countof(futexes); ++i) {
+ printf("%" PRIu8 "\[rs]t", init[i]);
+ atomic_init(&futexes[i], init[i]);
-+ pthread_create(&(pthread_t){}, NULL, worker, &futexes[i]);
++ pthread_create(&(pthread_t) {}, NULL, worker, &futexes[i]);
+ }
-+ putchar('\[rs]n');
++ putchar(\[aq]\[rs]n\[aq]);
+\&
+ for (i = 0; i < countof(futexes); ++i) {
+ waiters[i].val = futexes[i];
-+ waiters[i].uaddr = (uintptr_t)&futexes[i];
++ waiters[i].uaddr = (uintptr_t) &futexes[i];
+ waiters[i].flags = FUTEX2_SIZE_U32 | FUTEX2_PRIVATE;
+ }
+ for (;;) {
@@ man/man2/futex_waitv.2 (new)
+ timeout.tv_sec += 1;
+\&
+ woke = my_futex_waitv(waiters, countof(futexes), 0, &timeout, CLOCK_MONOTONIC);
-+ if (woke == -1 && (errno != EAGAIN && errno != EWOULDBLOCK))
++ if (woke == \-1 && (errno != EAGAIN && errno != EWOULDBLOCK))
+ break;
+\&
+ for (i = 0; i < countof(futexes); ++i) {
+ if (futexes[i] != waiters[i].val)
+ printf("%" PRIu32 "%s", futexes[i], i == woke ? "!" : "");
-+ putchar('\[rs]t');
++ putchar(\[aq]\[rs]t\[aq]);
+ }
-+ putchar('\[rs]n');
++ putchar(\[aq]\[rs]n\[aq]);
+\&
+ for (i = 0; i < countof(futexes); ++i)
+ waiters[i].val = futexes[i];
man/man2/futex_waitv.2 | 422 +++++++++++++++++++++++++++++++++++++++++
man/man7/futex.7 | 9 +-
2 files changed, 429 insertions(+), 2 deletions(-)
create mode 100644 man/man2/futex_waitv.2
diff --git u/man/man2/futex_waitv.2 p/man/man2/futex_waitv.2
new file mode 100644
index 000000000..6835434b4
--- /dev/null
+++ p/man/man2/futex_waitv.2@@ -0,0 +1,422 @@ +.\" Copyright, the authors of the Linux man-pages project +.\" +.\" SPDX-License-Identifier: MIT +.\" +.TH futex_waitv 2 (date) "Linux man-pages (unreleased)" +.SH NAME +futex_waitv \- wait for FUTEX_WAKE operation on multiple futexes +.SH LIBRARY +Standard C library +.RI ( libc ,\~ \-lc ) +.SH SYNOPSIS +.nf +.BR "#include <linux/futex.h>" " /* Definition of " FUTEX* " constants */" +.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */" +.B #include <unistd.h> +.B #include <time.h> +.P +.BR "long syscall(" "unsigned int n;" +.BI " SYS_futex_waitv, struct futex_waitv " waiters [ n ], +.BI " unsigned int " n ", unsigned int " flags , +.BI " const struct timespec *_Nullable " timeout ", clockid_t " clockid ");" +.fi +.P +.EX +.B "#include <linux/futex.h>" +.P +struct futex_waitv { + u64 val; /* Expected value at \f[I]uaddr\f[] */ + u64 uaddr; /* User address to wait on */ + u32 flags; /* Flags for this waiter */ + u32 __reserved; /* Align to u64 */ +}; +.EE +.SH DESCRIPTION +.\" This name is used internally in the kernel +Implements the FUTEX_WAIT_MULTIPLE operation, +analogous to a synchronous atomic parallel +.BR FUTEX_WAIT (2const) +or +.B FUTEX_WAIT_PRIVATE +on up to +.B FUTEX_WAITV_MAX +futex words. +For an overview of futexes, see +.BR futex (7); +for a description of the general interface, see +.BR futex (2); +for general minutiae of futex waiting, see the page above. +.P +This operation tests that the values at the +futex words pointed to by the addresses +.IR waiters []. uaddr +still contain respective expected values +.IR waiters []. val , +and if so, sleeps waiting for a +.BR FUTEX_WAKE (2const) +operation on any of the futex words, +and returns the index of +.I a +waiter whose futex was woken. +.P +If the thread starts to sleep, +it is considered a waiter on all given futex words. +If any of the futex values do not match their respective +.IR waiters []. val , +the call fails immediately with the error +.BR EAGAIN . +.P +If +.I timeout +is not NULL, +.I *timeout +specifies a deadline measured against clock +.IR clockid . +This interval will be rounded up to the system clock granularity, +and is guaranteed not to expire early. +If +.I timeout +is NULL, +the call blocks indefinitely. +.P +Futex words to monitor are given by +.IR "struct futex_waitv" , +whose fields are analogous to +.BR FUTEX_WAIT (2const) +parameters, except +.I .__reserved +must be 0 +and +.I .flags +must contain one of +.BI FUTEX2_SIZE_ * +ORed with some of the flags below. +.TP +.B FUTEX2_SIZE_U32 +.I .val +and +.I .uaddr[] +are 32-bit unsigned integers. +.TP +.B FUTEX2_NUMA +The futex word is followed by another word of the same size +.RI ( .uaddr +points to +.IR uint N _t[2] +rather than +.IR uint N _t . +The word is given by +.IR .uaddr[1] ), +which can be either +.B FUTEX_NO_NODE +(all bits set) +or a NUMA node number. +.IP +If the NUMA word is +.BR FUTEX_NO_NODE , +the node number of the processor the syscall executes on is written to it. +(Except in an +.B EINVAL +or +.B EFAULT +condition, this happens to all waiters whose +.I .flags +have +.B FUTEX2_NUMA +set.) +.IP +Futexes are placed on the NUMA node given by the NUMA word. +Futexes without this flag are placed on a random node. +.\" commit cec199c5e39bde7191a08087cc3d002ccfab31ff +.\" Author: Peter Zijlstra <peterz@infradead.org> +.\" Date: Wed Apr 16 18:29:16 2025 +0200 +.\" +.\" futex: Implement FUTEX2_NUMA +.\" +.\" FUTEX2_MPOL is not documented or used anywhere; +.\" it's unclear to me what it does +.\" (defined in commit c042c505210dc3453f378df432c10fff3d471bc5 +.\" "futex: Implement FUTEX2_MPOL") +.TP +.B FUTEX2_PRIVATE +By default, the futex is shared +.RB "(like " FUTEX_WAIT (2const)), +and can be accessed by multiple processes; +this flag waits on a private futex word, +where all users must use the same virtual memory map +(like +.BR FUTEX_WAIT_PRIVATE ; +this most often means they are part of the same process). +Private futexes are faster than shared ones. +.P +Programs should assign to +.I .uaddr +by casting a pointer to +.BR uintptr_t . +.\" +.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +.\" +.SH RETURN VALUE +Returns an index to an arbitrary entry in +.I waiters +corresponding to some woken-up futex. +This implies no information about other waiters. +.P +On error, +\-1 is returned, +and +.I errno +is set to indicate the error. +.\" +.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +.\" +.SH ERRORS +.TP +.B EFAULT +.I waiters +points outside the accessible address space. +.TP +.B EFAULT +.I timeout +is not NULL and points outside the accessible address space. +.TP +.B EFAULT +Any +.IR waiters []. uaddr +field points outside the accessible address space. +.TP +.B EINVAL +Any +.IR waiters []. uaddr +field does not point to a valid object\[em]that is, +the address is not aligned appropriately for the specified +.BI FUTEX2_SIZE_ * . +.TP +.B EINVAL +.I flags +was not 0. +.TP +.B EINVAL +.I n +was not in the range +.RB [ 1 , +.IR FUTEX_WAITV_MAX ]. +.TP +.B EINVAL +.I timeout +was not NULL and +.I clockid +was not a valid clock +.RB ( CLOCK_MONOTONIC +or +.BR CLOCK_REALTIME ). +.TP +.B EINVAL +.I *timeout +is denormal (before epoch or +.I tv_nsec +more than 999\[aq]999\[aq]999). +.TP +.B EINVAL +Any +.IR waiters []. flags +field contains an unknown flag. +.TP +.B EINVAL +Any +.IR waiters []. flags +field is missing a +.BI FUTEX2_SIZE_ * +flag or has a size flag different than +.B FUTEX2_SIZE_U32 +set. +.TP +.B EINVAL +Any +.IR waiters []. __reserved +field is not 0. +.TP +.B EINVAL +Any +.IR waiters []. value +field has more bits set than permitted than the size flags. +.TP +.B EINVAL +.B FUTEX2_NUMA +was set in +.IR waiters []. flags , +and the NUMA word +(which is the same size as the futex word) +is too small to contain the index of the biggest NUMA domain +(for example, +.B FUTEX2_SIZE_U8 +and there are at least 255 possible NUMA domains). +.TP +.B EINVAL +.B FUTEX2_NUMA +was set in +.IR waiters []. flags , +and the NUMA word is larger than the maximum possible NUMA node and not +.BR FUTEX_NO_NODE . +.TP +.B ETIMEDOUT +.I timeout +was not NULL and no futex was woken before the timeout elapsed. +.TP +.BR EAGAIN " or " EWOULDBLOCK +The value pointed to by +.I .uaddr +was not equal to the expected value +.I .val +at the time of the call. +.TP +.B EINTR +The +operation was interrupted by a signal (see +.BR signal (7)). +.\" +.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +.\" +.SH STANDARDS +Linux. +.SH NOTES +.BR FUTEX2_SIZE_U8 , +.BR FUTEX2_SIZE_U16 , +and +.B FUTEX2_SIZE_U64 +where +.I .val +and +.I .uaddr[] +are 8, 16, or 64 bits are defined, but not implemented +.RB ( EINVAL ). +.SH HISTORY +.\" commit bf69bad38cf63d980e8a603f8d1bd1f85b5ed3d9 +.\" Author: André Almeida <andrealmeid@igalia.com> +.\" Date: Thu Sep 23 14:11:05 2021 -0300 +.\" +.\" futex: Implement sys_futex_waitv() +Linux 5.16. +.SH EXAMPLES +The program below executes a linear-time operation on 10 threads, +displaying the results in real time, +waiting at most 1 second for each new result. +The first 3 threads operate on the same data (complete in the same time). +.B !\& +indicates the futex that woke up each +.BR futex_waitv (). +.in +4 +.EX +.RB $\~ ./futex_waitv +153 153 153 237 100 245 177 127 215 61 + 122! + 200! + 254! +306 306! + 306! + 354! + 430! + 474! + 490! +Connection timed out +.EE +.P +.\" SRC BEGIN (futex_waitv.c) +.EX +#include <err.h> +#include <errno.h> +#include <inttypes.h> +#include <linux/futex.h> +#include <pthread.h> +#include <stdatomic.h> +#include <stdcountof.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/syscall.h> +#include <time.h> +#include <unistd.h> +\& +static inline long +my_futex_wait_private(_Atomic uint32_t *uaddr, uint32_t val) +{ + return syscall(SYS_futex, uaddr, FUTEX_WAKE_PRIVATE, val); +} +\& +static inline long +my_futex_waitv(unsigned int n; + struct futex_waitv waiters[n], unsigned int n, + unsigned int flags, const struct timespec *timeout, + clockid_t clockid) +{ + return syscall(SYS_futex_waitv, waiters, n, flags, timeout, clockid); +} +\& +void * +worker(void *arg) +{ + _Atomic uint32_t *futex = arg; +\& + usleep(*futex * 10000); + *futex *= 2; + my_futex_wait_private(futex, 1); + return NULL; +} +\& +int +main(void) +{ + _Atomic uint32_t futexes[10]; + uint8_t init[countof(futexes)]; + struct futex_waitv waiters[countof(futexes)] = {}; + int i; +\& + if(getentropy(init, sizeof(init))) + err(EXIT_FAILURE, "getentropy"); + init[0] = init[1] = init[2]; + for (i = 0; i < countof(futexes); ++i) { + printf("%" PRIu8 "\[rs]t", init[i]); + atomic_init(&futexes[i], init[i]); + pthread_create(&(pthread_t) {}, NULL, worker, &futexes[i]); + } + putchar(\[aq]\[rs]n\[aq]); +\& + for (i = 0; i < countof(futexes); ++i) { + waiters[i].val = futexes[i]; + waiters[i].uaddr = (uintptr_t) &futexes[i]; + waiters[i].flags = FUTEX2_SIZE_U32 | FUTEX2_PRIVATE; + } + for (;;) { + struct timespec timeout; + int woke; +\& + clock_gettime(CLOCK_MONOTONIC, &timeout); + timeout.tv_sec += 1; +\& + woke = my_futex_waitv(waiters, countof(futexes), 0, &timeout, CLOCK_MONOTONIC); + if (woke == \-1 && (errno != EAGAIN && errno != EWOULDBLOCK)) + break; +\& + for (i = 0; i < countof(futexes); ++i) { + if (futexes[i] != waiters[i].val) + printf("%" PRIu32 "%s", futexes[i], i == woke ? "!" : ""); + putchar(\[aq]\[rs]t\[aq]); + } + putchar(\[aq]\[rs]n\[aq]); +\& + for (i = 0; i < countof(futexes); ++i) + waiters[i].val = futexes[i]; + } + fprintf(stderr, "%s\[rs]n", strerror(errno)); +} +.EE +.\" SRC END +.SH SEE ALSO +.BR futex (2), +.BR FUTEX_WAIT (2const), +.BR FUTEX_WAKE (2const), +.BR futex (7) +.P +Kernel source file +.I Documentation/userspace-api/futex2.rst
diff --git u/man/man7/futex.7 p/man/man7/futex.7
index 51c5d5d9b..d271144ff 100644
--- u/man/man7/futex.7
+++ p/man/man7/futex.7@@ -45,7 +45,9 @@ .SS Semantics Any futex operation starts in user space, but it may be necessary to communicate with the kernel using the .BR futex (2) -system call. +or +.BR futex_waitv (2) +system calls. .P To "up" a futex, execute the proper assembler instructions that will cause the host CPU to atomically increment the integer.
@@ -72,7 +74,9 @@ .SS Semantics .P The .BR futex (2) -system call can optionally be passed a timeout specifying how long +and +.BR futex_waitv (2) +system calls can optionally be passed a timeout specifying how long the kernel should wait for the futex to be upped. In this case, semantics are more complex and the programmer is referred
@@ -107,6 +111,7 @@ .SH NOTES .SH SEE ALSO .BR clone (2), .BR futex (2), +.BR futex_waitv (2), .BR get_robust_list (2), .BR set_robust_list (2), .BR set_tid_address (2),
--
2.39.5
Attachments
- signature.asc [application/pgp-signature] 833 bytes