Thread (49 messages) 49 messages, 5 authors, 2026-02-18

[PATCH v8] futex_waitv.2: new page

From: наб <hidden>
Date: 2026-02-14 19:30:46

On Sat, Feb 14, 2026 at 06:32:17PM +0100, Alejandro Colomar wrote:
quoted
diff --git u/man/man2/futex_waitv.2 p/man/man2/futex_waitv.2
new file mode 100644
index 000000000..a1eeb8ce8
--- /dev/null
+++ p/man/man2/futex_waitv.2
@@ -0,0 +1,421 @@
[...]
quoted
+.SH SYNOPSIS
+.nf
+.BR "#include <linux/futex.h>" "  /* Definition of " FUTEX* " constants */"
Out of curiosity, why are some macros FUTEX2_* instead of FUTEX_*?
(if you know)
They call the futex_*() syscalls "futex2", in contrast
to the futex(FUTEX_*) family which is (retronymically) version 1 futex;
futex2-specific macros start with FUTEX2_, the original futex macros
started with just FUTEX_.
quoted
+.B EINVAL
+.B FUTEX2_NUMA
+was set in
+.IR waiters []. flags ,
+and the NUMA word
+(which is the same size as the futex word)
+is too small to contain the index of the biggest NUMA domain
+(for example,
+.B FUTEX2_SIZE_U8
+and there are more than 255 NUMA domains).
Is it 255 or 256?  I assume it's a 0-based index, so I'd expect there to
fit 256 indices in a u8.
kernel/futex/futex.h:
	int bits = 8 * futex_size(flags);  // 8
	u64 max = ~0ULL;                   // 0xFFFF`FFFF`FFFF`FFFF
	
	max >>= 64 - bits;                 // 0xFF
	if (nr_node_ids >= max)
		return false;
which is first true when nr_node_ids is 0xFF,
so "FUTEX2_SIZE_U8 and at least 255", actually.

Also this variable is "/possible/ NUMA domains" apparently.

Scissor-patch below.

Best,
-- >8 --
From: =?UTF-8?q?=D0=BD=D0=B0=D0=B1?= <redacted>
Date: Tue, 10 Feb 2026 21:32:19 +0100
Subject: [PATCH v8] futex_waitv.2: new page

Signed-off-by: Ahelenia Ziemiańska <redacted>
---
Range-diff against v7:
1:  da50b4733 ! 1:  c63cce1ec futex_waitv.2: new page
    @@ man/man2/futex_waitv.2 (new)
     +.I n
     +was not in the range
     +.RB [ 1 ,
    -+.I FUTEX_WAITV_MAX
    -+(128)].
    ++.IR FUTEX_WAITV_MAX ].
     +.TP
     +.B EINVAL
     +.I timeout
    @@ man/man2/futex_waitv.2 (new)
     +is too small to contain the index of the biggest NUMA domain
     +(for example,
     +.B FUTEX2_SIZE_U8
    -+and there are more than 255 NUMA domains).
    ++and there are at least 255 possible NUMA domains).
     +.TP
     +.B EINVAL
     +.B FUTEX2_NUMA
    @@ man/man2/futex_waitv.2 (new)
     +where
     +.I .val
     +and
    -+.I *.uaddr
    ++.I .uaddr[]
     +are 8, 16, or 64 bits are defined, but not implemented
     +.RB ( EINVAL ).
     +.SH HISTORY
    @@ man/man2/futex_waitv.2 (new)
     +.P
     +.\" SRC BEGIN (futex_waitv.c)
     +.EX
    ++#include <err.h>
     +#include <errno.h>
     +#include <inttypes.h>
     +#include <linux/futex.h>
    @@ man/man2/futex_waitv.2 (new)
     +	struct futex_waitv waiters[countof(futexes)] = {};
     +	int  i;
     +\&
    -+	getentropy(init, sizeof(init));
    ++	if(getentropy(init, sizeof(init)))
    ++		err(EXIT_FAILURE, "getentropy");
     +	init[0] = init[1] = init[2];
     +	for (i = 0; i < countof(futexes); ++i) {
     +		printf("%" PRIu8 "\[rs]t", init[i]);
     +		atomic_init(&futexes[i], init[i]);
    -+		pthread_create(&(pthread_t){}, NULL, worker, &futexes[i]);
    ++		pthread_create(&(pthread_t) {}, NULL, worker, &futexes[i]);
     +	}
    -+	putchar('\[rs]n');
    ++	putchar(\[aq]\[rs]n\[aq]);
     +\&
     +	for (i = 0; i < countof(futexes); ++i) {
     +		waiters[i].val   = futexes[i];
    -+		waiters[i].uaddr = (uintptr_t)&futexes[i];
    ++		waiters[i].uaddr = (uintptr_t) &futexes[i];
     +		waiters[i].flags = FUTEX2_SIZE_U32 | FUTEX2_PRIVATE;
     +	}
     +	for (;;) {
    @@ man/man2/futex_waitv.2 (new)
     +		timeout.tv_sec += 1;
     +\&
     +		woke = my_futex_waitv(waiters, countof(futexes), 0, &timeout, CLOCK_MONOTONIC);
    -+		if (woke == -1 && (errno != EAGAIN && errno != EWOULDBLOCK))
    ++		if (woke == \-1 && (errno != EAGAIN && errno != EWOULDBLOCK))
     +			break;
     +\&
     +		for (i = 0; i < countof(futexes); ++i) {
     +			if (futexes[i] != waiters[i].val)
     +				printf("%" PRIu32 "%s", futexes[i], i == woke ? "!" : "");
    -+			putchar('\[rs]t');
    ++			putchar(\[aq]\[rs]t\[aq]);
     +		}
    -+		putchar('\[rs]n');
    ++		putchar(\[aq]\[rs]n\[aq]);
     +\&
     +		for (i = 0; i < countof(futexes); ++i)
     +			waiters[i].val = futexes[i];

 man/man2/futex_waitv.2 | 422 +++++++++++++++++++++++++++++++++++++++++
 man/man7/futex.7       |   9 +-
 2 files changed, 429 insertions(+), 2 deletions(-)
 create mode 100644 man/man2/futex_waitv.2
diff --git u/man/man2/futex_waitv.2 p/man/man2/futex_waitv.2
new file mode 100644
index 000000000..6835434b4
--- /dev/null
+++ p/man/man2/futex_waitv.2
@@ -0,0 +1,422 @@
+.\" Copyright, the authors of the Linux man-pages project
+.\"
+.\" SPDX-License-Identifier: MIT
+.\"
+.TH futex_waitv 2 (date) "Linux man-pages (unreleased)"
+.SH NAME
+futex_waitv \- wait for FUTEX_WAKE operation on multiple futexes
+.SH LIBRARY
+Standard C library
+.RI ( libc ,\~ \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <linux/futex.h>" "  /* Definition of " FUTEX* " constants */"
+.BR "#include <sys/syscall.h>" "  /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.B #include <time.h>
+.P
+.BR "long syscall(" "unsigned int n;"
+.BI "             SYS_futex_waitv, struct futex_waitv " waiters [ n ],
+.BI "             unsigned int " n ", unsigned int " flags ,
+.BI "             const struct timespec *_Nullable " timeout ", clockid_t " clockid ");"
+.fi
+.P
+.EX
+.B "#include <linux/futex.h>"
+.P
+struct futex_waitv {
+    u64 val;        /* Expected value at \f[I]uaddr\f[] */
+    u64 uaddr;      /* User address to wait on */
+    u32 flags;      /* Flags for this waiter */
+    u32 __reserved; /* Align to u64 */
+};
+.EE
+.SH DESCRIPTION
+.\" This name is used internally in the kernel
+Implements the FUTEX_WAIT_MULTIPLE operation,
+analogous to a synchronous atomic parallel
+.BR FUTEX_WAIT (2const)
+or
+.B FUTEX_WAIT_PRIVATE
+on up to
+.B FUTEX_WAITV_MAX
+futex words.
+For an overview of futexes, see
+.BR futex (7);
+for a description of the general interface, see
+.BR futex (2);
+for general minutiae of futex waiting, see the page above.
+.P
+This operation tests that the values at the
+futex words pointed to by the addresses
+.IR waiters []. uaddr
+still contain respective expected values
+.IR waiters []. val ,
+and if so, sleeps waiting for a
+.BR FUTEX_WAKE (2const)
+operation on any of the futex words,
+and returns the index of
+.I a
+waiter whose futex was woken.
+.P
+If the thread starts to sleep,
+it is considered a waiter on all given futex words.
+If any of the futex values do not match their respective
+.IR waiters []. val ,
+the call fails immediately with the error
+.BR EAGAIN .
+.P
+If
+.I timeout
+is not NULL,
+.I *timeout
+specifies a deadline measured against clock
+.IR clockid .
+This interval will be rounded up to the system clock granularity,
+and is guaranteed not to expire early.
+If
+.I timeout
+is NULL,
+the call blocks indefinitely.
+.P
+Futex words to monitor are given by
+.IR "struct futex_waitv" ,
+whose fields are analogous to
+.BR FUTEX_WAIT (2const)
+parameters, except
+.I .__reserved
+must be 0
+and
+.I .flags
+must contain one of
+.BI FUTEX2_SIZE_ *
+ORed with some of the flags below.
+.TP
+.B FUTEX2_SIZE_U32
+.I .val
+and
+.I .uaddr[]
+are 32-bit unsigned integers.
+.TP
+.B FUTEX2_NUMA
+The futex word is followed by another word of the same size
+.RI ( .uaddr
+points to
+.IR uint N _t[2]
+rather than
+.IR uint N _t .
+The word is given by
+.IR .uaddr[1] ),
+which can be either
+.B FUTEX_NO_NODE
+(all bits set)
+or a NUMA node number.
+.IP
+If the NUMA word is
+.BR FUTEX_NO_NODE ,
+the node number of the processor the syscall executes on is written to it.
+(Except in an
+.B EINVAL
+or
+.B EFAULT
+condition, this happens to all waiters whose
+.I .flags
+have
+.B FUTEX2_NUMA
+set.)
+.IP
+Futexes are placed on the NUMA node given by the NUMA word.
+Futexes without this flag are placed on a random node.
+.\" commit cec199c5e39bde7191a08087cc3d002ccfab31ff
+.\" Author: Peter Zijlstra <peterz@infradead.org>
+.\" Date:   Wed Apr 16 18:29:16 2025 +0200
+.\"
+.\"     futex: Implement FUTEX2_NUMA
+.\"
+.\" FUTEX2_MPOL is not documented or used anywhere;
+.\" it's unclear to me what it does
+.\" (defined in commit c042c505210dc3453f378df432c10fff3d471bc5
+.\"  "futex: Implement FUTEX2_MPOL")
+.TP
+.B FUTEX2_PRIVATE
+By default, the futex is shared
+.RB "(like " FUTEX_WAIT (2const)),
+and can be accessed by multiple processes;
+this flag waits on a private futex word,
+where all users must use the same virtual memory map
+(like
+.BR FUTEX_WAIT_PRIVATE ;
+this most often means they are part of the same process).
+Private futexes are faster than shared ones.
+.P
+Programs should assign to
+.I .uaddr
+by casting a pointer to
+.BR uintptr_t .
+.\"
+.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+.\"
+.SH RETURN VALUE
+Returns an index to an arbitrary entry in
+.I waiters
+corresponding to some woken-up futex.
+This implies no information about other waiters.
+.P
+On error,
+\-1 is returned,
+and
+.I errno
+is set to indicate the error.
+.\"
+.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+.\"
+.SH ERRORS
+.TP
+.B EFAULT
+.I waiters
+points outside the accessible address space.
+.TP
+.B EFAULT
+.I timeout
+is not NULL and points outside the accessible address space.
+.TP
+.B EFAULT
+Any
+.IR waiters []. uaddr
+field points outside the accessible address space.
+.TP
+.B EINVAL
+Any
+.IR waiters []. uaddr
+field does not point to a valid object\[em]that is,
+the address is not aligned appropriately for the specified
+.BI FUTEX2_SIZE_ * .
+.TP
+.B EINVAL
+.I flags
+was not 0.
+.TP
+.B EINVAL
+.I n
+was not in the range
+.RB [ 1 ,
+.IR FUTEX_WAITV_MAX ].
+.TP
+.B EINVAL
+.I timeout
+was not NULL and
+.I clockid
+was not a valid clock
+.RB ( CLOCK_MONOTONIC
+or
+.BR CLOCK_REALTIME ).
+.TP
+.B EINVAL
+.I *timeout
+is denormal (before epoch or
+.I tv_nsec
+more than 999\[aq]999\[aq]999).
+.TP
+.B EINVAL
+Any
+.IR waiters []. flags
+field contains an unknown flag.
+.TP
+.B EINVAL
+Any
+.IR waiters []. flags
+field is missing a
+.BI FUTEX2_SIZE_ *
+flag or has a size flag different than
+.B FUTEX2_SIZE_U32
+set.
+.TP
+.B EINVAL
+Any
+.IR waiters []. __reserved
+field is not 0.
+.TP
+.B EINVAL
+Any
+.IR waiters []. value
+field has more bits set than permitted than the size flags.
+.TP
+.B EINVAL
+.B FUTEX2_NUMA
+was set in
+.IR waiters []. flags ,
+and the NUMA word
+(which is the same size as the futex word)
+is too small to contain the index of the biggest NUMA domain
+(for example,
+.B FUTEX2_SIZE_U8
+and there are at least 255 possible NUMA domains).
+.TP
+.B EINVAL
+.B FUTEX2_NUMA
+was set in
+.IR waiters []. flags ,
+and the NUMA word is larger than the maximum possible NUMA node and not
+.BR FUTEX_NO_NODE .
+.TP
+.B ETIMEDOUT
+.I timeout
+was not NULL and no futex was woken before the timeout elapsed.
+.TP
+.BR EAGAIN " or " EWOULDBLOCK
+The value pointed to by
+.I .uaddr
+was not equal to the expected value
+.I .val
+at the time of the call.
+.TP
+.B EINTR
+The
+operation was interrupted by a signal (see
+.BR signal (7)).
+.\"
+.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+.\"
+.SH STANDARDS
+Linux.
+.SH NOTES
+.BR FUTEX2_SIZE_U8 ,
+.BR FUTEX2_SIZE_U16 ,
+and
+.B FUTEX2_SIZE_U64
+where
+.I .val
+and
+.I .uaddr[]
+are 8, 16, or 64 bits are defined, but not implemented
+.RB ( EINVAL ).
+.SH HISTORY
+.\" commit bf69bad38cf63d980e8a603f8d1bd1f85b5ed3d9
+.\" Author: André Almeida <andrealmeid@igalia.com>
+.\" Date:   Thu Sep 23 14:11:05 2021 -0300
+.\"
+.\"     futex: Implement sys_futex_waitv()
+Linux 5.16.
+.SH EXAMPLES
+The program below executes a linear-time operation on 10 threads,
+displaying the results in real time,
+waiting at most 1 second for each new result.
+The first 3 threads operate on the same data (complete in the same time).
+.B !\&
+indicates the futex that woke up each
+.BR futex_waitv ().
+.in +4
+.EX
+.RB $\~ ./futex_waitv
+153	153	153	237	100	245	177	127	215	61
+									122!
+				200!
+							254!
+306	306!
+		306!
+						354!
+								430!
+			474!
+					490!
+Connection timed out
+.EE
+.P
+.\" SRC BEGIN (futex_waitv.c)
+.EX
+#include <err.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <linux/futex.h>
+#include <pthread.h>
+#include <stdatomic.h>
+#include <stdcountof.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/syscall.h>
+#include <time.h>
+#include <unistd.h>
+\&
+static inline long
+my_futex_wait_private(_Atomic uint32_t *uaddr, uint32_t val)
+{
+	return syscall(SYS_futex, uaddr, FUTEX_WAKE_PRIVATE, val);
+}
+\&
+static inline long
+my_futex_waitv(unsigned int n;
+               struct futex_waitv waiters[n], unsigned int n,
+               unsigned int flags, const struct timespec *timeout,
+               clockid_t clockid)
+{
+	return syscall(SYS_futex_waitv, waiters, n, flags, timeout, clockid);
+}
+\&
+void *
+worker(void *arg)
+{
+	_Atomic uint32_t  *futex = arg;
+\&
+	usleep(*futex * 10000);
+	*futex *= 2;
+	my_futex_wait_private(futex, 1);
+	return NULL;
+}
+\&
+int
+main(void)
+{
+	_Atomic uint32_t  futexes[10];
+	uint8_t  init[countof(futexes)];
+	struct futex_waitv waiters[countof(futexes)] = {};
+	int  i;
+\&
+	if(getentropy(init, sizeof(init)))
+		err(EXIT_FAILURE, "getentropy");
+	init[0] = init[1] = init[2];
+	for (i = 0; i < countof(futexes); ++i) {
+		printf("%" PRIu8 "\[rs]t", init[i]);
+		atomic_init(&futexes[i], init[i]);
+		pthread_create(&(pthread_t) {}, NULL, worker, &futexes[i]);
+	}
+	putchar(\[aq]\[rs]n\[aq]);
+\&
+	for (i = 0; i < countof(futexes); ++i) {
+		waiters[i].val   = futexes[i];
+		waiters[i].uaddr = (uintptr_t) &futexes[i];
+		waiters[i].flags = FUTEX2_SIZE_U32 | FUTEX2_PRIVATE;
+	}
+	for (;;) {
+		struct timespec  timeout;
+		int  woke;
+\&
+		clock_gettime(CLOCK_MONOTONIC, &timeout);
+		timeout.tv_sec += 1;
+\&
+		woke = my_futex_waitv(waiters, countof(futexes), 0, &timeout, CLOCK_MONOTONIC);
+		if (woke == \-1 && (errno != EAGAIN && errno != EWOULDBLOCK))
+			break;
+\&
+		for (i = 0; i < countof(futexes); ++i) {
+			if (futexes[i] != waiters[i].val)
+				printf("%" PRIu32 "%s", futexes[i], i == woke ? "!" : "");
+			putchar(\[aq]\[rs]t\[aq]);
+		}
+		putchar(\[aq]\[rs]n\[aq]);
+\&
+		for (i = 0; i < countof(futexes); ++i)
+			waiters[i].val = futexes[i];
+	}
+	fprintf(stderr, "%s\[rs]n", strerror(errno));
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR futex (2),
+.BR FUTEX_WAIT (2const),
+.BR FUTEX_WAKE (2const),
+.BR futex (7)
+.P
+Kernel source file
+.I Documentation/userspace-api/futex2.rst
diff --git u/man/man7/futex.7 p/man/man7/futex.7
index 51c5d5d9b..d271144ff 100644
--- u/man/man7/futex.7
+++ p/man/man7/futex.7
@@ -45,7 +45,9 @@ .SS Semantics
 Any futex operation starts in user space,
 but it may be necessary to communicate with the kernel using the
 .BR futex (2)
-system call.
+or
+.BR futex_waitv (2)
+system calls.
 .P
 To "up" a futex, execute the proper assembler instructions that
 will cause the host CPU to atomically increment the integer.
@@ -72,7 +74,9 @@ .SS Semantics
 .P
 The
 .BR futex (2)
-system call can optionally be passed a timeout specifying how long
+and
+.BR futex_waitv (2)
+system calls can optionally be passed a timeout specifying how long
 the kernel should
 wait for the futex to be upped.
 In this case, semantics are more complex and the programmer is referred
@@ -107,6 +111,7 @@ .SH NOTES
 .SH SEE ALSO
 .BR clone (2),
 .BR futex (2),
+.BR futex_waitv (2),
 .BR get_robust_list (2),
 .BR set_robust_list (2),
 .BR set_tid_address (2),
-- 
2.39.5

Attachments

Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help