[PATCH 2/3] vmsplice: make vmsplice a trivial wrapper for preadv2/pwritev2
From: Askar Safin <hidden>
Date: 2026-05-31 01:02:44
Also in:
linux-fsdevel, linux-mm, linux-patches, lkml, netdev
Subsystem:
abi/api, filesystems (vfs and infrastructure), networking [general], the rest · Maintainers:
Alexander Viro, Christian Brauner, "David S. Miller", Eric Dumazet, Jakub Kicinski, Paolo Abeni, Linus Torvalds
vmsplice behavior on writable pipe became equivalent to pwritev2. vmsplice behavior on readable pipe already was nearly equivalent to preadv2, but I made this explicit. I. e. I made it obvious from code that vmsplice now is equivalent to preadv2/pwritev2. Also I moved vmsplice to fs/read_write.c, because now it arguably belongs there. Note that SPLICE_F_NONBLOCK behavior slightly changed: previously vmsplice ignored whether the pipe was opened with O_NONBLOCK, and mode of operation depended on whether SPLICE_F_NONBLOCK was passed only. Now the operation will be non-blocking if O_NONBLOCK was passed when opening *or* SPLICE_F_NONBLOCK was passed to vmsplice. Previous behavior was arguably buggy, and new behavior is arguably better. Now SPLICE_F_GIFT is always ignored by all 3 syscalls: splice, tee and vmsplice. Signed-off-by: Askar Safin <redacted> --- fs/read_write.c | 23 +++++ fs/splice.c | 192 +-------------------------------------- include/linux/skbuff.h | 4 +- include/linux/splice.h | 2 +- include/linux/syscalls.h | 4 +- 5 files changed, 29 insertions(+), 196 deletions(-)
diff --git a/fs/read_write.c b/fs/read_write.c
index 50bff7edc91f..1e5444f4dab3 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c@@ -1213,6 +1213,29 @@ SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec, return do_pwritev(fd, vec, vlen, pos, flags); } +/* + * Legacy preadv2/pwritev2 wrapper. + */ +SYSCALL_DEFINE4(vmsplice, unsigned long, fd, const struct iovec __user *, vec, + unsigned long, vlen, unsigned int, flags) +{ + if (unlikely(flags & ~SPLICE_F_ALL)) + return -EINVAL; + + CLASS(fd, f)(fd); + if (fd_empty(f)) + return -EBADF; + + /* We do do_writev/do_readv, so it is okay to pass "false" here */ + if (!get_pipe_info(fd_file(f), /* for_splice = */ false)) + return -EBADF; + + if (fd_file(f)->f_mode & FMODE_WRITE) + return do_writev(fd, vec, vlen, (flags & SPLICE_F_NONBLOCK) ? RWF_NOWAIT : 0); + else + return do_readv(fd, vec, vlen, (flags & SPLICE_F_NONBLOCK) ? RWF_NOWAIT : 0); +} + /* * Various compat syscalls. Note that they all pretend to take a native * iovec - import_iovec will properly treat those as compat_iovecs based on
diff --git a/fs/splice.c b/fs/splice.c
index 59adbc2fa4d6..b1a4e3713bd6 100644
--- a/fs/splice.c
+++ b/fs/splice.c@@ -159,22 +159,6 @@ const struct pipe_buf_operations page_cache_pipe_buf_ops = { .get = generic_pipe_buf_get, }; -static bool user_page_pipe_buf_try_steal(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) -{ - if (!(buf->flags & PIPE_BUF_FLAG_GIFT)) - return false; - - buf->flags |= PIPE_BUF_FLAG_LRU; - return generic_pipe_buf_try_steal(pipe, buf); -} - -static const struct pipe_buf_operations user_page_pipe_buf_ops = { - .release = page_cache_pipe_buf_release, - .try_steal = user_page_pipe_buf_try_steal, - .get = generic_pipe_buf_get, -}; - static void wakeup_pipe_readers(struct pipe_inode_info *pipe) { smp_mb();
@@ -589,8 +573,7 @@ static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_des * Description: * This function does little more than loop over the pipe and call * @actor to do the actual moving of a single struct pipe_buffer to - * the desired destination. See pipe_to_file, pipe_to_sendmsg, or - * pipe_to_user. + * the desired destination. See pipe_to_file or pipe_to_sendmsg. * */ ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
@@ -1440,179 +1423,6 @@ static ssize_t __do_splice(struct file *in, loff_t __user *off_in, return ret; } -static ssize_t iter_to_pipe(struct iov_iter *from, - struct pipe_inode_info *pipe, - unsigned int flags) -{ - struct pipe_buffer buf = { - .ops = &user_page_pipe_buf_ops, - .flags = flags - }; - size_t total = 0; - ssize_t ret = 0; - - while (iov_iter_count(from)) { - struct page *pages[16]; - ssize_t left; - size_t start; - int i, n; - - left = iov_iter_get_pages2(from, pages, ~0UL, 16, &start); - if (left <= 0) { - ret = left; - break; - } - - n = DIV_ROUND_UP(left + start, PAGE_SIZE); - for (i = 0; i < n; i++) { - int size = umin(left, PAGE_SIZE - start); - - buf.page = pages[i]; - buf.offset = start; - buf.len = size; - ret = add_to_pipe(pipe, &buf); - if (unlikely(ret < 0)) { - iov_iter_revert(from, left); - // this one got dropped by add_to_pipe() - while (++i < n) - put_page(pages[i]); - goto out; - } - total += ret; - left -= size; - start = 0; - } - } -out: - return total ? total : ret; -} - -static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, - struct splice_desc *sd) -{ - int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data); - return n == sd->len ? n : -EFAULT; -} - -/* - * For lack of a better implementation, implement vmsplice() to userspace - * as a simple copy of the pipe's pages to the user iov. - */ -static ssize_t vmsplice_to_user(struct file *file, struct iov_iter *iter, - unsigned int flags) -{ - struct pipe_inode_info *pipe = get_pipe_info(file, true); - struct splice_desc sd = { - .total_len = iov_iter_count(iter), - .flags = flags, - .u.data = iter - }; - ssize_t ret = 0; - - if (!pipe) - return -EBADF; - - pipe_clear_nowait(file); - - if (sd.total_len) { - pipe_lock(pipe); - ret = __splice_from_pipe(pipe, &sd, pipe_to_user); - pipe_unlock(pipe); - } - - if (ret > 0) - fsnotify_access(file); - - return ret; -} - -/* - * vmsplice splices a user address range into a pipe. It can be thought of - * as splice-from-memory, where the regular splice is splice-from-file (or - * to file). In both cases the output is a pipe, naturally. - */ -static ssize_t vmsplice_to_pipe(struct file *file, struct iov_iter *iter, - unsigned int flags) -{ - struct pipe_inode_info *pipe; - ssize_t ret = 0; - unsigned buf_flag = 0; - - if (flags & SPLICE_F_GIFT) - buf_flag = PIPE_BUF_FLAG_GIFT; - - pipe = get_pipe_info(file, true); - if (!pipe) - return -EBADF; - - pipe_clear_nowait(file); - - pipe_lock(pipe); - ret = wait_for_space(pipe, flags); - if (!ret) - ret = iter_to_pipe(iter, pipe, buf_flag); - pipe_unlock(pipe); - if (ret > 0) { - wakeup_pipe_readers(pipe); - fsnotify_modify(file); - } - return ret; -} - -/* - * Note that vmsplice only really supports true splicing _from_ user memory - * to a pipe, not the other way around. Splicing from user memory is a simple - * operation that can be supported without any funky alignment restrictions - * or nasty vm tricks. We simply map in the user memory and fill them into - * a pipe. The reverse isn't quite as easy, though. There are two possible - * solutions for that: - * - * - memcpy() the data internally, at which point we might as well just - * do a regular read() on the buffer anyway. - * - Lots of nasty vm tricks, that are neither fast nor flexible (it - * has restriction limitations on both ends of the pipe). - * - * Currently we punt and implement it as a normal copy, see pipe_to_user(). - * - */ -SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov, - unsigned long, nr_segs, unsigned int, flags) -{ - struct iovec iovstack[UIO_FASTIOV]; - struct iovec *iov = iovstack; - struct iov_iter iter; - ssize_t error; - int type; - - if (unlikely(flags & ~SPLICE_F_ALL)) - return -EINVAL; - - CLASS(fd, f)(fd); - if (fd_empty(f)) - return -EBADF; - if (fd_file(f)->f_mode & FMODE_WRITE) - type = ITER_SOURCE; - else if (fd_file(f)->f_mode & FMODE_READ) - type = ITER_DEST; - else - return -EBADF; - - error = import_iovec(type, uiov, nr_segs, - ARRAY_SIZE(iovstack), &iov, &iter); - if (error < 0) - return error; - - if (!iov_iter_count(&iter)) - error = 0; - else if (type == ITER_SOURCE) - error = vmsplice_to_pipe(fd_file(f), &iter, flags); - else - error = vmsplice_to_user(fd_file(f), &iter, flags); - - kfree(iov); - return error; -} - SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, int, fd_out, loff_t __user *, off_out, size_t, len, unsigned int, flags)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 2bcf78a4de7b..2961fee3e5cc 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h@@ -505,7 +505,7 @@ enum { SKBFL_ZEROCOPY_ENABLE = BIT(0), /* This indicates at least one fragment might be overwritten - * (as in vmsplice(), sendfile() ...) + * (as in sendfile(), ...) * If we need to compute a TX checksum, we'll need to copy * all frags to avoid possible bad checksum */
@@ -4017,7 +4017,7 @@ static inline int skb_linearize(struct sk_buff *skb) * @skb: buffer to test * * Return: true if the skb has at least one frag that might be modified - * by an external entity (as in vmsplice()/sendfile()) + * by an external entity (as in sendfile()) */ static inline bool skb_has_shared_frag(const struct sk_buff *skb) {
diff --git a/include/linux/splice.h b/include/linux/splice.h
index 9dec4861d09f..fb4f035aae83 100644
--- a/include/linux/splice.h
+++ b/include/linux/splice.h@@ -19,7 +19,7 @@ /* we may still block on the fd we splice */ /* from/to, of course */ #define SPLICE_F_MORE (0x04) /* expect more data */ -#define SPLICE_F_GIFT (0x08) /* pages passed in are a gift */ +#define SPLICE_F_GIFT (0x08) /* ignored */ #define SPLICE_F_ALL (SPLICE_F_MOVE|SPLICE_F_NONBLOCK|SPLICE_F_MORE|SPLICE_F_GIFT)
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index f5639d5ac331..a86a88207956 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h@@ -514,8 +514,8 @@ asmlinkage long sys_ppoll_time32(struct pollfd __user *, unsigned int, struct old_timespec32 __user *, const sigset_t __user *, size_t); asmlinkage long sys_signalfd4(int ufd, sigset_t __user *user_mask, size_t sizemask, int flags); -asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov, - unsigned long nr_segs, unsigned int flags); +asmlinkage long sys_vmsplice(unsigned long fd, const struct iovec __user *vec, + unsigned long vlen, unsigned int flags); asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, int fd_out, loff_t __user *off_out, size_t len, unsigned int flags);
--
2.47.3