Thread (2 messages) 2 messages, 2 authors, 2014-10-23

Re: [PATCHv5 1/3] syscalls,x86: implement execveat() system call

From: David Drysdale <hidden>
Date: 2014-10-23 06:41:10
Also in: linux-api, lkml

Possibly related (same subject, not in this thread)

On Wed, Oct 22, 2014 at 7:07 PM, Eric W. Biederman
[off-list ref] wrote:
David Drysdale [off-list ref] writes:
quoted
Add a new system execveat(2) syscall. execveat() is to execve() as
openat() is to open(): it takes a file descriptor that refers to a
directory, and resolves the filename relative to that.

In addition, if the filename is empty and AT_EMPTY_PATH is specified,
execveat() executes the file to which the file descriptor refers. This
replicates the functionality of fexecve(), which is a system call in
other UNIXen, but in Linux glibc it depends on opening
"/proc/self/fd/<fd>" (and so relies on /proc being mounted).

The filename fed to the executed program as argv[0] (or the name of the
script fed to a script interpreter) will be of the form "/dev/fd/<fd>"
(for an empty filename) or "/dev/fd/<fd>/<filename>", effectively
reflecting how the executable was found.  This does however mean that
execution of a script in a /proc-less environment won't work.

Only x86-64, i386 and x32 ABIs are supported in this patch.

Based on patches by Meredydd Luff [off-list ref]

Signed-off-by: David Drysdale <redacted>
---
 arch/x86/ia32/audit.c             |   1 +
 arch/x86/ia32/ia32entry.S         |   1 +
 arch/x86/kernel/audit_64.c        |   1 +
 arch/x86/kernel/entry_64.S        |  28 ++++++++
 arch/x86/syscalls/syscall_32.tbl  |   1 +
 arch/x86/syscalls/syscall_64.tbl  |   2 +
 arch/x86/um/sys_call_table_64.c   |   1 +
 fs/exec.c                         | 130 ++++++++++++++++++++++++++++++++++----
 fs/namei.c                        |   2 +-
 include/linux/compat.h            |   3 +
 include/linux/fs.h                |   1 +
 include/linux/sched.h             |   4 ++
 include/linux/syscalls.h          |   4 ++
 include/uapi/asm-generic/unistd.h |   4 +-
 kernel/sys_ni.c                   |   3 +
 lib/audit.c                       |   3 +
 16 files changed, 173 insertions(+), 16 deletions(-)
diff --git a/arch/x86/ia32/audit.c b/arch/x86/ia32/audit.c
index 5d7b381da692..2eccc8932ae6 100644
--- a/arch/x86/ia32/audit.c
+++ b/arch/x86/ia32/audit.c
@@ -35,6 +35,7 @@ int ia32_classify_syscall(unsigned syscall)
      case __NR_socketcall:
              return 4;
      case __NR_execve:
+     case __NR_execveat:
              return 5;
      default:
              return 1;
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 4299eb05023c..2516c09743e0 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -464,6 +464,7 @@ GLOBAL(\label)
      PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn
      PTREGSCALL stub32_sigreturn, sys32_sigreturn
      PTREGSCALL stub32_execve, compat_sys_execve
+     PTREGSCALL stub32_execveat, compat_sys_execveat
      PTREGSCALL stub32_fork, sys_fork
      PTREGSCALL stub32_vfork, sys_vfork
diff --git a/arch/x86/kernel/audit_64.c b/arch/x86/kernel/audit_64.c
index 06d3e5a14d9d..f3672508b249 100644
--- a/arch/x86/kernel/audit_64.c
+++ b/arch/x86/kernel/audit_64.c
@@ -50,6 +50,7 @@ int audit_classify_syscall(int abi, unsigned syscall)
      case __NR_openat:
              return 3;
      case __NR_execve:
+     case __NR_execveat:
              return 5;
      default:
              return 0;
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 2fac1343a90b..00c4526e6ffe 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -665,6 +665,20 @@ ENTRY(stub_execve)
      CFI_ENDPROC
 END(stub_execve)

+ENTRY(stub_execveat)
+     CFI_STARTPROC
+     addq $8, %rsp
+     PARTIAL_FRAME 0
+     SAVE_REST
+     FIXUP_TOP_OF_STACK %r11
+     call sys_execveat
+     RESTORE_TOP_OF_STACK %r11
+     movq %rax,RAX(%rsp)
+     RESTORE_REST
+     jmp int_ret_from_sys_call
+     CFI_ENDPROC
+END(stub_execveat)
+
 /*
  * sigreturn is special because it needs to restore all registers on return.
  * This cannot be done with SYSRET, so use the IRET return path instead.
@@ -710,6 +724,20 @@ ENTRY(stub_x32_execve)
      CFI_ENDPROC
 END(stub_x32_execve)

+ENTRY(stub_x32_execveat)
+     CFI_STARTPROC
+     addq $8, %rsp
+     PARTIAL_FRAME 0
+     SAVE_REST
+     FIXUP_TOP_OF_STACK %r11
+     call compat_sys_execveat
+     RESTORE_TOP_OF_STACK %r11
+     movq %rax,RAX(%rsp)
+     RESTORE_REST
+     jmp int_ret_from_sys_call
+     CFI_ENDPROC
+END(stub_x32_execveat)
+
 #endif

 /*
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index 028b78168d85..2633e3195455 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -363,3 +363,4 @@
 354  i386    seccomp                 sys_seccomp
 355  i386    getrandom               sys_getrandom
 356  i386    memfd_create            sys_memfd_create
+357  i386    execveat                sys_execveat                    stub32_execveat
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index 35dd922727b9..1af5badd159c 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -327,6 +327,7 @@
 318  common  getrandom               sys_getrandom
 319  common  memfd_create            sys_memfd_create
 320  common  kexec_file_load         sys_kexec_file_load
+321  64      execveat                stub_execveat

 #
 # x32-specific system call numbers start at 512 to avoid cache impact
@@ -365,3 +366,4 @@
 542  x32     getsockopt              compat_sys_getsockopt
 543  x32     io_setup                compat_sys_io_setup
 544  x32     io_submit               compat_sys_io_submit
+545  x32     execveat                stub_x32_execveat
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
index f2f0723070ca..20c3649d0691 100644
--- a/arch/x86/um/sys_call_table_64.c
+++ b/arch/x86/um/sys_call_table_64.c
@@ -31,6 +31,7 @@
 #define stub_fork sys_fork
 #define stub_vfork sys_vfork
 #define stub_execve sys_execve
+#define stub_execveat sys_execveat
 #define stub_rt_sigreturn sys_rt_sigreturn

 #define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
diff --git a/fs/exec.c b/fs/exec.c
index a2b42a98c743..92a6e14f096a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -747,7 +747,7 @@ EXPORT_SYMBOL(setup_arg_pages);

 #endif /* CONFIG_MMU */

-static struct file *do_open_exec(struct filename *name)
+static struct file *do_open_execat(int fd, struct filename *name, int flags)
 {
      struct file *file;
      int err;
@@ -757,10 +757,34 @@ static struct file *do_open_exec(struct filename *name)
              .intent = LOOKUP_OPEN,
              .lookup_flags = LOOKUP_FOLLOW,
      };
+     static const struct open_flags open_exec_nofollow_flags = {
+             .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
+             .acc_mode = MAY_EXEC | MAY_OPEN,
+             .intent = LOOKUP_OPEN,
+             .lookup_flags = 0,
+     };

-     file = do_filp_open(AT_FDCWD, name, &open_exec_flags);
-     if (IS_ERR(file))
-             goto out;
+     if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
+             return ERR_PTR(-EINVAL);
+
+     if (name->name[0] != '\0') {
Is it really necessary to special case AT_EMPTY_PATH here.  I would
have thought the existing logic in namei.c would have been fine
assuning we passed LOOKUP_EMPTY.
Just using do_filp_open() throughout looks mostly plausible on a quick
experiment, but my initial version appears to make O_PATH fds unexpectedly
fexecve()-able (I'm glad I had a test case for that).

I'll look for a way around that, hopefully without an explicit special case.
quoted
+             const struct open_flags *oflags = ((flags & AT_SYMLINK_NOFOLLOW)
+                                                ? &open_exec_nofollow_flags
+                                                : &open_exec_flags);
+
+             file = do_filp_open(fd, name, oflags);
+             if (IS_ERR(file))
+                     goto out;
+     } else {
+             file = fget(fd);
+             if (!file)
+                     return ERR_PTR(-EBADF);
+
+             err = inode_permission(file->f_path.dentry->d_inode,
+                             open_exec_flags.acc_mode);
+             if (err)
+                     goto exit;
+     }

      err = -EACCES;
      if (!S_ISREG(file_inode(file)->i_mode))
@@ -769,12 +793,13 @@ static struct file *do_open_exec(struct filename *name)
      if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
              goto exit;

-     fsnotify_open(file);
-
      err = deny_write_access(file);
      if (err)
              goto exit;

+     if (name->name[0] != '\0')
+             fsnotify_open(file);
+
 out:
      return file;
@@ -786,7 +811,7 @@ exit:
 struct file *open_exec(const char *name)
 {
      struct filename tmp = { .name = name };
-     return do_open_exec(&tmp);
+     return do_open_execat(AT_FDCWD, &tmp, 0);
 }
 EXPORT_SYMBOL(open_exec);
@@ -1422,10 +1447,12 @@ static int exec_binprm(struct linux_binprm *bprm)
 /*
  * sys_execve() executes a new program.
  */
-static int do_execve_common(struct filename *filename,
-                             struct user_arg_ptr argv,
-                             struct user_arg_ptr envp)
+static int do_execveat_common(int fd, struct filename *filename,
+                           struct user_arg_ptr argv,
+                           struct user_arg_ptr envp,
+                           int flags)
 {
+     char *pathbuf = NULL;
      struct linux_binprm *bprm;
      struct file *file;
      struct files_struct *displaced;
@@ -1466,7 +1493,7 @@ static int do_execve_common(struct filename *filename,
      check_unsafe_exec(bprm);
      current->in_execve = 1;

-     file = do_open_exec(filename);
+     file = do_open_execat(fd, filename, flags);
      retval = PTR_ERR(file);
      if (IS_ERR(file))
              goto out_unmark;
@@ -1474,7 +1501,27 @@ static int do_execve_common(struct filename *filename,
      sched_exec();

      bprm->file = file;
-     bprm->filename = bprm->interp = filename->name;
+     if (fd == AT_FDCWD || filename->name[0] == '/') {
+             bprm->filename = filename->name;
+     } else {
+             /*
+              * Build a pathname that reflects how we got to the file,
+              * either "/dev/fd/<fd>" (for an empty filename) or
+              * "/dev/fd/<fd>/<filename>".
+              */
+             pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY);
+             if (!pathbuf) {
+                     retval = -ENOMEM;
+                     goto out_unmark;
+             }
+             bprm->filename = pathbuf;
+             if (filename->name[0] == '\0')
+                     sprintf(pathbuf, "/dev/fd/%d", fd);
+             else
+                     snprintf(pathbuf, PATH_MAX,
+                              "/dev/fd/%d/%s", fd, filename->name);
+     }
+     bprm->interp = bprm->filename;

      retval = bprm_mm_init(bprm);
      if (retval)
@@ -1532,6 +1579,7 @@ out_unmark:

 out_free:
      free_bprm(bprm);
+     kfree(pathbuf);

 out_files:
      if (displaced)
@@ -1547,7 +1595,18 @@ int do_execve(struct filename *filename,
 {
      struct user_arg_ptr argv = { .ptr.native = __argv };
      struct user_arg_ptr envp = { .ptr.native = __envp };
-     return do_execve_common(filename, argv, envp);
+     return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
+}
+
+int do_execveat(int fd, struct filename *filename,
+             const char __user *const __user *__argv,
+             const char __user *const __user *__envp,
+             int flags)
+{
+     struct user_arg_ptr argv = { .ptr.native = __argv };
+     struct user_arg_ptr envp = { .ptr.native = __envp };
+
+     return do_execveat_common(fd, filename, argv, envp, flags);
 }

 #ifdef CONFIG_COMPAT
@@ -1563,7 +1622,23 @@ static int compat_do_execve(struct filename *filename,
              .is_compat = true,
              .ptr.compat = __envp,
      };
-     return do_execve_common(filename, argv, envp);
+     return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
+}
+
+static int compat_do_execveat(int fd, struct filename *filename,
+                           const compat_uptr_t __user *__argv,
+                           const compat_uptr_t __user *__envp,
+                           int flags)
+{
+     struct user_arg_ptr argv = {
+             .is_compat = true,
+             .ptr.compat = __argv,
+     };
+     struct user_arg_ptr envp = {
+             .is_compat = true,
+             .ptr.compat = __envp,
+     };
+     return do_execveat_common(fd, filename, argv, envp, flags);
 }
 #endif
@@ -1603,6 +1678,20 @@ SYSCALL_DEFINE3(execve,
 {
      return do_execve(getname(filename), argv, envp);
 }
+
+SYSCALL_DEFINE5(execveat,
+             int, fd, const char __user *, filename,
+             const char __user *const __user *, argv,
+             const char __user *const __user *, envp,
+             int, flags)
+{
+     int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
+
+     return do_execveat(fd,
+                        getname_flags(filename, lookup_flags, NULL),
+                        argv, envp, flags);
+}
+
 #ifdef CONFIG_COMPAT
 COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename,
      const compat_uptr_t __user *, argv,
@@ -1610,4 +1699,17 @@ COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename,
 {
      return compat_do_execve(getname(filename), argv, envp);
 }
+
+COMPAT_SYSCALL_DEFINE5(execveat, int, fd,
+                    const char __user *, filename,
+                    const compat_uptr_t __user *, argv,
+                    const compat_uptr_t __user *, envp,
+                    int,  flags)
+{
+     int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
+
+     return compat_do_execveat(fd,
+                               getname_flags(filename, lookup_flags, NULL),
+                               argv, envp, flags);
+}
 #endif
diff --git a/fs/namei.c b/fs/namei.c
index a7b05bf82d31..553c84d3e0cc 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -130,7 +130,7 @@ void final_putname(struct filename *name)

 #define EMBEDDED_NAME_MAX    (PATH_MAX - sizeof(struct filename))

-static struct filename *
+struct filename *
 getname_flags(const char __user *filename, int flags, int *empty)
 {
      struct filename *result, *err;
diff --git a/include/linux/compat.h b/include/linux/compat.h
index e6494261eaff..7450ca2ac1fc 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -357,6 +357,9 @@ asmlinkage long compat_sys_lseek(unsigned int, compat_off_t, unsigned int);

 asmlinkage long compat_sys_execve(const char __user *filename, const compat_uptr_t __user *argv,
                   const compat_uptr_t __user *envp);
+asmlinkage long compat_sys_execveat(int dfd, const char __user *filename,
+                  const compat_uptr_t __user *argv,
+                  const compat_uptr_t __user *envp, int flags);

 asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
              compat_ulong_t __user *outp, compat_ulong_t __user *exp,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 94187721ad41..e9818574d738 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2060,6 +2060,7 @@ extern struct file *file_open_root(struct dentry *, struct vfsmount *,
 extern struct file * dentry_open(const struct path *, int, const struct cred *);
 extern int filp_close(struct file *, fl_owner_t id);

+extern struct filename *getname_flags(const char __user *, int, int *);
 extern struct filename *getname(const char __user *);
 extern struct filename *getname_kernel(const char *);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b867a4dab38a..33e056da7d33 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2430,6 +2430,10 @@ extern void do_group_exit(int);
 extern int do_execve(struct filename *,
                   const char __user * const __user *,
                   const char __user * const __user *);
+extern int do_execveat(int, struct filename *,
+                    const char __user * const __user *,
+                    const char __user * const __user *,
+                    int);
 extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *);
 struct task_struct *fork_idle(int);
 extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 0f86d85a9ce4..df5422294deb 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -876,4 +876,8 @@ asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
 asmlinkage long sys_getrandom(char __user *buf, size_t count,
                            unsigned int flags);

+asmlinkage long sys_execveat(int dfd, const char __user *filename,
+                     const char __user *const __user *argv,
+                     const char __user *const __user *envp, int flags);
+
 #endif
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 11d11bc5c78f..feef07d29663 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -705,9 +705,11 @@ __SYSCALL(__NR_seccomp, sys_seccomp)
 __SYSCALL(__NR_getrandom, sys_getrandom)
 #define __NR_memfd_create 279
 __SYSCALL(__NR_memfd_create, sys_memfd_create)
+#define __NR_execveat 280
+__SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat)

 #undef __NR_syscalls
-#define __NR_syscalls 280
+#define __NR_syscalls 281

 /*
  * All syscalls below here should go away really,
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 391d4ddb6f4b..efb06058ad3e 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -218,3 +218,6 @@ cond_syscall(sys_kcmp);

 /* operate on Secure Computing state */
 cond_syscall(sys_seccomp);
+
+/* execveat */
+cond_syscall(sys_execveat);
diff --git a/lib/audit.c b/lib/audit.c
index 1d726a22565b..b8fb5ee81e26 100644
--- a/lib/audit.c
+++ b/lib/audit.c
@@ -54,6 +54,9 @@ int audit_classify_syscall(int abi, unsigned syscall)
      case __NR_socketcall:
              return 4;
 #endif
+#ifdef __NR_execveat
+     case __NR_execveat:
+#endif
      case __NR_execve:
              return 5;
      default:
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help