Thread (65 messages) 65 messages, 9 authors, 2019-06-17

Re: [PATCH 3/7] vfs: Add a mount-notification facility

From: Jann Horn <jannh@google.com>
Date: 2019-05-28 20:06:57
Also in: keyrings, linux-api, linux-block, linux-fsdevel, lkml

On Tue, May 28, 2019 at 6:05 PM David Howells [off-list ref] wrote:
Add a mount notification facility whereby notifications about changes in
mount topology and configuration can be received.  Note that this only
covers vfsmount topology changes and not superblock events.  A separate
facility will be added for that.
[...]
quoted hunk ↗ jump to hunk
@@ -172,4 +167,18 @@ static inline void notify_mount(struct mount *changed,
                                u32 info_flags)
 {
        atomic_inc(&changed->mnt_notify_counter);
+
+#ifdef CONFIG_MOUNT_NOTIFICATIONS
+       {
+               struct mount_notification n = {
+                       .watch.type     = WATCH_TYPE_MOUNT_NOTIFY,
+                       .watch.subtype  = subtype,
+                       .watch.info     = info_flags | sizeof(n),
+                       .triggered_on   = changed->mnt_id,
+                       .changed_mount  = aux ? aux->mnt_id : 0,
+               };
+
+               post_mount_notification(changed, &n);
+       }
+#endif
 }
[...]
+void post_mount_notification(struct mount *changed,
+                            struct mount_notification *notify)
+{
+       const struct cred *cred = current_cred();
This current_cred() looks bogus to me. Can't mount topology changes
come from all sorts of places? For example, umount_mnt() from
umount_tree() from dissolve_on_fput() from __fput(), which could
happen pretty much anywhere depending on where the last reference gets
dropped?
+       struct path cursor;
+       struct mount *mnt;
+       unsigned seq;
+
+       seq = 0;
+       rcu_read_lock();
+restart:
+       cursor.mnt = &changed->mnt;
+       cursor.dentry = changed->mnt.mnt_root;
+       mnt = real_mount(cursor.mnt);
+       notify->watch.info &= ~WATCH_INFO_IN_SUBTREE;
+
+       read_seqbegin_or_lock(&rename_lock, &seq);
+       for (;;) {
+               if (mnt->mnt_watchers &&
+                   !hlist_empty(&mnt->mnt_watchers->watchers)) {
+                       if (cursor.dentry->d_flags & DCACHE_MOUNT_WATCH)
+                               post_watch_notification(mnt->mnt_watchers,
+                                                       &notify->watch, cred,
+                                                       (unsigned long)cursor.dentry);
+               } else {
+                       cursor.dentry = mnt->mnt.mnt_root;
+               }
+               notify->watch.info |= WATCH_INFO_IN_SUBTREE;
+
+               if (cursor.dentry == cursor.mnt->mnt_root ||
+                   IS_ROOT(cursor.dentry)) {
+                       struct mount *parent = READ_ONCE(mnt->mnt_parent);
+
+                       /* Escaped? */
+                       if (cursor.dentry != cursor.mnt->mnt_root)
+                               break;
+
+                       /* Global root? */
+                       if (mnt != parent) {
+                               cursor.dentry = READ_ONCE(mnt->mnt_mountpoint);
+                               mnt = parent;
+                               cursor.mnt = &mnt->mnt;
+                               continue;
+                       }
+                       break;
(nit: this would look clearer if you inverted the condition and wrote
it as "if (mnt == parent) break;", then you also wouldn't need that
"continue" or the braces)
+               }
+
+               cursor.dentry = cursor.dentry->d_parent;
+       }
+
+       if (need_seqretry(&rename_lock, seq)) {
+               seq = 1;
+               goto restart;
+       }
+
+       done_seqretry(&rename_lock, seq);
+       rcu_read_unlock();
+}
[...]
+SYSCALL_DEFINE5(mount_notify,
+               int, dfd,
+               const char __user *, filename,
+               unsigned int, at_flags,
+               int, watch_fd,
+               int, watch_id)
+{
+       struct watch_queue *wqueue;
+       struct watch_list *wlist = NULL;
+       struct watch *watch;
+       struct mount *m;
+       struct path path;
+       int ret;
+
+       if (watch_id < -1 || watch_id > 0xff)
+               return -EINVAL;
+
+       ret = user_path_at(dfd, filename, at_flags, &path);
The third argument of user_path_at() contains kernel-private lookup
flags, I'm pretty sure userspace isn't supposed to be able to control
these directly.
+       if (ret)
+               return ret;
+
+       wqueue = get_watch_queue(watch_fd);
+       if (IS_ERR(wqueue))
+               goto err_path;
+
+       m = real_mount(path.mnt);
+
+       if (watch_id >= 0) {
+               if (!m->mnt_watchers) {
+                       wlist = kzalloc(sizeof(*wlist), GFP_KERNEL);
+                       if (!wlist)
+                               goto err_wqueue;
+                       INIT_HLIST_HEAD(&wlist->watchers);
+                       spin_lock_init(&wlist->lock);
+                       wlist->release_watch = release_mount_watch;
+               }
+
+               watch = kzalloc(sizeof(*watch), GFP_KERNEL);
+               if (!watch)
+                       goto err_wlist;
+
+               init_watch(watch, wqueue);
+               watch->id               = (unsigned long)path.dentry;
+               watch->private          = path.mnt;
+               watch->info_id          = (u32)watch_id << 24;
+
+               down_write(&m->mnt.mnt_sb->s_umount);
+               if (!m->mnt_watchers) {
+                       m->mnt_watchers = wlist;
+                       wlist = NULL;
+               }
+
+               ret = add_watch_to_object(watch, m->mnt_watchers);
+               if (ret == 0) {
+                       spin_lock(&path.dentry->d_lock);
+                       path.dentry->d_flags |= DCACHE_MOUNT_WATCH;
+                       spin_unlock(&path.dentry->d_lock);
+                       path_get(&path);
So... the watches on a mountpoint create references back to the
mountpoint? Is your plan that umount_tree() breaks the loop by getting
rid of the watches?

If so: Is there anything that prevents installing new watches after
umount_tree()? Because I don't see anything.

It might make sense to redesign this stuff so that watches don't hold
references on the object being watched.
+               }
+               up_write(&m->mnt.mnt_sb->s_umount);
+               if (ret < 0)
+                       kfree(watch);
+       } else if (m->mnt_watchers) {
+               down_write(&m->mnt.mnt_sb->s_umount);
+               ret = remove_watch_from_object(m->mnt_watchers, wqueue,
+                                              (unsigned long)path.dentry,
+                                              false);
+               up_write(&m->mnt.mnt_sb->s_umount);
+       } else {
+               ret = -EBADSLT;
+       }
+
+err_wlist:
+       kfree(wlist);
+err_wqueue:
+       put_watch_queue(wqueue);
+err_path:
+       path_put(&path);
+       return ret;
+}
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help