Re: [PATCH 3/7] vfs: Add a mount-notification facility
From: Jann Horn <jannh@google.com>
Date: 2019-05-28 20:06:57
Also in:
keyrings, linux-api, linux-block, linux-fsdevel, lkml
On Tue, May 28, 2019 at 6:05 PM David Howells [off-list ref] wrote:
Add a mount notification facility whereby notifications about changes in mount topology and configuration can be received. Note that this only covers vfsmount topology changes and not superblock events. A separate facility will be added for that.
[...]
quoted hunk ↗ jump to hunk
@@ -172,4 +167,18 @@ static inline void notify_mount(struct mount *changed, u32 info_flags) { atomic_inc(&changed->mnt_notify_counter); + +#ifdef CONFIG_MOUNT_NOTIFICATIONS + { + struct mount_notification n = { + .watch.type = WATCH_TYPE_MOUNT_NOTIFY, + .watch.subtype = subtype, + .watch.info = info_flags | sizeof(n), + .triggered_on = changed->mnt_id, + .changed_mount = aux ? aux->mnt_id : 0, + }; + + post_mount_notification(changed, &n); + } +#endif }
[...]
+void post_mount_notification(struct mount *changed,
+ struct mount_notification *notify)
+{
+ const struct cred *cred = current_cred();This current_cred() looks bogus to me. Can't mount topology changes come from all sorts of places? For example, umount_mnt() from umount_tree() from dissolve_on_fput() from __fput(), which could happen pretty much anywhere depending on where the last reference gets dropped?
+ struct path cursor;
+ struct mount *mnt;
+ unsigned seq;
+
+ seq = 0;
+ rcu_read_lock();
+restart:
+ cursor.mnt = &changed->mnt;
+ cursor.dentry = changed->mnt.mnt_root;
+ mnt = real_mount(cursor.mnt);
+ notify->watch.info &= ~WATCH_INFO_IN_SUBTREE;
+
+ read_seqbegin_or_lock(&rename_lock, &seq);
+ for (;;) {
+ if (mnt->mnt_watchers &&
+ !hlist_empty(&mnt->mnt_watchers->watchers)) {
+ if (cursor.dentry->d_flags & DCACHE_MOUNT_WATCH)
+ post_watch_notification(mnt->mnt_watchers,
+ ¬ify->watch, cred,
+ (unsigned long)cursor.dentry);
+ } else {
+ cursor.dentry = mnt->mnt.mnt_root;
+ }
+ notify->watch.info |= WATCH_INFO_IN_SUBTREE;
+
+ if (cursor.dentry == cursor.mnt->mnt_root ||
+ IS_ROOT(cursor.dentry)) {
+ struct mount *parent = READ_ONCE(mnt->mnt_parent);
+
+ /* Escaped? */
+ if (cursor.dentry != cursor.mnt->mnt_root)
+ break;
+
+ /* Global root? */
+ if (mnt != parent) {
+ cursor.dentry = READ_ONCE(mnt->mnt_mountpoint);
+ mnt = parent;
+ cursor.mnt = &mnt->mnt;
+ continue;
+ }
+ break;(nit: this would look clearer if you inverted the condition and wrote it as "if (mnt == parent) break;", then you also wouldn't need that "continue" or the braces)
+ }
+
+ cursor.dentry = cursor.dentry->d_parent;
+ }
+
+ if (need_seqretry(&rename_lock, seq)) {
+ seq = 1;
+ goto restart;
+ }
+
+ done_seqretry(&rename_lock, seq);
+ rcu_read_unlock();
+}[...]
+SYSCALL_DEFINE5(mount_notify,
+ int, dfd,
+ const char __user *, filename,
+ unsigned int, at_flags,
+ int, watch_fd,
+ int, watch_id)
+{
+ struct watch_queue *wqueue;
+ struct watch_list *wlist = NULL;
+ struct watch *watch;
+ struct mount *m;
+ struct path path;
+ int ret;
+
+ if (watch_id < -1 || watch_id > 0xff)
+ return -EINVAL;
+
+ ret = user_path_at(dfd, filename, at_flags, &path);The third argument of user_path_at() contains kernel-private lookup flags, I'm pretty sure userspace isn't supposed to be able to control these directly.
+ if (ret)
+ return ret;
+
+ wqueue = get_watch_queue(watch_fd);
+ if (IS_ERR(wqueue))
+ goto err_path;
+
+ m = real_mount(path.mnt);
+
+ if (watch_id >= 0) {
+ if (!m->mnt_watchers) {
+ wlist = kzalloc(sizeof(*wlist), GFP_KERNEL);
+ if (!wlist)
+ goto err_wqueue;
+ INIT_HLIST_HEAD(&wlist->watchers);
+ spin_lock_init(&wlist->lock);
+ wlist->release_watch = release_mount_watch;
+ }
+
+ watch = kzalloc(sizeof(*watch), GFP_KERNEL);
+ if (!watch)
+ goto err_wlist;
+
+ init_watch(watch, wqueue);
+ watch->id = (unsigned long)path.dentry;
+ watch->private = path.mnt;
+ watch->info_id = (u32)watch_id << 24;
+
+ down_write(&m->mnt.mnt_sb->s_umount);
+ if (!m->mnt_watchers) {
+ m->mnt_watchers = wlist;
+ wlist = NULL;
+ }
+
+ ret = add_watch_to_object(watch, m->mnt_watchers);
+ if (ret == 0) {
+ spin_lock(&path.dentry->d_lock);
+ path.dentry->d_flags |= DCACHE_MOUNT_WATCH;
+ spin_unlock(&path.dentry->d_lock);
+ path_get(&path);So... the watches on a mountpoint create references back to the mountpoint? Is your plan that umount_tree() breaks the loop by getting rid of the watches? If so: Is there anything that prevents installing new watches after umount_tree()? Because I don't see anything. It might make sense to redesign this stuff so that watches don't hold references on the object being watched.
+ }
+ up_write(&m->mnt.mnt_sb->s_umount);
+ if (ret < 0)
+ kfree(watch);
+ } else if (m->mnt_watchers) {
+ down_write(&m->mnt.mnt_sb->s_umount);
+ ret = remove_watch_from_object(m->mnt_watchers, wqueue,
+ (unsigned long)path.dentry,
+ false);
+ up_write(&m->mnt.mnt_sb->s_umount);
+ } else {
+ ret = -EBADSLT;
+ }
+
+err_wlist:
+ kfree(wlist);
+err_wqueue:
+ put_watch_queue(wqueue);
+err_path:
+ path_put(&path);
+ return ret;
+}