Re: [PATCH 4/7] vfs: Add superblock notifications
From: Jann Horn <jannh@google.com>
Date: 2019-05-28 20:28:17
Also in:
keyrings, linux-block, linux-fsdevel, linux-security-module, lkml
On Tue, May 28, 2019 at 6:05 PM David Howells [off-list ref] wrote:
Add a superblock event notification facility whereby notifications about superblock events, such as I/O errors (EIO), quota limits being hit (EDQUOT) and running out of space (ENOSPC) can be reported to a monitoring process asynchronously. Note that this does not cover vfsmount topology changes. mount_notify() is used for that.
[...]
+#ifdef CONFIG_SB_NOTIFICATIONS
+/*
+ * Post superblock notifications.
+ */
+void post_sb_notification(struct super_block *s, struct superblock_notification *n)
+{
+ post_watch_notification(s->s_watchers, &n->watch, current_cred(),
+ s->s_unique_id);
+}You're using current_cred() here? So the idea is that if some random process runs into a disk I/O error, the I/O error will come from that task's credentials? In general, you're not supposed to look at task credentials in ->read/->write handlers.
+static void release_sb_watch(struct watch_list *wlist, struct watch *watch)
+{
+ struct super_block *s = watch->private;
+
+ put_super(s);
+}
+
+/**
+ * sys_sb_notify - Watch for superblock events.
+ * @dfd: Base directory to pathwalk from or fd referring to superblock.
+ * @filename: Path to superblock to place the watch upon
+ * @at_flags: Pathwalk control flags
+ * @watch_fd: The watch queue to send notifications to.
+ * @watch_id: The watch ID to be placed in the notification (-1 to remove watch)
+ */
+SYSCALL_DEFINE5(sb_notify,
+ int, dfd,
+ const char __user *, filename,
+ unsigned int, at_flags,
+ int, watch_fd,
+ int, watch_id)
+{
+ struct watch_queue *wqueue;
+ struct super_block *s;
+ struct watch_list *wlist = NULL;
+ struct watch *watch;
+ struct path path;
+ int ret;
+
+ if (watch_id < -1 || watch_id > 0xff)
+ return -EINVAL;
+
+ ret = user_path_at(dfd, filename, at_flags, &path);As in the other patch, I don't think userspace is supposed to be able to supply user_path_at()'s third argument. It might make sense to require that the path points to the root inode of the superblock? That way you wouldn't be able to do this on a bind mount that exposes part of a shared filesystem to a container.
+ if (ret)
+ return ret;
+
+ wqueue = get_watch_queue(watch_fd);
+ if (IS_ERR(wqueue))
+ goto err_path;
+
+ s = path.dentry->d_sb;
+ if (watch_id >= 0) {
+ if (!s->s_watchers) {
+ wlist = kzalloc(sizeof(*wlist), GFP_KERNEL);
+ if (!wlist)
+ goto err_wqueue;
+ INIT_HLIST_HEAD(&wlist->watchers);
+ spin_lock_init(&wlist->lock);
+ wlist->release_watch = release_sb_watch;
+ }
+
+ watch = kzalloc(sizeof(*watch), GFP_KERNEL);
+ if (!watch)
+ goto err_wlist;
+
+ init_watch(watch, wqueue);
+ watch->id = s->s_unique_id;
+ watch->private = s;
+ watch->info_id = (u32)watch_id << 24;
+
+ down_write(&s->s_umount);
+ ret = -EIO;
+ if (atomic_read(&s->s_active)) {
+ if (!s->s_watchers) {
+ s->s_watchers = wlist;
+ wlist = NULL;
+ }
+
+ ret = add_watch_to_object(watch, s->s_watchers);
+ if (ret == 0) {
+ spin_lock(&sb_lock);
+ s->s_count++;
+ spin_unlock(&sb_lock);Why do watches hold references on the superblock they're watching?
+ }
+ }
+ up_write(&s->s_umount);
+ if (ret < 0)
+ kfree(watch);
+ } else if (s->s_watchers) {This should probably have something like a READ_ONCE() for clarity?
+ down_write(&s->s_umount);
+ ret = remove_watch_from_object(s->s_watchers, wqueue,
+ s->s_unique_id, false);
+ up_write(&s->s_umount);
+ } else {
+ ret = -EBADSLT;
+ }
+
+err_wlist:
+ kfree(wlist);
+err_wqueue:
+ put_watch_queue(wqueue);
+err_path:
+ path_put(&path);
+ return ret;
+}
+#endif