Re: [syzbot] possible deadlock in blkdev_put (2)
From: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Date: 2021-11-29 14:16:23
Subsystem:
block layer, the rest · Maintainers:
Jens Axboe, Linus Torvalds
On 2021/11/29 19:36, Tetsuo Handa wrote:
On 2021/11/29 19:21, Christoph Hellwig wrote:quoted
On Sun, Nov 28, 2021 at 04:42:35PM +0900, Tetsuo Handa wrote:quoted
Is dropping disk->open_mutex inside lo_release() ( https://lkml.kernel.org/r/e4bdc6b1-701d-6cc1-5d42-65564d2aa089@I-love.SAKURA.ne.jp ) possible?I don't think we can drop open_mutex inside ->release. What is the problem with offloading the clearing to a different context than the one that calls ->release?Offloading to a WQ context? If the caller just want to call ioctl(LOOP_CTL_GET_FREE) followed by ioctl(LOOP_CONFIGURE), deferring __loop_clr_fd() would be fine. But the caller might want to unmount as soon as fput(filp) from __loop_clr_fd() completes. I think we need to wait for __loop_clr_fd() from lo_release() to complete.
Something like this?
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 0a4416ef4fbf..2d54416abe24 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h@@ -1210,10 +1210,16 @@ struct block_device_operations { int (*get_unique_id)(struct gendisk *disk, u8 id[16], enum blk_unique_id id_type); struct module *owner; const struct pr_ops *pr_ops; + /* + * Special callback for waiting for completion of release callback + * without disk->open_mutex held. Used by loop driver. + */ + void (*wait_release)(struct gendisk *disk); + /* * Special callback for probing GPT entry at a given sector. * Needed by Android devices, used by GPT scanner and MMC blk * driver. */
diff --git a/block/bdev.c b/block/bdev.c
index ae063041f291..edadc3fc1df3 100644
--- a/block/bdev.c
+++ b/block/bdev.c@@ -952,10 +952,12 @@ void blkdev_put(struct block_device *bdev, fmode_t mode) if (bdev_is_partition(bdev)) blkdev_put_part(bdev, mode); else blkdev_put_whole(bdev, mode); mutex_unlock(&disk->open_mutex); + if (bdev->bd_disk->fops->wait_release) + bdev->bd_disk->fops->wait_release(bdev->bd_disk); blkdev_put_no_open(bdev); } EXPORT_SYMBOL(blkdev_put);
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index 56b9392737b2..858bb6b4ceea 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h@@ -55,10 +55,11 @@ struct loop_device { struct request_queue *lo_queue; struct blk_mq_tag_set tag_set; struct gendisk *lo_disk; struct mutex lo_mutex; bool idr_visible; + struct work_struct rundown_work; }; struct loop_cmd { struct list_head list_entry; bool use_aio; /* use AIO interface to handle I/O */
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 3dfb39d38235..8f1db8ca0eb8 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c@@ -1060,11 +1060,11 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, /* This is safe: open() is still holding a reference. */ module_put(THIS_MODULE); return error; } -static void __loop_clr_fd(struct loop_device *lo, bool release) +static void __loop_clr_fd(struct loop_device *lo) { struct file *filp; gfp_t gfp = lo->old_gfp_mask; struct loop_worker *pos, *worker;
@@ -1119,23 +1119,13 @@ static void __loop_clr_fd(struct loop_device *lo, bool release) disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE); if (lo->lo_flags & LO_FLAGS_PARTSCAN) { int err; - /* - * open_mutex has been held already in release path, so don't - * acquire it if this function is called in such case. - * - * If the reread partition isn't from release path, lo_refcnt - * must be at least one and it can only become zero when the - * current holder is released. - */ - if (!release) - mutex_lock(&lo->lo_disk->open_mutex); + mutex_lock(&lo->lo_disk->open_mutex); err = bdev_disk_changed(lo->lo_disk, false); - if (!release) - mutex_unlock(&lo->lo_disk->open_mutex); + mutex_unlock(&lo->lo_disk->open_mutex); if (err) pr_warn("%s: partition scan of loop%d failed (rc=%d)\n", __func__, lo->lo_number, err); /* Device is gone, no point in returning error */ }
@@ -1186,11 +1176,11 @@ static int loop_clr_fd(struct loop_device *lo) return 0; } loop_update_state_locked(lo, Lo_rundown); mutex_unlock(&lo->lo_mutex); - __loop_clr_fd(lo, false); + __loop_clr_fd(lo); return 0; } static int loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
@@ -1694,10 +1684,17 @@ static int lo_open(struct block_device *bdev, fmode_t mode) atomic_inc(&lo->lo_refcnt); mutex_unlock(&lo->lo_mutex); return err; } +static void loop_rundown_workfn(struct work_struct *work) +{ + struct loop_device *lo = container_of(work, struct loop_device, rundown_work); + + __loop_clr_fd(lo); +} + static void lo_release(struct gendisk *disk, fmode_t mode) { struct loop_device *lo = disk->private_data; mutex_lock(&lo->lo_mutex);
@@ -1710,11 +1707,12 @@ static void lo_release(struct gendisk *disk, fmode_t mode) mutex_unlock(&lo->lo_mutex); /* * In autoclear mode, stop the loop thread * and remove configuration after last close. */ - __loop_clr_fd(lo, true); + INIT_WORK(&lo->rundown_work, loop_rundown_workfn); + queue_work(system_long_wq, &lo->rundown_work); return; } else if (lo->lo_state == Lo_bound) { /* * Otherwise keep thread (if running) and config, * but flush possible ongoing bios in thread.
@@ -1725,14 +1723,22 @@ static void lo_release(struct gendisk *disk, fmode_t mode) out_unlock: mutex_unlock(&lo->lo_mutex); } +static void lo_wait_release(struct gendisk *disk) +{ + struct loop_device *lo = disk->private_data; + + flush_workqueue(system_long_wq); +} + static const struct block_device_operations lo_fops = { .owner = THIS_MODULE, .open = lo_open, .release = lo_release, + .wait_release = lo_wait_release, .ioctl = lo_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = lo_compat_ioctl, #endif };