Thread (34 messages) 34 messages, 5 authors, 2019-02-21

Re: v4.20-rc6: Sporadic use-after-free in bt_iter()

From: Jens Axboe <axboe@kernel.dk>
Date: 2018-12-20 22:47:36
Subsystem: block layer, the rest · Maintainers: Jens Axboe, Linus Torvalds

On 12/20/18 3:33 PM, Jens Axboe wrote:
On 12/20/18 3:23 PM, Jens Axboe wrote:
quoted
On 12/20/18 3:19 PM, Bart Van Assche wrote:
quoted
On Thu, 2018-12-20 at 14:48 -0700, Jens Axboe wrote:
quoted
-void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
-		     unsigned int hctx_idx)
+static void blk_mq_rcu_free_pages(struct work_struct *work)
 {
+	struct blk_mq_tags *tags = container_of(to_rcu_work(work),
+						struct blk_mq_tags, rcu_work);
 	struct page *page;
 
+	while (!list_empty(&tags->page_list)) {
+		page = list_first_entry(&tags->page_list, struct page, lru);
+		list_del_init(&page->lru);
+		/*
+		 * Remove kmemleak object previously allocated in
+		 * blk_mq_init_rq_map().
+		 */
+		kmemleak_free(page_address(page));
+		__free_pages(page, page->private);
+	}
+}
+
+void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
+		     unsigned int hctx_idx)
+{
 	if (tags->rqs && set->ops->exit_request) {
 		int i;
 
@@ -2038,16 +2061,9 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
 		}
 	}
 
-	while (!list_empty(&tags->page_list)) {
-		page = list_first_entry(&tags->page_list, struct page, lru);
-		list_del_init(&page->lru);
-		/*
-		 * Remove kmemleak object previously allocated in
-		 * blk_mq_init_rq_map().
-		 */
-		kmemleak_free(page_address(page));
-		__free_pages(page, page->private);
-	}
+	/* Punt to RCU free, so we don't race with tag iteration */
+	INIT_RCU_WORK(&tags->rcu_work, blk_mq_rcu_free_pages);
+	queue_rcu_work(system_wq, &tags->rcu_work);
 }
This can only work correctly if blk_mq_rcu_free_pages() is called before
INIT_RCU_WORK() is called a second time for the same bkl_mq_tags structure
and if blk_mq_rcu_free_pages() is called before struct blk_mq_tags is freed.
What provides these guarantees? Did I perhaps miss something?
We don't call it twice. Protecting against that is outside the scope
of this function. But we call and clear form the regular shutdown path,
and the rest is error handling on setup.

But yes, we do need to ensure that 'tags' doesn't go away. Probably best
to rework it to shove it somewhere else for freeing for that case, and
leave the rest of the shutdown alone. I'll update the patch.
Here's a version that doesn't rely on tags, we just dynamically allocate
the structure. For the very odd case where we fail, we just punt to
an on-stack structure and use the big synchronize_rcu() hammer first.
Forgot to init the lists... This one I actually booted, works for me.
Outside of that, not tested, in terms of verifying the use-after-free is
gone for tag iteration.

diff --git a/block/blk-flush.c b/block/blk-flush.c
index a3fc7191c694..827e3d2180d8 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -491,12 +491,22 @@ struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q,
 	return NULL;
 }
 
+static void blk_fq_rcu_free(struct work_struct *work)
+{
+	struct blk_flush_queue *fq = container_of(to_rcu_work(work),
+							struct blk_flush_queue,
+							rcu_work);
+
+	kfree(fq->flush_rq);
+	kfree(fq);
+}
+
 void blk_free_flush_queue(struct blk_flush_queue *fq)
 {
 	/* bio based request queue hasn't flush queue */
 	if (!fq)
 		return;
 
-	kfree(fq->flush_rq);
-	kfree(fq);
+	INIT_RCU_WORK(&fq->rcu_work, blk_fq_rcu_free);
+	queue_rcu_work(system_wq, &fq->rcu_work);
 }
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 2089c6c62f44..c39b58391ae8 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -228,13 +228,15 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
 
 	if (!reserved)
 		bitnr += tags->nr_reserved_tags;
-	rq = tags->rqs[bitnr];
+	if (tags->rqs[bitnr].queue != hctx->queue)
+		return true;
 
 	/*
 	 * We can hit rq == NULL here, because the tagging functions
 	 * test and set the bit before assigning ->rqs[].
 	 */
-	if (rq && rq->q == hctx->queue)
+	rq = tags->rqs[bitnr].rq;
+	if (rq)
 		return iter_data->fn(hctx, rq, iter_data->data, reserved);
 	return true;
 }
@@ -263,7 +265,9 @@ static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt,
 		.reserved = reserved,
 	};
 
+	rcu_read_lock();
 	sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data);
+	rcu_read_unlock();
 }
 
 struct bt_tags_iter_data {
@@ -287,7 +291,7 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
 	 * We can hit rq == NULL here, because the tagging functions
 	 * test and set the bit before assining ->rqs[].
 	 */
-	rq = tags->rqs[bitnr];
+	rq = tags->rqs[bitnr].rq;
 	if (rq && blk_mq_request_started(rq))
 		return iter_data->fn(rq, iter_data->data, reserved);
 
@@ -317,8 +321,11 @@ static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt,
 		.reserved = reserved,
 	};
 
-	if (tags->rqs)
+	if (tags->rqs) {
+		rcu_read_lock();
 		sbitmap_for_each_set(&bt->sb, bt_tags_iter, &iter_data);
+		rcu_read_unlock();
+	}
 }
 
 /**
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 61deab0b5a5a..bb84d1f099c7 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -4,6 +4,11 @@
 
 #include "blk-mq.h"
 
+struct rq_tag_entry {
+	struct request_queue *queue;
+	struct request *rq;
+};
+
 /*
  * Tag address space map.
  */
@@ -16,7 +21,7 @@ struct blk_mq_tags {
 	struct sbitmap_queue bitmap_tags;
 	struct sbitmap_queue breserved_tags;
 
-	struct request **rqs;
+	struct rq_tag_entry *rqs;
 	struct request **static_rqs;
 	struct list_head page_list;
 };
@@ -78,7 +83,8 @@ static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
 static inline void blk_mq_tag_set_rq(struct blk_mq_hw_ctx *hctx,
 		unsigned int tag, struct request *rq)
 {
-	hctx->tags->rqs[tag] = rq;
+	hctx->tags->rqs[tag].queue = hctx->queue;
+	hctx->tags->rqs[tag].rq = rq;
 }
 
 static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags,
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 2de972857496..c17103c00362 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -298,13 +298,16 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 		rq->tag = -1;
 		rq->internal_tag = tag;
 	} else {
-		if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) {
+		struct blk_mq_hw_ctx *hctx = data->hctx;
+
+		if (hctx->flags & BLK_MQ_F_TAG_SHARED) {
 			rq_flags = RQF_MQ_INFLIGHT;
-			atomic_inc(&data->hctx->nr_active);
+			atomic_inc(&hctx->nr_active);
 		}
 		rq->tag = tag;
 		rq->internal_tag = -1;
-		data->hctx->tags->rqs[rq->tag] = rq;
+		hctx->tags->rqs[rq->tag].queue = hctx->queue;
+		hctx->tags->rqs[rq->tag].rq = rq;
 	}
 
 	/* csd/requeue_work/fifo_time is initialized before use */
@@ -797,8 +800,8 @@ EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
 {
 	if (tag < tags->nr_tags) {
-		prefetch(tags->rqs[tag]);
-		return tags->rqs[tag];
+		prefetch(tags->rqs[tag].rq);
+		return tags->rqs[tag].rq;
 	}
 
 	return NULL;
@@ -809,10 +812,11 @@ static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq,
 			       void *priv, bool reserved)
 {
 	/*
-	 * If we find a request that is inflight and the queue matches,
-	 * we know the queue is busy. Return false to stop the iteration.
+	 * We're only called here if the queue matches. If the rq state is
+	 * inflight, we know the queue is busy. Return false to stop the
+	 * iteration.
 	 */
-	if (rq->state == MQ_RQ_IN_FLIGHT && rq->q == hctx->queue) {
+	if (rq->state == MQ_RQ_IN_FLIGHT) {
 		bool *busy = priv;
 
 		*busy = true;
@@ -1049,11 +1053,14 @@ bool blk_mq_get_driver_tag(struct request *rq)
 	shared = blk_mq_tag_busy(data.hctx);
 	rq->tag = blk_mq_get_tag(&data);
 	if (rq->tag >= 0) {
+		struct blk_mq_hw_ctx *hctx = data.hctx;
+
 		if (shared) {
 			rq->rq_flags |= RQF_MQ_INFLIGHT;
 			atomic_inc(&data.hctx->nr_active);
 		}
-		data.hctx->tags->rqs[rq->tag] = rq;
+		hctx->tags->rqs[rq->tag].queue = hctx->queue;
+		hctx->tags->rqs[rq->tag].rq = rq;
 	}
 
 done:
@@ -2020,10 +2027,34 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	return cookie;
 }
 
+struct rq_page_list {
+	struct rcu_work rcu_work;
+	struct list_head list;
+};
+
+static void blk_mq_rcu_free_pages(struct work_struct *work)
+{
+	struct rq_page_list *rpl = container_of(to_rcu_work(work),
+						struct rq_page_list, rcu_work);
+	struct page *page;
+
+	while (!list_empty(&rpl->list)) {
+		page = list_first_entry(&rpl->list, struct page, lru);
+		list_del_init(&page->lru);
+		/*
+		 * Remove kmemleak object previously allocated in
+		 * blk_mq_init_rq_map().
+		 */
+		kmemleak_free(page_address(page));
+		__free_pages(page, page->private);
+	}
+	kfree(rpl);
+}
+
 void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
 		     unsigned int hctx_idx)
 {
-	struct page *page;
+	struct rq_page_list *rpl;
 
 	if (tags->rqs && set->ops->exit_request) {
 		int i;
@@ -2038,15 +2069,28 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
 		}
 	}
 
-	while (!list_empty(&tags->page_list)) {
-		page = list_first_entry(&tags->page_list, struct page, lru);
-		list_del_init(&page->lru);
+	if (list_empty(&tags->page_list))
+		return;
+
+	rpl = kmalloc(sizeof(*rpl), GFP_NOIO);
+	if (rpl) {
+		INIT_LIST_HEAD(&rpl->list);
+		list_splice_init(&tags->page_list, &rpl->list);
+
+		/* Punt to RCU free, so we don't race with tag iteration */
+		INIT_RCU_WORK(&rpl->rcu_work, blk_mq_rcu_free_pages);
+		queue_rcu_work(system_wq, &rpl->rcu_work);
+	} else {
+		struct rq_page_list stack_rpl;
+
 		/*
-		 * Remove kmemleak object previously allocated in
-		 * blk_mq_init_rq_map().
+		 * Fail alloc, punt to on-stack, we just have to synchronize
+		 * RCU first to ensure readers are done.
 		 */
-		kmemleak_free(page_address(page));
-		__free_pages(page, page->private);
+		INIT_LIST_HEAD(&stack_rpl.list);
+		list_splice_init(&tags->page_list, &stack_rpl.list);
+		synchronize_rcu();
+		blk_mq_rcu_free_pages(&stack_rpl.rcu_work.work);
 	}
 }
 
@@ -2077,7 +2121,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
 	if (!tags)
 		return NULL;
 
-	tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *),
+	tags->rqs = kcalloc_node(nr_tags, sizeof(struct rq_tag_entry),
 				 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
 				 node);
 	if (!tags->rqs) {
diff --git a/block/blk.h b/block/blk.h
index 848278c52030..785207fc8a30 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -29,6 +29,8 @@ struct blk_flush_queue {
 	 */
 	struct request		*orig_rq;
 	spinlock_t		mq_flush_lock;
+
+	struct rcu_work		rcu_work;
 };
 
 extern struct kmem_cache *blk_requestq_cachep;
-- 
Jens Axboe
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help