Re: [PATCH 4/4] raid1: Rewrite the implementation of iobarrier.

From: NeilBrown <hidden>
Date: 2013-10-31 02:33:14

On Tue, 29 Oct 2013 09:30:14 +0800 majianpeng [off-list ref] wrote:


Nearly there!!  Just a few more details.  See below.

quoted hunk ↗ jump to hunk

Signed-off-by: Jianpeng Ma <redacted>
---
 drivers/md/raid1.c | 122 +++++++++++++++++++++++++++++++++++++++++++++++------
 drivers/md/raid1.h |  14 ++++++
 2 files changed, 124 insertions(+), 12 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index b4a6dcd..5b311c0 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c

@@ -66,7 +66,8 @@
  */
 static int max_queued_requests = 1024;
 
-static void allow_barrier(struct r1conf *conf);
+static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
+				sector_t bi_sector);
 static void lower_barrier(struct r1conf *conf);
 
 static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)

@@ -227,6 +228,8 @@ static void call_bio_endio(struct r1bio *r1_bio)
 	struct bio *bio = r1_bio->master_bio;
 	int done;
 	struct r1conf *conf = r1_bio->mddev->private;
+	sector_t start_next_window = r1_bio->start_next_window;
+	sector_t bi_sector = bio->bi_sector;

This should  be r1_bio->sector, not bio->bi_sector.
They are often the same but if multiple r1_bios are needed for some reason
(e.g. bad blocks) they may not be.

quoted hunk ↗ jump to hunk

 
 	if (bio->bi_phys_segments) {
 		unsigned long flags;

@@ -234,6 +237,11 @@ static void call_bio_endio(struct r1bio *r1_bio)
 		bio->bi_phys_segments--;
 		done = (bio->bi_phys_segments == 0);
 		spin_unlock_irqrestore(&conf->device_lock, flags);
+		/*
+		 * make_request() might be waiting for
+		 * bi_phys_segments to decrease
+		 */
+		wake_up(&conf->wait_barrier);
 	} else
 		done = 1;
 
@@ -245,7 +253,7 @@ static void call_bio_endio(struct r1bio *r1_bio)
 		 * Wake up any possible resync thread that waits for the device
 		 * to go idle.
 		 */
-		allow_barrier(conf);
+		allow_barrier(conf, start_next_window, bi_sector);
 	}
 }
 
@@ -827,10 +835,18 @@ static void raise_barrier(struct r1conf *conf)
 	/* block any new IO from starting */
 	conf->barrier++;
 
-	/* Now wait for all pending IO to complete */
+	/* For those conditions we must wait:
+	 * A:while the array is in frozen state
+	 * B:while barrier >= RESYNC_DEPTH, meaning resync reach
+	 *   the max count which allowed.
+	 * C:next_resync + RESYNC_SECTORS > start_next_window, meaning
+	 *   next resync will reach to window which normal bios are handling.
+	 */
 	wait_event_lock_irq(conf->wait_barrier,
 			    !conf->array_frozen &&
-			    !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
+			    conf->barrier < RESYNC_DEPTH &&
+			    (conf->start_next_window >=
+			     conf->next_resync + RESYNC_SECTORS),
 			    conf->resync_lock);

You've removed the test on conf->nr_pending here, which I think is correct.
It counts 'read' requests as well.  Testing start_next_window serves the same
purpose as it is increased whenever current_window_requests reaches zero.

However you having modified as similar test on nr_pending in wait_barrier().
That worries me a bit.  Should that be changed to a test on start_next_window
to match the above change?

quoted hunk ↗ jump to hunk

 
 	spin_unlock_irq(&conf->resync_lock);

@@ -846,10 +862,33 @@ static void lower_barrier(struct r1conf *conf)
 	wake_up(&conf->wait_barrier);
 }
 
-static void wait_barrier(struct r1conf *conf)
+static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio)
+{
+	bool wait = false;
+
+	if (conf->array_frozen || !bio)
+		wait = true;
+	else if (conf->barrier && bio_data_dir(bio) == WRITE) {
+		if (conf->next_resync < RESYNC_WINDOW_SECTORS)
+			wait = true;
+		else if ((conf->next_resync - RESYNC_WINDOW_SECTORS
+				>= bio_end_sector(bio)) ||
+			 (conf->next_resync + NEXT_NORMALIO_DISTANCE
+				<= bio->bi_sector))
+			wait = false;
+		else
+			wait = true;
+	}
+
+	return wait;
+}
+
+static sector_t wait_barrier(struct r1conf *conf, struct bio *bio)
 {
+	sector_t sector = 0;
+
 	spin_lock_irq(&conf->resync_lock);
-	if (conf->barrier) {
+	if (need_to_wait_for_sync(conf, bio)) {
 		conf->nr_waiting++;
 		/* Wait for the barrier to drop.
 		 * However if there are already pending
@@ -868,16 +907,57 @@ static void wait_barrier(struct r1conf *conf)
 				     !bio_list_empty(current->bio_list))),
 				    conf->resync_lock);
 		conf->nr_waiting--;
+	} else if (bio_data_dir(bio) == WRITE) {
+		if (conf->next_resync + NEXT_NORMALIO_DISTANCE
+				<= bio->bi_sector) {
+			if (conf->start_next_window == MaxSector)
+				conf->start_next_window =
+					conf->next_resync +
+						NEXT_NORMALIO_DISTANCE;
+
+			if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE)
+				<= bio->bi_sector)
+				conf->next_window_requests++;
+			else
+				conf->current_window_requests++;
+		}
+		if (bio->bi_sector >= conf->start_next_window)
+			sector = conf->start_next_window;

You aren't setting 'sector' if we needed to wait.  I don't think that is
correct, is it?

quoted hunk ↗ jump to hunk

 	}
+
 	conf->nr_pending++;
 	spin_unlock_irq(&conf->resync_lock);
+	return sector;
 }
 
-static void allow_barrier(struct r1conf *conf)
+static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
+				sector_t bi_sector)
 {
 	unsigned long flags;
+
 	spin_lock_irqsave(&conf->resync_lock, flags);
 	conf->nr_pending--;
+	if (start_next_window) {
+		if (start_next_window == conf->start_next_window) {
+			if (conf->start_next_window + NEXT_NORMALIO_DISTANCE
+				<= bi_sector)
+				conf->next_window_requests--;
+			else
+				conf->current_window_requests--;
+		} else
+			conf->current_window_requests--;
+
+		if (!conf->current_window_requests) {
+			if (conf->next_window_requests) {
+				conf->current_window_requests =
+					conf->next_window_requests;
+				conf->next_window_requests = 0;
+				conf->start_next_window +=
+					NEXT_NORMALIO_DISTANCE;
+			} else
+				conf->start_next_window = MaxSector;
+		}
+	}
 	spin_unlock_irqrestore(&conf->resync_lock, flags);
 	wake_up(&conf->wait_barrier);
 }

@@ -1012,6 +1092,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 	int first_clone;
 	int sectors_handled;
 	int max_sectors;
+	sector_t start_next_window;
 
 	/*
 	 * Register the new request and wait if the reconstruction
@@ -1041,7 +1122,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 		finish_wait(&conf->wait_barrier, &w);
 	}
 
-	wait_barrier(conf);
+	start_next_window = wait_barrier(conf, bio);
 
 	bitmap = mddev->bitmap;
 
@@ -1162,6 +1243,7 @@ read_again:
 
 	disks = conf->raid_disks * 2;
  retry_write:
+	r1_bio->start_next_window = start_next_window;
 	blocked_rdev = NULL;
 	rcu_read_lock();
 	max_sectors = r1_bio->sectors;
@@ -1230,14 +1312,24 @@ read_again:
 	if (unlikely(blocked_rdev)) {
 		/* Wait for this device to become unblocked */
 		int j;
+		sector_t old = start_next_window;
 
 		for (j = 0; j < i; j++)
 			if (r1_bio->bios[j])
 				rdev_dec_pending(conf->mirrors[j].rdev, mddev);
 		r1_bio->state = 0;
-		allow_barrier(conf);
+		allow_barrier(conf, start_next_window, bio->bi_sector);

I think this should be r1_bio->sector, not bio->bi_sector, for the same
reason as earlier.

quoted hunk ↗ jump to hunk

 		md_wait_for_blocked_rdev(blocked_rdev, mddev);
-		wait_barrier(conf);
+		start_next_window = wait_barrier(conf, bio);
+		/*
+		 * We must make sure the multi r1bios of bio have
+		 * the same value of bi_phys_segments
+		 */
+		if (bio->bi_phys_segments && old &&
+			old != start_next_window)
+			/*wait the former r1bio(s) completed*/
+			wait_event(conf->wait_barrier,
+					bio->bi_phys_segments == 1);
 		goto retry_write;
 	}

@@ -1437,11 +1529,14 @@ static void print_conf(struct r1conf *conf)
 
 static void close_sync(struct r1conf *conf)
 {
-	wait_barrier(conf);
-	allow_barrier(conf);
+	wait_barrier(conf, NULL);
+	allow_barrier(conf, 0, 0);
 
 	mempool_destroy(conf->r1buf_pool);
 	conf->r1buf_pool = NULL;
+
+	conf->next_resync = 0;
+	conf->start_next_window = MaxSector;
 }
 
 static int raid1_spare_active(struct mddev *mddev)

@@ -2713,6 +2808,9 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 	conf->pending_count = 0;
 	conf->recovery_disabled = mddev->recovery_disabled - 1;
 
+	conf->start_next_window = MaxSector;
+	conf->current_window_requests = conf->next_window_requests = 0;
+
 	err = -EIO;
 	for (i = 0; i < conf->raid_disks * 2; i++) {

diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 331a98a..07425a1 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h

@@ -41,6 +41,19 @@ struct r1conf {
 	 */
 	sector_t		next_resync;
 
+	/*when raid1 start resync,we divide raid into four partitions
+	 * |---------|--------------|---------------------|-------------|
+	 *        next_resync   start_next_window        Pc
+	 * Now start_next_window = next_resync + NEXT_NORMALIO_DISTANCE
+	 * Pc = start_next_window + NEXT_NORMALIO_DISTANCE
+	 * current_window_requests means the count of normalIO between
+	 * start_next_window and Pc.
+	 * next_window_requests means the count of nornalIO after Pc.
+	 * */
+	sector_t		start_next_window;
+	int			current_window_requests;
+	int			next_window_requests;
+
 	spinlock_t		device_lock;
 
 	/* list of 'struct r1bio' that need to be processed by raid1d,

@@ -112,6 +125,7 @@ struct r1bio {
 						 * in this BehindIO request
 						 */
 	sector_t		sector;
+	sector_t		start_next_window;
 	int			sectors;
 	unsigned long		state;
 	struct mddev		*mddev;


Thanks,
NeilBrown

Attachments

signature.asc [application/pgp-signature] 828 bytes

`h`	back out one level
`j`	next message in thread
`k`	previous message in thread
`l`	drill in
`Esc`	close help / fold thread tree
`?`	toggle this help