From 9092c02d943515b3c9ffd5d0003527f8cc1dd77b Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Thu, 2 May 2013 14:19:24 -0500
Subject: DM RAID: Add ability to restore transiently failed devices on resume

DM RAID: Add ability to restore transiently failed devices on resume

This patch adds code to the resume function to check over the devices
in the RAID array.  If any are found to be marked as failed and their
superblocks can be read, an attempt is made to reintegrate them into
the array.  This allows the user to refresh the array with a simple
suspend and resume of the array - rather than having to load a
completely new table, allocate and initialize all the structures and
throw away the old instantiation.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'drivers/md/raid10.c')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 6ddae2501b9a..3c6b193cefd5 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1819,15 +1819,17 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 			set_bit(Replacement, &rdev->flags);
 			rdev->raid_disk = mirror;
 			err = 0;
-			disk_stack_limits(mddev->gendisk, rdev->bdev,
-					  rdev->data_offset << 9);
+			if (mddev->gendisk)
+				disk_stack_limits(mddev->gendisk, rdev->bdev,
+						  rdev->data_offset << 9);
 			conf->fullsync = 1;
 			rcu_assign_pointer(p->replacement, rdev);
 			break;
 		}
 
-		disk_stack_limits(mddev->gendisk, rdev->bdev,
-				  rdev->data_offset << 9);
+		if (mddev->gendisk)
+			disk_stack_limits(mddev->gendisk, rdev->bdev,
+					  rdev->data_offset << 9);
 
 		p->head_position = 0;
 		p->recovery_disabled = mddev->recovery_disabled - 1;
-- 
cgit v1.2.3


From 635f6416a2e983adac8ccf90bffbeed0c1a76454 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 11 Jun 2013 14:57:09 +1000
Subject: md/raid10: locking changes for 'enough()'.

As 'enough' accesses conf->prev and conf->geo, which can change
spontanously, it should guard against changes.
This can be done with device_lock as start_reshape holds device_lock
while updating 'geo' and end_reshape holds it while updating 'prev'.

So 'error' needs to hold 'device_lock'.

On the other hand, raid10_end_read_request knows which of the two it
really wants to access, and as it is an active request on that one,
the value cannot change underneath it.

So change _enough to take flag rather than a pointer, pass the
appropriate flag from raid10_end_read_request(), and remove the locking.

All other calls to 'enough' are made with reconfig_mutex held, so
neither 'prev' nor 'geo' can change.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 45 +++++++++++++++++++++++++++++----------------
 1 file changed, 29 insertions(+), 16 deletions(-)

(limited to 'drivers/md/raid10.c')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 3c6b193cefd5..5169ed2a9156 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -97,7 +97,7 @@ static int max_queued_requests = 1024;
 
 static void allow_barrier(struct r10conf *conf);
 static void lower_barrier(struct r10conf *conf);
-static int enough(struct r10conf *conf, int ignore);
+static int _enough(struct r10conf *conf, int previous, int ignore);
 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
 				int *skipped);
 static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
@@ -392,11 +392,9 @@ static void raid10_end_read_request(struct bio *bio, int error)
 		 * than fail the last device.  Here we redefine
 		 * "uptodate" to mean "Don't want to retry"
 		 */
-		unsigned long flags;
-		spin_lock_irqsave(&conf->device_lock, flags);
-		if (!enough(conf, rdev->raid_disk))
+		if (!_enough(conf, test_bit(R10BIO_Previous, &r10_bio->state),
+			     rdev->raid_disk))
 			uptodate = 1;
-		spin_unlock_irqrestore(&conf->device_lock, flags);
 	}
 	if (uptodate) {
 		raid_end_bio_io(r10_bio);
@@ -1632,9 +1630,17 @@ static void status(struct seq_file *seq, struct mddev *mddev)
  * Don't consider the device numbered 'ignore'
  * as we might be about to remove it.
  */
-static int _enough(struct r10conf *conf, struct geom *geo, int ignore)
+static int _enough(struct r10conf *conf, int previous, int ignore)
 {
 	int first = 0;
+	int disks, ncopies;
+	if (previous) {
+		disks = conf->prev.raid_disks;
+		ncopies = conf->prev.near_copies;
+	} else {
+		disks = conf->geo.raid_disks;
+		ncopies = conf->geo.near_copies;
+	}
 
 	do {
 		int n = conf->copies;
@@ -1644,25 +1650,31 @@ static int _enough(struct r10conf *conf, struct geom *geo, int ignore)
 			if (conf->mirrors[this].rdev &&
 			    this != ignore)
 				cnt++;
-			this = (this+1) % geo->raid_disks;
+			this = (this+1) % disks;
 		}
 		if (cnt == 0)
 			return 0;
-		first = (first + geo->near_copies) % geo->raid_disks;
+		first = (first + ncopies) % disks;
 	} while (first != 0);
 	return 1;
 }
 
 static int enough(struct r10conf *conf, int ignore)
 {
-	return _enough(conf, &conf->geo, ignore) &&
-		_enough(conf, &conf->prev, ignore);
+	/* when calling 'enough', both 'prev' and 'geo' must
+	 * be stable.
+	 * This is ensured if ->reconfig_mutex or ->device_lock
+	 * is held.
+	 */
+	return _enough(conf, 0, ignore) &&
+		_enough(conf, 1, ignore);
 }
 
 static void error(struct mddev *mddev, struct md_rdev *rdev)
 {
 	char b[BDEVNAME_SIZE];
 	struct r10conf *conf = mddev->private;
+	unsigned long flags;
 
 	/*
 	 * If it is not operational, then we have already marked it as dead
@@ -1670,18 +1682,18 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
 	 * next level up know.
 	 * else mark the drive as failed
 	 */
+	spin_lock_irqsave(&conf->device_lock, flags);
 	if (test_bit(In_sync, &rdev->flags)
-	    && !enough(conf, rdev->raid_disk))
+	    && !enough(conf, rdev->raid_disk)) {
 		/*
 		 * Don't fail the drive, just return an IO error.
 		 */
+		spin_unlock_irqrestore(&conf->device_lock, flags);
 		return;
+	}
 	if (test_and_clear_bit(In_sync, &rdev->flags)) {
-		unsigned long flags;
-		spin_lock_irqsave(&conf->device_lock, flags);
 		mddev->degraded++;
-		spin_unlock_irqrestore(&conf->device_lock, flags);
-		/*
+			/*
 		 * if recovery is running, make sure it aborts.
 		 */
 		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
@@ -1689,6 +1701,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
 	set_bit(Blocked, &rdev->flags);
 	set_bit(Faulty, &rdev->flags);
 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
+	spin_unlock_irqrestore(&conf->device_lock, flags);
 	printk(KERN_ALERT
 	       "md/raid10:%s: Disk failure on %s, disabling device.\n"
 	       "md/raid10:%s: Operation continuing on %d devices.\n",
@@ -1791,7 +1804,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 		 * very different from resync
 		 */
 		return -EBUSY;
-	if (rdev->saved_raid_disk < 0 && !_enough(conf, &conf->prev, -1))
+	if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1))
 		return -EINVAL;
 
 	if (rdev->raid_disk >= 0)
-- 
cgit v1.2.3


From 725d6e579f06360744fc101b1af2f82d8aa282f1 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 11 Jun 2013 15:08:03 +1000
Subject: md/raid10: check In_sync flag in 'enough()'.

It isn't really enough to check that the rdev is present, we need to
also be sure that the device is still In_sync.

Doing this requires using rcu_dereference to access the rdev, and
holding the rcu_read_lock() to ensure the rdev doesn't disappear while
we look at it.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'drivers/md/raid10.c')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 5169ed2a9156..aa8ba0760cac 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1633,6 +1633,7 @@ static void status(struct seq_file *seq, struct mddev *mddev)
 static int _enough(struct r10conf *conf, int previous, int ignore)
 {
 	int first = 0;
+	int has_enough = 0;
 	int disks, ncopies;
 	if (previous) {
 		disks = conf->prev.raid_disks;
@@ -1642,21 +1643,27 @@ static int _enough(struct r10conf *conf, int previous, int ignore)
 		ncopies = conf->geo.near_copies;
 	}
 
+	rcu_read_lock();
 	do {
 		int n = conf->copies;
 		int cnt = 0;
 		int this = first;
 		while (n--) {
-			if (conf->mirrors[this].rdev &&
-			    this != ignore)
+			struct md_rdev *rdev;
+			if (this != ignore &&
+			    (rdev = rcu_dereference(conf->mirrors[this].rdev)) &&
+			    test_bit(In_sync, &rdev->flags))
 				cnt++;
 			this = (this+1) % disks;
 		}
 		if (cnt == 0)
-			return 0;
+			goto out;
 		first = (first + ncopies) % disks;
 	} while (first != 0);
-	return 1;
+	has_enough = 1;
+out:
+	rcu_read_unlock();
+	return has_enough;
 }
 
 static int enough(struct r10conf *conf, int ignore)
-- 
cgit v1.2.3


From 78eaa0d4cbcdb345992fa3dd22b3bcbb473cc064 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 2 Jul 2013 15:58:05 +1000
Subject: md/raid10: fix two bugs affecting RAID10 reshape.

1/ If a RAID10 is being reshaped to a fewer number of devices
 and is stopped while this is ongoing, then when the array is
 reassembled the 'mirrors' array will be allocated too small.
 This will lead to an access error or memory corruption.

2/ A sanity test for a reshaping RAID10 array is restarted
 is slightly incorrect.

Due to the first bug, this is suitable for any -stable
kernel since 3.5 where this code was introduced.

Cc: stable@vger.kernel.org (v3.5+)
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers/md/raid10.c')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index aa8ba0760cac..3480bf7c20d4 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -3554,7 +3554,7 @@ static struct r10conf *setup_conf(struct mddev *mddev)
 
 	/* FIXME calc properly */
 	conf->mirrors = kzalloc(sizeof(struct raid10_info)*(mddev->raid_disks +
-							    max(0,mddev->delta_disks)),
+							    max(0,-mddev->delta_disks)),
 				GFP_KERNEL);
 	if (!conf->mirrors)
 		goto out;
@@ -3713,7 +3713,7 @@ static int run(struct mddev *mddev)
 		    conf->geo.far_offset == 0)
 			goto out_free_conf;
 		if (conf->prev.far_copies != 1 &&
-		    conf->geo.far_offset == 0)
+		    conf->prev.far_offset == 0)
 			goto out_free_conf;
 	}
 
-- 
cgit v1.2.3


From 1376512065b23f39d5f9a160948f313397dde972 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 4 Jul 2013 16:41:53 +1000
Subject: md/raid10: fix bug which causes all RAID10 reshapes to move no data.

The recent comment:
commit 7e83ccbecd608b971f340e951c9e84cd0343002f
    md/raid10: Allow skipping recovery when clean arrays are assembled

Causes raid10 to skip a recovery in certain cases where it is safe to
do so.  Unfortunately it also causes a reshape to be skipped which is
never safe.  The result is that an attempt to reshape a RAID10 will
appear to complete instantly, but no data will have been moves so the
array will now contain garbage.
(If nothing is written, you can recovery by simple performing the
reverse reshape which will also complete instantly).

Bug was introduced in 3.10, so this is suitable for 3.10-stable.

Cc: stable@vger.kernel.org (3.10)
Cc: Martin Wilck <mwilck@arcor.de>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'drivers/md/raid10.c')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 3480bf7c20d4..cd066b63bdaf 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2931,14 +2931,13 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 	 */
 	if (mddev->bitmap == NULL &&
 	    mddev->recovery_cp == MaxSector &&
+	    mddev->reshape_position == MaxSector &&
+	    !test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
 	    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
+	    !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
 	    conf->fullsync == 0) {
 		*skipped = 1;
-		max_sector = mddev->dev_sectors;
-		if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
-		    test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
-			max_sector = mddev->resync_max_sectors;
-		return max_sector - sector_nr;
+		return mddev->dev_sectors - sector_nr;
 	}
 
  skipped:
-- 
cgit v1.2.3


From 7bb23c4934059c64cbee2e41d5d24ce122285176 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 16 Jul 2013 16:50:47 +1000
Subject: md/raid10: fix two problems with RAID10 resync.

1/ When an different between blocks is found, data is copied from
   one bio to the other.  However bv_len is used as the length to
   copy and this could be zero.  So use r10_bio->sectors to calculate
   length instead.
   Using bv_len was probably always a bit dubious, but the introduction
   of bio_advance made it much more likely to be a problem.

2/ When preparing some blocks for sync, we don't set BIO_UPTODATE
   except on bios that we schedule for a read.  This ensures that
   missing/failed devices don't confuse the loop at the top of
   sync_request write.
   Commit 8be185f2c9d54d6 "raid10: Use bio_reset()"
   removed a loop which set BIO_UPTDATE on all appropriate bios.
   So we need to re-add that flag.

These bugs were introduced in 3.10, so this patch is suitable for
3.10-stable, and can remove a potential for data corruption.

Cc: stable@vger.kernel.org (3.10)
Reported-by: Brassow Jonathan <jbrassow@redhat.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'drivers/md/raid10.c')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index cd066b63bdaf..957a719e8c2f 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2097,11 +2097,17 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 			 * both 'first' and 'i', so we just compare them.
 			 * All vec entries are PAGE_SIZE;
 			 */
-			for (j = 0; j < vcnt; j++)
+			int sectors = r10_bio->sectors;
+			for (j = 0; j < vcnt; j++) {
+				int len = PAGE_SIZE;
+				if (sectors < (len / 512))
+					len = sectors * 512;
 				if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
 					   page_address(tbio->bi_io_vec[j].bv_page),
-					   fbio->bi_io_vec[j].bv_len))
+					   len))
 					break;
+				sectors -= len/512;
+			}
 			if (j == vcnt)
 				continue;
 			atomic64_add(r10_bio->sectors, &mddev->resync_mismatches);
@@ -3407,6 +3413,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 
 		if (bio->bi_end_io == end_sync_read) {
 			md_sync_acct(bio->bi_bdev, nr_sectors);
+			set_bit(BIO_UPTODATE, &bio->bi_flags);
 			generic_make_request(bio);
 		}
 	}
-- 
cgit v1.2.3


From 0eb25bb027a100f5a9df8991f2f628e7d851bc1e Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 24 Jul 2013 15:37:42 +1000
Subject: md/raid10: remove use-after-free bug.

We always need to be careful when calling generic_make_request, as it
can start a chain of events which might free something that we are
using.

Here is one place I wasn't careful enough.  If the wbio2 is not in
use, then it might get freed at the first generic_make_request call.
So perform all necessary tests first.

This bug was introduced in 3.3-rc3 (24afd80d99) and can cause an
oops, so fix is suitable for any -stable since then.

Cc: stable@vger.kernel.org (3.3+)
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'drivers/md/raid10.c')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 957a719e8c2f..df7b0a06b0ea 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2290,12 +2290,18 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 	d = r10_bio->devs[1].devnum;
 	wbio = r10_bio->devs[1].bio;
 	wbio2 = r10_bio->devs[1].repl_bio;
+	/* Need to test wbio2->bi_end_io before we call
+	 * generic_make_request as if the former is NULL,
+	 * the latter is free to free wbio2.
+	 */
+	if (wbio2 && !wbio2->bi_end_io)
+		wbio2 = NULL;
 	if (wbio->bi_end_io) {
 		atomic_inc(&conf->mirrors[d].rdev->nr_pending);
 		md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
 		generic_make_request(wbio);
 	}
-	if (wbio2 && wbio2->bi_end_io) {
+	if (wbio2) {
 		atomic_inc(&conf->mirrors[d].replacement->nr_pending);
 		md_sync_acct(conf->mirrors[d].replacement->bdev,
 			     bio_sectors(wbio2));
-- 
cgit v1.2.3