From 0232605d987d8230b254aa139805bbb56a7ca30c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 3 Jul 2012 15:56:52 +1000 Subject: md: make 'name' arg to md_register_thread non-optional. Having the 'name' arg optional and defaulting to the current personality name is no necessary and leads to errors, as when changing the level of an array we can end up using the name of the old level instead of the new one. So make it non-optional and always explicitly pass the name of the level that the array will be. Reported-by: majianpeng Signed-off-by: NeilBrown --- drivers/md/raid1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md/raid1.c') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index a9c7981ddd24..39b2a8aa3b23 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2621,7 +2621,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) goto abort; } err = -ENOMEM; - conf->thread = md_register_thread(raid1d, mddev, NULL); + conf->thread = md_register_thread(raid1d, mddev, "raid1"); if (!conf->thread) { printk(KERN_ERR "md/raid1:%s: couldn't allocate thread\n", -- cgit v1.2.3 From 32644afd8975d19174bcb9ba34687c32dd810a09 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 3 Jul 2012 15:58:42 +1000 Subject: md/raid1: fix bug in read_balance introduced by hot-replace When we added hot_replace we doubled the number of devices that could be in a RAID1 array. So we doubled how far read_balance would search. Unfortunately we didn't double the point at which it looped back to the beginning - so it effectively loops over all non-replacement disks twice. This doesn't cause bad behaviour, but it pointless and means we never read from replacement devices. Signed-off-by: NeilBrown --- drivers/md/raid1.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/md/raid1.c') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 39b2a8aa3b23..34b4665cb0b6 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -517,8 +517,8 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect int bad_sectors; int disk = start_disk + i; - if (disk >= conf->raid_disks) - disk -= conf->raid_disks; + if (disk >= conf->raid_disks * 2) + disk -= conf->raid_disks * 2; rdev = rcu_dereference(conf->mirrors[disk].rdev); if (r1_bio->bios[disk] == IO_BLOCKED -- cgit v1.2.3 From b357f04a67c2aeee828b240863cd3f21d6cb3179 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 3 Jul 2012 17:45:31 +1000 Subject: md: fix up plugging (again). The value returned by "mddev_check_plug" is only valid until the next 'schedule' as that will unplug things. This could happen at any call to mempool_alloc. So just calling mddev_check_plug at the start doesn't really make sense. So call it just before, or just after, queuing things for the thread. As the action that happens at unplug is to wake the thread, this makes lots of sense. If we cannot add a plug (which requires a small GFP_ATOMIC alloc) we wake thread immediately. RAID5 is a bit different. Requests are queued for the thread and the thread is woken by release_stripe. So we don't need to wake the thread on failure. However the thread doesn't perform certain actions when there is any active plug, so it is important to install a plug before waking the thread. So for RAID5 we install the plug *before* queuing the request and waking the thread. Without this patch it is possible for raid1 or raid10 to queue a request without then waking the thread, resulting in the array locking up. Also change raid10 to only flush_pending_write when there are not active plugs, just like raid1. This patch is suitable for 3.0 or later. I plan to submit it to -stable, but I'll like to let it spend a few weeks in mainline first to be sure it is completely safe. Signed-off-by: NeilBrown --- drivers/md/raid1.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'drivers/md/raid1.c') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 34b4665cb0b6..8c2754f835ef 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -883,7 +883,6 @@ static void make_request(struct mddev *mddev, struct bio * bio) const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); struct md_rdev *blocked_rdev; - int plugged; int first_clone; int sectors_handled; int max_sectors; @@ -1034,7 +1033,6 @@ read_again: * the bad blocks. Each set of writes gets it's own r1bio * with a set of bios attached. */ - plugged = mddev_check_plugged(mddev); disks = conf->raid_disks * 2; retry_write: @@ -1191,6 +1189,8 @@ read_again: bio_list_add(&conf->pending_bio_list, mbio); conf->pending_count++; spin_unlock_irqrestore(&conf->device_lock, flags); + if (!mddev_check_plugged(mddev)) + md_wakeup_thread(mddev->thread); } /* Mustn't call r1_bio_write_done before this next test, * as it could result in the bio being freed. @@ -1213,9 +1213,6 @@ read_again: /* In case raid1d snuck in to freeze_array */ wake_up(&conf->wait_barrier); - - if (do_sync || !bitmap || !plugged) - md_wakeup_thread(mddev->thread); } static void status(struct seq_file *seq, struct mddev *mddev) -- cgit v1.2.3 From 2d4f4f3384d4ef4f7c571448e803a1ce721113d5 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 9 Jul 2012 11:34:13 +1000 Subject: md/raid1: fix use-after-free bug in RAID1 data-check code. This bug has been present ever since data-check was introduce in 2.6.16. However it would only fire if a data-check were done on a degraded array, which was only possible if the array has 3 or more devices. This is certainly possible, but is quite uncommon. Since hot-replace was added in 3.3 it can happen more often as the same condition can arise if not all possible replacements are present. The problem is that as soon as we submit the last read request, the 'r1_bio' structure could be freed at any time, so we really should stop looking at it. If the last device is being read from we will stop looking at it. However if the last device is not due to be read from, we will still check the bio pointer in the r1_bio, but the r1_bio might already be free. So use the read_targets counter to make sure we stop looking for bios to submit as soon as we have submitted them all. This fix is suitable for any -stable kernel since 2.6.16. Cc: stable@vger.kernel.org Reported-by: Arnold Schulz Signed-off-by: NeilBrown --- drivers/md/raid1.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/md/raid1.c') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 8c2754f835ef..240ff3125040 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2485,9 +2485,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp */ if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { atomic_set(&r1_bio->remaining, read_targets); - for (i = 0; i < conf->raid_disks * 2; i++) { + for (i = 0; i < conf->raid_disks * 2 && read_targets; i++) { bio = r1_bio->bios[i]; if (bio->bi_end_io == end_sync_read) { + read_targets--; md_sync_acct(bio->bi_bdev, nr_sectors); generic_make_request(bio); } -- cgit v1.2.3 From 58e94ae18478c08229626daece2fc108a4a23261 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 19 Jul 2012 15:59:18 +1000 Subject: md/raid1: close some possible races on write errors during resync commit 4367af556133723d0f443e14ca8170d9447317cb md/raid1: clear bad-block record when write succeeds. Added a 'reschedule_retry' call possibility at the end of end_sync_write, but didn't add matching code at the end of sync_request_write. So if the writes complete very quickly, or scheduling makes it seem that way, then we can miss rescheduling the request and the resync could hang. Also commit 73d5c38a9536142e062c35997b044e89166e063b md: avoid races when stopping resync. Fix a race condition in this same code in end_sync_write but didn't make the change in sync_request_write. This patch updates sync_request_write to fix both of those. Patch is suitable for 3.1 and later kernels. Reported-by: Alexander Lyakas Original-version-by: Alexander Lyakas Cc: stable@vger.kernel.org Signed-off-by: NeilBrown --- drivers/md/raid1.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'drivers/md/raid1.c') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 240ff3125040..cacd008d6864 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1818,8 +1818,14 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) if (atomic_dec_and_test(&r1_bio->remaining)) { /* if we're here, all write(s) have completed, so clean up */ - md_done_sync(mddev, r1_bio->sectors, 1); - put_buf(r1_bio); + int s = r1_bio->sectors; + if (test_bit(R1BIO_MadeGood, &r1_bio->state) || + test_bit(R1BIO_WriteError, &r1_bio->state)) + reschedule_retry(r1_bio); + else { + put_buf(r1_bio); + md_done_sync(mddev, s, 1); + } } } -- cgit v1.2.3