summaryrefslogtreecommitdiff
path: root/drivers/md/md.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/md.c')
-rw-r--r--drivers/md/md.c89
1 files changed, 69 insertions, 20 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 0c2801d77090..846bdee4daa0 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -528,6 +528,9 @@ static void md_end_flush(struct bio *bio)
rdev_dec_pending(rdev, mddev);
if (atomic_dec_and_test(&mddev->flush_pending)) {
+ /* The pair is percpu_ref_get() from md_flush_request() */
+ percpu_ref_put(&mddev->active_io);
+
/* The pre-request flush has finished */
queue_work(md_wq, &mddev->flush_work);
}
@@ -547,12 +550,8 @@ static void submit_flushes(struct work_struct *ws)
rdev_for_each_rcu(rdev, mddev)
if (rdev->raid_disk >= 0 &&
!test_bit(Faulty, &rdev->flags)) {
- /* Take two references, one is dropped
- * when request finishes, one after
- * we reclaim rcu_read_lock
- */
struct bio *bi;
- atomic_inc(&rdev->nr_pending);
+
atomic_inc(&rdev->nr_pending);
rcu_read_unlock();
bi = bio_alloc_bioset(rdev->bdev, 0,
@@ -563,11 +562,14 @@ static void submit_flushes(struct work_struct *ws)
atomic_inc(&mddev->flush_pending);
submit_bio(bi);
rcu_read_lock();
- rdev_dec_pending(rdev, mddev);
}
rcu_read_unlock();
- if (atomic_dec_and_test(&mddev->flush_pending))
+ if (atomic_dec_and_test(&mddev->flush_pending)) {
+ /* The pair is percpu_ref_get() from md_flush_request() */
+ percpu_ref_put(&mddev->active_io);
+
queue_work(md_wq, &mddev->flush_work);
+ }
}
static void md_submit_flush_data(struct work_struct *ws)
@@ -616,6 +618,18 @@ bool md_flush_request(struct mddev *mddev, struct bio *bio)
/* new request after previous flush is completed */
if (ktime_after(req_start, mddev->prev_flush_start)) {
WARN_ON(mddev->flush_bio);
+ /*
+ * Grab a reference to make sure mddev_suspend() will wait for
+ * this flush to be done.
+ *
+ * md_flush_reqeust() is called under md_handle_request() and
+ * 'active_io' is already grabbed, hence percpu_ref_is_zero()
+ * won't pass, percpu_ref_tryget_live() can't be used because
+ * percpu_ref_kill() can be called by mddev_suspend()
+ * concurrently.
+ */
+ WARN_ON(percpu_ref_is_zero(&mddev->active_io));
+ percpu_ref_get(&mddev->active_io);
mddev->flush_bio = bio;
bio = NULL;
}
@@ -953,9 +967,10 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
return;
bio = bio_alloc_bioset(rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev,
- 1,
- REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH | REQ_FUA,
- GFP_NOIO, &mddev->sync_set);
+ 1,
+ REQ_OP_WRITE | REQ_SYNC | REQ_IDLE | REQ_META
+ | REQ_PREFLUSH | REQ_FUA,
+ GFP_NOIO, &mddev->sync_set);
atomic_inc(&rdev->nr_pending);
@@ -1135,6 +1150,7 @@ struct super_type {
struct md_rdev *refdev,
int minor_version);
int (*validate_super)(struct mddev *mddev,
+ struct md_rdev *freshest,
struct md_rdev *rdev);
void (*sync_super)(struct mddev *mddev,
struct md_rdev *rdev);
@@ -1272,8 +1288,9 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
/*
* validate_super for 0.90.0
+ * note: we are not using "freshest" for 0.9 superblock
*/
-static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
+static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev)
{
mdp_disk_t *desc;
mdp_super_t *sb = page_address(rdev->sb_page);
@@ -1785,7 +1802,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
return ret;
}
-static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
+static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev)
{
struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
__u64 ev1 = le64_to_cpu(sb->events);
@@ -1881,13 +1898,15 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
}
} else if (mddev->pers == NULL) {
/* Insist of good event counter while assembling, except for
- * spares (which don't need an event count) */
- ++ev1;
+ * spares (which don't need an event count).
+ * Similar to mdadm, we allow event counter difference of 1
+ * from the freshest device.
+ */
if (rdev->desc_nr >= 0 &&
rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
(le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))
- if (ev1 < mddev->events)
+ if (ev1 + 1 < mddev->events)
return -EINVAL;
} else if (mddev->bitmap) {
/* If adding to array with a bitmap, then we can accept an
@@ -1908,8 +1927,38 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
role = MD_DISK_ROLE_SPARE;
rdev->desc_nr = -1;
- } else
+ } else if (mddev->pers == NULL && freshest && ev1 < mddev->events) {
+ /*
+ * If we are assembling, and our event counter is smaller than the
+ * highest event counter, we cannot trust our superblock about the role.
+ * It could happen that our rdev was marked as Faulty, and all other
+ * superblocks were updated with +1 event counter.
+ * Then, before the next superblock update, which typically happens when
+ * remove_and_add_spares() removes the device from the array, there was
+ * a crash or reboot.
+ * If we allow current rdev without consulting the freshest superblock,
+ * we could cause data corruption.
+ * Note that in this case our event counter is smaller by 1 than the
+ * highest, otherwise, this rdev would not be allowed into array;
+ * both kernel and mdadm allow event counter difference of 1.
+ */
+ struct mdp_superblock_1 *freshest_sb = page_address(freshest->sb_page);
+ u32 freshest_max_dev = le32_to_cpu(freshest_sb->max_dev);
+
+ if (rdev->desc_nr >= freshest_max_dev) {
+ /* this is unexpected, better not proceed */
+ pr_warn("md: %s: rdev[%pg]: desc_nr(%d) >= freshest(%pg)->sb->max_dev(%u)\n",
+ mdname(mddev), rdev->bdev, rdev->desc_nr,
+ freshest->bdev, freshest_max_dev);
+ return -EUCLEAN;
+ }
+
+ role = le16_to_cpu(freshest_sb->dev_roles[rdev->desc_nr]);
+ pr_debug("md: %s: rdev[%pg]: role=%d(0x%x) according to freshest %pg\n",
+ mdname(mddev), rdev->bdev, role, role, freshest->bdev);
+ } else {
role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
+ }
switch(role) {
case MD_DISK_ROLE_SPARE: /* spare */
break;
@@ -2851,7 +2900,7 @@ static int add_bound_rdev(struct md_rdev *rdev)
* and should be added immediately.
*/
super_types[mddev->major_version].
- validate_super(mddev, rdev);
+ validate_super(mddev, NULL/*freshest*/, rdev);
if (add_journal)
mddev_suspend(mddev);
err = mddev->pers->hot_add_disk(mddev, rdev);
@@ -3765,7 +3814,7 @@ static int analyze_sbs(struct mddev *mddev)
}
super_types[mddev->major_version].
- validate_super(mddev, freshest);
+ validate_super(mddev, NULL/*freshest*/, freshest);
i = 0;
rdev_for_each_safe(rdev, tmp, mddev) {
@@ -3780,7 +3829,7 @@ static int analyze_sbs(struct mddev *mddev)
}
if (rdev != freshest) {
if (super_types[mddev->major_version].
- validate_super(mddev, rdev)) {
+ validate_super(mddev, freshest, rdev)) {
pr_warn("md: kicking non-fresh %pg from array!\n",
rdev->bdev);
md_kick_rdev_from_array(rdev);
@@ -6794,7 +6843,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
rdev->saved_raid_disk = rdev->raid_disk;
} else
super_types[mddev->major_version].
- validate_super(mddev, rdev);
+ validate_super(mddev, NULL/*freshest*/, rdev);
if ((info->state & (1<<MD_DISK_SYNC)) &&
rdev->raid_disk != info->raid_disk) {
/* This was a hot-add request, but events doesn't