diff options
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r-- | drivers/md/raid5.c | 150 |
1 files changed, 129 insertions, 21 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 81abefc172d9..e84204eb12df 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2947,6 +2947,7 @@ static void handle_stripe5(struct stripe_head *sh) struct r5dev *dev; mdk_rdev_t *blocked_rdev = NULL; int prexor; + int dec_preread_active = 0; memset(&s, 0, sizeof(s)); pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d " @@ -3096,12 +3097,8 @@ static void handle_stripe5(struct stripe_head *sh) set_bit(STRIPE_INSYNC, &sh->state); } } - if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { - atomic_dec(&conf->preread_active_stripes); - if (atomic_read(&conf->preread_active_stripes) < - IO_THRESHOLD) - md_wakeup_thread(conf->mddev->thread); - } + if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + dec_preread_active = 1; } /* Now to consider new write requests and what else, if anything @@ -3208,6 +3205,16 @@ static void handle_stripe5(struct stripe_head *sh) ops_run_io(sh, &s); + if (dec_preread_active) { + /* We delay this until after ops_run_io so that if make_request + * is waiting on a barrier, it won't continue until the writes + * have actually been submitted. + */ + atomic_dec(&conf->preread_active_stripes); + if (atomic_read(&conf->preread_active_stripes) < + IO_THRESHOLD) + md_wakeup_thread(conf->mddev->thread); + } return_io(return_bi); } @@ -3221,6 +3228,7 @@ static void handle_stripe6(struct stripe_head *sh) struct r6_state r6s; struct r5dev *dev, *pdev, *qdev; mdk_rdev_t *blocked_rdev = NULL; + int dec_preread_active = 0; pr_debug("handling stripe %llu, state=%#lx cnt=%d, " "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", @@ -3358,7 +3366,6 @@ static void handle_stripe6(struct stripe_head *sh) * completed */ if (sh->reconstruct_state == reconstruct_state_drain_result) { - int qd_idx = sh->qd_idx; sh->reconstruct_state = reconstruct_state_idle; /* All the 'written' buffers and the parity blocks are ready to @@ -3380,12 +3387,8 @@ static void handle_stripe6(struct stripe_head *sh) set_bit(STRIPE_INSYNC, &sh->state); } } - if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { - atomic_dec(&conf->preread_active_stripes); - if (atomic_read(&conf->preread_active_stripes) < - IO_THRESHOLD) - md_wakeup_thread(conf->mddev->thread); - } + if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + dec_preread_active = 1; } /* Now to consider new write requests and what else, if anything @@ -3494,6 +3497,18 @@ static void handle_stripe6(struct stripe_head *sh) ops_run_io(sh, &s); + + if (dec_preread_active) { + /* We delay this until after ops_run_io so that if make_request + * is waiting on a barrier, it won't continue until the writes + * have actually been submitted. + */ + atomic_dec(&conf->preread_active_stripes); + if (atomic_read(&conf->preread_active_stripes) < + IO_THRESHOLD) + md_wakeup_thread(conf->mddev->thread); + } + return_io(return_bi); } @@ -3741,7 +3756,7 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) { mddev_t *mddev = q->queuedata; raid5_conf_t *conf = mddev->private; - unsigned int dd_idx; + int dd_idx; struct bio* align_bi; mdk_rdev_t *rdev; @@ -3866,7 +3881,13 @@ static int make_request(struct request_queue *q, struct bio * bi) int cpu, remaining; if (unlikely(bio_rw_flagged(bi, BIO_RW_BARRIER))) { - bio_endio(bi, -EOPNOTSUPP); + /* Drain all pending writes. We only really need + * to ensure they have been submitted, but this is + * easier. + */ + mddev->pers->quiesce(mddev, 1); + mddev->pers->quiesce(mddev, 0); + md_barrier_request(mddev, bi); return 0; } @@ -3990,6 +4011,9 @@ static int make_request(struct request_queue *q, struct bio * bi) finish_wait(&conf->wait_for_overlap, &w); set_bit(STRIPE_HANDLE, &sh->state); clear_bit(STRIPE_DELAYED, &sh->state); + if (mddev->barrier && + !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + atomic_inc(&conf->preread_active_stripes); release_stripe(sh); } else { /* cannot get stripe for read-ahead, just give-up */ @@ -4009,6 +4033,14 @@ static int make_request(struct request_queue *q, struct bio * bi) bio_endio(bi, 0); } + + if (mddev->barrier) { + /* We need to wait for the stripes to all be handled. + * So: wait for preread_active_stripes to drop to 0. + */ + wait_event(mddev->thread->wqueue, + atomic_read(&conf->preread_active_stripes) == 0); + } return 0; } @@ -4049,6 +4081,8 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped sector_nr = conf->reshape_progress; sector_div(sector_nr, new_data_disks); if (sector_nr) { + mddev->curr_resync_completed = sector_nr; + sysfs_notify(&mddev->kobj, NULL, "sync_completed"); *skipped = 1; return sector_nr; } @@ -4821,11 +4855,40 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) return ERR_PTR(-ENOMEM); } + +static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded) +{ + switch (algo) { + case ALGORITHM_PARITY_0: + if (raid_disk < max_degraded) + return 1; + break; + case ALGORITHM_PARITY_N: + if (raid_disk >= raid_disks - max_degraded) + return 1; + break; + case ALGORITHM_PARITY_0_6: + if (raid_disk == 0 || + raid_disk == raid_disks - 1) + return 1; + break; + case ALGORITHM_LEFT_ASYMMETRIC_6: + case ALGORITHM_RIGHT_ASYMMETRIC_6: + case ALGORITHM_LEFT_SYMMETRIC_6: + case ALGORITHM_RIGHT_SYMMETRIC_6: + if (raid_disk == raid_disks - 1) + return 1; + } + return 0; +} + static int run(mddev_t *mddev) { raid5_conf_t *conf; int working_disks = 0, chunk_size; + int dirty_parity_disks = 0; mdk_rdev_t *rdev; + sector_t reshape_offset = 0; if (mddev->recovery_cp != MaxSector) printk(KERN_NOTICE "raid5: %s is not clean" @@ -4859,6 +4922,7 @@ static int run(mddev_t *mddev) "on a stripe boundary\n"); return -EINVAL; } + reshape_offset = here_new * mddev->new_chunk_sectors; /* here_new is the stripe we will write to */ here_old = mddev->reshape_position; sector_div(here_old, mddev->chunk_sectors * @@ -4914,10 +4978,51 @@ static int run(mddev_t *mddev) /* * 0 for a fully functional array, 1 or 2 for a degraded array. */ - list_for_each_entry(rdev, &mddev->disks, same_set) - if (rdev->raid_disk >= 0 && - test_bit(In_sync, &rdev->flags)) + list_for_each_entry(rdev, &mddev->disks, same_set) { + if (rdev->raid_disk < 0) + continue; + if (test_bit(In_sync, &rdev->flags)) working_disks++; + /* This disc is not fully in-sync. However if it + * just stored parity (beyond the recovery_offset), + * when we don't need to be concerned about the + * array being dirty. + * When reshape goes 'backwards', we never have + * partially completed devices, so we only need + * to worry about reshape going forwards. + */ + /* Hack because v0.91 doesn't store recovery_offset properly. */ + if (mddev->major_version == 0 && + mddev->minor_version > 90) + rdev->recovery_offset = reshape_offset; + + printk("%d: w=%d pa=%d pr=%d m=%d a=%d r=%d op1=%d op2=%d\n", + rdev->raid_disk, working_disks, conf->prev_algo, + conf->previous_raid_disks, conf->max_degraded, + conf->algorithm, conf->raid_disks, + only_parity(rdev->raid_disk, + conf->prev_algo, + conf->previous_raid_disks, + conf->max_degraded), + only_parity(rdev->raid_disk, + conf->algorithm, + conf->raid_disks, + conf->max_degraded)); + if (rdev->recovery_offset < reshape_offset) { + /* We need to check old and new layout */ + if (!only_parity(rdev->raid_disk, + conf->algorithm, + conf->raid_disks, + conf->max_degraded)) + continue; + } + if (!only_parity(rdev->raid_disk, + conf->prev_algo, + conf->previous_raid_disks, + conf->max_degraded)) + continue; + dirty_parity_disks++; + } mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks) - working_disks); @@ -4933,7 +5038,7 @@ static int run(mddev_t *mddev) mddev->dev_sectors &= ~(mddev->chunk_sectors - 1); mddev->resync_max_sectors = mddev->dev_sectors; - if (mddev->degraded > 0 && + if (mddev->degraded > dirty_parity_disks && mddev->recovery_cp != MaxSector) { if (mddev->ok_start_degraded) printk(KERN_WARNING @@ -5359,9 +5464,11 @@ static int raid5_start_reshape(mddev_t *mddev) !test_bit(Faulty, &rdev->flags)) { if (raid5_add_disk(mddev, rdev) == 0) { char nm[20]; - set_bit(In_sync, &rdev->flags); + if (rdev->raid_disk >= conf->previous_raid_disks) + set_bit(In_sync, &rdev->flags); + else + rdev->recovery_offset = 0; added_devices++; - rdev->recovery_offset = 0; sprintf(nm, "rd%d", rdev->raid_disk); if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) @@ -5785,6 +5892,7 @@ static void raid5_exit(void) module_init(raid5_init); module_exit(raid5_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD"); MODULE_ALIAS("md-personality-4"); /* RAID5 */ MODULE_ALIAS("md-raid5"); MODULE_ALIAS("md-raid4"); |