From ac5e7113e74872928844d00085bd47c988f12728 Mon Sep 17 00:00:00 2001 From: Andre Noll Date: Mon, 3 Aug 2009 10:59:47 +1000 Subject: md: Push down data integrity code to personalities. This patch replaces md_integrity_check() by two new public functions: md_integrity_register() and md_integrity_add_rdev() which are both personality-independent. md_integrity_register() is called from the ->run and ->hot_remove methods of all personalities that support data integrity. The function iterates over the component devices of the array and determines if all active devices are integrity capable and if their profiles match. If this is the case, the common profile is registered for the mddev via blk_integrity_register(). The second new function, md_integrity_add_rdev() is called from the ->hot_add_disk methods, i.e. whenever a new device is being added to a raid array. If the new device does not support data integrity, or has a profile different from the one already registered, data integrity for the mddev is disabled. For raid0 and linear, only the call to md_integrity_register() from the ->run method is necessary. Signed-off-by: Andre Noll Signed-off-by: NeilBrown --- drivers/md/md.c | 94 +++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 64 insertions(+), 30 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index d4351ff0849f..180949e94a7b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1487,37 +1487,76 @@ static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) static LIST_HEAD(pending_raid_disks); -static void md_integrity_check(mdk_rdev_t *rdev, mddev_t *mddev) +/* + * Try to register data integrity profile for an mddev + * + * This is called when an array is started and after a disk has been kicked + * from the array. It only succeeds if all working and active component devices + * are integrity capable with matching profiles. + */ +int md_integrity_register(mddev_t *mddev) +{ + mdk_rdev_t *rdev, *reference = NULL; + + if (list_empty(&mddev->disks)) + return 0; /* nothing to do */ + if (blk_get_integrity(mddev->gendisk)) + return 0; /* already registered */ + list_for_each_entry(rdev, &mddev->disks, same_set) { + /* skip spares and non-functional disks */ + if (test_bit(Faulty, &rdev->flags)) + continue; + if (rdev->raid_disk < 0) + continue; + /* + * If at least one rdev is not integrity capable, we can not + * enable data integrity for the md device. + */ + if (!bdev_get_integrity(rdev->bdev)) + return -EINVAL; + if (!reference) { + /* Use the first rdev as the reference */ + reference = rdev; + continue; + } + /* does this rdev's profile match the reference profile? */ + if (blk_integrity_compare(reference->bdev->bd_disk, + rdev->bdev->bd_disk) < 0) + return -EINVAL; + } + /* + * All component devices are integrity capable and have matching + * profiles, register the common profile for the md device. + */ + if (blk_integrity_register(mddev->gendisk, + bdev_get_integrity(reference->bdev)) != 0) { + printk(KERN_ERR "md: failed to register integrity for %s\n", + mdname(mddev)); + return -EINVAL; + } + printk(KERN_NOTICE "md: data integrity on %s enabled\n", + mdname(mddev)); + return 0; +} +EXPORT_SYMBOL(md_integrity_register); + +/* Disable data integrity if non-capable/non-matching disk is being added */ +void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev) { - struct mdk_personality *pers = mddev->pers; - struct gendisk *disk = mddev->gendisk; struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev); - struct blk_integrity *bi_mddev = blk_get_integrity(disk); + struct blk_integrity *bi_mddev = blk_get_integrity(mddev->gendisk); - /* Data integrity passthrough not supported on RAID 4, 5 and 6 */ - if (pers && pers->level >= 4 && pers->level <= 6) + if (!bi_mddev) /* nothing to do */ return; - - /* If rdev is integrity capable, register profile for mddev */ - if (!bi_mddev && bi_rdev) { - if (blk_integrity_register(disk, bi_rdev)) - printk(KERN_ERR "%s: %s Could not register integrity!\n", - __func__, disk->disk_name); - else - printk(KERN_NOTICE "Enabling data integrity on %s\n", - disk->disk_name); + if (rdev->raid_disk < 0) /* skip spares */ return; - } - - /* Check that mddev and rdev have matching profiles */ - if (blk_integrity_compare(disk, rdev->bdev->bd_disk) < 0) { - printk(KERN_ERR "%s: %s/%s integrity mismatch!\n", __func__, - disk->disk_name, rdev->bdev->bd_disk->disk_name); - printk(KERN_NOTICE "Disabling data integrity on %s\n", - disk->disk_name); - blk_integrity_unregister(disk); - } + if (bi_rdev && blk_integrity_compare(mddev->gendisk, + rdev->bdev->bd_disk) >= 0) + return; + printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev)); + blk_integrity_unregister(mddev->gendisk); } +EXPORT_SYMBOL(md_integrity_add_rdev); static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) { @@ -1591,7 +1630,6 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) /* May as well allow recovery to be retried once */ mddev->recovery_disabled = 0; - md_integrity_check(rdev, mddev); return 0; fail: @@ -4048,10 +4086,6 @@ static int do_md_run(mddev_t * mddev) } strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); - if (pers->level >= 4 && pers->level <= 6) - /* Cannot support integrity (yet) */ - blk_integrity_unregister(mddev->gendisk); - if (mddev->reshape_position != MaxSector && pers->start_reshape == NULL) { /* This personality cannot handle reshaping... */ -- cgit v1.2.3 From 3a981b03f38dc3b8a69b77cbc679e66c1318a44a Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 3 Aug 2009 10:59:55 +1000 Subject: md: when a level change reduces the number of devices, remove the excess. When an array is changed from RAID6 to RAID5, fewer drives are needed. So any device that is made superfluous by the level conversion must be marked as not-active. For the RAID6->RAID5 conversion, this will be a drive which only has 'Q' blocks on it. Cc: stable@kernel.org Signed-off-by: NeilBrown --- drivers/md/md.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 180949e94a7b..c194955aecae 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2695,6 +2695,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len) ssize_t rv = len; struct mdk_personality *pers; void *priv; + mdk_rdev_t *rdev; if (mddev->pers == NULL) { if (len == 0) @@ -2774,6 +2775,12 @@ level_store(mddev_t *mddev, const char *buf, size_t len) mddev_suspend(mddev); mddev->pers->stop(mddev); module_put(mddev->pers->owner); + /* Invalidate devices that are now superfluous */ + list_for_each_entry(rdev, &mddev->disks, same_set) + if (rdev->raid_disk >= mddev->raid_disks) { + rdev->raid_disk = -1; + clear_bit(In_sync, &rdev->flags); + } mddev->pers = pers; mddev->private = priv; strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); -- cgit v1.2.3 From 3673f305faf1bc66ead751344f8262ace851ff44 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 3 Aug 2009 10:59:56 +1000 Subject: md: avoid array overflow with bad v1.x metadata We trust the 'desc_nr' field in v1.x metadata enough to use it as an index in an array. This isn't really safe. So range-check the value first. Signed-off-by: NeilBrown --- drivers/md/md.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index c194955aecae..249b2896d4ea 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1308,7 +1308,12 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) } if (mddev->level != LEVEL_MULTIPATH) { int role; - role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); + if (rdev->desc_nr < 0 || + rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { + role = 0xffff; + rdev->desc_nr = -1; + } else + role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); switch(role) { case 0xffff: /* spare */ break; -- cgit v1.2.3 From 70471dafe3390243c598a3165dfb86b8b8b3f4fe Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 3 Aug 2009 10:59:57 +1000 Subject: md: Handle growth of v1.x metadata correctly. The v1.x metadata does not have a fixed size and can grow when devices are added. If it grows enough to require an extra sector of storage, we need to update the 'sb_size' to match. Without this, md can write out an incomplete superblock with a bad checksum, which will be rejected when trying to re-assemble the array. Cc: stable@kernel.org Signed-off-by: NeilBrown --- drivers/md/md.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 249b2896d4ea..52c988b072d0 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1399,8 +1399,14 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) if (rdev2->desc_nr+1 > max_dev) max_dev = rdev2->desc_nr+1; - if (max_dev > le32_to_cpu(sb->max_dev)) + if (max_dev > le32_to_cpu(sb->max_dev)) { + int bmask; sb->max_dev = cpu_to_le32(max_dev); + rdev->sb_size = max_dev * 2 + 256; + bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; + if (rdev->sb_size & bmask) + rdev->sb_size = (rdev->sb_size | bmask) + 1; + } for (i=0; idev_roles[i] = cpu_to_le16(0xfffe); -- cgit v1.2.3 From 449aad3e25358812c43afc60918c5ad3819488e7 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 3 Aug 2009 10:59:58 +1000 Subject: md: Use revalidate_disk to effect changes in size of device. As revalidate_disk calls check_disk_size_change, it will cause any capacity change of a gendisk to be propagated to the blockdev inode. So use that instead of mucking about with locks and i_size_write. Also add a call to revalidate_disk in do_md_run and a few other places where the gendisk capacity is changed. Signed-off-by: NeilBrown --- drivers/md/md.c | 28 +++++----------------------- 1 file changed, 5 insertions(+), 23 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 52c988b072d0..5b98bea4ff9b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3741,17 +3741,8 @@ array_size_store(mddev_t *mddev, const char *buf, size_t len) mddev->array_sectors = sectors; set_capacity(mddev->gendisk, mddev->array_sectors); - if (mddev->pers) { - struct block_device *bdev = bdget_disk(mddev->gendisk, 0); - - if (bdev) { - mutex_lock(&bdev->bd_inode->i_mutex); - i_size_write(bdev->bd_inode, - (loff_t)mddev->array_sectors << 9); - mutex_unlock(&bdev->bd_inode->i_mutex); - bdput(bdev); - } - } + if (mddev->pers) + revalidate_disk(mddev->gendisk); return len; } @@ -4241,6 +4232,7 @@ static int do_md_run(mddev_t * mddev) md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ + revalidate_disk(mddev->gendisk); mddev->changed = 1; md_new_event(mddev); sysfs_notify_dirent(mddev->sysfs_state); @@ -5139,18 +5131,8 @@ static int update_size(mddev_t *mddev, sector_t num_sectors) return -ENOSPC; } rv = mddev->pers->resize(mddev, num_sectors); - if (!rv) { - struct block_device *bdev; - - bdev = bdget_disk(mddev->gendisk, 0); - if (bdev) { - mutex_lock(&bdev->bd_inode->i_mutex); - i_size_write(bdev->bd_inode, - (loff_t)mddev->array_sectors << 9); - mutex_unlock(&bdev->bd_inode->i_mutex); - bdput(bdev); - } - } + if (!rv) + revalidate_disk(mddev->gendisk); return rv; } -- cgit v1.2.3