From 7eaceaccab5f40bbfda044629a6298616aeaed50 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Thu, 10 Mar 2011 08:52:07 +0100
Subject: block: remove per-queue plugging

Code has been converted over to the new explicit on-stack plugging,
and delay users have been converted to use the new API for that.
So lets kill off the old plugging along with aops->sync_page().

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/md/bitmap.c    |  3 +-
 drivers/md/dm-crypt.c  |  9 +-----
 drivers/md/dm-kcopyd.c | 52 +++---------------------------
 drivers/md/dm-raid.c   |  2 +-
 drivers/md/dm-raid1.c  |  2 --
 drivers/md/dm-table.c  | 24 --------------
 drivers/md/dm.c        | 33 +++----------------
 drivers/md/linear.c    | 17 ----------
 drivers/md/md.c        |  7 ----
 drivers/md/multipath.c | 31 ------------------
 drivers/md/raid0.c     | 16 ----------
 drivers/md/raid1.c     | 83 ++++++++++-------------------------------------
 drivers/md/raid10.c    | 87 +++++++++++---------------------------------------
 drivers/md/raid5.c     | 62 +++++------------------------------
 drivers/md/raid5.h     |  2 +-
 15 files changed, 58 insertions(+), 372 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 9a35320fb59f..54bfc274b39a 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1339,8 +1339,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
 			prepare_to_wait(&bitmap->overflow_wait, &__wait,
 					TASK_UNINTERRUPTIBLE);
 			spin_unlock_irq(&bitmap->lock);
-			md_unplug(bitmap->mddev);
-			schedule();
+			io_schedule();
 			finish_wait(&bitmap->overflow_wait, &__wait);
 			continue;
 		}
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 4e054bd91664..2c62c1169f78 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -991,11 +991,6 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone)
 	clone->bi_destructor = dm_crypt_bio_destructor;
 }
 
-static void kcryptd_unplug(struct crypt_config *cc)
-{
-	blk_unplug(bdev_get_queue(cc->dev->bdev));
-}
-
 static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
 {
 	struct crypt_config *cc = io->target->private;
@@ -1008,10 +1003,8 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
 	 * one in order to decrypt the whole bio data *afterwards*.
 	 */
 	clone = bio_alloc_bioset(gfp, bio_segments(base_bio), cc->bs);
-	if (!clone) {
-		kcryptd_unplug(cc);
+	if (!clone)
 		return 1;
-	}
 
 	crypt_inc_pending(io);
 
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 924f5f0084c2..400cf35094a4 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -37,13 +37,6 @@ struct dm_kcopyd_client {
 	unsigned int nr_pages;
 	unsigned int nr_free_pages;
 
-	/*
-	 * Block devices to unplug.
-	 * Non-NULL pointer means that a block device has some pending requests
-	 * and needs to be unplugged.
-	 */
-	struct block_device *unplug[2];
-
 	struct dm_io_client *io_client;
 
 	wait_queue_head_t destroyq;
@@ -315,31 +308,6 @@ static int run_complete_job(struct kcopyd_job *job)
 	return 0;
 }
 
-/*
- * Unplug the block device at the specified index.
- */
-static void unplug(struct dm_kcopyd_client *kc, int rw)
-{
-	if (kc->unplug[rw] != NULL) {
-		blk_unplug(bdev_get_queue(kc->unplug[rw]));
-		kc->unplug[rw] = NULL;
-	}
-}
-
-/*
- * Prepare block device unplug. If there's another device
- * to be unplugged at the same array index, we unplug that
- * device first.
- */
-static void prepare_unplug(struct dm_kcopyd_client *kc, int rw,
-			   struct block_device *bdev)
-{
-	if (likely(kc->unplug[rw] == bdev))
-		return;
-	unplug(kc, rw);
-	kc->unplug[rw] = bdev;
-}
-
 static void complete_io(unsigned long error, void *context)
 {
 	struct kcopyd_job *job = (struct kcopyd_job *) context;
@@ -386,15 +354,12 @@ static int run_io_job(struct kcopyd_job *job)
 		.client = job->kc->io_client,
 	};
 
-	if (job->rw == READ) {
+	if (job->rw == READ)
 		r = dm_io(&io_req, 1, &job->source, NULL);
-		prepare_unplug(job->kc, READ, job->source.bdev);
-	} else {
+	else {
 		if (job->num_dests > 1)
 			io_req.bi_rw |= REQ_UNPLUG;
 		r = dm_io(&io_req, job->num_dests, job->dests, NULL);
-		if (!(io_req.bi_rw & REQ_UNPLUG))
-			prepare_unplug(job->kc, WRITE, job->dests[0].bdev);
 	}
 
 	return r;
@@ -466,6 +431,7 @@ static void do_work(struct work_struct *work)
 {
 	struct dm_kcopyd_client *kc = container_of(work,
 					struct dm_kcopyd_client, kcopyd_work);
+	struct blk_plug plug;
 
 	/*
 	 * The order that these are called is *very* important.
@@ -473,18 +439,12 @@ static void do_work(struct work_struct *work)
 	 * Pages jobs when successful will jump onto the io jobs
 	 * list.  io jobs call wake when they complete and it all
 	 * starts again.
-	 *
-	 * Note that io_jobs add block devices to the unplug array,
-	 * this array is cleared with "unplug" calls. It is thus
-	 * forbidden to run complete_jobs after io_jobs and before
-	 * unplug because the block device could be destroyed in
-	 * job completion callback.
 	 */
+	blk_start_plug(&plug);
 	process_jobs(&kc->complete_jobs, kc, run_complete_job);
 	process_jobs(&kc->pages_jobs, kc, run_pages_job);
 	process_jobs(&kc->io_jobs, kc, run_io_job);
-	unplug(kc, READ);
-	unplug(kc, WRITE);
+	blk_finish_plug(&plug);
 }
 
 /*
@@ -665,8 +625,6 @@ int dm_kcopyd_client_create(unsigned int nr_pages,
 	INIT_LIST_HEAD(&kc->io_jobs);
 	INIT_LIST_HEAD(&kc->pages_jobs);
 
-	memset(kc->unplug, 0, sizeof(kc->unplug));
-
 	kc->job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache);
 	if (!kc->job_pool)
 		goto bad_slab;
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index b9e1e15ef11c..5ef136cdba91 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -394,7 +394,7 @@ static void raid_unplug(struct dm_target_callbacks *cb)
 {
 	struct raid_set *rs = container_of(cb, struct raid_set, callbacks);
 
-	md_raid5_unplug_device(rs->md.private);
+	md_raid5_kick_device(rs->md.private);
 }
 
 /*
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index dee326775c60..976ad4688afc 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -842,8 +842,6 @@ static void do_mirror(struct work_struct *work)
 	do_reads(ms, &reads);
 	do_writes(ms, &writes);
 	do_failures(ms, &failures);
-
-	dm_table_unplug_all(ms->ti->table);
 }
 
 /*-----------------------------------------------------------------
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 38e4eb1bb965..f50a7b952257 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1275,29 +1275,6 @@ int dm_table_any_busy_target(struct dm_table *t)
 	return 0;
 }
 
-void dm_table_unplug_all(struct dm_table *t)
-{
-	struct dm_dev_internal *dd;
-	struct list_head *devices = dm_table_get_devices(t);
-	struct dm_target_callbacks *cb;
-
-	list_for_each_entry(dd, devices, list) {
-		struct request_queue *q = bdev_get_queue(dd->dm_dev.bdev);
-		char b[BDEVNAME_SIZE];
-
-		if (likely(q))
-			blk_unplug(q);
-		else
-			DMWARN_LIMIT("%s: Cannot unplug nonexistent device %s",
-				     dm_device_name(t->md),
-				     bdevname(dd->dm_dev.bdev, b));
-	}
-
-	list_for_each_entry(cb, &t->target_callbacks, list)
-		if (cb->unplug_fn)
-			cb->unplug_fn(cb);
-}
-
 struct mapped_device *dm_table_get_md(struct dm_table *t)
 {
 	return t->md;
@@ -1345,4 +1322,3 @@ EXPORT_SYMBOL(dm_table_get_mode);
 EXPORT_SYMBOL(dm_table_get_md);
 EXPORT_SYMBOL(dm_table_put);
 EXPORT_SYMBOL(dm_table_get);
-EXPORT_SYMBOL(dm_table_unplug_all);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index eaa3af0e0632..d22b9905c168 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -807,8 +807,6 @@ void dm_requeue_unmapped_request(struct request *clone)
 	dm_unprep_request(rq);
 
 	spin_lock_irqsave(q->queue_lock, flags);
-	if (elv_queue_empty(q))
-		blk_plug_device(q);
 	blk_requeue_request(q, rq);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 
@@ -1613,10 +1611,10 @@ static void dm_request_fn(struct request_queue *q)
 	 * number of in-flight I/Os after the queue is stopped in
 	 * dm_suspend().
 	 */
-	while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) {
+	while (!blk_queue_stopped(q)) {
 		rq = blk_peek_request(q);
 		if (!rq)
-			goto plug_and_out;
+			goto delay_and_out;
 
 		/* always use block 0 to find the target for flushes for now */
 		pos = 0;
@@ -1627,7 +1625,7 @@ static void dm_request_fn(struct request_queue *q)
 		BUG_ON(!dm_target_is_valid(ti));
 
 		if (ti->type->busy && ti->type->busy(ti))
-			goto plug_and_out;
+			goto delay_and_out;
 
 		blk_start_request(rq);
 		clone = rq->special;
@@ -1647,11 +1645,8 @@ requeued:
 	BUG_ON(!irqs_disabled());
 	spin_lock(q->queue_lock);
 
-plug_and_out:
-	if (!elv_queue_empty(q))
-		/* Some requests still remain, retry later */
-		blk_plug_device(q);
-
+delay_and_out:
+	blk_delay_queue(q, HZ / 10);
 out:
 	dm_table_put(map);
 
@@ -1680,20 +1675,6 @@ static int dm_lld_busy(struct request_queue *q)
 	return r;
 }
 
-static void dm_unplug_all(struct request_queue *q)
-{
-	struct mapped_device *md = q->queuedata;
-	struct dm_table *map = dm_get_live_table(md);
-
-	if (map) {
-		if (dm_request_based(md))
-			generic_unplug_device(q);
-
-		dm_table_unplug_all(map);
-		dm_table_put(map);
-	}
-}
-
 static int dm_any_congested(void *congested_data, int bdi_bits)
 {
 	int r = bdi_bits;
@@ -1817,7 +1798,6 @@ static void dm_init_md_queue(struct mapped_device *md)
 	md->queue->backing_dev_info.congested_data = md;
 	blk_queue_make_request(md->queue, dm_request);
 	blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
-	md->queue->unplug_fn = dm_unplug_all;
 	blk_queue_merge_bvec(md->queue, dm_merge_bvec);
 	blk_queue_flush(md->queue, REQ_FLUSH | REQ_FUA);
 }
@@ -2263,8 +2243,6 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
 	int r = 0;
 	DECLARE_WAITQUEUE(wait, current);
 
-	dm_unplug_all(md->queue);
-
 	add_wait_queue(&md->wait, &wait);
 
 	while (1) {
@@ -2539,7 +2517,6 @@ int dm_resume(struct mapped_device *md)
 
 	clear_bit(DMF_SUSPENDED, &md->flags);
 
-	dm_table_unplug_all(map);
 	r = 0;
 out:
 	dm_table_put(map);
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 8a2f767f26d8..38861b5b9d90 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -87,22 +87,6 @@ static int linear_mergeable_bvec(struct request_queue *q,
 	return maxsectors << 9;
 }
 
-static void linear_unplug(struct request_queue *q)
-{
-	mddev_t *mddev = q->queuedata;
-	linear_conf_t *conf;
-	int i;
-
-	rcu_read_lock();
-	conf = rcu_dereference(mddev->private);
-
-	for (i=0; i < mddev->raid_disks; i++) {
-		struct request_queue *r_queue = bdev_get_queue(conf->disks[i].rdev->bdev);
-		blk_unplug(r_queue);
-	}
-	rcu_read_unlock();
-}
-
 static int linear_congested(void *data, int bits)
 {
 	mddev_t *mddev = data;
@@ -225,7 +209,6 @@ static int linear_run (mddev_t *mddev)
 	md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
 
 	blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
-	mddev->queue->unplug_fn = linear_unplug;
 	mddev->queue->backing_dev_info.congested_fn = linear_congested;
 	mddev->queue->backing_dev_info.congested_data = mddev;
 	md_integrity_register(mddev);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 0cc30ecda4c1..ca0d79c264b9 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4812,7 +4812,6 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 		__md_stop_writes(mddev);
 		md_stop(mddev);
 		mddev->queue->merge_bvec_fn = NULL;
-		mddev->queue->unplug_fn = NULL;
 		mddev->queue->backing_dev_info.congested_fn = NULL;
 
 		/* tell userspace to handle 'inactive' */
@@ -6669,8 +6668,6 @@ EXPORT_SYMBOL_GPL(md_allow_write);
 
 void md_unplug(mddev_t *mddev)
 {
-	if (mddev->queue)
-		blk_unplug(mddev->queue);
 	if (mddev->plug)
 		mddev->plug->unplug_fn(mddev->plug);
 }
@@ -6853,7 +6850,6 @@ void md_do_sync(mddev_t *mddev)
 		     >= mddev->resync_max - mddev->curr_resync_completed
 			    )) {
 			/* time to update curr_resync_completed */
-			md_unplug(mddev);
 			wait_event(mddev->recovery_wait,
 				   atomic_read(&mddev->recovery_active) == 0);
 			mddev->curr_resync_completed = j;
@@ -6929,7 +6925,6 @@ void md_do_sync(mddev_t *mddev)
 		 * about not overloading the IO subsystem. (things like an
 		 * e2fsck being done on the RAID array should execute fast)
 		 */
-		md_unplug(mddev);
 		cond_resched();
 
 		currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
@@ -6948,8 +6943,6 @@ void md_do_sync(mddev_t *mddev)
 	 * this also signals 'finished resyncing' to md_stop
 	 */
  out:
-	md_unplug(mddev);
-
 	wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
 
 	/* tell personality that we are finished */
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 6d7ddf32ef2e..1cc8ed44e4ad 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -106,36 +106,6 @@ static void multipath_end_request(struct bio *bio, int error)
 	rdev_dec_pending(rdev, conf->mddev);
 }
 
-static void unplug_slaves(mddev_t *mddev)
-{
-	multipath_conf_t *conf = mddev->private;
-	int i;
-
-	rcu_read_lock();
-	for (i=0; i<mddev->raid_disks; i++) {
-		mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev);
-		if (rdev && !test_bit(Faulty, &rdev->flags)
-		    && atomic_read(&rdev->nr_pending)) {
-			struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
-
-			atomic_inc(&rdev->nr_pending);
-			rcu_read_unlock();
-
-			blk_unplug(r_queue);
-
-			rdev_dec_pending(rdev, mddev);
-			rcu_read_lock();
-		}
-	}
-	rcu_read_unlock();
-}
-
-static void multipath_unplug(struct request_queue *q)
-{
-	unplug_slaves(q->queuedata);
-}
-
-
 static int multipath_make_request(mddev_t *mddev, struct bio * bio)
 {
 	multipath_conf_t *conf = mddev->private;
@@ -518,7 +488,6 @@ static int multipath_run (mddev_t *mddev)
 	 */
 	md_set_array_sectors(mddev, multipath_size(mddev, 0, 0));
 
-	mddev->queue->unplug_fn = multipath_unplug;
 	mddev->queue->backing_dev_info.congested_fn = multipath_congested;
 	mddev->queue->backing_dev_info.congested_data = mddev;
 	md_integrity_register(mddev);
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 637a96855edb..6338c0fe6208 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -25,21 +25,6 @@
 #include "raid0.h"
 #include "raid5.h"
 
-static void raid0_unplug(struct request_queue *q)
-{
-	mddev_t *mddev = q->queuedata;
-	raid0_conf_t *conf = mddev->private;
-	mdk_rdev_t **devlist = conf->devlist;
-	int raid_disks = conf->strip_zone[0].nb_dev;
-	int i;
-
-	for (i=0; i < raid_disks; i++) {
-		struct request_queue *r_queue = bdev_get_queue(devlist[i]->bdev);
-
-		blk_unplug(r_queue);
-	}
-}
-
 static int raid0_congested(void *data, int bits)
 {
 	mddev_t *mddev = data;
@@ -272,7 +257,6 @@ static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf)
 		       mdname(mddev),
 		       (unsigned long long)smallest->sectors);
 	}
-	mddev->queue->unplug_fn = raid0_unplug;
 	mddev->queue->backing_dev_info.congested_fn = raid0_congested;
 	mddev->queue->backing_dev_info.congested_data = mddev;
 
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index a23ffa397ba9..b67d822d57ae 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -52,23 +52,16 @@
 #define	NR_RAID1_BIOS 256
 
 
-static void unplug_slaves(mddev_t *mddev);
-
 static void allow_barrier(conf_t *conf);
 static void lower_barrier(conf_t *conf);
 
 static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
 {
 	struct pool_info *pi = data;
-	r1bio_t *r1_bio;
 	int size = offsetof(r1bio_t, bios[pi->raid_disks]);
 
 	/* allocate a r1bio with room for raid_disks entries in the bios array */
-	r1_bio = kzalloc(size, gfp_flags);
-	if (!r1_bio && pi->mddev)
-		unplug_slaves(pi->mddev);
-
-	return r1_bio;
+	return kzalloc(size, gfp_flags);
 }
 
 static void r1bio_pool_free(void *r1_bio, void *data)
@@ -91,10 +84,8 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 	int i, j;
 
 	r1_bio = r1bio_pool_alloc(gfp_flags, pi);
-	if (!r1_bio) {
-		unplug_slaves(pi->mddev);
+	if (!r1_bio)
 		return NULL;
-	}
 
 	/*
 	 * Allocate bios : 1 for reading, n-1 for writing
@@ -520,37 +511,6 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
 	return new_disk;
 }
 
-static void unplug_slaves(mddev_t *mddev)
-{
-	conf_t *conf = mddev->private;
-	int i;
-
-	rcu_read_lock();
-	for (i=0; i<mddev->raid_disks; i++) {
-		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
-		if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
-			struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
-
-			atomic_inc(&rdev->nr_pending);
-			rcu_read_unlock();
-
-			blk_unplug(r_queue);
-
-			rdev_dec_pending(rdev, mddev);
-			rcu_read_lock();
-		}
-	}
-	rcu_read_unlock();
-}
-
-static void raid1_unplug(struct request_queue *q)
-{
-	mddev_t *mddev = q->queuedata;
-
-	unplug_slaves(mddev);
-	md_wakeup_thread(mddev->thread);
-}
-
 static int raid1_congested(void *data, int bits)
 {
 	mddev_t *mddev = data;
@@ -580,20 +540,16 @@ static int raid1_congested(void *data, int bits)
 }
 
 
-static int flush_pending_writes(conf_t *conf)
+static void flush_pending_writes(conf_t *conf)
 {
 	/* Any writes that have been queued but are awaiting
 	 * bitmap updates get flushed here.
-	 * We return 1 if any requests were actually submitted.
 	 */
-	int rv = 0;
-
 	spin_lock_irq(&conf->device_lock);
 
 	if (conf->pending_bio_list.head) {
 		struct bio *bio;
 		bio = bio_list_get(&conf->pending_bio_list);
-		blk_remove_plug(conf->mddev->queue);
 		spin_unlock_irq(&conf->device_lock);
 		/* flush any pending bitmap writes to
 		 * disk before proceeding w/ I/O */
@@ -605,10 +561,14 @@ static int flush_pending_writes(conf_t *conf)
 			generic_make_request(bio);
 			bio = next;
 		}
-		rv = 1;
 	} else
 		spin_unlock_irq(&conf->device_lock);
-	return rv;
+}
+
+static void md_kick_device(mddev_t *mddev)
+{
+	blk_flush_plug(current);
+	md_wakeup_thread(mddev->thread);
 }
 
 /* Barriers....
@@ -640,8 +600,7 @@ static void raise_barrier(conf_t *conf)
 
 	/* Wait until no block IO is waiting */
 	wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
-			    conf->resync_lock,
-			    raid1_unplug(conf->mddev->queue));
+			    conf->resync_lock, md_kick_device(conf->mddev));
 
 	/* block any new IO from starting */
 	conf->barrier++;
@@ -649,8 +608,7 @@ static void raise_barrier(conf_t *conf)
 	/* Now wait for all pending IO to complete */
 	wait_event_lock_irq(conf->wait_barrier,
 			    !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
-			    conf->resync_lock,
-			    raid1_unplug(conf->mddev->queue));
+			    conf->resync_lock, md_kick_device(conf->mddev));
 
 	spin_unlock_irq(&conf->resync_lock);
 }
@@ -672,7 +630,7 @@ static void wait_barrier(conf_t *conf)
 		conf->nr_waiting++;
 		wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
 				    conf->resync_lock,
-				    raid1_unplug(conf->mddev->queue));
+				    md_kick_device(conf->mddev));
 		conf->nr_waiting--;
 	}
 	conf->nr_pending++;
@@ -709,7 +667,7 @@ static void freeze_array(conf_t *conf)
 			    conf->nr_pending == conf->nr_queued+1,
 			    conf->resync_lock,
 			    ({ flush_pending_writes(conf);
-			       raid1_unplug(conf->mddev->queue); }));
+			       md_kick_device(conf->mddev); }));
 	spin_unlock_irq(&conf->resync_lock);
 }
 static void unfreeze_array(conf_t *conf)
@@ -959,7 +917,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 		atomic_inc(&r1_bio->remaining);
 		spin_lock_irqsave(&conf->device_lock, flags);
 		bio_list_add(&conf->pending_bio_list, mbio);
-		blk_plug_device(mddev->queue);
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 	}
 	r1_bio_write_done(r1_bio, bio->bi_vcnt, behind_pages, behind_pages != NULL);
@@ -968,7 +925,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 	/* In case raid1d snuck in to freeze_array */
 	wake_up(&conf->wait_barrier);
 
-	if (do_sync)
+	if (do_sync || !bitmap)
 		md_wakeup_thread(mddev->thread);
 
 	return 0;
@@ -1558,7 +1515,6 @@ static void raid1d(mddev_t *mddev)
 	unsigned long flags;
 	conf_t *conf = mddev->private;
 	struct list_head *head = &conf->retry_list;
-	int unplug=0;
 	mdk_rdev_t *rdev;
 
 	md_check_recovery(mddev);
@@ -1566,7 +1522,7 @@ static void raid1d(mddev_t *mddev)
 	for (;;) {
 		char b[BDEVNAME_SIZE];
 
-		unplug += flush_pending_writes(conf);
+		flush_pending_writes(conf);
 
 		spin_lock_irqsave(&conf->device_lock, flags);
 		if (list_empty(head)) {
@@ -1580,10 +1536,9 @@ static void raid1d(mddev_t *mddev)
 
 		mddev = r1_bio->mddev;
 		conf = mddev->private;
-		if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
+		if (test_bit(R1BIO_IsSync, &r1_bio->state))
 			sync_request_write(mddev, r1_bio);
-			unplug = 1;
-		} else {
+		else {
 			int disk;
 
 			/* we got a read error. Maybe the drive is bad.  Maybe just
@@ -1633,14 +1588,11 @@ static void raid1d(mddev_t *mddev)
 				bio->bi_end_io = raid1_end_read_request;
 				bio->bi_rw = READ | do_sync;
 				bio->bi_private = r1_bio;
-				unplug = 1;
 				generic_make_request(bio);
 			}
 		}
 		cond_resched();
 	}
-	if (unplug)
-		unplug_slaves(mddev);
 }
 
 
@@ -2064,7 +2016,6 @@ static int run(mddev_t *mddev)
 
 	md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
 
-	mddev->queue->unplug_fn = raid1_unplug;
 	mddev->queue->backing_dev_info.congested_fn = raid1_congested;
 	mddev->queue->backing_dev_info.congested_data = mddev;
 	md_integrity_register(mddev);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 3b607b28741b..e79f1c5bf71b 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -57,23 +57,16 @@
  */
 #define	NR_RAID10_BIOS 256
 
-static void unplug_slaves(mddev_t *mddev);
-
 static void allow_barrier(conf_t *conf);
 static void lower_barrier(conf_t *conf);
 
 static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
 {
 	conf_t *conf = data;
-	r10bio_t *r10_bio;
 	int size = offsetof(struct r10bio_s, devs[conf->copies]);
 
 	/* allocate a r10bio with room for raid_disks entries in the bios array */
-	r10_bio = kzalloc(size, gfp_flags);
-	if (!r10_bio && conf->mddev)
-		unplug_slaves(conf->mddev);
-
-	return r10_bio;
+	return kzalloc(size, gfp_flags);
 }
 
 static void r10bio_pool_free(void *r10_bio, void *data)
@@ -106,10 +99,8 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
 	int nalloc;
 
 	r10_bio = r10bio_pool_alloc(gfp_flags, conf);
-	if (!r10_bio) {
-		unplug_slaves(conf->mddev);
+	if (!r10_bio)
 		return NULL;
-	}
 
 	if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
 		nalloc = conf->copies; /* resync */
@@ -597,37 +588,6 @@ rb_out:
 	return disk;
 }
 
-static void unplug_slaves(mddev_t *mddev)
-{
-	conf_t *conf = mddev->private;
-	int i;
-
-	rcu_read_lock();
-	for (i=0; i < conf->raid_disks; i++) {
-		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
-		if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
-			struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
-
-			atomic_inc(&rdev->nr_pending);
-			rcu_read_unlock();
-
-			blk_unplug(r_queue);
-
-			rdev_dec_pending(rdev, mddev);
-			rcu_read_lock();
-		}
-	}
-	rcu_read_unlock();
-}
-
-static void raid10_unplug(struct request_queue *q)
-{
-	mddev_t *mddev = q->queuedata;
-
-	unplug_slaves(q->queuedata);
-	md_wakeup_thread(mddev->thread);
-}
-
 static int raid10_congested(void *data, int bits)
 {
 	mddev_t *mddev = data;
@@ -649,20 +609,16 @@ static int raid10_congested(void *data, int bits)
 	return ret;
 }
 
-static int flush_pending_writes(conf_t *conf)
+static void flush_pending_writes(conf_t *conf)
 {
 	/* Any writes that have been queued but are awaiting
 	 * bitmap updates get flushed here.
-	 * We return 1 if any requests were actually submitted.
 	 */
-	int rv = 0;
-
 	spin_lock_irq(&conf->device_lock);
 
 	if (conf->pending_bio_list.head) {
 		struct bio *bio;
 		bio = bio_list_get(&conf->pending_bio_list);
-		blk_remove_plug(conf->mddev->queue);
 		spin_unlock_irq(&conf->device_lock);
 		/* flush any pending bitmap writes to disk
 		 * before proceeding w/ I/O */
@@ -674,11 +630,16 @@ static int flush_pending_writes(conf_t *conf)
 			generic_make_request(bio);
 			bio = next;
 		}
-		rv = 1;
 	} else
 		spin_unlock_irq(&conf->device_lock);
-	return rv;
 }
+
+static void md_kick_device(mddev_t *mddev)
+{
+	blk_flush_plug(current);
+	md_wakeup_thread(mddev->thread);
+}
+
 /* Barriers....
  * Sometimes we need to suspend IO while we do something else,
  * either some resync/recovery, or reconfigure the array.
@@ -708,8 +669,7 @@ static void raise_barrier(conf_t *conf, int force)
 
 	/* Wait until no block IO is waiting (unless 'force') */
 	wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
-			    conf->resync_lock,
-			    raid10_unplug(conf->mddev->queue));
+			    conf->resync_lock, md_kick_device(conf->mddev));
 
 	/* block any new IO from starting */
 	conf->barrier++;
@@ -717,8 +677,7 @@ static void raise_barrier(conf_t *conf, int force)
 	/* No wait for all pending IO to complete */
 	wait_event_lock_irq(conf->wait_barrier,
 			    !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
-			    conf->resync_lock,
-			    raid10_unplug(conf->mddev->queue));
+			    conf->resync_lock, md_kick_device(conf->mddev));
 
 	spin_unlock_irq(&conf->resync_lock);
 }
@@ -739,7 +698,7 @@ static void wait_barrier(conf_t *conf)
 		conf->nr_waiting++;
 		wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
 				    conf->resync_lock,
-				    raid10_unplug(conf->mddev->queue));
+				    md_kick_device(conf->mddev));
 		conf->nr_waiting--;
 	}
 	conf->nr_pending++;
@@ -776,7 +735,7 @@ static void freeze_array(conf_t *conf)
 			    conf->nr_pending == conf->nr_queued+1,
 			    conf->resync_lock,
 			    ({ flush_pending_writes(conf);
-			       raid10_unplug(conf->mddev->queue); }));
+			       md_kick_device(conf->mddev); }));
 	spin_unlock_irq(&conf->resync_lock);
 }
 
@@ -971,7 +930,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 		atomic_inc(&r10_bio->remaining);
 		spin_lock_irqsave(&conf->device_lock, flags);
 		bio_list_add(&conf->pending_bio_list, mbio);
-		blk_plug_device(mddev->queue);
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 	}
 
@@ -988,7 +946,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 	/* In case raid10d snuck in to freeze_array */
 	wake_up(&conf->wait_barrier);
 
-	if (do_sync)
+	if (do_sync || !mddev->bitmap)
 		md_wakeup_thread(mddev->thread);
 
 	return 0;
@@ -1681,7 +1639,6 @@ static void raid10d(mddev_t *mddev)
 	unsigned long flags;
 	conf_t *conf = mddev->private;
 	struct list_head *head = &conf->retry_list;
-	int unplug=0;
 	mdk_rdev_t *rdev;
 
 	md_check_recovery(mddev);
@@ -1689,7 +1646,7 @@ static void raid10d(mddev_t *mddev)
 	for (;;) {
 		char b[BDEVNAME_SIZE];
 
-		unplug += flush_pending_writes(conf);
+		flush_pending_writes(conf);
 
 		spin_lock_irqsave(&conf->device_lock, flags);
 		if (list_empty(head)) {
@@ -1703,13 +1660,11 @@ static void raid10d(mddev_t *mddev)
 
 		mddev = r10_bio->mddev;
 		conf = mddev->private;
-		if (test_bit(R10BIO_IsSync, &r10_bio->state)) {
+		if (test_bit(R10BIO_IsSync, &r10_bio->state))
 			sync_request_write(mddev, r10_bio);
-			unplug = 1;
-		} else 	if (test_bit(R10BIO_IsRecover, &r10_bio->state)) {
+		else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
 			recovery_request_write(mddev, r10_bio);
-			unplug = 1;
-		} else {
+		else {
 			int mirror;
 			/* we got a read error. Maybe the drive is bad.  Maybe just
 			 * the block and we can fix it.
@@ -1756,14 +1711,11 @@ static void raid10d(mddev_t *mddev)
 				bio->bi_rw = READ | do_sync;
 				bio->bi_private = r10_bio;
 				bio->bi_end_io = raid10_end_read_request;
-				unplug = 1;
 				generic_make_request(bio);
 			}
 		}
 		cond_resched();
 	}
-	if (unplug)
-		unplug_slaves(mddev);
 }
 
 
@@ -2376,7 +2328,6 @@ static int run(mddev_t *mddev)
 	md_set_array_sectors(mddev, size);
 	mddev->resync_max_sectors = size;
 
-	mddev->queue->unplug_fn = raid10_unplug;
 	mddev->queue->backing_dev_info.congested_fn = raid10_congested;
 	mddev->queue->backing_dev_info.congested_data = mddev;
 
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 702812824195..e867ee42b152 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -433,8 +433,6 @@ static int has_failed(raid5_conf_t *conf)
 	return 0;
 }
 
-static void unplug_slaves(mddev_t *mddev);
-
 static struct stripe_head *
 get_active_stripe(raid5_conf_t *conf, sector_t sector,
 		  int previous, int noblock, int noquiesce)
@@ -463,8 +461,7 @@ get_active_stripe(raid5_conf_t *conf, sector_t sector,
 						     < (conf->max_nr_stripes *3/4)
 						     || !conf->inactive_blocked),
 						    conf->device_lock,
-						    md_raid5_unplug_device(conf)
-					);
+						    md_raid5_kick_device(conf));
 				conf->inactive_blocked = 0;
 			} else
 				init_stripe(sh, sector, previous);
@@ -1473,8 +1470,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
 		wait_event_lock_irq(conf->wait_for_stripe,
 				    !list_empty(&conf->inactive_list),
 				    conf->device_lock,
-				    unplug_slaves(conf->mddev)
-			);
+				    blk_flush_plug(current));
 		osh = get_free_stripe(conf);
 		spin_unlock_irq(&conf->device_lock);
 		atomic_set(&nsh->count, 1);
@@ -3645,58 +3641,19 @@ static void activate_bit_delay(raid5_conf_t *conf)
 	}
 }
 
-static void unplug_slaves(mddev_t *mddev)
+void md_raid5_kick_device(raid5_conf_t *conf)
 {
-	raid5_conf_t *conf = mddev->private;
-	int i;
-	int devs = max(conf->raid_disks, conf->previous_raid_disks);
-
-	rcu_read_lock();
-	for (i = 0; i < devs; i++) {
-		mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
-		if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
-			struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
-
-			atomic_inc(&rdev->nr_pending);
-			rcu_read_unlock();
-
-			blk_unplug(r_queue);
-
-			rdev_dec_pending(rdev, mddev);
-			rcu_read_lock();
-		}
-	}
-	rcu_read_unlock();
-}
-
-void md_raid5_unplug_device(raid5_conf_t *conf)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&conf->device_lock, flags);
-
-	if (plugger_remove_plug(&conf->plug)) {
-		conf->seq_flush++;
-		raid5_activate_delayed(conf);
-	}
+	blk_flush_plug(current);
+	raid5_activate_delayed(conf);
 	md_wakeup_thread(conf->mddev->thread);
-
-	spin_unlock_irqrestore(&conf->device_lock, flags);
-
-	unplug_slaves(conf->mddev);
 }
-EXPORT_SYMBOL_GPL(md_raid5_unplug_device);
+EXPORT_SYMBOL_GPL(md_raid5_kick_device);
 
 static void raid5_unplug(struct plug_handle *plug)
 {
 	raid5_conf_t *conf = container_of(plug, raid5_conf_t, plug);
-	md_raid5_unplug_device(conf);
-}
 
-static void raid5_unplug_queue(struct request_queue *q)
-{
-	mddev_t *mddev = q->queuedata;
-	md_raid5_unplug_device(mddev->private);
+	md_raid5_kick_device(conf);
 }
 
 int md_raid5_congested(mddev_t *mddev, int bits)
@@ -4100,7 +4057,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
 				 * add failed due to overlap.  Flush everything
 				 * and wait a while
 				 */
-				md_raid5_unplug_device(conf);
+				md_raid5_kick_device(conf);
 				release_stripe(sh);
 				schedule();
 				goto retry;
@@ -4365,7 +4322,6 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
 
 	if (sector_nr >= max_sector) {
 		/* just being told to finish up .. nothing much to do */
-		unplug_slaves(mddev);
 
 		if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
 			end_reshape(conf);
@@ -4569,7 +4525,6 @@ static void raid5d(mddev_t *mddev)
 	spin_unlock_irq(&conf->device_lock);
 
 	async_tx_issue_pending_all();
-	unplug_slaves(mddev);
 
 	pr_debug("--- raid5d inactive\n");
 }
@@ -5205,7 +5160,6 @@ static int run(mddev_t *mddev)
 		mddev->queue->backing_dev_info.congested_data = mddev;
 		mddev->queue->backing_dev_info.congested_fn = raid5_congested;
 		mddev->queue->queue_lock = &conf->device_lock;
-		mddev->queue->unplug_fn = raid5_unplug_queue;
 
 		chunk_size = mddev->chunk_sectors << 9;
 		blk_queue_io_min(mddev->queue, chunk_size);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 2ace0582b409..8d563a4f022a 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -503,6 +503,6 @@ static inline int algorithm_is_DDF(int layout)
 }
 
 extern int md_raid5_congested(mddev_t *mddev, int bits);
-extern void md_raid5_unplug_device(raid5_conf_t *conf);
+extern void md_raid5_kick_device(raid5_conf_t *conf);
 extern int raid5_set_cache_size(mddev_t *mddev, int size);
 #endif
-- 
cgit v1.2.3


From 721a9602e6607417c6bc15b18e97a2f35266c690 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Wed, 9 Mar 2011 11:56:30 +0100
Subject: block: kill off REQ_UNPLUG

With the plugging now being explicitly controlled by the
submitter, callers need not pass down unplugging hints
to the block layer. If they want to unplug, it's because they
manually plugged on their own - in which case, they should just
unplug at will.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/md/bitmap.c    | 2 +-
 drivers/md/dm-io.c     | 2 +-
 drivers/md/dm-kcopyd.c | 5 +----
 drivers/md/md.c        | 5 ++---
 4 files changed, 5 insertions(+), 9 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 54bfc274b39a..ca203cb23f3c 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -347,7 +347,7 @@ static void write_page(struct bitmap *bitmap, struct page *page, int wait)
 			atomic_inc(&bitmap->pending_writes);
 			set_buffer_locked(bh);
 			set_buffer_mapped(bh);
-			submit_bh(WRITE | REQ_UNPLUG | REQ_SYNC, bh);
+			submit_bh(WRITE | REQ_SYNC, bh);
 			bh = bh->b_this_page;
 		}
 
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 136d4f71a116..76a5af00a26b 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -352,7 +352,7 @@ static void dispatch_io(int rw, unsigned int num_regions,
 	BUG_ON(num_regions > DM_IO_MAX_REGIONS);
 
 	if (sync)
-		rw |= REQ_SYNC | REQ_UNPLUG;
+		rw |= REQ_SYNC;
 
 	/*
 	 * For multiple regions we need to be careful to rewind
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 400cf35094a4..1bb73a13ca40 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -356,11 +356,8 @@ static int run_io_job(struct kcopyd_job *job)
 
 	if (job->rw == READ)
 		r = dm_io(&io_req, 1, &job->source, NULL);
-	else {
-		if (job->num_dests > 1)
-			io_req.bi_rw |= REQ_UNPLUG;
+	else
 		r = dm_io(&io_req, job->num_dests, job->dests, NULL);
-	}
 
 	return r;
 }
diff --git a/drivers/md/md.c b/drivers/md/md.c
index ca0d79c264b9..28f9c1ee4e3a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -777,8 +777,7 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
 	bio->bi_end_io = super_written;
 
 	atomic_inc(&mddev->pending_writes);
-	submit_bio(REQ_WRITE | REQ_SYNC | REQ_UNPLUG | REQ_FLUSH | REQ_FUA,
-		   bio);
+	submit_bio(REQ_WRITE | REQ_SYNC | REQ_FLUSH | REQ_FUA, bio);
 }
 
 void md_super_wait(mddev_t *mddev)
@@ -806,7 +805,7 @@ int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size,
 	struct completion event;
 	int ret;
 
-	rw |= REQ_SYNC | REQ_UNPLUG;
+	rw |= REQ_SYNC;
 
 	bio->bi_bdev = (metadata_op && rdev->meta_bdev) ?
 		rdev->meta_bdev : rdev->bdev;
-- 
cgit v1.2.3


From a91a2785b200864aef2270ed6a3babac7a253a20 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Thu, 17 Mar 2011 11:11:05 +0100
Subject: block: Require subsystems to explicitly allocate bio_set integrity
 mempool

MD and DM create a new bio_set for every metadevice. Each bio_set has an
integrity mempool attached regardless of whether the metadevice is
capable of passing integrity metadata. This is a waste of memory.

Instead we defer the allocation decision to MD and DM since we know at
metadevice creation time whether integrity passthrough is needed or not.

Automatic integrity mempool allocation can then be removed from
bioset_create() and we make an explicit integrity allocation for the
fs_bio_set.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Reported-by: Zdenek Kabelac <zkabelac@redhat.com>
Acked-by: Mike Snitzer <snizer@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/md/dm-table.c  |  7 +++++--
 drivers/md/dm.c        | 12 +++++++++---
 drivers/md/dm.h        |  2 +-
 drivers/md/linear.c    |  3 +--
 drivers/md/md.c        |  8 ++++++--
 drivers/md/multipath.c |  7 +++++--
 drivers/md/raid0.c     |  3 +--
 drivers/md/raid1.c     |  5 ++---
 drivers/md/raid10.c    |  7 +++++--
 9 files changed, 35 insertions(+), 19 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index f50a7b952257..416d4e258df6 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -55,6 +55,7 @@ struct dm_table {
 	struct dm_target *targets;
 
 	unsigned discards_supported:1;
+	unsigned integrity_supported:1;
 
 	/*
 	 * Indicates the rw permissions for the new logical
@@ -859,7 +860,7 @@ int dm_table_alloc_md_mempools(struct dm_table *t)
 		return -EINVAL;
 	}
 
-	t->mempools = dm_alloc_md_mempools(type);
+	t->mempools = dm_alloc_md_mempools(type, t->integrity_supported);
 	if (!t->mempools)
 		return -ENOMEM;
 
@@ -935,8 +936,10 @@ static int dm_table_prealloc_integrity(struct dm_table *t, struct mapped_device
 	struct dm_dev_internal *dd;
 
 	list_for_each_entry(dd, devices, list)
-		if (bdev_get_integrity(dd->dm_dev.bdev))
+		if (bdev_get_integrity(dd->dm_dev.bdev)) {
+			t->integrity_supported = 1;
 			return blk_integrity_register(dm_disk(md), NULL);
+		}
 
 	return 0;
 }
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index d22b9905c168..88820704a191 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2620,9 +2620,10 @@ int dm_noflush_suspending(struct dm_target *ti)
 }
 EXPORT_SYMBOL_GPL(dm_noflush_suspending);
 
-struct dm_md_mempools *dm_alloc_md_mempools(unsigned type)
+struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity)
 {
 	struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL);
+	unsigned int pool_size = (type == DM_TYPE_BIO_BASED) ? 16 : MIN_IOS;
 
 	if (!pools)
 		return NULL;
@@ -2639,13 +2640,18 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type)
 	if (!pools->tio_pool)
 		goto free_io_pool_and_out;
 
-	pools->bs = (type == DM_TYPE_BIO_BASED) ?
-		    bioset_create(16, 0) : bioset_create(MIN_IOS, 0);
+	pools->bs = bioset_create(pool_size, 0);
 	if (!pools->bs)
 		goto free_tio_pool_and_out;
 
+	if (integrity && bioset_integrity_create(pools->bs, pool_size))
+		goto free_bioset_and_out;
+
 	return pools;
 
+free_bioset_and_out:
+	bioset_free(pools->bs);
+
 free_tio_pool_and_out:
 	mempool_destroy(pools->tio_pool);
 
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 0c2dd5f4af76..1aaf16746da8 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -149,7 +149,7 @@ void dm_kcopyd_exit(void);
 /*
  * Mempool operations
  */
-struct dm_md_mempools *dm_alloc_md_mempools(unsigned type);
+struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity);
 void dm_free_md_mempools(struct dm_md_mempools *pools);
 
 #endif
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 338804f8fb3b..abfb59a61ede 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -210,8 +210,7 @@ static int linear_run (mddev_t *mddev)
 	blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
 	mddev->queue->backing_dev_info.congested_fn = linear_congested;
 	mddev->queue->backing_dev_info.congested_data = mddev;
-	md_integrity_register(mddev);
-	return 0;
+	return md_integrity_register(mddev);
 }
 
 static void free_conf(struct rcu_head *head)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 86ba66c0b28a..1876761f1828 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1803,8 +1803,12 @@ int md_integrity_register(mddev_t *mddev)
 			mdname(mddev));
 		return -EINVAL;
 	}
-	printk(KERN_NOTICE "md: data integrity on %s enabled\n",
-		mdname(mddev));
+	printk(KERN_NOTICE "md: data integrity enabled on %s\n", mdname(mddev));
+	if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) {
+		printk(KERN_ERR "md: failed to create integrity pool for %s\n",
+		       mdname(mddev));
+		return -EINVAL;
+	}
 	return 0;
 }
 EXPORT_SYMBOL(md_integrity_register);
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 5e694b151c30..c35890990985 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -315,7 +315,7 @@ static int multipath_remove_disk(mddev_t *mddev, int number)
 			p->rdev = rdev;
 			goto abort;
 		}
-		md_integrity_register(mddev);
+		err = md_integrity_register(mddev);
 	}
 abort:
 
@@ -489,7 +489,10 @@ static int multipath_run (mddev_t *mddev)
 
 	mddev->queue->backing_dev_info.congested_fn = multipath_congested;
 	mddev->queue->backing_dev_info.congested_data = mddev;
-	md_integrity_register(mddev);
+
+	if (md_integrity_register(mddev))
+		goto out_free_conf;
+
 	return 0;
 
 out_free_conf:
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 95916fd6394a..e86bf3682e1e 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -379,8 +379,7 @@ static int raid0_run(mddev_t *mddev)
 
 	blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec);
 	dump_zones(mddev);
-	md_integrity_register(mddev);
-	return 0;
+	return md_integrity_register(mddev);
 }
 
 static int raid0_stop(mddev_t *mddev)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 8f34ad5c478b..c2a21ae56d97 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1132,7 +1132,7 @@ static int raid1_remove_disk(mddev_t *mddev, int number)
 			p->rdev = rdev;
 			goto abort;
 		}
-		md_integrity_register(mddev);
+		err = md_integrity_register(mddev);
 	}
 abort:
 
@@ -2017,8 +2017,7 @@ static int run(mddev_t *mddev)
 
 	mddev->queue->backing_dev_info.congested_fn = raid1_congested;
 	mddev->queue->backing_dev_info.congested_data = mddev;
-	md_integrity_register(mddev);
-	return 0;
+	return md_integrity_register(mddev);
 }
 
 static int stop(mddev_t *mddev)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index c0d0f5f7e407..f7b62370b374 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1188,7 +1188,7 @@ static int raid10_remove_disk(mddev_t *mddev, int number)
 			p->rdev = rdev;
 			goto abort;
 		}
-		md_integrity_register(mddev);
+		err = md_integrity_register(mddev);
 	}
 abort:
 
@@ -2343,7 +2343,10 @@ static int run(mddev_t *mddev)
 
 	if (conf->near_copies < conf->raid_disks)
 		blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
-	md_integrity_register(mddev);
+
+	if (md_integrity_register(mddev))
+		goto out_free_conf;
+
 	return 0;
 
 out_free_conf:
-- 
cgit v1.2.3


From 1e9bb8808ac11094d711d20d580e7b45a4992d0c Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Tue, 22 Mar 2011 08:35:35 +0100
Subject: block: fix non-atomic access to genhd inflight structures

After the stack plugging introduction, these are called lockless.
Ensure that the counters are updated atomically.

Signed-off-by: Shaohua Li<shaohua.li@intel.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/md/dm.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 88820704a191..0cf68b478878 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -477,7 +477,8 @@ static void start_io_acct(struct dm_io *io)
 	cpu = part_stat_lock();
 	part_round_stats(cpu, &dm_disk(md)->part0);
 	part_stat_unlock();
-	dm_disk(md)->part0.in_flight[rw] = atomic_inc_return(&md->pending[rw]);
+	atomic_set(&dm_disk(md)->part0.in_flight[rw],
+		atomic_inc_return(&md->pending[rw]));
 }
 
 static void end_io_acct(struct dm_io *io)
@@ -497,8 +498,8 @@ static void end_io_acct(struct dm_io *io)
 	 * After this is decremented the bio must not be touched if it is
 	 * a flush.
 	 */
-	dm_disk(md)->part0.in_flight[rw] = pending =
-		atomic_dec_return(&md->pending[rw]);
+	pending = atomic_dec_return(&md->pending[rw]);
+	atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
 	pending += atomic_read(&md->pending[rw^0x1]);
 
 	/* nudge anyone waiting on suspend queue */
-- 
cgit v1.2.3


From 6b33aff368def952be78102c0935ebd219f9a748 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Wed, 23 Mar 2011 16:42:13 -0700
Subject: md: use little-endian bitops

As a preparation for removing ext2 non-atomic bit operations from
asm/bitops.h.  This converts ext2 non-atomic bit operations to
little-endian bit operations.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Acked-by: NeilBrown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/md/bitmap.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 9a35320fb59f..a2ce0b2da281 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -854,7 +854,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
 		if (bitmap->flags & BITMAP_HOSTENDIAN)
 			set_bit(bit, kaddr);
 		else
-			ext2_set_bit(bit, kaddr);
+			__test_and_set_bit_le(bit, kaddr);
 		kunmap_atomic(kaddr, KM_USER0);
 		PRINTK("set file bit %lu page %lu\n", bit, page->index);
 	}
@@ -1050,7 +1050,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 		if (bitmap->flags & BITMAP_HOSTENDIAN)
 			b = test_bit(bit, paddr);
 		else
-			b = ext2_test_bit(bit, paddr);
+			b = test_bit_le(bit, paddr);
 		kunmap_atomic(paddr, KM_USER0);
 		if (b) {
 			/* if the disk bit is set, set the memory bit */
@@ -1226,7 +1226,7 @@ void bitmap_daemon_work(mddev_t *mddev)
 						clear_bit(file_page_offset(bitmap, j),
 							  paddr);
 					else
-						ext2_clear_bit(file_page_offset(bitmap, j),
+						__test_and_clear_bit_le(file_page_offset(bitmap, j),
 							       paddr);
 					kunmap_atomic(paddr, KM_USER0);
 				} else
-- 
cgit v1.2.3


From bb5cda3d706f44e5696533c9a7353c458f2871e0 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Wed, 23 Mar 2011 16:42:13 -0700
Subject: dm: use little-endian bitops

As a preparation for removing ext2 non-atomic bit operations from
asm/bitops.h.  This converts ext2 non-atomic bit operations to
little-endian bit operations.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Alasdair Kergon <agk@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/md/dm-log.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 6951536ea29c..57968eb382c1 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -251,20 +251,20 @@ struct log_c {
  */
 static inline int log_test_bit(uint32_t *bs, unsigned bit)
 {
-	return ext2_test_bit(bit, (unsigned long *) bs) ? 1 : 0;
+	return test_bit_le(bit, (unsigned long *) bs) ? 1 : 0;
 }
 
 static inline void log_set_bit(struct log_c *l,
 			       uint32_t *bs, unsigned bit)
 {
-	ext2_set_bit(bit, (unsigned long *) bs);
+	__test_and_set_bit_le(bit, (unsigned long *) bs);
 	l->touched_cleaned = 1;
 }
 
 static inline void log_clear_bit(struct log_c *l,
 				 uint32_t *bs, unsigned bit)
 {
-	ext2_clear_bit(bit, (unsigned long *) bs);
+	__test_and_clear_bit_le(bit, (unsigned long *) bs);
 	l->touched_dirtied = 1;
 }
 
@@ -740,7 +740,7 @@ static int core_get_resync_work(struct dm_dirty_log *log, region_t *region)
 		return 0;
 
 	do {
-		*region = ext2_find_next_zero_bit(
+		*region = find_next_zero_bit_le(
 					     (unsigned long *) lc->sync_bits,
 					     lc->region_count,
 					     lc->sync_search);
-- 
cgit v1.2.3


From 024d37e95ec4a7ccc256973ab2feab01f4fbdd2d Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Thu, 24 Mar 2011 13:52:14 +0000
Subject: dm: fix opening log and cow devices for read only tables

If a table is read-only, also open any log and cow devices it uses read-only.

Previously, even read-only devices were opened read-write internally.
After patch 75f1dc0d076d1c1168f2115f1941ea627d38bd5a
  block: check bdev_read_only() from blkdev_get()
was applied, loading such tables began to fail.  The patch
was reverted by e51900f7d38cbcfb481d84567fd92540e7e1d23a
  block: revert block_dev read-only check
but this patch fixes this part of the code to work with the original patch.

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-log.c  | 2 +-
 drivers/md/dm-snap.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 6951536ea29c..8e8a868ca857 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -543,7 +543,7 @@ static int disk_ctr(struct dm_dirty_log *log, struct dm_target *ti,
 		return -EINVAL;
 	}
 
-	r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &dev);
+	r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &dev);
 	if (r)
 		return r;
 
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index fdde53cd12b7..a2d330942cb2 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1080,7 +1080,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	argv++;
 	argc--;
 
-	r = dm_get_device(ti, cow_path, FMODE_READ | FMODE_WRITE, &s->cow);
+	r = dm_get_device(ti, cow_path, dm_table_get_mode(ti->table), &s->cow);
 	if (r) {
 		ti->error = "Cannot get COW device";
 		goto bad_cow;
-- 
cgit v1.2.3


From 3407ef5262b55ca5d7139d2b555ef792fe531eec Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 24 Mar 2011 13:54:24 +0000
Subject: dm: add flakey target

This target is the same as the linear target except that it returns I/O
errors periodically.  It's been found useful in simulating failing
devices for testing purposes.

I needed a dm target to do some failure testing on btrfs's raid code, and
Mike pointed me at this.

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/Kconfig     |   6 ++
 drivers/md/Makefile    |   1 +
 drivers/md/dm-flakey.c | 212 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 219 insertions(+)
 create mode 100644 drivers/md/dm-flakey.c

(limited to 'drivers/md')

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 98d9ec85e0eb..8420129fc5ee 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -327,4 +327,10 @@ config DM_UEVENT
 	---help---
 	Generate udev events for DM events.
 
+config DM_FLAKEY
+       tristate "Flakey target (EXPERIMENTAL)"
+       depends on BLK_DEV_DM && EXPERIMENTAL
+       ---help---
+         A target that intermittently fails I/O for debugging purposes.
+
 endif # MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index d0138606c2e8..448838b1f92a 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_BLK_DEV_MD)	+= md-mod.o
 obj-$(CONFIG_BLK_DEV_DM)	+= dm-mod.o
 obj-$(CONFIG_DM_CRYPT)		+= dm-crypt.o
 obj-$(CONFIG_DM_DELAY)		+= dm-delay.o
+obj-$(CONFIG_DM_FLAKEY)		+= dm-flakey.o
 obj-$(CONFIG_DM_MULTIPATH)	+= dm-multipath.o dm-round-robin.o
 obj-$(CONFIG_DM_MULTIPATH_QL)	+= dm-queue-length.o
 obj-$(CONFIG_DM_MULTIPATH_ST)	+= dm-service-time.o
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
new file mode 100644
index 000000000000..ea790623c30b
--- /dev/null
+++ b/drivers/md/dm-flakey.c
@@ -0,0 +1,212 @@
+/*
+ * Copyright (C) 2003 Sistina Software (UK) Limited.
+ * Copyright (C) 2004, 2010 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/device-mapper.h>
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+
+#define DM_MSG_PREFIX "flakey"
+
+/*
+ * Flakey: Used for testing only, simulates intermittent,
+ * catastrophic device failure.
+ */
+struct flakey_c {
+	struct dm_dev *dev;
+	unsigned long start_time;
+	sector_t start;
+	unsigned up_interval;
+	unsigned down_interval;
+};
+
+/*
+ * Construct a flakey mapping: <dev_path> <offset> <up interval> <down interval>
+ */
+static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+	struct flakey_c *fc;
+	unsigned long long tmp;
+
+	if (argc != 4) {
+		ti->error = "dm-flakey: Invalid argument count";
+		return -EINVAL;
+	}
+
+	fc = kmalloc(sizeof(*fc), GFP_KERNEL);
+	if (!fc) {
+		ti->error = "dm-flakey: Cannot allocate linear context";
+		return -ENOMEM;
+	}
+	fc->start_time = jiffies;
+
+	if (sscanf(argv[1], "%llu", &tmp) != 1) {
+		ti->error = "dm-flakey: Invalid device sector";
+		goto bad;
+	}
+	fc->start = tmp;
+
+	if (sscanf(argv[2], "%u", &fc->up_interval) != 1) {
+		ti->error = "dm-flakey: Invalid up interval";
+		goto bad;
+	}
+
+	if (sscanf(argv[3], "%u", &fc->down_interval) != 1) {
+		ti->error = "dm-flakey: Invalid down interval";
+		goto bad;
+	}
+
+	if (!(fc->up_interval + fc->down_interval)) {
+		ti->error = "dm-flakey: Total (up + down) interval is zero";
+		goto bad;
+	}
+
+	if (fc->up_interval + fc->down_interval < fc->up_interval) {
+		ti->error = "dm-flakey: Interval overflow";
+		goto bad;
+	}
+
+	if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &fc->dev)) {
+		ti->error = "dm-flakey: Device lookup failed";
+		goto bad;
+	}
+
+	ti->num_flush_requests = 1;
+	ti->private = fc;
+	return 0;
+
+bad:
+	kfree(fc);
+	return -EINVAL;
+}
+
+static void flakey_dtr(struct dm_target *ti)
+{
+	struct flakey_c *fc = ti->private;
+
+	dm_put_device(ti, fc->dev);
+	kfree(fc);
+}
+
+static sector_t flakey_map_sector(struct dm_target *ti, sector_t bi_sector)
+{
+	struct flakey_c *fc = ti->private;
+
+	return fc->start + (bi_sector - ti->begin);
+}
+
+static void flakey_map_bio(struct dm_target *ti, struct bio *bio)
+{
+	struct flakey_c *fc = ti->private;
+
+	bio->bi_bdev = fc->dev->bdev;
+	if (bio_sectors(bio))
+		bio->bi_sector = flakey_map_sector(ti, bio->bi_sector);
+}
+
+static int flakey_map(struct dm_target *ti, struct bio *bio,
+		      union map_info *map_context)
+{
+	struct flakey_c *fc = ti->private;
+	unsigned elapsed;
+
+	/* Are we alive ? */
+	elapsed = (jiffies - fc->start_time) / HZ;
+	if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval)
+		return -EIO;
+
+	flakey_map_bio(ti, bio);
+
+	return DM_MAPIO_REMAPPED;
+}
+
+static int flakey_status(struct dm_target *ti, status_type_t type,
+			 char *result, unsigned int maxlen)
+{
+	struct flakey_c *fc = ti->private;
+
+	switch (type) {
+	case STATUSTYPE_INFO:
+		result[0] = '\0';
+		break;
+
+	case STATUSTYPE_TABLE:
+		snprintf(result, maxlen, "%s %llu %u %u", fc->dev->name,
+			 (unsigned long long)fc->start, fc->up_interval,
+			 fc->down_interval);
+		break;
+	}
+	return 0;
+}
+
+static int flakey_ioctl(struct dm_target *ti, unsigned int cmd, unsigned long arg)
+{
+	struct flakey_c *fc = ti->private;
+
+	return __blkdev_driver_ioctl(fc->dev->bdev, fc->dev->mode, cmd, arg);
+}
+
+static int flakey_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+			struct bio_vec *biovec, int max_size)
+{
+	struct flakey_c *fc = ti->private;
+	struct request_queue *q = bdev_get_queue(fc->dev->bdev);
+
+	if (!q->merge_bvec_fn)
+		return max_size;
+
+	bvm->bi_bdev = fc->dev->bdev;
+	bvm->bi_sector = flakey_map_sector(ti, bvm->bi_sector);
+
+	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
+}
+
+static int flakey_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data)
+{
+	struct flakey_c *fc = ti->private;
+
+	return fn(ti, fc->dev, fc->start, ti->len, data);
+}
+
+static struct target_type flakey_target = {
+	.name   = "flakey",
+	.version = {1, 1, 0},
+	.module = THIS_MODULE,
+	.ctr    = flakey_ctr,
+	.dtr    = flakey_dtr,
+	.map    = flakey_map,
+	.status = flakey_status,
+	.ioctl	= flakey_ioctl,
+	.merge	= flakey_merge,
+	.iterate_devices = flakey_iterate_devices,
+};
+
+static int __init dm_flakey_init(void)
+{
+	int r = dm_register_target(&flakey_target);
+
+	if (r < 0)
+		DMERR("register failed %d", r);
+
+	return r;
+}
+
+static void __exit dm_flakey_exit(void)
+{
+	dm_unregister_target(&flakey_target);
+}
+
+/* Module hooks */
+module_init(dm_flakey_init);
+module_exit(dm_flakey_exit);
+
+MODULE_DESCRIPTION(DM_NAME " flakey target");
+MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From de8be5ac70f50a2340f24fd769a1aafa5a51ae34 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Thu, 24 Mar 2011 13:54:27 +0000
Subject: dm crypt: wipe keys string immediately after key is set

Always wipe the original copy of the key after processing it
in crypt_set_key().

Signed-off-by: Milan Broz <mbroz@redhat.com>
Acked-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-crypt.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 4e054bd91664..7dcae0d74db7 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1331,20 +1331,29 @@ static int crypt_setkey_allcpus(struct crypt_config *cc)
 
 static int crypt_set_key(struct crypt_config *cc, char *key)
 {
+	int r = -EINVAL;
+	int key_string_len = strlen(key);
+
 	/* The key size may not be changed. */
-	if (cc->key_size != (strlen(key) >> 1))
-		return -EINVAL;
+	if (cc->key_size != (key_string_len >> 1))
+		goto out;
 
 	/* Hyphen (which gives a key_size of zero) means there is no key. */
 	if (!cc->key_size && strcmp(key, "-"))
-		return -EINVAL;
+		goto out;
 
 	if (cc->key_size && crypt_decode_key(cc->key, key, cc->key_size) < 0)
-		return -EINVAL;
+		goto out;
 
 	set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
 
-	return crypt_setkey_allcpus(cc);
+	r = crypt_setkey_allcpus(cc);
+
+out:
+	/* Hex key string not needed after here, so wipe it. */
+	memset(key, '0', key_string_len);
+
+	return r;
 }
 
 static int crypt_wipe_key(struct crypt_config *cc)
-- 
cgit v1.2.3


From 6bb43b5d1f54fb44c0408d86d5e71e4405a3ebe1 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Thu, 24 Mar 2011 13:54:28 +0000
Subject: dm ioctl: prepare for crypt key wiping

Prepare code for implementing buffer wipe flag.
No functional change in this patch.

Signed-off-by: Milan Broz <mbroz@redhat.com>
Acked-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-ioctl.c | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 6d12775a1061..3a8f6a0e6eb5 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1501,11 +1501,6 @@ static int check_version(unsigned int cmd, struct dm_ioctl __user *user)
 	return r;
 }
 
-static void free_params(struct dm_ioctl *param)
-{
-	vfree(param);
-}
-
 static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param)
 {
 	struct dm_ioctl tmp, *dmi;
@@ -1520,13 +1515,15 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param)
 	if (!dmi)
 		return -ENOMEM;
 
-	if (copy_from_user(dmi, user, tmp.data_size)) {
-		vfree(dmi);
-		return -EFAULT;
-	}
+	if (copy_from_user(dmi, user, tmp.data_size))
+		goto bad;
 
 	*param = dmi;
 	return 0;
+
+bad:
+	vfree(dmi);
+	return -EFAULT;
 }
 
 static int validate_params(uint cmd, struct dm_ioctl *param)
@@ -1564,7 +1561,7 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
 	unsigned int cmd;
 	struct dm_ioctl *uninitialized_var(param);
 	ioctl_fn fn = NULL;
-	size_t param_size;
+	size_t input_param_size;
 
 	/* only root can play with this */
 	if (!capable(CAP_SYS_ADMIN))
@@ -1605,6 +1602,7 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
 	 * Copy the parameters into kernel space.
 	 */
 	r = copy_params(user, &param);
+	input_param_size = param->data_size;
 
 	current->flags &= ~PF_MEMALLOC;
 
@@ -1615,9 +1613,8 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
 	if (r)
 		goto out;
 
-	param_size = param->data_size;
 	param->data_size = sizeof(*param);
-	r = fn(param, param_size);
+	r = fn(param, input_param_size);
 
 	/*
 	 * Copy the results back to userland.
@@ -1625,8 +1622,8 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
 	if (!r && copy_to_user(user, param, param->data_size))
 		r = -EFAULT;
 
- out:
-	free_params(param);
+out:
+	vfree(param);
 	return r;
 }
 
-- 
cgit v1.2.3


From f868120549fc1664b2c451d4b9882a363928c698 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Thu, 24 Mar 2011 13:54:30 +0000
Subject: dm ioctl: add flag to wipe buffers for secure data

Add DM_SECURE_DATA_FLAG which userspace can use to ensure
that all buffers allocated for dm-ioctl are wiped
immediately after use.

The user buffer is wiped as well (we do not want to keep
and return sensitive data back to userspace if the flag is set).

Wiping is useful for cryptsetup to ensure that the key
is present in memory only in defined places and only
for the time needed.

(For crypt, key can be present in table during load or table
status, wait and message commands).

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-ioctl.c | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 3a8f6a0e6eb5..4cacdad2270a 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1504,6 +1504,7 @@ static int check_version(unsigned int cmd, struct dm_ioctl __user *user)
 static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param)
 {
 	struct dm_ioctl tmp, *dmi;
+	int secure_data;
 
 	if (copy_from_user(&tmp, user, sizeof(tmp) - sizeof(tmp.data)))
 		return -EFAULT;
@@ -1511,17 +1512,28 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param)
 	if (tmp.data_size < (sizeof(tmp) - sizeof(tmp.data)))
 		return -EINVAL;
 
+	secure_data = tmp.flags & DM_SECURE_DATA_FLAG;
+
 	dmi = vmalloc(tmp.data_size);
-	if (!dmi)
+	if (!dmi) {
+		if (secure_data && clear_user(user, tmp.data_size))
+			return -EFAULT;
 		return -ENOMEM;
+	}
 
 	if (copy_from_user(dmi, user, tmp.data_size))
 		goto bad;
 
+	/* Wipe the user buffer so we do not return it to userspace */
+	if (secure_data && clear_user(user, tmp.data_size))
+		goto bad;
+
 	*param = dmi;
 	return 0;
 
 bad:
+	if (secure_data)
+		memset(dmi, 0, tmp.data_size);
 	vfree(dmi);
 	return -EFAULT;
 }
@@ -1531,6 +1543,7 @@ static int validate_params(uint cmd, struct dm_ioctl *param)
 	/* Always clear this flag */
 	param->flags &= ~DM_BUFFER_FULL_FLAG;
 	param->flags &= ~DM_UEVENT_GENERATED_FLAG;
+	param->flags &= ~DM_SECURE_DATA_FLAG;
 
 	/* Ignores parameters */
 	if (cmd == DM_REMOVE_ALL_CMD ||
@@ -1558,6 +1571,7 @@ static int validate_params(uint cmd, struct dm_ioctl *param)
 static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
 {
 	int r = 0;
+	int wipe_buffer;
 	unsigned int cmd;
 	struct dm_ioctl *uninitialized_var(param);
 	ioctl_fn fn = NULL;
@@ -1602,13 +1616,15 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
 	 * Copy the parameters into kernel space.
 	 */
 	r = copy_params(user, &param);
-	input_param_size = param->data_size;
 
 	current->flags &= ~PF_MEMALLOC;
 
 	if (r)
 		return r;
 
+	input_param_size = param->data_size;
+	wipe_buffer = param->flags & DM_SECURE_DATA_FLAG;
+
 	r = validate_params(cmd, param);
 	if (r)
 		goto out;
@@ -1623,6 +1639,9 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
 		r = -EFAULT;
 
 out:
+	if (wipe_buffer)
+		memset(param, 0, input_param_size);
+
 	vfree(param);
 	return r;
 }
-- 
cgit v1.2.3


From 19040c0bc8efcb767221d8ef7bb9c32ff0586179 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 24 Mar 2011 13:54:31 +0000
Subject: dm mpath: fail message ioctl if specified path is not valid

Fail the reinstate_path and fail_path message ioctl if the specified
path is not valid.

The message ioctl would succeed for the 'reinistate_path' and
'fail_path' messages even if action was not taken because the
specified device was not a valid path of the multipath device.

Before, when /dev/vdb is not a path of mpathb:
$ dmsetup message mpathb 0 reinstate_path /dev/vdb
$ echo $?
0

After:
$ dmsetup message mpathb 0 reinstate_path /dev/vdb
device-mapper: message ioctl failed: Invalid argument
Command failed
$ echo $?
1

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-mpath.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 4b0b63c290a6..52f0de394632 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1065,7 +1065,7 @@ out:
 static int action_dev(struct multipath *m, struct dm_dev *dev,
 		      action_fn action)
 {
-	int r = 0;
+	int r = -EINVAL;
 	struct pgpath *pgpath;
 	struct priority_group *pg;
 
@@ -1669,7 +1669,7 @@ out:
  *---------------------------------------------------------------*/
 static struct target_type multipath_target = {
 	.name = "multipath",
-	.version = {1, 2, 0},
+	.version = {1, 3, 0},
 	.module = THIS_MODULE,
 	.ctr = multipath_ctr,
 	.dtr = multipath_dtr,
-- 
cgit v1.2.3


From a490a07a67b7a37f588021410e06b721a752fc34 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 24 Mar 2011 13:54:33 +0000
Subject: dm mpath: allow table load with no priority groups

This patch adjusts the multipath target to allow a table with both 0
priority groups and 0 for the initial priority group number.

If any mpath device is held open when all paths in the last priority
group have failed, userspace multipathd will attempt to reload the
associated DM table to reflect the fact that the device no longer has
any priority groups.  But the reload attempt always failed because the
multipath target did not allow 0 priority groups.

All multipath target messages related to priority group (enable_group,
disable_group, switch_group) will handle a priority group of 0 (will
cause error).

When reloading a multipath table with 0 priority groups, userspace
multipathd must be updated to specify an initial priority group number
of 0 (rather than 1).

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: Babu Moger <babu.moger@lsi.com>
Acked-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-mpath.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 52f0de394632..a550a057d991 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -844,8 +844,8 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
 {
 	/* target parameters */
 	static struct param _params[] = {
-		{1, 1024, "invalid number of priority groups"},
-		{1, 1024, "invalid initial priority group number"},
+		{0, 1024, "invalid number of priority groups"},
+		{0, 1024, "invalid initial priority group number"},
 	};
 
 	int r;
@@ -879,6 +879,13 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
 	if (r)
 		goto bad;
 
+	if ((!m->nr_priority_groups && next_pg_num) ||
+	    (m->nr_priority_groups && !next_pg_num)) {
+		ti->error = "invalid initial priority group";
+		r = -EINVAL;
+		goto bad;
+	}
+
 	/* parse the priority groups */
 	while (as.argc) {
 		struct priority_group *pg;
@@ -1415,7 +1422,7 @@ static int multipath_status(struct dm_target *ti, status_type_t type,
 	else if (m->current_pg)
 		pg_num = m->current_pg->pg_num;
 	else
-			pg_num = 1;
+		pg_num = (m->nr_priority_groups ? 1 : 0);
 
 	DMEMIT("%u ", pg_num);
 
-- 
cgit v1.2.3


From 29915202006c2e7bafe81348eb498ff9a724ac61 Mon Sep 17 00:00:00 2001
From: Mustafa Mesanovic <mume@linux.vnet.ibm.com>
Date: Thu, 24 Mar 2011 13:54:35 +0000
Subject: dm stripe: implement merge method

Implement a merge function in the striped target.

When the striped target's underlying devices provide a merge_bvec_fn
(like all DM devices do via dm_merge_bvec) it is important to call down
to them when building a biovec that doesn't span a stripe boundary.

Without the merge method, a striped DM device stacked on DM devices
causes bios with a single page to be submitted which results
in unnecessary overhead that hurts performance.

This change really helps filesystems (e.g. XFS and now ext4) which take
care to assemble larger bios.  By implementing stripe_merge(), DM and the
stripe target no longer undermine the filesystem's work by only allowing
a single page per bio.  Buffered IO sees the biggest improvement
(particularly uncached reads, buffered writes to a lesser degree).  This
is especially so for more capable "enterprise" storage LUNs.

The performance improvement has been measured to be ~12-35% -- when a
reasonable chunk_size is used (e.g. 64K) in conjunction with a stripe
count that is a power of 2.

In contrast, the performance penalty is ~5-7% for the pathological worst
case stripe configuration (small chunk_size with a stripe count that is
not a power of 2).  The reason for this is that stripe_map_sector() is
now called once for every call to dm_merge_bvec().  stripe_map_sector()
will use slower division if stripe count isn't a power of 2.

Signed-off-by: Mustafa Mesanovic <mume@linux.vnet.ibm.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-stripe.c | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index dddfa14f2982..3d80cf0c152d 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -396,9 +396,29 @@ static void stripe_io_hints(struct dm_target *ti,
 	blk_limits_io_opt(limits, chunk_size * sc->stripes);
 }
 
+static int stripe_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+			struct bio_vec *biovec, int max_size)
+{
+	struct stripe_c *sc = ti->private;
+	sector_t bvm_sector = bvm->bi_sector;
+	uint32_t stripe;
+	struct request_queue *q;
+
+	stripe_map_sector(sc, bvm_sector, &stripe, &bvm_sector);
+
+	q = bdev_get_queue(sc->stripe[stripe].dev->bdev);
+	if (!q->merge_bvec_fn)
+		return max_size;
+
+	bvm->bi_bdev = sc->stripe[stripe].dev->bdev;
+	bvm->bi_sector = sc->stripe[stripe].physical_start + bvm_sector;
+
+	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
+}
+
 static struct target_type stripe_target = {
 	.name   = "striped",
-	.version = {1, 3, 1},
+	.version = {1, 4, 0},
 	.module = THIS_MODULE,
 	.ctr    = stripe_ctr,
 	.dtr    = stripe_dtr,
@@ -407,6 +427,7 @@ static struct target_type stripe_target = {
 	.status = stripe_status,
 	.iterate_devices = stripe_iterate_devices,
 	.io_hints = stripe_io_hints,
+	.merge  = stripe_merge,
 };
 
 int __init dm_stripe_init(void)
-- 
cgit v1.2.3


From 89078d572eb9ce8d4c04264b8b0ba86de0d74c8f Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Mon, 28 Mar 2011 20:09:12 -0400
Subject: md: Fix integrity registration error when no devices are capable

We incorrectly returned -EINVAL when none of the devices in the array
had an integrity profile.  This in turn prevented mdadm from starting
the metadevice.  Fix this so we only return errors on mismatched
profiles and memory allocation failures.

Reported-by: Giacomo Catenazzi <cate@cateee.net>
Reported-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/md/md.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 06ecea751a39..8b66e04c2ea6 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1777,12 +1777,6 @@ int md_integrity_register(mddev_t *mddev)
 			continue;
 		if (rdev->raid_disk < 0)
 			continue;
-		/*
-		 * If at least one rdev is not integrity capable, we can not
-		 * enable data integrity for the md device.
-		 */
-		if (!bdev_get_integrity(rdev->bdev))
-			return -EINVAL;
 		if (!reference) {
 			/* Use the first rdev as the reference */
 			reference = rdev;
@@ -1793,6 +1787,8 @@ int md_integrity_register(mddev_t *mddev)
 				rdev->bdev->bd_disk) < 0)
 			return -EINVAL;
 	}
+	if (!reference || !bdev_get_integrity(reference->bdev))
+		return 0;
 	/*
 	 * All component devices are integrity capable and have matching
 	 * profiles, register the common profile for the md device.
-- 
cgit v1.2.3


From 25985edcedea6396277003854657b5f3cb31a628 Mon Sep 17 00:00:00 2001
From: Lucas De Marchi <lucas.demarchi@profusion.mobi>
Date: Wed, 30 Mar 2011 22:57:33 -0300
Subject: Fix common misspellings

Fixes generated by 'codespell' and manually reviewed.

Signed-off-by: Lucas De Marchi <lucas.demarchi@profusion.mobi>
---
 drivers/md/bitmap.h         | 2 +-
 drivers/md/dm-region-hash.c | 2 +-
 drivers/md/faulty.c         | 2 +-
 drivers/md/md.c             | 2 +-
 drivers/md/md.h             | 2 +-
 drivers/md/raid10.c         | 6 +++---
 drivers/md/raid10.h         | 4 ++--
 7 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index 931a7a7c3796..d0aeaf46d932 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -45,7 +45,7 @@
  *
  * The counter counts pending write requests, plus the on-disk bit.
  * When the counter is '1' and the resync bits are clear, the on-disk
- * bit can be cleared aswell, thus setting the counter to 0.
+ * bit can be cleared as well, thus setting the counter to 0.
  * When we set a bit, or in the counter (to start a write), if the fields is
  * 0, we first set the disk bit and set the counter to 1.
  *
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index dad011aed0c9..7771ed212182 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -419,7 +419,7 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio)
 	/*
 	 * Possible cases:
 	 *   1) DM_RH_DIRTY
-	 *   2) DM_RH_NOSYNC: was dirty, other preceeding writes failed
+	 *   2) DM_RH_NOSYNC: was dirty, other preceding writes failed
 	 *   3) DM_RH_RECOVERING: flushing pending writes
 	 * Either case, the region should have not been connected to list.
 	 */
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 339fdc670751..23078dabb6df 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -30,7 +30,7 @@
  *
  * Different modes can be active at a time, but only
  * one can be set at array creation.  Others can be added later.
- * A mode can be one-shot or recurrent with the recurrance being
+ * A mode can be one-shot or recurrent with the recurrence being
  * once in every N requests.
  * The bottom 5 bits of the "layout" indicate the mode.  The
  * remainder indicate a period, or 0 for one-shot.
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8b66e04c2ea6..b12b3776c0c0 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -6266,7 +6266,7 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev)
 	 * rt is a sector_t, so could be 32bit or 64bit.
 	 * So we divide before multiply in case it is 32bit and close
 	 * to the limit.
-	 * We scale the divisor (db) by 32 to avoid loosing precision
+	 * We scale the divisor (db) by 32 to avoid losing precision
 	 * near the end of resync when the number of remaining sectors
 	 * is close to 'db'.
 	 * We then divide rt by 32 after multiplying by db to compensate.
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 12215d437fcc..52b407369e13 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -94,7 +94,7 @@ struct mdk_rdev_s
 #define	In_sync		2		/* device is in_sync with rest of array */
 #define	WriteMostly	4		/* Avoid reading if at all possible */
 #define	AutoDetected	7		/* added by auto-detect */
-#define Blocked		8		/* An error occured on an externally
+#define Blocked		8		/* An error occurred on an externally
 					 * managed array, don't allow writes
 					 * until it is cleared */
 	wait_queue_head_t blocked_wait;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index f7b62370b374..2da83d566592 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -5,7 +5,7 @@
  *
  * RAID-10 support for md.
  *
- * Base on code in raid1.c.  See raid1.c for futher copyright information.
+ * Base on code in raid1.c.  See raid1.c for further copyright information.
  *
  *
  * This program is free software; you can redistribute it and/or modify
@@ -340,14 +340,14 @@ static void raid10_end_write_request(struct bio *bio, int error)
 
 /*
  * RAID10 layout manager
- * Aswell as the chunksize and raid_disks count, there are two
+ * As well as the chunksize and raid_disks count, there are two
  * parameters: near_copies and far_copies.
  * near_copies * far_copies must be <= raid_disks.
  * Normally one of these will be 1.
  * If both are 1, we get raid0.
  * If near_copies == raid_disks, we get raid1.
  *
- * Chunks are layed out in raid0 style with near_copies copies of the
+ * Chunks are laid out in raid0 style with near_copies copies of the
  * first chunk, followed by near_copies copies of the next chunk and
  * so on.
  * If far_copies > 1, then after 1/far_copies of the array has been assigned
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 2316ac2e8e21..944b1104d3b4 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -17,8 +17,8 @@ struct r10_private_data_s {
 	spinlock_t		device_lock;
 
 	/* geometry */
-	int			near_copies;  /* number of copies layed out raid0 style */
-	int 			far_copies;   /* number of copies layed out
+	int			near_copies;  /* number of copies laid out raid0 style */
+	int 			far_copies;   /* number of copies laid out
 					       * at large strides across drives
 					       */
 	int			far_offset;   /* far_copies are offset by 1 stripe
-- 
cgit v1.2.3


From a63a5cf84dac7a23a57c800eea5734701e7d3c04 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Fri, 1 Apr 2011 21:02:31 +0200
Subject: dm: improve block integrity support

The current block integrity (DIF/DIX) support in DM is verifying that
all devices' integrity profiles match during DM device resume (which
is past the point of no return).  To some degree that is unavoidable
(stacked DM devices force this late checking).  But for most DM
devices (which aren't stacking on other DM devices) the ideal time to
verify all integrity profiles match is during table load.

Introduce the notion of an "initialized" integrity profile: a profile
that was blk_integrity_register()'d with a non-NULL 'blk_integrity'
template.  Add blk_integrity_is_initialized() to allow checking if a
profile was initialized.

Update DM integrity support to:
- check all devices with _initialized_ integrity profiles match
  during table load; uninitialized profiles (e.g. for underlying DM
  device(s) of a stacked DM device) are ignored.
- disallow a table load that would result in an integrity profile that
  conflicts with a DM device's existing (in-use) integrity profile
- avoid clearing an existing integrity profile
- validate all integrity profiles match during resume; but if they
  don't all we can do is report the mismatch (during resume we're past
  the point of no return)

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/md/dm-table.c | 114 +++++++++++++++++++++++++++++++++++---------------
 1 file changed, 80 insertions(+), 34 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 416d4e258df6..cb8380c9767f 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -926,21 +926,81 @@ static int dm_table_build_index(struct dm_table *t)
 	return r;
 }
 
+/*
+ * Get a disk whose integrity profile reflects the table's profile.
+ * If %match_all is true, all devices' profiles must match.
+ * If %match_all is false, all devices must at least have an
+ * allocated integrity profile; but uninitialized is ok.
+ * Returns NULL if integrity support was inconsistent or unavailable.
+ */
+static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t,
+						    bool match_all)
+{
+	struct list_head *devices = dm_table_get_devices(t);
+	struct dm_dev_internal *dd = NULL;
+	struct gendisk *prev_disk = NULL, *template_disk = NULL;
+
+	list_for_each_entry(dd, devices, list) {
+		template_disk = dd->dm_dev.bdev->bd_disk;
+		if (!blk_get_integrity(template_disk))
+			goto no_integrity;
+		if (!match_all && !blk_integrity_is_initialized(template_disk))
+			continue; /* skip uninitialized profiles */
+		else if (prev_disk &&
+			 blk_integrity_compare(prev_disk, template_disk) < 0)
+			goto no_integrity;
+		prev_disk = template_disk;
+	}
+
+	return template_disk;
+
+no_integrity:
+	if (prev_disk)
+		DMWARN("%s: integrity not set: %s and %s profile mismatch",
+		       dm_device_name(t->md),
+		       prev_disk->disk_name,
+		       template_disk->disk_name);
+	return NULL;
+}
+
 /*
  * Register the mapped device for blk_integrity support if
- * the underlying devices support it.
+ * the underlying devices have an integrity profile.  But all devices
+ * may not have matching profiles (checking all devices isn't reliable
+ * during table load because this table may use other DM device(s) which
+ * must be resumed before they will have an initialized integity profile).
+ * Stacked DM devices force a 2 stage integrity profile validation:
+ * 1 - during load, validate all initialized integrity profiles match
+ * 2 - during resume, validate all integrity profiles match
  */
 static int dm_table_prealloc_integrity(struct dm_table *t, struct mapped_device *md)
 {
-	struct list_head *devices = dm_table_get_devices(t);
-	struct dm_dev_internal *dd;
+	struct gendisk *template_disk = NULL;
 
-	list_for_each_entry(dd, devices, list)
-		if (bdev_get_integrity(dd->dm_dev.bdev)) {
-			t->integrity_supported = 1;
-			return blk_integrity_register(dm_disk(md), NULL);
-		}
+	template_disk = dm_table_get_integrity_disk(t, false);
+	if (!template_disk)
+		return 0;
 
+	if (!blk_integrity_is_initialized(dm_disk(md))) {
+		t->integrity_supported = 1;
+		return blk_integrity_register(dm_disk(md), NULL);
+	}
+
+	/*
+	 * If DM device already has an initalized integrity
+	 * profile the new profile should not conflict.
+	 */
+	if (blk_integrity_is_initialized(template_disk) &&
+	    blk_integrity_compare(dm_disk(md), template_disk) < 0) {
+		DMWARN("%s: conflict with existing integrity profile: "
+		       "%s profile mismatch",
+		       dm_device_name(t->md),
+		       template_disk->disk_name);
+		return 1;
+	}
+
+	/* Preserve existing initialized integrity profile */
+	t->integrity_supported = 1;
 	return 0;
 }
 
@@ -1094,41 +1154,27 @@ combine_limits:
 
 /*
  * Set the integrity profile for this device if all devices used have
- * matching profiles.
+ * matching profiles.  We're quite deep in the resume path but still
+ * don't know if all devices (particularly DM devices this device
+ * may be stacked on) have matching profiles.  Even if the profiles
+ * don't match we have no way to fail (to resume) at this point.
  */
 static void dm_table_set_integrity(struct dm_table *t)
 {
-	struct list_head *devices = dm_table_get_devices(t);
-	struct dm_dev_internal *prev = NULL, *dd = NULL;
+	struct gendisk *template_disk = NULL;
 
 	if (!blk_get_integrity(dm_disk(t->md)))
 		return;
 
-	list_for_each_entry(dd, devices, list) {
-		if (prev &&
-		    blk_integrity_compare(prev->dm_dev.bdev->bd_disk,
-					  dd->dm_dev.bdev->bd_disk) < 0) {
-			DMWARN("%s: integrity not set: %s and %s mismatch",
-			       dm_device_name(t->md),
-			       prev->dm_dev.bdev->bd_disk->disk_name,
-			       dd->dm_dev.bdev->bd_disk->disk_name);
-			goto no_integrity;
-		}
-		prev = dd;
+	template_disk = dm_table_get_integrity_disk(t, true);
+	if (!template_disk &&
+	    blk_integrity_is_initialized(dm_disk(t->md))) {
+		DMWARN("%s: device no longer has a valid integrity profile",
+		       dm_device_name(t->md));
+		return;
 	}
-
-	if (!prev || !bdev_get_integrity(prev->dm_dev.bdev))
-		goto no_integrity;
-
 	blk_integrity_register(dm_disk(t->md),
-			       bdev_get_integrity(prev->dm_dev.bdev));
-
-	return;
-
-no_integrity:
-	blk_integrity_register(dm_disk(t->md), NULL);
-
-	return;
+			       blk_get_integrity(template_disk));
 }
 
 void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
-- 
cgit v1.2.3


From e1dfa0a29737142c32f00a3bac0f609dc85b4a82 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 18 Apr 2011 18:25:41 +1000
Subject: md: use new plugging interface for RAID IO.

md/raid submits a lot of IO from the various raid threads.
So adding start/finish plug calls to those so that some
plugging happens.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid1.c  | 5 ++++-
 drivers/md/raid10.c | 3 +++
 drivers/md/raid5.c  | 3 +++
 3 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index c2a21ae56d97..5449cd5e3141 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1516,9 +1516,11 @@ static void raid1d(mddev_t *mddev)
 	conf_t *conf = mddev->private;
 	struct list_head *head = &conf->retry_list;
 	mdk_rdev_t *rdev;
+	struct blk_plug plug;
 
 	md_check_recovery(mddev);
-	
+
+	blk_start_plug(&plug);
 	for (;;) {
 		char b[BDEVNAME_SIZE];
 
@@ -1593,6 +1595,7 @@ static void raid1d(mddev_t *mddev)
 		}
 		cond_resched();
 	}
+	blk_finish_plug(&plug);
 }
 
 
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 2da83d566592..c8e5dac5d695 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1640,9 +1640,11 @@ static void raid10d(mddev_t *mddev)
 	conf_t *conf = mddev->private;
 	struct list_head *head = &conf->retry_list;
 	mdk_rdev_t *rdev;
+	struct blk_plug plug;
 
 	md_check_recovery(mddev);
 
+	blk_start_plug(&plug);
 	for (;;) {
 		char b[BDEVNAME_SIZE];
 
@@ -1716,6 +1718,7 @@ static void raid10d(mddev_t *mddev)
 		}
 		cond_resched();
 	}
+	blk_finish_plug(&plug);
 }
 
 
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index e867ee42b152..ce6960b1c682 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4478,11 +4478,13 @@ static void raid5d(mddev_t *mddev)
 	struct stripe_head *sh;
 	raid5_conf_t *conf = mddev->private;
 	int handled;
+	struct blk_plug plug;
 
 	pr_debug("+++ raid5d active\n");
 
 	md_check_recovery(mddev);
 
+	blk_start_plug(&plug);
 	handled = 0;
 	spin_lock_irq(&conf->device_lock);
 	while (1) {
@@ -4525,6 +4527,7 @@ static void raid5d(mddev_t *mddev)
 	spin_unlock_irq(&conf->device_lock);
 
 	async_tx_issue_pending_all();
+	blk_finish_plug(&plug);
 
 	pr_debug("--- raid5d inactive\n");
 }
-- 
cgit v1.2.3


From af1db72d8b340f97ad12b60175afdef43e6f0e60 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 18 Apr 2011 18:25:41 +1000
Subject: md/dm - remove remains of plug_fn callback.

Now that unplugging is done differently, the unplug_fn callback is
never called, so it can be completely discarded.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/dm-raid.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 5ef136cdba91..e5d8904fc8f6 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -390,13 +390,6 @@ static int raid_is_congested(struct dm_target_callbacks *cb, int bits)
 	return md_raid5_congested(&rs->md, bits);
 }
 
-static void raid_unplug(struct dm_target_callbacks *cb)
-{
-	struct raid_set *rs = container_of(cb, struct raid_set, callbacks);
-
-	md_raid5_kick_device(rs->md.private);
-}
-
 /*
  * Construct a RAID4/5/6 mapping:
  * Args:
@@ -487,7 +480,6 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	}
 
 	rs->callbacks.congested_fn = raid_is_congested;
-	rs->callbacks.unplug_fn = raid_unplug;
 	dm_table_add_target_callbacks(ti->table, &rs->callbacks);
 
 	return 0;
-- 
cgit v1.2.3


From 482c083492ddaa32ef5864bae3d143dc8bcdf7d1 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 18 Apr 2011 18:25:42 +1000
Subject: md - remove old plugging code.

md has some plugging infrastructure for RAID5 to use because the
normal plugging infrastructure required a 'request_queue', and when
called from dm, RAID5 doesn't have one of those available.

This relied on the ->unplug_fn callback which doesn't exist any more.

So remove all of that code, both in md and raid5.  Subsequent patches
with restore the plugging functionality.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c    | 51 ---------------------------------------------------
 drivers/md/md.h    | 22 ----------------------
 drivers/md/raid5.c | 37 ++++++++-----------------------------
 drivers/md/raid5.h |  2 --
 4 files changed, 8 insertions(+), 104 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index b12b3776c0c0..fb11170c717e 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -445,50 +445,6 @@ void md_flush_request(mddev_t *mddev, struct bio *bio)
 }
 EXPORT_SYMBOL(md_flush_request);
 
-/* Support for plugging.
- * This mirrors the plugging support in request_queue, but does not
- * require having a whole queue
- */
-static void plugger_work(struct work_struct *work)
-{
-	struct plug_handle *plug =
-		container_of(work, struct plug_handle, unplug_work);
-	plug->unplug_fn(plug);
-}
-static void plugger_timeout(unsigned long data)
-{
-	struct plug_handle *plug = (void *)data;
-	kblockd_schedule_work(NULL, &plug->unplug_work);
-}
-void plugger_init(struct plug_handle *plug,
-		  void (*unplug_fn)(struct plug_handle *))
-{
-	plug->unplug_flag = 0;
-	plug->unplug_fn = unplug_fn;
-	init_timer(&plug->unplug_timer);
-	plug->unplug_timer.function = plugger_timeout;
-	plug->unplug_timer.data = (unsigned long)plug;
-	INIT_WORK(&plug->unplug_work, plugger_work);
-}
-EXPORT_SYMBOL_GPL(plugger_init);
-
-void plugger_set_plug(struct plug_handle *plug)
-{
-	if (!test_and_set_bit(PLUGGED_FLAG, &plug->unplug_flag))
-		mod_timer(&plug->unplug_timer, jiffies + msecs_to_jiffies(3)+1);
-}
-EXPORT_SYMBOL_GPL(plugger_set_plug);
-
-int plugger_remove_plug(struct plug_handle *plug)
-{
-	if (test_and_clear_bit(PLUGGED_FLAG, &plug->unplug_flag)) {
-		del_timer(&plug->unplug_timer);
-		return 1;
-	} else
-		return 0;
-}
-EXPORT_SYMBOL_GPL(plugger_remove_plug);
-
 
 static inline mddev_t *mddev_get(mddev_t *mddev)
 {
@@ -4723,7 +4679,6 @@ static void md_clean(mddev_t *mddev)
 	mddev->bitmap_info.chunksize = 0;
 	mddev->bitmap_info.daemon_sleep = 0;
 	mddev->bitmap_info.max_write_behind = 0;
-	mddev->plug = NULL;
 }
 
 static void __md_stop_writes(mddev_t *mddev)
@@ -6688,12 +6643,6 @@ int md_allow_write(mddev_t *mddev)
 }
 EXPORT_SYMBOL_GPL(md_allow_write);
 
-void md_unplug(mddev_t *mddev)
-{
-	if (mddev->plug)
-		mddev->plug->unplug_fn(mddev->plug);
-}
-
 #define SYNC_MARKS	10
 #define	SYNC_MARK_STEP	(3*HZ)
 void md_do_sync(mddev_t *mddev)
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 52b407369e13..fad90228672f 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -29,26 +29,6 @@
 typedef struct mddev_s mddev_t;
 typedef struct mdk_rdev_s mdk_rdev_t;
 
-/* generic plugging support - like that provided with request_queue,
- * but does not require a request_queue
- */
-struct plug_handle {
-	void			(*unplug_fn)(struct plug_handle *);
-	struct timer_list	unplug_timer;
-	struct work_struct	unplug_work;
-	unsigned long		unplug_flag;
-};
-#define	PLUGGED_FLAG 1
-void plugger_init(struct plug_handle *plug,
-		  void (*unplug_fn)(struct plug_handle *));
-void plugger_set_plug(struct plug_handle *plug);
-int plugger_remove_plug(struct plug_handle *plug);
-static inline void plugger_flush(struct plug_handle *plug)
-{
-	del_timer_sync(&plug->unplug_timer);
-	cancel_work_sync(&plug->unplug_work);
-}
-
 /*
  * MD's 'extended' device
  */
@@ -336,7 +316,6 @@ struct mddev_s
 	struct list_head		all_mddevs;
 
 	struct attribute_group		*to_remove;
-	struct plug_handle		*plug; /* if used by personality */
 
 	struct bio_set			*bio_set;
 
@@ -516,7 +495,6 @@ extern int md_integrity_register(mddev_t *mddev);
 extern void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
 extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale);
 extern void restore_bitmap_write_access(struct file *file);
-extern void md_unplug(mddev_t *mddev);
 
 extern void mddev_init(mddev_t *mddev);
 extern int md_run(mddev_t *mddev);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index ce6960b1c682..a1755a6a3e5f 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -199,14 +199,12 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
 		BUG_ON(!list_empty(&sh->lru));
 		BUG_ON(atomic_read(&conf->active_stripes)==0);
 		if (test_bit(STRIPE_HANDLE, &sh->state)) {
-			if (test_bit(STRIPE_DELAYED, &sh->state)) {
+			if (test_bit(STRIPE_DELAYED, &sh->state))
 				list_add_tail(&sh->lru, &conf->delayed_list);
-				plugger_set_plug(&conf->plug);
-			} else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
-				   sh->bm_seq - conf->seq_write > 0) {
+			else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
+				   sh->bm_seq - conf->seq_write > 0)
 				list_add_tail(&sh->lru, &conf->bitmap_list);
-				plugger_set_plug(&conf->plug);
-			} else {
+			else {
 				clear_bit(STRIPE_BIT_DELAY, &sh->state);
 				list_add_tail(&sh->lru, &conf->handle_list);
 			}
@@ -461,7 +459,7 @@ get_active_stripe(raid5_conf_t *conf, sector_t sector,
 						     < (conf->max_nr_stripes *3/4)
 						     || !conf->inactive_blocked),
 						    conf->device_lock,
-						    md_raid5_kick_device(conf));
+						    md_wakeup_thread(conf->mddev->thread));
 				conf->inactive_blocked = 0;
 			} else
 				init_stripe(sh, sector, previous);
@@ -1470,7 +1468,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
 		wait_event_lock_irq(conf->wait_for_stripe,
 				    !list_empty(&conf->inactive_list),
 				    conf->device_lock,
-				    blk_flush_plug(current));
+				    );
 		osh = get_free_stripe(conf);
 		spin_unlock_irq(&conf->device_lock);
 		atomic_set(&nsh->count, 1);
@@ -3623,8 +3621,7 @@ static void raid5_activate_delayed(raid5_conf_t *conf)
 				atomic_inc(&conf->preread_active_stripes);
 			list_add_tail(&sh->lru, &conf->hold_list);
 		}
-	} else
-		plugger_set_plug(&conf->plug);
+	}
 }
 
 static void activate_bit_delay(raid5_conf_t *conf)
@@ -3641,21 +3638,6 @@ static void activate_bit_delay(raid5_conf_t *conf)
 	}
 }
 
-void md_raid5_kick_device(raid5_conf_t *conf)
-{
-	blk_flush_plug(current);
-	raid5_activate_delayed(conf);
-	md_wakeup_thread(conf->mddev->thread);
-}
-EXPORT_SYMBOL_GPL(md_raid5_kick_device);
-
-static void raid5_unplug(struct plug_handle *plug)
-{
-	raid5_conf_t *conf = container_of(plug, raid5_conf_t, plug);
-
-	md_raid5_kick_device(conf);
-}
-
 int md_raid5_congested(mddev_t *mddev, int bits)
 {
 	raid5_conf_t *conf = mddev->private;
@@ -4057,7 +4039,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
 				 * add failed due to overlap.  Flush everything
 				 * and wait a while
 				 */
-				md_raid5_kick_device(conf);
+				md_wakeup_thread(mddev->thread);
 				release_stripe(sh);
 				schedule();
 				goto retry;
@@ -5144,8 +5126,6 @@ static int run(mddev_t *mddev)
 		       mdname(mddev));
 	md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
 
-	plugger_init(&conf->plug, raid5_unplug);
-	mddev->plug = &conf->plug;
 	if (mddev->queue) {
 		int chunk_size;
 		/* read-ahead size must cover two whole stripes, which
@@ -5195,7 +5175,6 @@ static int stop(mddev_t *mddev)
 	mddev->thread = NULL;
 	if (mddev->queue)
 		mddev->queue->backing_dev_info.congested_fn = NULL;
-	plugger_flush(&conf->plug); /* the unplug fn references 'conf'*/
 	free_conf(conf);
 	mddev->private = NULL;
 	mddev->to_remove = &raid5_attrs_group;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 8d563a4f022a..3ca77a2613ba 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -400,8 +400,6 @@ struct raid5_private_data {
 					    * Cleared when a sync completes.
 					    */
 
-	struct plug_handle	plug;
-
 	/* per cpu variables */
 	struct raid5_percpu {
 		struct page	*spare_page; /* Used when checking P/Q in raid6 */
-- 
cgit v1.2.3


From 97658cdd3af7d01461874c93b89afa4a2465e7c6 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 18 Apr 2011 18:25:42 +1000
Subject: md: provide generic support for handling unplug callbacks.

When an md device adds a request to a queue, it can call
mddev_check_plugged.
If this succeeds then we know that the md thread will be woken up
shortly, and ->plug_cnt will be non-zero until then, so some
processing can be delayed.

If it fails, then no unplug callback is expected and the make_request
function needs to do whatever is required to make the request happen.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/md/md.h |  4 ++++
 2 files changed, 60 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index fb11170c717e..6e853c61d87e 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -445,6 +445,61 @@ void md_flush_request(mddev_t *mddev, struct bio *bio)
 }
 EXPORT_SYMBOL(md_flush_request);
 
+/* Support for plugging.
+ * This mirrors the plugging support in request_queue, but does not
+ * require having a whole queue or request structures.
+ * We allocate an md_plug_cb for each md device and each thread it gets
+ * plugged on.  This links tot the private plug_handle structure in the
+ * personality data where we keep a count of the number of outstanding
+ * plugs so other code can see if a plug is active.
+ */
+struct md_plug_cb {
+	struct blk_plug_cb cb;
+	mddev_t *mddev;
+};
+
+static void plugger_unplug(struct blk_plug_cb *cb)
+{
+	struct md_plug_cb *mdcb = container_of(cb, struct md_plug_cb, cb);
+	if (atomic_dec_and_test(&mdcb->mddev->plug_cnt))
+		md_wakeup_thread(mdcb->mddev->thread);
+	kfree(mdcb);
+}
+
+/* Check that an unplug wakeup will come shortly.
+ * If not, wakeup the md thread immediately
+ */
+int mddev_check_plugged(mddev_t *mddev)
+{
+	struct blk_plug *plug = current->plug;
+	struct md_plug_cb *mdcb;
+
+	if (!plug)
+		return 0;
+
+	list_for_each_entry(mdcb, &plug->cb_list, cb.list) {
+		if (mdcb->cb.callback == plugger_unplug &&
+		    mdcb->mddev == mddev) {
+			/* Already on the list, move to top */
+			if (mdcb != list_first_entry(&plug->cb_list,
+						    struct md_plug_cb,
+						    cb.list))
+				list_move(&mdcb->cb.list, &plug->cb_list);
+			return 1;
+		}
+	}
+	/* Not currently on the callback list */
+	mdcb = kmalloc(sizeof(*mdcb), GFP_ATOMIC);
+	if (!mdcb)
+		return 0;
+
+	mdcb->mddev = mddev;
+	mdcb->cb.callback = plugger_unplug;
+	atomic_inc(&mddev->plug_cnt);
+	list_add(&mdcb->cb.list, &plug->cb_list);
+	return 1;
+}
+EXPORT_SYMBOL_GPL(mddev_check_plugged);
 
 static inline mddev_t *mddev_get(mddev_t *mddev)
 {
@@ -494,6 +549,7 @@ void mddev_init(mddev_t *mddev)
 	atomic_set(&mddev->active, 1);
 	atomic_set(&mddev->openers, 0);
 	atomic_set(&mddev->active_io, 0);
+	atomic_set(&mddev->plug_cnt, 0);
 	spin_lock_init(&mddev->write_lock);
 	atomic_set(&mddev->flush_pending, 0);
 	init_waitqueue_head(&mddev->sb_wait);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index fad90228672f..0b1fd3f1d85b 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -179,6 +179,9 @@ struct mddev_s
 	int				delta_disks, new_level, new_layout;
 	int				new_chunk_sectors;
 
+	atomic_t			plug_cnt;	/* If device is expecting
+							 * more bios soon.
+							 */
 	struct mdk_thread_s		*thread;	/* management thread */
 	struct mdk_thread_s		*sync_thread;	/* doing resync or reconstruct */
 	sector_t			curr_resync;	/* last block scheduled */
@@ -508,4 +511,5 @@ extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
 				   mddev_t *mddev);
 extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
 				   mddev_t *mddev);
+extern int mddev_check_plugged(mddev_t *mddev);
 #endif /* _MD_MD_H */
-- 
cgit v1.2.3


From 7c13edc87510f665da3094174e1fd633e06649f4 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 18 Apr 2011 18:25:43 +1000
Subject: md: incorporate new plugging into raid5.

In raid5 plugging is used for 2 things:
 1/ collecting writes that require a bitmap update
 2/ collecting writes in the hope that we can create full
    stripes - or at least more-full.

We now release these different sets of stripes when plug_cnt
is zero.

Also in make_request, we call mddev_check_plug to hopefully increase
plug_cnt, and wake up the thread at the end if plugging wasn't
achieved for some reason.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index a1755a6a3e5f..f301e6ae220c 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -27,12 +27,12 @@
  *
  * We group bitmap updates into batches.  Each batch has a number.
  * We may write out several batches at once, but that isn't very important.
- * conf->bm_write is the number of the last batch successfully written.
- * conf->bm_flush is the number of the last batch that was closed to
+ * conf->seq_write is the number of the last batch successfully written.
+ * conf->seq_flush is the number of the last batch that was closed to
  *    new additions.
  * When we discover that we will need to write to any block in a stripe
  * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
- * the number of the batch it will be in. This is bm_flush+1.
+ * the number of the batch it will be in. This is seq_flush+1.
  * When we are ready to do a write, if that batch hasn't been written yet,
  *   we plug the array and queue the stripe for later.
  * When an unplug happens, we increment bm_flush, thus closing the current
@@ -459,7 +459,7 @@ get_active_stripe(raid5_conf_t *conf, sector_t sector,
 						     < (conf->max_nr_stripes *3/4)
 						     || !conf->inactive_blocked),
 						    conf->device_lock,
-						    md_wakeup_thread(conf->mddev->thread));
+						    );
 				conf->inactive_blocked = 0;
 			} else
 				init_stripe(sh, sector, previous);
@@ -3927,6 +3927,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
 	struct stripe_head *sh;
 	const int rw = bio_data_dir(bi);
 	int remaining;
+	int plugged;
 
 	if (unlikely(bi->bi_rw & REQ_FLUSH)) {
 		md_flush_request(mddev, bi);
@@ -3945,6 +3946,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
 	bi->bi_next = NULL;
 	bi->bi_phys_segments = 1;	/* over-loaded to count active stripes */
 
+	plugged = mddev_check_plugged(mddev);
 	for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
 		DEFINE_WAIT(w);
 		int disks, data_disks;
@@ -4059,6 +4061,9 @@ static int make_request(mddev_t *mddev, struct bio * bi)
 		}
 			
 	}
+	if (!plugged)
+		md_wakeup_thread(mddev->thread);
+
 	spin_lock_irq(&conf->device_lock);
 	remaining = raid5_dec_bi_phys_segments(bi);
 	spin_unlock_irq(&conf->device_lock);
@@ -4472,14 +4477,18 @@ static void raid5d(mddev_t *mddev)
 	while (1) {
 		struct bio *bio;
 
-		if (conf->seq_flush != conf->seq_write) {
-			int seq = conf->seq_flush;
+		if (atomic_read(&mddev->plug_cnt) == 0 &&
+		    !list_empty(&conf->bitmap_list)) {
+			/* Now is a good time to flush some bitmap updates */
+			conf->seq_flush++;
 			spin_unlock_irq(&conf->device_lock);
 			bitmap_unplug(mddev->bitmap);
 			spin_lock_irq(&conf->device_lock);
-			conf->seq_write = seq;
+			conf->seq_write = conf->seq_flush;
 			activate_bit_delay(conf);
 		}
+		if (atomic_read(&mddev->plug_cnt) == 0)
+			raid5_activate_delayed(conf);
 
 		while ((bio = remove_bio_from_retry(conf))) {
 			int ok;
-- 
cgit v1.2.3


From c3b328ac846bcf6b9a62c5563380a81ab723006d Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 18 Apr 2011 18:25:43 +1000
Subject: md: fix up raid1/raid10 unplugging.

We just need to make sure that an unplug event wakes up the md
thread, which is exactly what mddev_check_plugged does.

Also remove some plug-related code that is no longer needed.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid1.c  | 24 ++++++++++--------------
 drivers/md/raid10.c | 24 ++++++++++--------------
 2 files changed, 20 insertions(+), 28 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 5449cd5e3141..2b7a7ff401dc 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -565,12 +565,6 @@ static void flush_pending_writes(conf_t *conf)
 		spin_unlock_irq(&conf->device_lock);
 }
 
-static void md_kick_device(mddev_t *mddev)
-{
-	blk_flush_plug(current);
-	md_wakeup_thread(mddev->thread);
-}
-
 /* Barriers....
  * Sometimes we need to suspend IO while we do something else,
  * either some resync/recovery, or reconfigure the array.
@@ -600,7 +594,7 @@ static void raise_barrier(conf_t *conf)
 
 	/* Wait until no block IO is waiting */
 	wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
-			    conf->resync_lock, md_kick_device(conf->mddev));
+			    conf->resync_lock, );
 
 	/* block any new IO from starting */
 	conf->barrier++;
@@ -608,7 +602,7 @@ static void raise_barrier(conf_t *conf)
 	/* Now wait for all pending IO to complete */
 	wait_event_lock_irq(conf->wait_barrier,
 			    !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
-			    conf->resync_lock, md_kick_device(conf->mddev));
+			    conf->resync_lock, );
 
 	spin_unlock_irq(&conf->resync_lock);
 }
@@ -630,7 +624,7 @@ static void wait_barrier(conf_t *conf)
 		conf->nr_waiting++;
 		wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
 				    conf->resync_lock,
-				    md_kick_device(conf->mddev));
+				    );
 		conf->nr_waiting--;
 	}
 	conf->nr_pending++;
@@ -666,8 +660,7 @@ static void freeze_array(conf_t *conf)
 	wait_event_lock_irq(conf->wait_barrier,
 			    conf->nr_pending == conf->nr_queued+1,
 			    conf->resync_lock,
-			    ({ flush_pending_writes(conf);
-			       md_kick_device(conf->mddev); }));
+			    flush_pending_writes(conf));
 	spin_unlock_irq(&conf->resync_lock);
 }
 static void unfreeze_array(conf_t *conf)
@@ -729,6 +722,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 	const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
 	const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
 	mdk_rdev_t *blocked_rdev;
+	int plugged;
 
 	/*
 	 * Register the new request and wait if the reconstruction
@@ -820,6 +814,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 	 * inc refcount on their rdev.  Record them by setting
 	 * bios[x] to bio
 	 */
+	plugged = mddev_check_plugged(mddev);
+
 	disks = conf->raid_disks;
  retry_write:
 	blocked_rdev = NULL;
@@ -925,7 +921,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 	/* In case raid1d snuck in to freeze_array */
 	wake_up(&conf->wait_barrier);
 
-	if (do_sync || !bitmap)
+	if (do_sync || !bitmap || !plugged)
 		md_wakeup_thread(mddev->thread);
 
 	return 0;
@@ -1524,7 +1520,8 @@ static void raid1d(mddev_t *mddev)
 	for (;;) {
 		char b[BDEVNAME_SIZE];
 
-		flush_pending_writes(conf);
+		if (atomic_read(&mddev->plug_cnt) == 0)
+			flush_pending_writes(conf);
 
 		spin_lock_irqsave(&conf->device_lock, flags);
 		if (list_empty(head)) {
@@ -2042,7 +2039,6 @@ static int stop(mddev_t *mddev)
 
 	md_unregister_thread(mddev->thread);
 	mddev->thread = NULL;
-	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
 	if (conf->r1bio_pool)
 		mempool_destroy(conf->r1bio_pool);
 	kfree(conf->mirrors);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index c8e5dac5d695..8e9462626ec5 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -634,12 +634,6 @@ static void flush_pending_writes(conf_t *conf)
 		spin_unlock_irq(&conf->device_lock);
 }
 
-static void md_kick_device(mddev_t *mddev)
-{
-	blk_flush_plug(current);
-	md_wakeup_thread(mddev->thread);
-}
-
 /* Barriers....
  * Sometimes we need to suspend IO while we do something else,
  * either some resync/recovery, or reconfigure the array.
@@ -669,15 +663,15 @@ static void raise_barrier(conf_t *conf, int force)
 
 	/* Wait until no block IO is waiting (unless 'force') */
 	wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
-			    conf->resync_lock, md_kick_device(conf->mddev));
+			    conf->resync_lock, );
 
 	/* block any new IO from starting */
 	conf->barrier++;
 
-	/* No wait for all pending IO to complete */
+	/* Now wait for all pending IO to complete */
 	wait_event_lock_irq(conf->wait_barrier,
 			    !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
-			    conf->resync_lock, md_kick_device(conf->mddev));
+			    conf->resync_lock, );
 
 	spin_unlock_irq(&conf->resync_lock);
 }
@@ -698,7 +692,7 @@ static void wait_barrier(conf_t *conf)
 		conf->nr_waiting++;
 		wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
 				    conf->resync_lock,
-				    md_kick_device(conf->mddev));
+				    );
 		conf->nr_waiting--;
 	}
 	conf->nr_pending++;
@@ -734,8 +728,8 @@ static void freeze_array(conf_t *conf)
 	wait_event_lock_irq(conf->wait_barrier,
 			    conf->nr_pending == conf->nr_queued+1,
 			    conf->resync_lock,
-			    ({ flush_pending_writes(conf);
-			       md_kick_device(conf->mddev); }));
+			    flush_pending_writes(conf));
+
 	spin_unlock_irq(&conf->resync_lock);
 }
 
@@ -762,6 +756,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 	const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
 	unsigned long flags;
 	mdk_rdev_t *blocked_rdev;
+	int plugged;
 
 	if (unlikely(bio->bi_rw & REQ_FLUSH)) {
 		md_flush_request(mddev, bio);
@@ -870,6 +865,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 	 * inc refcount on their rdev.  Record them by setting
 	 * bios[x] to bio
 	 */
+	plugged = mddev_check_plugged(mddev);
+
 	raid10_find_phys(conf, r10_bio);
  retry_write:
 	blocked_rdev = NULL;
@@ -946,9 +943,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 	/* In case raid10d snuck in to freeze_array */
 	wake_up(&conf->wait_barrier);
 
-	if (do_sync || !mddev->bitmap)
+	if (do_sync || !mddev->bitmap || !plugged)
 		md_wakeup_thread(mddev->thread);
-
 	return 0;
 }
 
-- 
cgit v1.2.3


From 2b7da309ffe602d222558cee4d7e407b96e34b3a Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 20 Apr 2011 15:38:07 +1000
Subject: md/raid5: remove setting of ->queue_lock

We previously needed to set ->queue_lock to match the raid5
device_lock so we could safely use queue_flag_* operations (e.g. for
plugging). which test the ->queue_lock is in fact locked.

However that need has completely gone away and is unlikely to come
back to remove this now-pointless setting.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f301e6ae220c..179bdfc4d03a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5151,7 +5151,6 @@ static int run(mddev_t *mddev)
 
 		mddev->queue->backing_dev_info.congested_data = mddev;
 		mddev->queue->backing_dev_info.congested_fn = raid5_congested;
-		mddev->queue->queue_lock = &conf->device_lock;
 
 		chunk_size = mddev->chunk_sectors << 9;
 		blk_queue_io_min(mddev->queue, chunk_size);
-- 
cgit v1.2.3


From 3b71bd9337b404baab5c894e066be6b6bf51b1c3 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 20 Apr 2011 15:38:18 +1000
Subject: md: Fix dev_sectors on takeover from raid0 to raid4/5

A raid0 array doesn't set 'dev_sectors' as each device might
contribute a different number of sectors.
So when converting to a RAID4 or RAID5 we need to set dev_sectors
as they need the number.
We have already verified that in fact all devices do contribute
the same number of sectors, so use that number.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 179bdfc4d03a..fd500112f13e 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5678,6 +5678,7 @@ static void raid5_quiesce(mddev_t *mddev, int state)
 static void *raid45_takeover_raid0(mddev_t *mddev, int level)
 {
 	struct raid0_private_data *raid0_priv = mddev->private;
+	unsigned long long sectors;
 
 	/* for raid0 takeover only one zone is supported */
 	if (raid0_priv->nr_strip_zones > 1) {
@@ -5686,6 +5687,9 @@ static void *raid45_takeover_raid0(mddev_t *mddev, int level)
 		return ERR_PTR(-EINVAL);
 	}
 
+	sectors = raid0_priv->strip_zone[0].zone_end;
+	sector_div(sectors, raid0_priv->strip_zone[0].nb_dev);
+	mddev->dev_sectors = sectors;
 	mddev->new_level = level;
 	mddev->new_layout = ALGORITHM_PARITY_N;
 	mddev->new_chunk_sectors = mddev->chunk_sectors;
-- 
cgit v1.2.3


From fee68723cf6ae00082f70f3eff17fceab2a4f7d7 Mon Sep 17 00:00:00 2001
From: Krzysztof Wojcik <krzysztof.wojcik@intel.com>
Date: Wed, 20 Apr 2011 15:39:53 +1000
Subject: md: Cleanup after raid45->raid0 takeover

Problem:
After raid4->raid0 takeover operation, another takeover operation
(e.g raid0->raid10) results "kernel oops".
Root cause:
Variables 'degraded' in mddev structure is not cleared
on raid45->raid0 takeover.

This patch reset this variable.

Signed-off-by: Krzysztof Wojcik <krzysztof.wojcik@intel.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 6e853c61d87e..7d6f7f18a920 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3170,6 +3170,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
 	mddev->layout = mddev->new_layout;
 	mddev->chunk_sectors = mddev->new_chunk_sectors;
 	mddev->delta_disks = 0;
+	mddev->degraded = 0;
 	if (mddev->pers->sync_request == NULL) {
 		/* this is now an array without redundancy, so
 		 * it must always be in_sync
-- 
cgit v1.2.3


From d76c8420c3cf8e468901b0bd58306637335c98ea Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Thu, 21 Apr 2011 09:07:26 -0700
Subject: raid5: fix build error, sector_t usage

Change <sectors> from unsigned long long to sector_t.
This matches its source field.

  ERROR: "__udivdi3" [drivers/md/raid456.ko] undefined!

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/md/raid5.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index fd500112f13e..49bf5f891435 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5678,7 +5678,7 @@ static void raid5_quiesce(mddev_t *mddev, int state)
 static void *raid45_takeover_raid0(mddev_t *mddev, int level)
 {
 	struct raid0_private_data *raid0_priv = mddev->private;
-	unsigned long long sectors;
+	sector_t sectors;
 
 	/* for raid0 takeover only one zone is supported */
 	if (raid0_priv->nr_strip_zones > 1) {
-- 
cgit v1.2.3