7 files changed, 217 insertions, 208 deletions
diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt
index 05e2822a80b3..3d6951d63489 100644
--- a/Documentation/block/bfq-iosched.txt
+++ b/Documentation/block/bfq-iosched.txt
@@ -16,14 +16,16 @@ throughput. So, when needed for achieving a lower latency, BFQ builds
 schedules that may lead to a lower throughput. If your main or only
 goal, for a given device, is to achieve the maximum-possible
 throughput at all times, then do switch off all low-latency heuristics
-for that device, by setting low_latency to 0. Full details in Section 3.
+for that device, by setting low_latency to 0. See Section 3 for
+details on how to configure BFQ for the desired tradeoff between
+latency and throughput, or on how to maximize throughput.
 
 On average CPUs, the current version of BFQ can handle devices
 performing at most ~30K IOPS; at most ~50 KIOPS on faster CPUs. As a
 reference, 30-50 KIOPS correspond to very high bandwidths with
 sequential I/O (e.g., 8-12 GB/s if I/O requests are 256 KB large), and
-to 120-200 MB/s with 4KB random I/O. BFQ has not yet been tested on
-multi-queue devices.
+to 120-200 MB/s with 4KB random I/O. BFQ is currently being tested on
+multi-queue devices too.
 
 The table of contents follow. Impatients can just jump to Section 3.
 
@@ -33,7 +35,7 @@ CONTENTS
  1-1 Personal systems
  1-2 Server systems
 2. How does BFQ work?
-3. What are BFQ's tunable?
+3. What are BFQ's tunables and how to properly configure BFQ?
 4. BFQ group scheduling
  4-1 Service guarantees provided
  4-2 Interface
@@ -145,19 +147,28 @@ plus a lot of code, are borrowed from CFQ.
       contrast, BFQ may idle the device for a short time interval,
       giving the process the chance to go on being served if it issues
       a new request in time. Device idling typically boosts the
-      throughput on rotational devices, if processes do synchronous
-      and sequential I/O. In addition, under BFQ, device idling is
-      also instrumental in guaranteeing the desired throughput
-      fraction to processes issuing sync requests (see the description
-      of the slice_idle tunable in this document, or [1, 2], for more
-      details).
+      throughput on rotational devices and on non-queueing flash-based
+      devices, if processes do synchronous and sequential I/O. In
+      addition, under BFQ, device idling is also instrumental in
+      guaranteeing the desired throughput fraction to processes
+      issuing sync requests (see the description of the slice_idle
+      tunable in this document, or [1, 2], for more details).
 
       - With respect to idling for service guarantees, if several
 	processes are competing for the device at the same time, but
-	all processes (and groups, after the following commit) have
-	the same weight, then BFQ guarantees the expected throughput
-	distribution without ever idling the device. Throughput is
-	thus as high as possible in this common scenario.
+	all processes and groups have the same weight, then BFQ
+	guarantees the expected throughput distribution without ever
+	idling the device. Throughput is thus as high as possible in
+	this common scenario.
+
+     - On flash-based storage with internal queueing of commands
+       (typically NCQ), device idling happens to be always detrimental
+       for throughput. So, with these devices, BFQ performs idling
+       only when strictly needed for service guarantees, i.e., for
+       guaranteeing low latency or fairness. In these cases, overall
+       throughput may be sub-optimal. No solution currently exists to
+       provide both strong service guarantees and optimal throughput
+       on devices with internal queueing.
 
   - If low-latency mode is enabled (default configuration), BFQ
     executes some special heuristics to detect interactive and soft
@@ -191,10 +202,7 @@ plus a lot of code, are borrowed from CFQ.
   - Queues are scheduled according to a variant of WF2Q+, named
     B-WF2Q+, and implemented using an augmented rb-tree to preserve an
     O(log N) overall complexity.  See [2] for more details. B-WF2Q+ is
-    also ready for hierarchical scheduling. However, for a cleaner
-    logical breakdown, the code that enables and completes
-    hierarchical support is provided in the next commit, which focuses
-    exactly on this feature.
+    also ready for hierarchical scheduling, details in Section 4.
 
   - B-WF2Q+ guarantees a tight deviation with respect to an ideal,
     perfectly fair, and smooth service. In particular, B-WF2Q+
@@ -249,13 +257,24 @@ plus a lot of code, are borrowed from CFQ.
   the Idle class, to prevent it from starving.
 
 
-3. What are BFQ's tunable?
-==========================
+3. What are BFQ's tunables and how to properly configure BFQ?
+=============================================================
+
+Most BFQ tunables affect service guarantees (basically latency and
+fairness) and throughput. For full details on how to choose the
+desired tradeoff between service guarantees and throughput, see the
+parameters slice_idle, strict_guarantees and low_latency. For details
+on how to maximise throughput, see slice_idle, timeout_sync and
+max_budget. The other performance-related parameters have been
+inherited from, and have been preserved mostly for compatibility with
+CFQ. So far, no performance improvement has been reported after
+changing the latter parameters in BFQ.
 
-The tunables back_seek-max, back_seek_penalty, fifo_expire_async and
-fifo_expire_sync below are the same as in CFQ. Their description is
-just copied from that for CFQ. Some considerations in the description
-of slice_idle are copied from CFQ too.
+In particular, the tunables back_seek-max, back_seek_penalty,
+fifo_expire_async and fifo_expire_sync below are the same as in
+CFQ. Their description is just copied from that for CFQ. Some
+considerations in the description of slice_idle are copied from CFQ
+too.
 
 per-process ioprio and weight
 -----------------------------
@@ -285,15 +304,17 @@ number of seeks and see improved throughput.
 
 Setting slice_idle to 0 will remove all the idling on queues and one
 should see an overall improved throughput on faster storage devices
-like multiple SATA/SAS disks in hardware RAID configuration.
+like multiple SATA/SAS disks in hardware RAID configuration, as well
+as flash-based storage with internal command queueing (and
+parallelism).
 
 So depending on storage and workload, it might be useful to set
 slice_idle=0.  In general for SATA/SAS disks and software RAID of
 SATA/SAS disks keeping slice_idle enabled should be useful. For any
 configurations where there are multiple spindles behind single LUN
-(Host based hardware RAID controller or for storage arrays), setting
-slice_idle=0 might end up in better throughput and acceptable
-latencies.
+(Host based hardware RAID controller or for storage arrays), or with
+flash-based fast storage, setting slice_idle=0 might end up in better
+throughput and acceptable latencies.
 
 Idling is however necessary to have service guarantees enforced in
 case of differentiated weights or differentiated I/O-request lengths.
@@ -312,13 +333,14 @@ There is an important flipside for idling: apart from the above cases
 where it is beneficial also for throughput, idling can severely impact
 throughput. One important case is random workload. Because of this
 issue, BFQ tends to avoid idling as much as possible, when it is not
-beneficial also for throughput. As a consequence of this behavior, and
-of further issues described for the strict_guarantees tunable,
-short-term service guarantees may be occasionally violated. And, in
-some cases, these guarantees may be more important than guaranteeing
-maximum throughput. For example, in video playing/streaming, a very
-low drop rate may be more important than maximum throughput. In these
-cases, consider setting the strict_guarantees parameter.
+beneficial also for throughput (as detailed in Section 2). As a
+consequence of this behavior, and of further issues described for the
+strict_guarantees tunable, short-term service guarantees may be
+occasionally violated. And, in some cases, these guarantees may be
+more important than guaranteeing maximum throughput. For example, in
+video playing/streaming, a very low drop rate may be more important
+than maximum throughput. In these cases, consider setting the
+strict_guarantees parameter.
 
 strict_guarantees
 -----------------
@@ -420,6 +442,13 @@ The default value is 0, which enables auto-tuning: BFQ sets max_budget
 to the maximum number of sectors that can be served during
 timeout_sync, according to the estimated peak rate.
 
+For specific devices, some users have occasionally reported to have
+reached a higher throughput by setting max_budget explicitly, i.e., by
+setting max_budget to a higher value than 0. In particular, they have
+set max_budget to higher values than those to which BFQ would have set
+it with auto-tuning. An alternative way to achieve this goal is to
+just increase the value of timeout_sync, leaving max_budget equal to 0.
+
 weights
 -------
 
@@ -427,51 +456,6 @@ Read-only parameter, used to show the weights of the currently active
 BFQ queues.
 
 
-wr_ tunables
-------------
-
-BFQ exports a few parameters to control/tune the behavior of
-low-latency heuristics.
-
-wr_coeff
-
-Factor by which the weight of a weight-raised queue is multiplied. If
-the queue is deemed soft real-time, then the weight is further
-multiplied by an additional, constant factor.
-
-wr_max_time
-
-Maximum duration of a weight-raising period for an interactive task
-(ms). If set to zero (default value), then this value is computed
-automatically, as a function of the peak rate of the device. In any
-case, when the value of this parameter is read, it always reports the
-current duration, regardless of whether it has been set manually or
-computed automatically.
-
-wr_max_softrt_rate
-
-Maximum service rate below which a queue is deemed to be associated
-with a soft real-time application, and is then weight-raised
-accordingly (sectors/sec).
-
-wr_min_idle_time
-
-Minimum idle period after which interactive weight-raising may be
-reactivated for a queue (in ms).
-
-wr_rt_max_time
-
-Maximum weight-raising duration for soft real-time queues (in ms). The
-start time from which this duration is considered is automatically
-moved forward if the queue is detected to be still soft real-time
-before the current soft real-time weight-raising period finishes.
-
-wr_min_inter_arr_async
-
-Minimum period between I/O request arrivals after which weight-raising
-may be reactivated for an already busy async queue (in ms).
-
-
 4. Group scheduling with BFQ
 ============================
 
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 79484469c2f7..8f3d22330d64 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -720,7 +720,7 @@ static void bfq_updated_next_req(struct bfq_data *bfqd,
 		entity->budget = new_budget;
 		bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",
 					 new_budget);
-		bfq_requeue_bfqq(bfqd, bfqq);
+		bfq_requeue_bfqq(bfqd, bfqq, false);
 	}
 }
 
@@ -2563,7 +2563,7 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 
 		bfq_del_bfqq_busy(bfqd, bfqq, true);
 	} else {
-		bfq_requeue_bfqq(bfqd, bfqq);
+		bfq_requeue_bfqq(bfqd, bfqq, true);
 		/*
 		 * Resort priority tree of potential close cooperators.
 		 */
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index cc4ea8574483..ac0809c72c98 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -817,7 +817,6 @@ extern const int bfq_timeout;
 struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync);
 void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync);
 struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic);
-void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
 void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq);
 void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity,
 			  struct rb_root *root);
@@ -917,7 +916,8 @@ void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd);
 void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 			 bool ins_into_idle_tree, bool expiration);
 void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
-void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
+void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+		      bool expiration);
 void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 		       bool expiration);
 void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq);
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index 911aa7431dbe..414ba686a847 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -44,7 +44,8 @@ static unsigned int bfq_class_idx(struct bfq_entity *entity)
 		BFQ_DEFAULT_GRP_CLASS - 1;
 }
 
-static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd);
+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
+						 bool expiration);
 
 static bool bfq_update_parent_budget(struct bfq_entity *next_in_service);
 
@@ -54,6 +55,8 @@ static bool bfq_update_parent_budget(struct bfq_entity *next_in_service);
  * @new_entity: if not NULL, pointer to the entity whose activation,
  *		requeueing or repositionig triggered the invocation of
  *		this function.
+ * @expiration: id true, this function is being invoked after the
+ *             expiration of the in-service entity
  *
  * This function is called to update sd->next_in_service, which, in
  * its turn, may change as a consequence of the insertion or
@@ -72,19 +75,20 @@ static bool bfq_update_parent_budget(struct bfq_entity *next_in_service);
  * entity.
  */
 static bool bfq_update_next_in_service(struct bfq_sched_data *sd,
-				       struct bfq_entity *new_entity)
+				       struct bfq_entity *new_entity,
+				       bool expiration)
 {
 	struct bfq_entity *next_in_service = sd->next_in_service;
 	bool parent_sched_may_change = false;
+	bool change_without_lookup = false;
 
 	/*
 	 * If this update is triggered by the activation, requeueing
 	 * or repositiong of an entity that does not coincide with
 	 * sd->next_in_service, then a full lookup in the active tree
 	 * can be avoided. In fact, it is enough to check whether the
-	 * just-modified entity has a higher priority than
-	 * sd->next_in_service, or, even if it has the same priority
-	 * as sd->next_in_service, is eligible and has a lower virtual
+	 * just-modified entity has the same priority as
+	 * sd->next_in_service, is eligible and has a lower virtual
 	 * finish time than sd->next_in_service. If this compound
 	 * condition holds, then the new entity becomes the new
 	 * next_in_service. Otherwise no change is needed.
@@ -96,13 +100,12 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd,
 		 * set to true, and left as true if
 		 * sd->next_in_service is NULL.
 		 */
-		bool replace_next = true;
+		change_without_lookup = true;
 
 		/*
 		 * If there is already a next_in_service candidate
-		 * entity, then compare class priorities or timestamps
-		 * to decide whether to replace sd->service_tree with
-		 * new_entity.
+		 * entity, then compare timestamps to decide whether
+		 * to replace sd->service_tree with new_entity.
 		 */
 		if (next_in_service) {
 			unsigned int new_entity_class_idx =
@@ -110,32 +113,26 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd,
 			struct bfq_service_tree *st =
 				sd->service_tree + new_entity_class_idx;
 
-			/*
-			 * For efficiency, evaluate the most likely
-			 * sub-condition first.
-			 */
-			replace_next =
+			change_without_lookup =
 				(new_entity_class_idx ==
 				 bfq_class_idx(next_in_service)
 				 &&
 				 !bfq_gt(new_entity->start, st->vtime)
 				 &&
 				 bfq_gt(next_in_service->finish,
-					new_entity->finish))
-				||
-				new_entity_class_idx <
-				bfq_class_idx(next_in_service);
+					new_entity->finish));
 		}
 
-		if (replace_next)
+		if (change_without_lookup)
 			next_in_service = new_entity;
-	} else /* invoked because of a deactivation: lookup needed */
-		next_in_service = bfq_lookup_next_entity(sd);
+	}
+
+	if (!change_without_lookup) /* lookup needed */
+		next_in_service = bfq_lookup_next_entity(sd, expiration);
 
-	if (next_in_service) {
+	if (next_in_service)
 		parent_sched_may_change = !sd->next_in_service ||
 			bfq_update_parent_budget(next_in_service);
-	}
 
 	sd->next_in_service = next_in_service;
 
@@ -1127,10 +1124,12 @@ static void __bfq_activate_requeue_entity(struct bfq_entity *entity,
  * @requeue: true if this is a requeue, which implies that bfqq is
  *	     being expired; thus ALL its ancestors stop being served and must
  *	     therefore be requeued
+ * @expiration: true if this function is being invoked in the expiration path
+ *             of the in-service queue
  */
 static void bfq_activate_requeue_entity(struct bfq_entity *entity,
 					bool non_blocking_wait_rq,
-					bool requeue)
+					bool requeue, bool expiration)
 {
 	struct bfq_sched_data *sd;
 
@@ -1138,7 +1137,8 @@ static void bfq_activate_requeue_entity(struct bfq_entity *entity,
 		sd = entity->sched_data;
 		__bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq);
 
-		if (!bfq_update_next_in_service(sd, entity) && !requeue)
+		if (!bfq_update_next_in_service(sd, entity, expiration) &&
+		    !requeue)
 			break;
 	}
 }
@@ -1194,6 +1194,8 @@ bool __bfq_deactivate_entity(struct bfq_entity *entity, bool ins_into_idle_tree)
  * bfq_deactivate_entity - deactivate an entity representing a bfq_queue.
  * @entity: the entity to deactivate.
  * @ins_into_idle_tree: true if the entity can be put into the idle tree
+ * @expiration: true if this function is being invoked in the expiration path
+ *             of the in-service queue
  */
 static void bfq_deactivate_entity(struct bfq_entity *entity,
 				  bool ins_into_idle_tree,
@@ -1222,7 +1224,7 @@ static void bfq_deactivate_entity(struct bfq_entity *entity,
 			 * then, since entity has just been
 			 * deactivated, a new one must be found.
 			 */
-			bfq_update_next_in_service(sd, NULL);
+			bfq_update_next_in_service(sd, NULL, expiration);
 
 		if (sd->next_in_service || sd->in_service_entity) {
 			/*
@@ -1281,7 +1283,7 @@ static void bfq_deactivate_entity(struct bfq_entity *entity,
 		__bfq_requeue_entity(entity);
 
 		sd = entity->sched_data;
-		if (!bfq_update_next_in_service(sd, entity) &&
+		if (!bfq_update_next_in_service(sd, entity, expiration) &&
 		    !expiration)
 			/*
 			 * next_in_service unchanged or not causing
@@ -1416,12 +1418,14 @@ __bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service)
 /**
  * bfq_lookup_next_entity - return the first eligible entity in @sd.
  * @sd: the sched_data.
+ * @expiration: true if we are on the expiration path of the in-service queue
  *
  * This function is invoked when there has been a change in the trees
- * for sd, and we need know what is the new next entity after this
- * change.
+ * for sd, and we need to know what is the new next entity to serve
+ * after this change.
  */
-static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd)
+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
+						 bool expiration)
 {
 	struct bfq_service_tree *st = sd->service_tree;
 	struct bfq_service_tree *idle_class_st = st + (BFQ_IOPRIO_CLASSES - 1);
@@ -1448,8 +1452,24 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd)
 	 * class, unless the idle class needs to be served.
 	 */
 	for (; class_idx < BFQ_IOPRIO_CLASSES; class_idx++) {
+		/*
+		 * If expiration is true, then bfq_lookup_next_entity
+		 * is being invoked as a part of the expiration path
+		 * of the in-service queue. In this case, even if
+		 * sd->in_service_entity is not NULL,
+		 * sd->in_service_entiy at this point is actually not
+		 * in service any more, and, if needed, has already
+		 * been properly queued or requeued into the right
+		 * tree. The reason why sd->in_service_entity is still
+		 * not NULL here, even if expiration is true, is that
+		 * sd->in_service_entiy is reset as a last step in the
+		 * expiration path. So, if expiration is true, tell
+		 * __bfq_lookup_next_entity that there is no
+		 * sd->in_service_entity.
+		 */
 		entity = __bfq_lookup_next_entity(st + class_idx,
-						  sd->in_service_entity);
+						  sd->in_service_entity &&
+						  !expiration);
 
 		if (entity)
 			break;
@@ -1562,7 +1582,7 @@ struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
 	for_each_entity(entity) {
 		struct bfq_sched_data *sd = entity->sched_data;
 
-		if (!bfq_update_next_in_service(sd, NULL))
+		if (!bfq_update_next_in_service(sd, NULL, false))
 			break;
 	}
 
@@ -1610,16 +1630,17 @@ void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 	struct bfq_entity *entity = &bfqq->entity;
 
 	bfq_activate_requeue_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq),
-				    false);
+				    false, false);
 	bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
 }
 
-void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+		      bool expiration)
 {
 	struct bfq_entity *entity = &bfqq->entity;
 
 	bfq_activate_requeue_entity(entity, false,
-				    bfqq == bfqd->in_service_queue);
+				    bfqq == bfqd->in_service_queue, expiration);
 }
 
 /*
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 407cb172d6e3..3a35121f8a6f 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -213,10 +213,13 @@ static void __loop_update_dio(struct loop_device *lo, bool dio)
 	 */
 	blk_mq_freeze_queue(lo->lo_queue);
 	lo->use_dio = use_dio;
-	if (use_dio)
+	if (use_dio) {
+		queue_flag_clear_unlocked(QUEUE_FLAG_NOMERGES, lo->lo_queue);
 		lo->lo_flags |= LO_FLAGS_DIRECT_IO;
-	else
+	} else {
+		queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, lo->lo_queue);
 		lo->lo_flags &= ~LO_FLAGS_DIRECT_IO;
+	}
 	blk_mq_unfreeze_queue(lo->lo_queue);
 }
 
@@ -464,6 +467,8 @@ static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
 {
 	struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);
 
+	kfree(cmd->bvec);
+	cmd->bvec = NULL;
 	cmd->ret = ret;
 	blk_mq_complete_request(cmd->rq);
 }
@@ -473,22 +478,50 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
 {
 	struct iov_iter iter;
 	struct bio_vec *bvec;
-	struct bio *bio = cmd->rq->bio;
+	struct request *rq = cmd->rq;
+	struct bio *bio = rq->bio;
 	struct file *file = lo->lo_backing_file;
+	unsigned int offset;
+	int segments = 0;
 	int ret;
 
-	/* nomerge for loop request queue */
-	WARN_ON(cmd->rq->bio != cmd->rq->biotail);
+	if (rq->bio != rq->biotail) {
+		struct req_iterator iter;
+		struct bio_vec tmp;
+
+		__rq_for_each_bio(bio, rq)
+			segments += bio_segments(bio);
+		bvec = kmalloc(sizeof(struct bio_vec) * segments, GFP_NOIO);
+		if (!bvec)
+			return -EIO;
+		cmd->bvec = bvec;
+
+		/*
+		 * The bios of the request may be started from the middle of
+		 * the 'bvec' because of bio splitting, so we can't directly
+		 * copy bio->bi_iov_vec to new bvec. The rq_for_each_segment
+		 * API will take care of all details for us.
+		 */
+		rq_for_each_segment(tmp, rq, iter) {
+			*bvec = tmp;
+			bvec++;
+		}
+		bvec = cmd->bvec;
+		offset = 0;
+	} else {
+		/*
+		 * Same here, this bio may be started from the middle of the
+		 * 'bvec' because of bio splitting, so offset from the bvec
+		 * must be passed to iov iterator
+		 */
+		offset = bio->bi_iter.bi_bvec_done;
+		bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
+		segments = bio_segments(bio);
+	}
 
-	bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
 	iov_iter_bvec(&iter, ITER_BVEC | rw, bvec,
-		      bio_segments(bio), blk_rq_bytes(cmd->rq));
-	/*
-	 * This bio may be started from the middle of the 'bvec'
-	 * because of bio splitting, so offset from the bvec must
-	 * be passed to iov iterator
-	 */
-	iter.iov_offset = bio->bi_iter.bi_bvec_done;
+		      segments, blk_rq_bytes(rq));
+	iter.iov_offset = offset;
 
 	cmd->iocb.ki_pos = pos;
 	cmd->iocb.ki_filp = file;
@@ -546,74 +579,12 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq)
 	}
 }
 
-struct switch_request {
-	struct file *file;
-	struct completion wait;
-};
-
 static inline void loop_update_dio(struct loop_device *lo)
 {
 	__loop_update_dio(lo, io_is_direct(lo->lo_backing_file) |
 			lo->use_dio);
 }
 
-/*
- * Do the actual switch; called from the BIO completion routine
- */
-static void do_loop_switch(struct loop_device *lo, struct switch_request *p)
-{
-	struct file *file = p->file;
-	struct file *old_file = lo->lo_backing_file;
-	struct address_space *mapping;
-
-	/* if no new file, only flush of queued bios requested */
-	if (!file)
-		return;
-
-	mapping = file->f_mapping;
-	mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask);
-	lo->lo_backing_file = file;
-	lo->lo_blocksize = S_ISBLK(mapping->host->i_mode) ?
-		mapping->host->i_bdev->bd_block_size : PAGE_SIZE;
-	lo->old_gfp_mask = mapping_gfp_mask(mapping);
-	mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
-	loop_update_dio(lo);
-}
-
-/*
- * loop_switch performs the hard work of switching a backing store.
- * First it needs to flush existing IO, it does this by sending a magic
- * BIO down the pipe. The completion of this BIO does the actual switch.
- */
-static int loop_switch(struct loop_device *lo, struct file *file)
-{
-	struct switch_request w;
-
-	w.file = file;
-
-	/* freeze queue and wait for completion of scheduled requests */
-	blk_mq_freeze_queue(lo->lo_queue);
-
-	/* do the switch action */
-	do_loop_switch(lo, &w);
-
-	/* unfreeze */
-	blk_mq_unfreeze_queue(lo->lo_queue);
-
-	return 0;
-}
-
-/*
- * Helper to flush the IOs in loop, but keeping loop thread running
- */
-static int loop_flush(struct loop_device *lo)
-{
-	/* loop not yet configured, no running thread, nothing to flush */
-	if (lo->lo_state != Lo_bound)
-		return 0;
-	return loop_switch(lo, NULL);
-}
-
 static void loop_reread_partitions(struct loop_device *lo,
 				   struct block_device *bdev)
 {
@@ -678,9 +649,14 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
 		goto out_putf;
 
 	/* and ... switch */
-	error = loop_switch(lo, file);
-	if (error)
-		goto out_putf;
+	blk_mq_freeze_queue(lo->lo_queue);
+	mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask);
+	lo->lo_backing_file = file;
+	lo->old_gfp_mask = mapping_gfp_mask(file->f_mapping);
+	mapping_set_gfp_mask(file->f_mapping,
+			     lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
+	loop_update_dio(lo);
+	blk_mq_unfreeze_queue(lo->lo_queue);
 
 	fput(old_file);
 	if (lo->lo_flags & LO_FLAGS_PARTSCAN)
@@ -867,7 +843,6 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	struct file	*file, *f;
 	struct inode	*inode;
 	struct address_space *mapping;
-	unsigned lo_blocksize;
 	int		lo_flags = 0;
 	int		error;
 	loff_t		size;
@@ -911,9 +886,6 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	    !file->f_op->write_iter)
 		lo_flags |= LO_FLAGS_READ_ONLY;
 
-	lo_blocksize = S_ISBLK(inode->i_mode) ?
-		inode->i_bdev->bd_block_size : PAGE_SIZE;
-
 	error = -EFBIG;
 	size = get_loop_size(lo, file);
 	if ((loff_t)(sector_t)size != size)
@@ -927,7 +899,6 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0);
 
 	lo->use_dio = false;
-	lo->lo_blocksize = lo_blocksize;
 	lo->lo_device = bdev;
 	lo->lo_flags = lo_flags;
 	lo->lo_backing_file = file;
@@ -947,7 +918,8 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	/* let user-space know about the new size */
 	kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
 
-	set_blocksize(bdev, lo_blocksize);
+	set_blocksize(bdev, S_ISBLK(inode->i_mode) ?
+		      block_size(inode->i_bdev) : PAGE_SIZE);
 
 	lo->lo_state = Lo_bound;
 	if (part_shift)
@@ -1053,6 +1025,7 @@ static int loop_clr_fd(struct loop_device *lo)
 	memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE);
 	memset(lo->lo_crypt_name, 0, LO_NAME_SIZE);
 	memset(lo->lo_file_name, 0, LO_NAME_SIZE);
+	blk_queue_logical_block_size(lo->lo_queue, 512);
 	if (bdev) {
 		bdput(bdev);
 		invalidate_bdev(bdev);
@@ -1336,6 +1309,24 @@ static int loop_set_dio(struct loop_device *lo, unsigned long arg)
 	return error;
 }
 
+static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
+{
+	if (lo->lo_state != Lo_bound)
+		return -ENXIO;
+
+	if (arg < 512 || arg > PAGE_SIZE || !is_power_of_2(arg))
+		return -EINVAL;
+
+	blk_mq_freeze_queue(lo->lo_queue);
+
+	blk_queue_logical_block_size(lo->lo_queue, arg);
+	loop_update_dio(lo);
+
+	blk_mq_unfreeze_queue(lo->lo_queue);
+
+	return 0;
+}
+
 static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 	unsigned int cmd, unsigned long arg)
 {
@@ -1384,6 +1375,11 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 		if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
 			err = loop_set_dio(lo, arg);
 		break;
+	case LOOP_SET_BLOCK_SIZE:
+		err = -EPERM;
+		if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
+			err = loop_set_block_size(lo, arg);
+		break;
 	default:
 		err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL;
 	}
@@ -1583,12 +1579,13 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
 		err = loop_clr_fd(lo);
 		if (!err)
 			return;
-	} else {
+	} else if (lo->lo_state == Lo_bound) {
 		/*
 		 * Otherwise keep thread (if running) and config,
 		 * but flush possible ongoing bios in thread.
 		 */
-		loop_flush(lo);
+		blk_mq_freeze_queue(lo->lo_queue);
+		blk_mq_unfreeze_queue(lo->lo_queue);
 	}
 
 	mutex_unlock(&lo->lo_ctl_mutex);
@@ -1770,9 +1767,15 @@ static int loop_add(struct loop_device **l, int i)
 	}
 	lo->lo_queue->queuedata = lo;
 
+	blk_queue_physical_block_size(lo->lo_queue, PAGE_SIZE);
+
+	blk_queue_max_hw_sectors(lo->lo_queue, BLK_DEF_MAX_SECTORS);
+
 	/*
-	 * It doesn't make sense to enable merge because the I/O
-	 * submitted to backing file is handled page by page.
+	 * By default, we do buffer IO, so it doesn't make sense to enable
+	 * merge because the I/O submitted to backing file is handled page by
+	 * page. For directio mode, merge does help to dispatch bigger request
+	 * to underlayer disk. We will enable merge once directio is enabled.
 	 */
 	queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, lo->lo_queue);
 
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index fecd3f97ef8c..43d20d37b79a 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -48,7 +48,6 @@ struct loop_device {
 
 	struct file *	lo_backing_file;
 	struct block_device *lo_device;
-	unsigned	lo_blocksize;
 	void		*key_data; 
 
 	gfp_t		old_gfp_mask;
@@ -72,6 +71,7 @@ struct loop_cmd {
 	bool use_aio;           /* use AIO interface to handle I/O */
 	long ret;
 	struct kiocb iocb;
+	struct bio_vec *bvec;
 };
 
 /* Support for loadable transfer modules */
diff --git a/include/uapi/linux/loop.h b/include/uapi/linux/loop.h
index c8125ec1f4f2..23158dbe2424 100644
--- a/include/uapi/linux/loop.h
+++ b/include/uapi/linux/loop.h
@@ -88,6 +88,7 @@ struct loop_info64 {
 #define LOOP_CHANGE_FD		0x4C06
 #define LOOP_SET_CAPACITY	0x4C07
 #define LOOP_SET_DIRECT_IO	0x4C08
+#define LOOP_SET_BLOCK_SIZE	0x4C09
 
 /* /dev/loop-control interface */
 #define LOOP_CTL_ADD		0x4C80