From ae1b1539622fb46e51b4d13b3f9e5f4c713f86ae Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 25 Jan 2011 12:43:54 +0100 Subject: block: reimplement FLUSH/FUA to support merge The current FLUSH/FUA support has evolved from the implementation which had to perform queue draining. As such, sequencing is done queue-wide one flush request after another. However, with the draining requirement gone, there's no reason to keep the queue-wide sequential approach. This patch reimplements FLUSH/FUA support such that each FLUSH/FUA request is sequenced individually. The actual FLUSH execution is double buffered and whenever a request wants to execute one for either PRE or POSTFLUSH, it queues on the pending queue. Once certain conditions are met, a flush request is issued and on its completion all pending requests proceed to the next sequence. This allows arbitrary merging of different type of flushes. How they are merged can be primarily controlled and tuned by adjusting the above said 'conditions' used to determine when to issue the next flush. This is inspired by Darrick's patches to merge multiple zero-data flushes which helps workloads with highly concurrent fsync requests. * As flush requests are never put on the IO scheduler, request fields used for flush share space with rq->rb_node. rq->completion_data is moved out of the union. This increases the request size by one pointer. As rq->elevator_private* are used only by the iosched too, it is possible to reduce the request size further. However, to do that, we need to modify request allocation path such that iosched data is not allocated for flush requests. * FLUSH/FUA processing happens on insertion now instead of dispatch. - Comments updated as per Vivek and Mike. Signed-off-by: Tejun Heo Cc: "Darrick J. Wong" Cc: Shaohua Li Cc: Christoph Hellwig Cc: Vivek Goyal Cc: Mike Snitzer Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 36ab42c9bb99..6d7e9afd08c3 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -99,13 +99,18 @@ struct request { /* * The rb_node is only used inside the io scheduler, requests * are pruned when moved to the dispatch queue. So let the - * completion_data share space with the rb_node. + * flush fields share space with the rb_node. */ union { struct rb_node rb_node; /* sort/lookup */ - void *completion_data; + struct { + unsigned int seq; + struct list_head list; + } flush; }; + void *completion_data; + /* * Three pointers are available for the IO schedulers, if they need * more they have to dynamically allocate it. @@ -362,11 +367,12 @@ struct request_queue * for flush operations */ unsigned int flush_flags; - unsigned int flush_seq; - int flush_err; + unsigned int flush_pending_idx:1; + unsigned int flush_running_idx:1; + unsigned long flush_pending_since; + struct list_head flush_queue[2]; + struct list_head flush_data_in_flight; struct request flush_rq; - struct request *orig_flush_rq; - struct list_head pending_flushes; struct mutex sysfs_lock; -- cgit v1.2.3 From c186794dbb466b45cf40f942f2d09d6d5b4b0e42 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 11 Feb 2011 11:08:00 +0100 Subject: block: share request flush fields with elevator_private Flush requests are never put on the IO scheduler. Convert request structure's elevator_private* into an array and have the flush fields share a union with it. Reclaim the space lost in 'struct request' by moving 'completion_data' back in the union with 'rb_node'. Signed-off-by: Mike Snitzer Acked-by: Vivek Goyal Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6d7e9afd08c3..12bb426949e9 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -99,25 +99,26 @@ struct request { /* * The rb_node is only used inside the io scheduler, requests * are pruned when moved to the dispatch queue. So let the - * flush fields share space with the rb_node. + * completion_data share space with the rb_node. */ union { struct rb_node rb_node; /* sort/lookup */ - struct { - unsigned int seq; - struct list_head list; - } flush; + void *completion_data; }; - void *completion_data; - /* * Three pointers are available for the IO schedulers, if they need - * more they have to dynamically allocate it. + * more they have to dynamically allocate it. Flush requests are + * never put on the IO scheduler. So let the flush fields share + * space with the three elevator_private pointers. */ - void *elevator_private; - void *elevator_private2; - void *elevator_private3; + union { + void *elevator_private[3]; + struct { + unsigned int seq; + struct list_head list; + } flush; + }; struct gendisk *rq_disk; unsigned long start_time; -- cgit v1.2.3 From da527770007fce8e4541947d47918248286da875 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 2 Mar 2011 19:05:33 -0500 Subject: block: Move blk_throtl_exit() call to blk_cleanup_queue() Move blk_throtl_exit() in blk_cleanup_queue() as blk_throtl_exit() is written in such a way that it needs queue lock. In blk_release_queue() there is no gurantee that ->queue_lock is still around. Initially blk_throtl_exit() was in blk_cleanup_queue() but Ingo reported one problem. https://lkml.org/lkml/2010/10/23/86 And a quick fix moved blk_throtl_exit() to blk_release_queue(). commit 7ad58c028652753814054f4e3ac58f925e7343f4 Author: Jens Axboe Date: Sat Oct 23 20:40:26 2010 +0200 block: fix use-after-free bug in blk throttle code This patch reverts above change and does not try to shutdown the throtl work in blk_sync_queue(). By avoiding call to throtl_shutdown_timer_wq() from blk_sync_queue(), we should also avoid the problem reported by Ingo. blk_sync_queue() seems to be used only by md driver and it seems to be using it to make sure q->unplug_fn is not called as md registers its own unplug functions and it is about to free up the data structures used by unplug_fn(). Block throttle does not call back into unplug_fn() or into md. So there is no need to cancel blk throttle work. In fact I think cancelling block throttle work is bad because it might happen that some bios are throttled and scheduled to be dispatched later with the help of pending work and if work is cancelled, these bios might never be dispatched. Block layer also uses blk_sync_queue() during blk_cleanup_queue() and blk_release_queue() time. That should be safe as we are also calling blk_throtl_exit() which should make sure all the throttling related data structures are cleaned up. Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e3ee74fc5903..23fb92506c31 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1144,7 +1144,6 @@ extern int blk_throtl_init(struct request_queue *q); extern void blk_throtl_exit(struct request_queue *q); extern int blk_throtl_bio(struct request_queue *q, struct bio **bio); extern void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay); -extern void throtl_shutdown_timer_wq(struct request_queue *q); #else /* CONFIG_BLK_DEV_THROTTLING */ static inline int blk_throtl_bio(struct request_queue *q, struct bio **bio) { @@ -1154,7 +1153,6 @@ static inline int blk_throtl_bio(struct request_queue *q, struct bio **bio) static inline int blk_throtl_init(struct request_queue *q) { return 0; } static inline int blk_throtl_exit(struct request_queue *q) { return 0; } static inline void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay) {} -static inline void throtl_shutdown_timer_wq(struct request_queue *q) {} #endif /* CONFIG_BLK_DEV_THROTTLING */ #define MODULE_ALIAS_BLOCKDEV(major,minor) \ -- cgit v1.2.3 From 3cca6dc1c81e2407928dc4c6105252146fd3924f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 2 Mar 2011 11:08:00 -0500 Subject: block: add API for delaying work/request_fn a little bit Currently we use plugging for that, but as plugging is going away, we need an alternative mechanism. Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e3ee74fc5903..f55b2a8b6610 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -300,6 +300,11 @@ struct request_queue unsigned long unplug_delay; /* After this many jiffies */ struct work_struct unplug_work; + /* + * Delayed queue handling + */ + struct delayed_work delay_work; + struct backing_dev_info backing_dev_info; /* @@ -677,6 +682,7 @@ extern int blk_insert_cloned_request(struct request_queue *q, extern void blk_plug_device(struct request_queue *); extern void blk_plug_device_unlocked(struct request_queue *); extern int blk_remove_plug(struct request_queue *); +extern void blk_delay_queue(struct request_queue *, unsigned long); extern void blk_recount_segments(struct request_queue *, struct bio *); extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t, unsigned int, void __user *); -- cgit v1.2.3 From 73c101011926c5832e6e141682180c4debe2cf45 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 8 Mar 2011 13:19:51 +0100 Subject: block: initial patch for on-stack per-task plugging This patch adds support for creating a queuing context outside of the queue itself. This enables us to batch up pieces of IO before grabbing the block device queue lock and submitting them to the IO scheduler. The context is created on the stack of the process and assigned in the task structure, so that we can auto-unplug it if we hit a schedule event. The current queue plugging happens implicitly if IO is submitted to an empty device, yet callers have to remember to unplug that IO when they are going to wait for it. This is an ugly API and has caused bugs in the past. Additionally, it requires hacks in the vm (->sync_page() callback) to handle that logic. By switching to an explicit plugging scheme we make the API a lot nicer and can get rid of the ->sync_page() hack in the vm. Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f55b2a8b6610..5873037eeb91 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -871,6 +871,31 @@ struct request_queue *blk_alloc_queue(gfp_t); struct request_queue *blk_alloc_queue_node(gfp_t, int); extern void blk_put_queue(struct request_queue *); +struct blk_plug { + unsigned long magic; + struct list_head list; + unsigned int should_sort; +}; + +extern void blk_start_plug(struct blk_plug *); +extern void blk_finish_plug(struct blk_plug *); +extern void __blk_flush_plug(struct task_struct *, struct blk_plug *); + +static inline void blk_flush_plug(struct task_struct *tsk) +{ + struct blk_plug *plug = tsk->plug; + + if (unlikely(plug)) + __blk_flush_plug(tsk, plug); +} + +static inline bool blk_needs_flush_plug(struct task_struct *tsk) +{ + struct blk_plug *plug = tsk->plug; + + return plug && !list_empty(&plug->list); +} + /* * tag stuff */ @@ -1294,6 +1319,23 @@ static inline long nr_blockdev_pages(void) return 0; } +static inline void blk_start_plug(struct list_head *list) +{ +} + +static inline void blk_finish_plug(struct list_head *list) +{ +} + +static inline void blk_flush_plug(struct task_struct *tsk) +{ +} + +static inline bool blk_needs_flush_plug(struct task_struct *tsk) +{ + return false; +} + #endif /* CONFIG_BLOCK */ #endif -- cgit v1.2.3 From 7eaceaccab5f40bbfda044629a6298616aeaed50 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 10 Mar 2011 08:52:07 +0100 Subject: block: remove per-queue plugging Code has been converted over to the new explicit on-stack plugging, and delay users have been converted to use the new API for that. So lets kill off the old plugging along with aops->sync_page(). Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 31 +++++++------------------------ 1 file changed, 7 insertions(+), 24 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 5873037eeb91..64ab2a1bb167 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -196,7 +196,6 @@ typedef void (request_fn_proc) (struct request_queue *q); typedef int (make_request_fn) (struct request_queue *q, struct bio *bio); typedef int (prep_rq_fn) (struct request_queue *, struct request *); typedef void (unprep_rq_fn) (struct request_queue *, struct request *); -typedef void (unplug_fn) (struct request_queue *); struct bio_vec; struct bvec_merge_data { @@ -279,7 +278,6 @@ struct request_queue make_request_fn *make_request_fn; prep_rq_fn *prep_rq_fn; unprep_rq_fn *unprep_rq_fn; - unplug_fn *unplug_fn; merge_bvec_fn *merge_bvec_fn; softirq_done_fn *softirq_done_fn; rq_timed_out_fn *rq_timed_out_fn; @@ -292,14 +290,6 @@ struct request_queue sector_t end_sector; struct request *boundary_rq; - /* - * Auto-unplugging state - */ - struct timer_list unplug_timer; - int unplug_thresh; /* After this many requests */ - unsigned long unplug_delay; /* After this many jiffies */ - struct work_struct unplug_work; - /* * Delayed queue handling */ @@ -399,14 +389,13 @@ struct request_queue #define QUEUE_FLAG_ASYNCFULL 4 /* write queue has been filled */ #define QUEUE_FLAG_DEAD 5 /* queue being torn down */ #define QUEUE_FLAG_REENTER 6 /* Re-entrancy avoidance */ -#define QUEUE_FLAG_PLUGGED 7 /* queue is plugged */ -#define QUEUE_FLAG_ELVSWITCH 8 /* don't use elevator, just do FIFO */ -#define QUEUE_FLAG_BIDI 9 /* queue supports bidi requests */ -#define QUEUE_FLAG_NOMERGES 10 /* disable merge attempts */ -#define QUEUE_FLAG_SAME_COMP 11 /* force complete on same CPU */ -#define QUEUE_FLAG_FAIL_IO 12 /* fake timeout */ -#define QUEUE_FLAG_STACKABLE 13 /* supports request stacking */ -#define QUEUE_FLAG_NONROT 14 /* non-rotational device (SSD) */ +#define QUEUE_FLAG_ELVSWITCH 7 /* don't use elevator, just do FIFO */ +#define QUEUE_FLAG_BIDI 8 /* queue supports bidi requests */ +#define QUEUE_FLAG_NOMERGES 9 /* disable merge attempts */ +#define QUEUE_FLAG_SAME_COMP 10 /* force complete on same CPU */ +#define QUEUE_FLAG_FAIL_IO 11 /* fake timeout */ +#define QUEUE_FLAG_STACKABLE 12 /* supports request stacking */ +#define QUEUE_FLAG_NONROT 13 /* non-rotational device (SSD) */ #define QUEUE_FLAG_VIRT QUEUE_FLAG_NONROT /* paravirt device */ #define QUEUE_FLAG_IO_STAT 15 /* do IO stats */ #define QUEUE_FLAG_DISCARD 16 /* supports DISCARD */ @@ -484,7 +473,6 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q) __clear_bit(flag, &q->queue_flags); } -#define blk_queue_plugged(q) test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags) #define blk_queue_tagged(q) test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags) #define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags) #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) @@ -679,9 +667,6 @@ extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src, extern void blk_rq_unprep_clone(struct request *rq); extern int blk_insert_cloned_request(struct request_queue *q, struct request *rq); -extern void blk_plug_device(struct request_queue *); -extern void blk_plug_device_unlocked(struct request_queue *); -extern int blk_remove_plug(struct request_queue *); extern void blk_delay_queue(struct request_queue *, unsigned long); extern void blk_recount_segments(struct request_queue *, struct bio *); extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t, @@ -726,7 +711,6 @@ extern int blk_execute_rq(struct request_queue *, struct gendisk *, struct request *, int); extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *, struct request *, int, rq_end_io_fn *); -extern void blk_unplug(struct request_queue *q); static inline struct request_queue *bdev_get_queue(struct block_device *bdev) { @@ -863,7 +847,6 @@ extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bd extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *); extern void blk_dump_rq_flags(struct request *, char *); -extern void generic_unplug_device(struct request_queue *); extern long nr_blockdev_pages(void); int blk_get_queue(struct request_queue *); -- cgit v1.2.3 From 1f940bdfc0d03265d178d9dfd840d854819f797d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 11 Mar 2011 20:17:08 +0100 Subject: block: fixup plugging stubs for !CONFIG_BLOCK They used an older prototype, fix it up. Reported-by: Randy Dunlap Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 91fa428fa2c1..16a902f099ac 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1297,15 +1297,18 @@ static inline long nr_blockdev_pages(void) return 0; } -static inline void blk_start_plug(struct list_head *list) +struct blk_plug { +}; + +static inline void blk_start_plug(struct blk_plug *plug) { } -static inline void blk_finish_plug(struct list_head *list) +static inline void blk_finish_plug(struct blk_plug *plug) { } -static inline void blk_flush_plug(struct task_struct *tsk) +static inline void blk_flush_plug(struct task_struct *task) { } -- cgit v1.2.3