diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/bcache/bcache.h | 20 | ||||
-rw-r--r-- | drivers/md/bcache/btree.c | 5 | ||||
-rw-r--r-- | drivers/md/bcache/btree.h | 18 | ||||
-rw-r--r-- | drivers/md/bcache/debug.c | 3 | ||||
-rw-r--r-- | drivers/md/bcache/journal.c | 2 | ||||
-rw-r--r-- | drivers/md/bcache/request.c | 6 | ||||
-rw-r--r-- | drivers/md/bcache/super.c | 48 | ||||
-rw-r--r-- | drivers/md/bcache/sysfs.c | 61 | ||||
-rw-r--r-- | drivers/md/bcache/writeback.c | 30 | ||||
-rw-r--r-- | drivers/md/bcache/writeback.h | 12 | ||||
-rw-r--r-- | drivers/md/dm-cache-metadata.c | 4 | ||||
-rw-r--r-- | drivers/md/dm-core.h | 5 | ||||
-rw-r--r-- | drivers/md/dm-crypt.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-integrity.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-rq.c | 7 | ||||
-rw-r--r-- | drivers/md/dm-table.c | 4 | ||||
-rw-r--r-- | drivers/md/dm-thin.c | 72 | ||||
-rw-r--r-- | drivers/md/dm-zoned-target.c | 122 | ||||
-rw-r--r-- | drivers/md/dm.c | 81 | ||||
-rw-r--r-- | drivers/md/md.c | 7 | ||||
-rw-r--r-- | drivers/md/raid0.c | 2 |
21 files changed, 314 insertions, 199 deletions
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index b61b83bbcfff..fdf75352e16a 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -627,6 +627,20 @@ struct cache_set { struct bkey gc_done; /* + * For automatical garbage collection after writeback completed, this + * varialbe is used as bit fields, + * - 0000 0001b (BCH_ENABLE_AUTO_GC): enable gc after writeback + * - 0000 0010b (BCH_DO_AUTO_GC): do gc after writeback + * This is an optimization for following write request after writeback + * finished, but read hit rate dropped due to clean data on cache is + * discarded. Unless user explicitly sets it via sysfs, it won't be + * enabled. + */ +#define BCH_ENABLE_AUTO_GC 1 +#define BCH_DO_AUTO_GC 2 + uint8_t gc_after_writeback; + + /* * The allocation code needs gc_mark in struct bucket to be correct, but * it's not while a gc is in progress. Protected by bucket_lock. */ @@ -658,7 +672,11 @@ struct cache_set { /* * A btree node on disk could have too many bsets for an iterator to fit - * on the stack - have to dynamically allocate them + * on the stack - have to dynamically allocate them. + * bch_cache_set_alloc() will make sure the pool can allocate iterators + * equipped with enough room that can host + * (sb.bucket_size / sb.block_size) + * btree_iter_sets, which is more than static MAX_BSETS. */ mempool_t fill_iter; diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 3f4211b5cd33..23cb1dc7296b 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -207,6 +207,11 @@ void bch_btree_node_read_done(struct btree *b) struct bset *i = btree_bset_first(b); struct btree_iter *iter; + /* + * c->fill_iter can allocate an iterator with more memory space + * than static MAX_BSETS. + * See the comment arount cache_set->fill_iter. + */ iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO); iter->size = b->c->sb.bucket_size / b->c->sb.block_size; iter->used = 0; diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index a68d6c55783b..d1c72ef64edf 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -266,6 +266,24 @@ static inline void wake_up_gc(struct cache_set *c) wake_up(&c->gc_wait); } +static inline void force_wake_up_gc(struct cache_set *c) +{ + /* + * Garbage collection thread only works when sectors_to_gc < 0, + * calling wake_up_gc() won't start gc thread if sectors_to_gc is + * not a nagetive value. + * Therefore sectors_to_gc is set to -1 here, before waking up + * gc thread by calling wake_up_gc(). Then gc_should_run() will + * give a chance to permit gc thread to run. "Give a chance" means + * before going into gc_should_run(), there is still possibility + * that c->sectors_to_gc being set to other positive value. So + * this routine won't 100% make sure gc thread will be woken up + * to run. + */ + atomic_set(&c->sectors_to_gc, -1); + wake_up_gc(c); +} + #define MAP_DONE 0 #define MAP_CONTINUE 1 diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 8f448b9c96a1..8b123be05254 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -249,8 +249,7 @@ void bch_debug_init_cache_set(struct cache_set *c) void bch_debug_exit(void) { - if (!IS_ERR_OR_NULL(bcache_debug)) - debugfs_remove_recursive(bcache_debug); + debugfs_remove_recursive(bcache_debug); } void __init bch_debug_init(void) diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 522c7426f3a0..b2fd412715b1 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -663,7 +663,7 @@ static void journal_write_unlocked(struct closure *cl) REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA); bch_bio_map(bio, w->data); - trace_bcache_journal_write(bio); + trace_bcache_journal_write(bio, w->data->keys); bio_list_add(&list, bio); SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + sectors); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 3bf35914bb57..15070412a32e 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -311,11 +311,11 @@ err: * data is written it calls bch_journal, and after the keys have been added to * the next journal write they're inserted into the btree. * - * It inserts the data in s->cache_bio; bi_sector is used for the key offset, + * It inserts the data in op->bio; bi_sector is used for the key offset, * and op->inode is used for the key inode. * - * If s->bypass is true, instead of inserting the data it invalidates the - * region of the cache represented by s->cache_bio and op->inode. + * If op->bypass is true, instead of inserting the data it invalidates the + * region of the cache represented by op->bio and op->inode. */ void bch_data_insert(struct closure *cl) { diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 7bbd670a5a84..4dee119c3664 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -25,8 +25,8 @@ #include <linux/reboot.h> #include <linux/sysfs.h> -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>"); +unsigned int bch_cutoff_writeback; +unsigned int bch_cutoff_writeback_sync; static const char bcache_magic[] = { 0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca, @@ -1510,8 +1510,7 @@ static void cache_set_free(struct closure *cl) struct cache *ca; unsigned int i; - if (!IS_ERR_OR_NULL(c->debug)) - debugfs_remove(c->debug); + debugfs_remove(c->debug); bch_open_buckets_free(c); bch_btree_cache_free(c); @@ -2424,6 +2423,32 @@ static void bcache_exit(void) mutex_destroy(&bch_register_lock); } +/* Check and fixup module parameters */ +static void check_module_parameters(void) +{ + if (bch_cutoff_writeback_sync == 0) + bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC; + else if (bch_cutoff_writeback_sync > CUTOFF_WRITEBACK_SYNC_MAX) { + pr_warn("set bch_cutoff_writeback_sync (%u) to max value %u", + bch_cutoff_writeback_sync, CUTOFF_WRITEBACK_SYNC_MAX); + bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC_MAX; + } + + if (bch_cutoff_writeback == 0) + bch_cutoff_writeback = CUTOFF_WRITEBACK; + else if (bch_cutoff_writeback > CUTOFF_WRITEBACK_MAX) { + pr_warn("set bch_cutoff_writeback (%u) to max value %u", + bch_cutoff_writeback, CUTOFF_WRITEBACK_MAX); + bch_cutoff_writeback = CUTOFF_WRITEBACK_MAX; + } + + if (bch_cutoff_writeback > bch_cutoff_writeback_sync) { + pr_warn("set bch_cutoff_writeback (%u) to %u", + bch_cutoff_writeback, bch_cutoff_writeback_sync); + bch_cutoff_writeback = bch_cutoff_writeback_sync; + } +} + static int __init bcache_init(void) { static const struct attribute *files[] = { @@ -2432,6 +2457,8 @@ static int __init bcache_init(void) NULL }; + check_module_parameters(); + mutex_init(&bch_register_lock); init_waitqueue_head(&unregister_wait); register_reboot_notifier(&reboot); @@ -2468,5 +2495,18 @@ err: return -ENOMEM; } +/* + * Module hooks + */ module_exit(bcache_exit); module_init(bcache_init); + +module_param(bch_cutoff_writeback, uint, 0); +MODULE_PARM_DESC(bch_cutoff_writeback, "threshold to cutoff writeback"); + +module_param(bch_cutoff_writeback_sync, uint, 0); +MODULE_PARM_DESC(bch_cutoff_writeback_sync, "hard threshold to cutoff writeback"); + +MODULE_DESCRIPTION("Bcache: a Linux block layer cache"); +MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 26f035a0c5b9..557a8a3270a1 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -16,7 +16,7 @@ #include <linux/sort.h> #include <linux/sched/clock.h> -/* Default is -1; we skip past it for struct cached_dev's cache mode */ +/* Default is 0 ("writethrough") */ static const char * const bch_cache_modes[] = { "writethrough", "writeback", @@ -25,7 +25,7 @@ static const char * const bch_cache_modes[] = { NULL }; -/* Default is -1; we skip past it for stop_when_cache_set_failed */ +/* Default is 0 ("auto") */ static const char * const bch_stop_on_failure_modes[] = { "auto", "always", @@ -88,6 +88,8 @@ read_attribute(writeback_keys_done); read_attribute(writeback_keys_failed); read_attribute(io_errors); read_attribute(congested); +read_attribute(cutoff_writeback); +read_attribute(cutoff_writeback_sync); rw_attribute(congested_read_threshold_us); rw_attribute(congested_write_threshold_us); @@ -128,6 +130,7 @@ rw_attribute(expensive_debug_checks); rw_attribute(cache_replacement_policy); rw_attribute(btree_shrinker_disabled); rw_attribute(copy_gc_enabled); +rw_attribute(gc_after_writeback); rw_attribute(size); static ssize_t bch_snprint_string_list(char *buf, @@ -264,7 +267,8 @@ STORE(__cached_dev) d_strtoul(writeback_running); d_strtoul(writeback_delay); - sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40); + sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, + 0, bch_cutoff_writeback); if (attr == &sysfs_writeback_rate) { ssize_t ret; @@ -384,8 +388,25 @@ STORE(bch_cached_dev) mutex_lock(&bch_register_lock); size = __cached_dev_store(kobj, attr, buf, size); - if (attr == &sysfs_writeback_running) - bch_writeback_queue(dc); + if (attr == &sysfs_writeback_running) { + /* dc->writeback_running changed in __cached_dev_store() */ + if (IS_ERR_OR_NULL(dc->writeback_thread)) { + /* + * reject setting it to 1 via sysfs if writeback + * kthread is not created yet. + */ + if (dc->writeback_running) { + dc->writeback_running = false; + pr_err("%s: failed to run non-existent writeback thread", + dc->disk.disk->disk_name); + } + } else + /* + * writeback kthread will check if dc->writeback_running + * is true or false. + */ + bch_writeback_queue(dc); + } if (attr == &sysfs_writeback_percent) if (!test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) @@ -668,6 +689,9 @@ SHOW(__bch_cache_set) sysfs_print(congested_write_threshold_us, c->congested_write_threshold_us); + sysfs_print(cutoff_writeback, bch_cutoff_writeback); + sysfs_print(cutoff_writeback_sync, bch_cutoff_writeback_sync); + sysfs_print(active_journal_entries, fifo_used(&c->journal.pin)); sysfs_printf(verify, "%i", c->verify); sysfs_printf(key_merging_disabled, "%i", c->key_merging_disabled); @@ -676,6 +700,7 @@ SHOW(__bch_cache_set) sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite); sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled); sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); + sysfs_printf(gc_after_writeback, "%i", c->gc_after_writeback); sysfs_printf(io_disable, "%i", test_bit(CACHE_SET_IO_DISABLE, &c->flags)); @@ -725,21 +750,8 @@ STORE(__bch_cache_set) bch_cache_accounting_clear(&c->accounting); } - if (attr == &sysfs_trigger_gc) { - /* - * Garbage collection thread only works when sectors_to_gc < 0, - * when users write to sysfs entry trigger_gc, most of time - * they want to forcibly triger gargage collection. Here -1 is - * set to c->sectors_to_gc, to make gc_should_run() give a - * chance to permit gc thread to run. "give a chance" means - * before going into gc_should_run(), there is still chance - * that c->sectors_to_gc being set to other positive value. So - * writing sysfs entry trigger_gc won't always make sure gc - * thread takes effect. - */ - atomic_set(&c->sectors_to_gc, -1); - wake_up_gc(c); - } + if (attr == &sysfs_trigger_gc) + force_wake_up_gc(c); if (attr == &sysfs_prune_cache) { struct shrink_control sc; @@ -789,6 +801,12 @@ STORE(__bch_cache_set) sysfs_strtoul(gc_always_rewrite, c->gc_always_rewrite); sysfs_strtoul(btree_shrinker_disabled, c->shrinker_disabled); sysfs_strtoul(copy_gc_enabled, c->copy_gc_enabled); + /* + * write gc_after_writeback here may overwrite an already set + * BCH_DO_AUTO_GC, it doesn't matter because this flag will be + * set in next chance. + */ + sysfs_strtoul_clamp(gc_after_writeback, c->gc_after_writeback, 0, 1); return size; } @@ -869,7 +887,10 @@ static struct attribute *bch_cache_set_internal_files[] = { &sysfs_gc_always_rewrite, &sysfs_btree_shrinker_disabled, &sysfs_copy_gc_enabled, + &sysfs_gc_after_writeback, &sysfs_io_disable, + &sysfs_cutoff_writeback, + &sysfs_cutoff_writeback_sync, NULL }; KTYPE(bch_cache_set_internal); diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 08c3a9f9676c..73f0efac2b9f 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -17,6 +17,15 @@ #include <linux/sched/clock.h> #include <trace/events/bcache.h> +static void update_gc_after_writeback(struct cache_set *c) +{ + if (c->gc_after_writeback != (BCH_ENABLE_AUTO_GC) || + c->gc_stats.in_use < BCH_AUTO_GC_DIRTY_THRESHOLD) + return; + + c->gc_after_writeback |= BCH_DO_AUTO_GC; +} + /* Rate limiting */ static uint64_t __calc_target_rate(struct cached_dev *dc) { @@ -191,6 +200,7 @@ static void update_writeback_rate(struct work_struct *work) if (!set_at_max_writeback_rate(c, dc)) { down_read(&dc->writeback_lock); __update_writeback_rate(dc); + update_gc_after_writeback(c); up_read(&dc->writeback_lock); } } @@ -689,6 +699,23 @@ static int bch_writeback_thread(void *arg) up_write(&dc->writeback_lock); break; } + + /* + * When dirty data rate is high (e.g. 50%+), there might + * be heavy buckets fragmentation after writeback + * finished, which hurts following write performance. + * If users really care about write performance they + * may set BCH_ENABLE_AUTO_GC via sysfs, then when + * BCH_DO_AUTO_GC is set, garbage collection thread + * will be wake up here. After moving gc, the shrunk + * btree and discarded free buckets SSD space may be + * helpful for following write requests. + */ + if (c->gc_after_writeback == + (BCH_ENABLE_AUTO_GC|BCH_DO_AUTO_GC)) { + c->gc_after_writeback &= ~BCH_DO_AUTO_GC; + force_wake_up_gc(c); + } } up_write(&dc->writeback_lock); @@ -777,7 +804,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) bch_keybuf_init(&dc->writeback_keys); dc->writeback_metadata = true; - dc->writeback_running = true; + dc->writeback_running = false; dc->writeback_percent = 10; dc->writeback_delay = 30; atomic_long_set(&dc->writeback_rate.rate, 1024); @@ -805,6 +832,7 @@ int bch_cached_dev_writeback_start(struct cached_dev *dc) cached_dev_put(dc); return PTR_ERR(dc->writeback_thread); } + dc->writeback_running = true; WARN_ON(test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)); schedule_delayed_work(&dc->writeback_rate_update, diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index d2b9fdbc8994..6a743d3bb338 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -5,12 +5,17 @@ #define CUTOFF_WRITEBACK 40 #define CUTOFF_WRITEBACK_SYNC 70 +#define CUTOFF_WRITEBACK_MAX 70 +#define CUTOFF_WRITEBACK_SYNC_MAX 90 + #define MAX_WRITEBACKS_IN_PASS 5 #define MAX_WRITESIZE_IN_PASS 5000 /* *512b */ #define WRITEBACK_RATE_UPDATE_SECS_MAX 60 #define WRITEBACK_RATE_UPDATE_SECS_DEFAULT 5 +#define BCH_AUTO_GC_DIRTY_THRESHOLD 50 + /* * 14 (16384ths) is chosen here as something that each backing device * should be a reasonable fraction of the share, and not to blow up @@ -53,6 +58,9 @@ static inline bool bcache_dev_stripe_dirty(struct cached_dev *dc, } } +extern unsigned int bch_cutoff_writeback; +extern unsigned int bch_cutoff_writeback_sync; + static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, unsigned int cache_mode, bool would_skip) { @@ -60,7 +68,7 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, if (cache_mode != CACHE_MODE_WRITEBACK || test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || - in_use > CUTOFF_WRITEBACK_SYNC) + in_use > bch_cutoff_writeback_sync) return false; if (dc->partial_stripes_expensive && @@ -73,7 +81,7 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, return (op_is_sync(bio->bi_opf) || bio->bi_opf & (REQ_META|REQ_PRIO) || - in_use <= CUTOFF_WRITEBACK); + in_use <= bch_cutoff_writeback); } static inline void bch_writeback_queue(struct cached_dev *dc) diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 5936de71883f..6fc93834da44 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -930,6 +930,10 @@ static int blocks_are_clean_separate_dirty(struct dm_cache_metadata *cmd, bool dirty_flag; *result = true; + if (from_cblock(cmd->cache_blocks) == 0) + /* Nothing to do */ + return 0; + r = dm_bitset_cursor_begin(&cmd->dirty_info, cmd->dirty_root, from_cblock(cmd->cache_blocks), &cmd->dirty_cursor); if (r) { diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index 224d44503a06..95c6d86ab5e8 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -65,7 +65,6 @@ struct mapped_device { */ struct work_struct work; wait_queue_head_t wait; - atomic_t pending[2]; spinlock_t deferred_lock; struct bio_list deferred; @@ -107,9 +106,6 @@ struct mapped_device { struct block_device *bdev; - /* zero-length flush that will be cloned and submitted to targets */ - struct bio flush_bio; - struct dm_stats stats; /* for blk-mq request-based DM support */ @@ -119,7 +115,6 @@ struct mapped_device { struct srcu_struct io_barrier; }; -int md_in_flight(struct mapped_device *md); void disable_write_same(struct mapped_device *md); void disable_write_zeroes(struct mapped_device *md); diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index b8eec515a003..a7195eb5b8d8 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -377,7 +377,7 @@ static struct crypto_cipher *alloc_essiv_cipher(struct crypt_config *cc, int err; /* Setup the essiv_tfm with the given salt */ - essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC); + essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, 0); if (IS_ERR(essiv_tfm)) { ti->error = "Error allocating crypto tfm for ESSIV"; return essiv_tfm; diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index bb3096bf2cc6..d4ad0bfee251 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -2804,7 +2804,7 @@ static int get_mac(struct crypto_shash **hash, struct alg_spec *a, char **error, int r; if (a->alg_string) { - *hash = crypto_alloc_shash(a->alg_string, 0, CRYPTO_ALG_ASYNC); + *hash = crypto_alloc_shash(a->alg_string, 0, 0); if (IS_ERR(*hash)) { *error = error_alg; r = PTR_ERR(*hash); diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 7cd36e4d1310..4e06be4f0a62 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -43,7 +43,7 @@ static unsigned dm_get_blk_mq_queue_depth(void) int dm_request_based(struct mapped_device *md) { - return queue_is_rq_based(md->queue); + return queue_is_mq(md->queue); } void dm_start_queue(struct request_queue *q) @@ -130,10 +130,8 @@ static void rq_end_stats(struct mapped_device *md, struct request *orig) */ static void rq_completed(struct mapped_device *md, int rw, bool run_queue) { - atomic_dec(&md->pending[rw]); - /* nudge anyone waiting on suspend queue */ - if (!md_in_flight(md)) + if (unlikely(waitqueue_active(&md->wait))) wake_up(&md->wait); /* @@ -436,7 +434,6 @@ ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md, static void dm_start_request(struct mapped_device *md, struct request *orig) { blk_mq_start_request(orig); - atomic_inc(&md->pending[rq_data_dir(orig)]); if (unlikely(dm_stats_used(&md->stats))) { struct dm_rq_target_io *tio = tio_from_request(orig); diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 9038c302d5c2..844f7d0f2ef8 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -919,12 +919,12 @@ static int device_is_rq_based(struct dm_target *ti, struct dm_dev *dev, struct request_queue *q = bdev_get_queue(dev->bdev); struct verify_rq_based_data *v = data; - if (q->mq_ops) + if (queue_is_mq(q)) v->mq_count++; else v->sq_count++; - return queue_is_rq_based(q); + return queue_is_mq(q); } static int dm_table_determine_type(struct dm_table *t) diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 0bd8d498b3b9..dadd9696340c 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -195,7 +195,7 @@ static void throttle_unlock(struct throttle *t) struct dm_thin_new_mapping; /* - * The pool runs in 4 modes. Ordered in degraded order for comparisons. + * The pool runs in various modes. Ordered in degraded order for comparisons. */ enum pool_mode { PM_WRITE, /* metadata may be changed */ @@ -282,9 +282,38 @@ struct pool { mempool_t mapping_pool; }; -static enum pool_mode get_pool_mode(struct pool *pool); static void metadata_operation_failed(struct pool *pool, const char *op, int r); +static enum pool_mode get_pool_mode(struct pool *pool) +{ + return pool->pf.mode; +} + +static void notify_of_pool_mode_change(struct pool *pool) +{ + const char *descs[] = { + "write", + "out-of-data-space", + "read-only", + "read-only", + "fail" + }; + const char *extra_desc = NULL; + enum pool_mode mode = get_pool_mode(pool); + + if (mode == PM_OUT_OF_DATA_SPACE) { + if (!pool->pf.error_if_no_space) + extra_desc = " (queue IO)"; + else + extra_desc = " (error IO)"; + } + + dm_table_event(pool->ti->table); + DMINFO("%s: switching pool to %s%s mode", + dm_device_name(pool->pool_md), + descs[(int)mode], extra_desc ? : ""); +} + /* * Target context for a pool. */ @@ -2351,8 +2380,6 @@ static void do_waker(struct work_struct *ws) queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD); } -static void notify_of_pool_mode_change_to_oods(struct pool *pool); - /* * We're holding onto IO to allow userland time to react. After the * timeout either the pool will have been resized (and thus back in @@ -2365,7 +2392,7 @@ static void do_no_space_timeout(struct work_struct *ws) if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space) { pool->pf.error_if_no_space = true; - notify_of_pool_mode_change_to_oods(pool); + notify_of_pool_mode_change(pool); error_retry_list_with_code(pool, BLK_STS_NOSPC); } } @@ -2433,26 +2460,6 @@ static void noflush_work(struct thin_c *tc, void (*fn)(struct work_struct *)) /*----------------------------------------------------------------*/ -static enum pool_mode get_pool_mode(struct pool *pool) -{ - return pool->pf.mode; -} - -static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode) -{ - dm_table_event(pool->ti->table); - DMINFO("%s: switching pool to %s mode", - dm_device_name(pool->pool_md), new_mode); -} - -static void notify_of_pool_mode_change_to_oods(struct pool *pool) -{ - if (!pool->pf.error_if_no_space) - notify_of_pool_mode_change(pool, "out-of-data-space (queue IO)"); - else - notify_of_pool_mode_change(pool, "out-of-data-space (error IO)"); -} - static bool passdown_enabled(struct pool_c *pt) { return pt->adjusted_pf.discard_passdown; @@ -2501,8 +2508,6 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) switch (new_mode) { case PM_FAIL: - if (old_mode != new_mode) - notify_of_pool_mode_change(pool, "failure"); dm_pool_metadata_read_only(pool->pmd); pool->process_bio = process_bio_fail; pool->process_discard = process_bio_fail; @@ -2516,8 +2521,6 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) case PM_OUT_OF_METADATA_SPACE: case PM_READ_ONLY: - if (!is_read_only_pool_mode(old_mode)) - notify_of_pool_mode_change(pool, "read-only"); dm_pool_metadata_read_only(pool->pmd); pool->process_bio = process_bio_read_only; pool->process_discard = process_bio_success; @@ -2538,8 +2541,6 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) * alarming rate. Adjust your low water mark if you're * frequently seeing this mode. */ - if (old_mode != new_mode) - notify_of_pool_mode_change_to_oods(pool); pool->out_of_data_space = true; pool->process_bio = process_bio_read_only; pool->process_discard = process_discard_bio; @@ -2552,8 +2553,6 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) break; case PM_WRITE: - if (old_mode != new_mode) - notify_of_pool_mode_change(pool, "write"); if (old_mode == PM_OUT_OF_DATA_SPACE) cancel_delayed_work_sync(&pool->no_space_timeout); pool->out_of_data_space = false; @@ -2573,6 +2572,9 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) * doesn't cause an unexpected mode transition on resume. */ pt->adjusted_pf.mode = new_mode; + + if (old_mode != new_mode) + notify_of_pool_mode_change(pool); } static void abort_transaction(struct pool *pool) @@ -4023,7 +4025,7 @@ static struct target_type pool_target = { .name = "thin-pool", .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | DM_TARGET_IMMUTABLE, - .version = {1, 20, 0}, + .version = {1, 21, 0}, .module = THIS_MODULE, .ctr = pool_ctr, .dtr = pool_dtr, @@ -4397,7 +4399,7 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type thin_target = { .name = "thin", - .version = {1, 20, 0}, + .version = {1, 21, 0}, .module = THIS_MODULE, .ctr = thin_ctr, .dtr = thin_dtr, diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 981154e59461..6af5babe6837 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -20,7 +20,6 @@ struct dmz_bioctx { struct dm_zone *zone; struct bio *bio; refcount_t ref; - blk_status_t status; }; /* @@ -78,65 +77,66 @@ static inline void dmz_bio_endio(struct bio *bio, blk_status_t status) { struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx)); - if (bioctx->status == BLK_STS_OK && status != BLK_STS_OK) - bioctx->status = status; - bio_endio(bio); + if (status != BLK_STS_OK && bio->bi_status == BLK_STS_OK) + bio->bi_status = status; + + if (refcount_dec_and_test(&bioctx->ref)) { + struct dm_zone *zone = bioctx->zone; + + if (zone) { + if (bio->bi_status != BLK_STS_OK && + bio_op(bio) == REQ_OP_WRITE && + dmz_is_seq(zone)) + set_bit(DMZ_SEQ_WRITE_ERR, &zone->flags); + dmz_deactivate_zone(zone); + } + bio_endio(bio); + } } /* - * Partial clone read BIO completion callback. This terminates the + * Completion callback for an internally cloned target BIO. This terminates the * target BIO when there are no more references to its context. */ -static void dmz_read_bio_end_io(struct bio *bio) +static void dmz_clone_endio(struct bio *clone) { - struct dmz_bioctx *bioctx = bio->bi_private; - blk_status_t status = bio->bi_status; + struct dmz_bioctx *bioctx = clone->bi_private; + blk_status_t status = clone->bi_status; - bio_put(bio); + bio_put(clone); dmz_bio_endio(bioctx->bio, status); } /* - * Issue a BIO to a zone. The BIO may only partially process the + * Issue a clone of a target BIO. The clone may only partially process the * original target BIO. */ -static int dmz_submit_read_bio(struct dmz_target *dmz, struct dm_zone *zone, - struct bio *bio, sector_t chunk_block, - unsigned int nr_blocks) +static int dmz_submit_bio(struct dmz_target *dmz, struct dm_zone *zone, + struct bio *bio, sector_t chunk_block, + unsigned int nr_blocks) { struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx)); - sector_t sector; struct bio *clone; - /* BIO remap sector */ - sector = dmz_start_sect(dmz->metadata, zone) + dmz_blk2sect(chunk_block); - - /* If the read is not partial, there is no need to clone the BIO */ - if (nr_blocks == dmz_bio_blocks(bio)) { - /* Setup and submit the BIO */ - bio->bi_iter.bi_sector = sector; - refcount_inc(&bioctx->ref); - generic_make_request(bio); - return 0; - } - - /* Partial BIO: we need to clone the BIO */ clone = bio_clone_fast(bio, GFP_NOIO, &dmz->bio_set); if (!clone) return -ENOMEM; - /* Setup the clone */ - clone->bi_iter.bi_sector = sector; + bio_set_dev(clone, dmz->dev->bdev); + clone->bi_iter.bi_sector = + dmz_start_sect(dmz->metadata, zone) + dmz_blk2sect(chunk_block); clone->bi_iter.bi_size = dmz_blk2sect(nr_blocks) << SECTOR_SHIFT; - clone->bi_end_io = dmz_read_bio_end_io; + clone->bi_end_io = dmz_clone_endio; clone->bi_private = bioctx; bio_advance(bio, clone->bi_iter.bi_size); - /* Submit the clone */ refcount_inc(&bioctx->ref); generic_make_request(clone); + if (bio_op(bio) == REQ_OP_WRITE && dmz_is_seq(zone)) + zone->wp_block += nr_blocks; + return 0; } @@ -214,7 +214,7 @@ static int dmz_handle_read(struct dmz_target *dmz, struct dm_zone *zone, if (nr_blocks) { /* Valid blocks found: read them */ nr_blocks = min_t(unsigned int, nr_blocks, end_block - chunk_block); - ret = dmz_submit_read_bio(dmz, rzone, bio, chunk_block, nr_blocks); + ret = dmz_submit_bio(dmz, rzone, bio, chunk_block, nr_blocks); if (ret) return ret; chunk_block += nr_blocks; @@ -229,25 +229,6 @@ static int dmz_handle_read(struct dmz_target *dmz, struct dm_zone *zone, } /* - * Issue a write BIO to a zone. - */ -static void dmz_submit_write_bio(struct dmz_target *dmz, struct dm_zone *zone, - struct bio *bio, sector_t chunk_block, - unsigned int nr_blocks) -{ - struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx)); - - /* Setup and submit the BIO */ - bio_set_dev(bio, dmz->dev->bdev); - bio->bi_iter.bi_sector = dmz_start_sect(dmz->metadata, zone) + dmz_blk2sect(chunk_block); - refcount_inc(&bioctx->ref); - generic_make_request(bio); - - if (dmz_is_seq(zone)) - zone->wp_block += nr_blocks; -} - -/* * Write blocks directly in a data zone, at the write pointer. * If a buffer zone is assigned, invalidate the blocks written * in place. @@ -265,7 +246,9 @@ static int dmz_handle_direct_write(struct dmz_target *dmz, return -EROFS; /* Submit write */ - dmz_submit_write_bio(dmz, zone, bio, chunk_block, nr_blocks); + ret = dmz_submit_bio(dmz, zone, bio, chunk_block, nr_blocks); + if (ret) + return ret; /* * Validate the blocks in the data zone and invalidate @@ -301,7 +284,9 @@ static int dmz_handle_buffered_write(struct dmz_target *dmz, return -EROFS; /* Submit write */ - dmz_submit_write_bio(dmz, bzone, bio, chunk_block, nr_blocks); + ret = dmz_submit_bio(dmz, bzone, bio, chunk_block, nr_blocks); + if (ret) + return ret; /* * Validate the blocks in the buffer zone @@ -600,7 +585,6 @@ static int dmz_map(struct dm_target *ti, struct bio *bio) bioctx->zone = NULL; bioctx->bio = bio; refcount_set(&bioctx->ref, 1); - bioctx->status = BLK_STS_OK; /* Set the BIO pending in the flush list */ if (!nr_sectors && bio_op(bio) == REQ_OP_WRITE) { @@ -624,35 +608,6 @@ static int dmz_map(struct dm_target *ti, struct bio *bio) } /* - * Completed target BIO processing. - */ -static int dmz_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *error) -{ - struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx)); - - if (bioctx->status == BLK_STS_OK && *error) - bioctx->status = *error; - - if (!refcount_dec_and_test(&bioctx->ref)) - return DM_ENDIO_INCOMPLETE; - - /* Done */ - bio->bi_status = bioctx->status; - - if (bioctx->zone) { - struct dm_zone *zone = bioctx->zone; - - if (*error && bio_op(bio) == REQ_OP_WRITE) { - if (dmz_is_seq(zone)) - set_bit(DMZ_SEQ_WRITE_ERR, &zone->flags); - } - dmz_deactivate_zone(zone); - } - - return DM_ENDIO_DONE; -} - -/* * Get zoned device information. */ static int dmz_get_zoned_device(struct dm_target *ti, char *path) @@ -946,7 +901,6 @@ static struct target_type dmz_type = { .ctr = dmz_ctr, .dtr = dmz_dtr, .map = dmz_map, - .end_io = dmz_end_io, .io_hints = dmz_io_hints, .prepare_ioctl = dmz_prepare_ioctl, .postsuspend = dmz_suspend, diff --git a/drivers/md/dm.c b/drivers/md/dm.c index c510179a7f84..a4a06982ed91 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -646,26 +646,38 @@ static void free_tio(struct dm_target_io *tio) bio_put(&tio->clone); } -int md_in_flight(struct mapped_device *md) +static bool md_in_flight_bios(struct mapped_device *md) { - return atomic_read(&md->pending[READ]) + - atomic_read(&md->pending[WRITE]); + int cpu; + struct hd_struct *part = &dm_disk(md)->part0; + long sum = 0; + + for_each_possible_cpu(cpu) { + sum += part_stat_local_read_cpu(part, in_flight[0], cpu); + sum += part_stat_local_read_cpu(part, in_flight[1], cpu); + } + + return sum != 0; +} + +static bool md_in_flight(struct mapped_device *md) +{ + if (queue_is_mq(md->queue)) + return blk_mq_queue_inflight(md->queue); + else + return md_in_flight_bios(md); } static void start_io_acct(struct dm_io *io) { struct mapped_device *md = io->md; struct bio *bio = io->orig_bio; - int rw = bio_data_dir(bio); io->start_time = jiffies; generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio), &dm_disk(md)->part0); - atomic_set(&dm_disk(md)->part0.in_flight[rw], - atomic_inc_return(&md->pending[rw])); - if (unlikely(dm_stats_used(&md->stats))) dm_stats_account_io(&md->stats, bio_data_dir(bio), bio->bi_iter.bi_sector, bio_sectors(bio), @@ -677,8 +689,6 @@ static void end_io_acct(struct dm_io *io) struct mapped_device *md = io->md; struct bio *bio = io->orig_bio; unsigned long duration = jiffies - io->start_time; - int pending; - int rw = bio_data_dir(bio); generic_end_io_acct(md->queue, bio_op(bio), &dm_disk(md)->part0, io->start_time); @@ -688,16 +698,8 @@ static void end_io_acct(struct dm_io *io) bio->bi_iter.bi_sector, bio_sectors(bio), true, duration, &io->stats_aux); - /* - * After this is decremented the bio must not be touched if it is - * a flush. - */ - pending = atomic_dec_return(&md->pending[rw]); - atomic_set(&dm_disk(md)->part0.in_flight[rw], pending); - pending += atomic_read(&md->pending[rw^0x1]); - /* nudge anyone waiting on suspend queue */ - if (!pending) + if (unlikely(waitqueue_active(&md->wait))) wake_up(&md->wait); } @@ -1417,10 +1419,21 @@ static int __send_empty_flush(struct clone_info *ci) unsigned target_nr = 0; struct dm_target *ti; + /* + * Empty flush uses a statically initialized bio, as the base for + * cloning. However, blkg association requires that a bdev is + * associated with a gendisk, which doesn't happen until the bdev is + * opened. So, blkg association is done at issue time of the flush + * rather than when the device is created in alloc_dev(). + */ + bio_set_dev(ci->bio, ci->io->md->bdev); + BUG_ON(bio_has_data(ci->bio)); while ((ti = dm_table_get_target(ci->map, target_nr++))) __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL); + bio_disassociate_blkg(ci->bio); + return 0; } @@ -1593,10 +1606,21 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md, return ret; } + blk_queue_split(md->queue, &bio); + init_clone_info(&ci, md, map, bio); if (bio->bi_opf & REQ_PREFLUSH) { - ci.bio = &ci.io->md->flush_bio; + struct bio flush_bio; + + /* + * Use an on-stack bio for this, it's safe since we don't + * need to reference it after submit. It's just used as + * the basis for the clone(s). + */ + bio_init(&flush_bio, NULL, 0); + flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC; + ci.bio = &flush_bio; ci.sector_count = 0; error = __send_empty_flush(&ci); /* dec_pending submits any data associated with flush */ @@ -1652,7 +1676,16 @@ static blk_qc_t __process_bio(struct mapped_device *md, init_clone_info(&ci, md, map, bio); if (bio->bi_opf & REQ_PREFLUSH) { - ci.bio = &ci.io->md->flush_bio; + struct bio flush_bio; + + /* + * Use an on-stack bio for this, it's safe since we don't + * need to reference it after submit. It's just used as + * the basis for the clone(s). + */ + bio_init(&flush_bio, NULL, 0); + flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC; + ci.bio = &flush_bio; ci.sector_count = 0; error = __send_empty_flush(&ci); /* dec_pending submits any data associated with flush */ @@ -1896,7 +1929,7 @@ static struct mapped_device *alloc_dev(int minor) INIT_LIST_HEAD(&md->table_devices); spin_lock_init(&md->uevent_lock); - md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id, NULL); + md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id); if (!md->queue) goto bad; md->queue->queuedata = md; @@ -1906,8 +1939,6 @@ static struct mapped_device *alloc_dev(int minor) if (!md->disk) goto bad; - atomic_set(&md->pending[0], 0); - atomic_set(&md->pending[1], 0); init_waitqueue_head(&md->wait); INIT_WORK(&md->work, dm_wq_work); init_waitqueue_head(&md->eventq); @@ -1938,10 +1969,6 @@ static struct mapped_device *alloc_dev(int minor) if (!md->bdev) goto bad; - bio_init(&md->flush_bio, NULL, 0); - bio_set_dev(&md->flush_bio, md->bdev); - md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC; - dm_stats_init(&md->stats); /* Populate the mapping, nobody knows we exist yet */ diff --git a/drivers/md/md.c b/drivers/md/md.c index fc488cb30a94..9a0a1e0934d5 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -334,7 +334,6 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) const int sgrp = op_stat_group(bio_op(bio)); struct mddev *mddev = q->queuedata; unsigned int sectors; - int cpu; blk_queue_split(q, &bio); @@ -359,9 +358,9 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) md_handle_request(mddev, bio); - cpu = part_stat_lock(); - part_stat_inc(cpu, &mddev->gendisk->part0, ios[sgrp]); - part_stat_add(cpu, &mddev->gendisk->part0, sectors[sgrp], sectors); + part_stat_lock(); + part_stat_inc(&mddev->gendisk->part0, ios[sgrp]); + part_stat_add(&mddev->gendisk->part0, sectors[sgrp], sectors); part_stat_unlock(); return BLK_QC_T_NONE; diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index ac1cffd2a09b..f3fb5bb8c82a 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -542,7 +542,7 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) !discard_bio) continue; bio_chain(discard_bio, bio); - bio_clone_blkcg_association(discard_bio, bio); + bio_clone_blkg_association(discard_bio, bio); if (mddev->gendisk) trace_block_bio_remap(bdev_get_queue(rdev->bdev), discard_bio, disk_devt(mddev->gendisk), |