From 53b381b3abeb86f12787a6c40fee9b2f71edc23b Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Tue, 29 Jan 2013 18:40:14 -0500 Subject: Btrfs: RAID5 and RAID6 This builds on David Woodhouse's original Btrfs raid5/6 implementation. The code has changed quite a bit, blame Chris Mason for any bugs. Read/modify/write is done after the higher levels of the filesystem have prepared a given bio. This means the higher layers are not responsible for building full stripes, and they don't need to query for the topology of the extents that may get allocated during delayed allocation runs. It also means different files can easily share the same stripe. But, it does expose us to incorrect parity if we crash or lose power while doing a read/modify/write cycle. This will be addressed in a later commit. Scrub is unable to repair crc errors on raid5/6 chunks. Discard does not work on raid5/6 (yet) The stripe size is fixed at 64KiB per disk. This will be tunable in a later commit. Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 53 insertions(+), 9 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 65f03670a952..e9fa7b4d18e3 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -46,6 +46,7 @@ #include "check-integrity.h" #include "rcu-string.h" #include "dev-replace.h" +#include "raid56.h" #ifdef CONFIG_X86 #include @@ -639,8 +640,15 @@ err: btree_readahead_hook(root, eb, eb->start, ret); } - if (ret) + if (ret) { + /* + * our io error hook is going to dec the io pages + * again, we have to make sure it has something + * to decrement + */ + atomic_inc(&eb->io_pages); clear_extent_buffer_uptodate(eb); + } free_extent_buffer(eb); out: return ret; @@ -654,6 +662,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror) eb = (struct extent_buffer *)page->private; set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); eb->read_mirror = failed_mirror; + atomic_dec(&eb->io_pages); if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) btree_readahead_hook(root, eb, eb->start, -EIO); return -EIO; /* we fixed nothing */ @@ -670,17 +679,23 @@ static void end_workqueue_bio(struct bio *bio, int err) end_io_wq->work.flags = 0; if (bio->bi_rw & REQ_WRITE) { - if (end_io_wq->metadata == 1) + if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) btrfs_queue_worker(&fs_info->endio_meta_write_workers, &end_io_wq->work); - else if (end_io_wq->metadata == 2) + else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) btrfs_queue_worker(&fs_info->endio_freespace_worker, &end_io_wq->work); + else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) + btrfs_queue_worker(&fs_info->endio_raid56_workers, + &end_io_wq->work); else btrfs_queue_worker(&fs_info->endio_write_workers, &end_io_wq->work); } else { - if (end_io_wq->metadata) + if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) + btrfs_queue_worker(&fs_info->endio_raid56_workers, + &end_io_wq->work); + else if (end_io_wq->metadata) btrfs_queue_worker(&fs_info->endio_meta_workers, &end_io_wq->work); else @@ -695,6 +710,7 @@ static void end_workqueue_bio(struct bio *bio, int err) * 0 - if data * 1 - if normal metadta * 2 - if writing to the free space cache area + * 3 - raid parity work */ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, int metadata) @@ -2165,6 +2181,12 @@ int open_ctree(struct super_block *sb, init_waitqueue_head(&fs_info->transaction_blocked_wait); init_waitqueue_head(&fs_info->async_submit_wait); + ret = btrfs_alloc_stripe_hash_table(fs_info); + if (ret) { + err = -ENOMEM; + goto fail_alloc; + } + __setup_root(4096, 4096, 4096, 4096, tree_root, fs_info, BTRFS_ROOT_TREE_OBJECTID); @@ -2332,6 +2354,12 @@ int open_ctree(struct super_block *sb, btrfs_init_workers(&fs_info->endio_meta_write_workers, "endio-meta-write", fs_info->thread_pool_size, &fs_info->generic_worker); + btrfs_init_workers(&fs_info->endio_raid56_workers, + "endio-raid56", fs_info->thread_pool_size, + &fs_info->generic_worker); + btrfs_init_workers(&fs_info->rmw_workers, + "rmw", fs_info->thread_pool_size, + &fs_info->generic_worker); btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", fs_info->thread_pool_size, &fs_info->generic_worker); @@ -2350,6 +2378,8 @@ int open_ctree(struct super_block *sb, */ fs_info->endio_workers.idle_thresh = 4; fs_info->endio_meta_workers.idle_thresh = 4; + fs_info->endio_raid56_workers.idle_thresh = 4; + fs_info->rmw_workers.idle_thresh = 2; fs_info->endio_write_workers.idle_thresh = 2; fs_info->endio_meta_write_workers.idle_thresh = 2; @@ -2366,6 +2396,8 @@ int open_ctree(struct super_block *sb, ret |= btrfs_start_workers(&fs_info->fixup_workers); ret |= btrfs_start_workers(&fs_info->endio_workers); ret |= btrfs_start_workers(&fs_info->endio_meta_workers); + ret |= btrfs_start_workers(&fs_info->rmw_workers); + ret |= btrfs_start_workers(&fs_info->endio_raid56_workers); ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers); ret |= btrfs_start_workers(&fs_info->endio_write_workers); ret |= btrfs_start_workers(&fs_info->endio_freespace_worker); @@ -2710,6 +2742,8 @@ fail_sb_buffer: btrfs_stop_workers(&fs_info->workers); btrfs_stop_workers(&fs_info->endio_workers); btrfs_stop_workers(&fs_info->endio_meta_workers); + btrfs_stop_workers(&fs_info->endio_raid56_workers); + btrfs_stop_workers(&fs_info->rmw_workers); btrfs_stop_workers(&fs_info->endio_meta_write_workers); btrfs_stop_workers(&fs_info->endio_write_workers); btrfs_stop_workers(&fs_info->endio_freespace_worker); @@ -2728,6 +2762,7 @@ fail_bdi: fail_srcu: cleanup_srcu_struct(&fs_info->subvol_srcu); fail: + btrfs_free_stripe_hash_table(fs_info); btrfs_close_devices(fs_info->fs_devices); return err; @@ -3076,11 +3111,16 @@ int btrfs_calc_num_tolerated_disk_barrier_failures( ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0))) num_tolerated_disk_barrier_failures = 0; - else if (num_tolerated_disk_barrier_failures > 1 - && - (flags & (BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10))) - num_tolerated_disk_barrier_failures = 1; + else if (num_tolerated_disk_barrier_failures > 1) { + if (flags & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID10)) { + num_tolerated_disk_barrier_failures = 1; + } else if (flags & + BTRFS_BLOCK_GROUP_RAID5) { + num_tolerated_disk_barrier_failures = 2; + } + } } } up_read(&sinfo->groups_sem); @@ -3384,6 +3424,8 @@ int close_ctree(struct btrfs_root *root) btrfs_stop_workers(&fs_info->workers); btrfs_stop_workers(&fs_info->endio_workers); btrfs_stop_workers(&fs_info->endio_meta_workers); + btrfs_stop_workers(&fs_info->endio_raid56_workers); + btrfs_stop_workers(&fs_info->rmw_workers); btrfs_stop_workers(&fs_info->endio_meta_write_workers); btrfs_stop_workers(&fs_info->endio_write_workers); btrfs_stop_workers(&fs_info->endio_freespace_worker); @@ -3404,6 +3446,8 @@ int close_ctree(struct btrfs_root *root) bdi_destroy(&fs_info->bdi); cleanup_srcu_struct(&fs_info->subvol_srcu); + btrfs_free_stripe_hash_table(fs_info); + return 0; } -- cgit v1.2.3 From 78a6184a3ff9041280ee56273c01e5679a831b39 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 21 Nov 2012 02:21:28 +0000 Subject: Btrfs: use slabs for delayed reference allocation The delayed reference allocation is in the fast path of the IO, so use slabs to improve the speed of the allocation. And besides that, it can do check for leaked objects when the module is removed. Signed-off-by: Miao Xie --- fs/btrfs/disk-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 65f03670a952..4438aac4947f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3614,7 +3614,7 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, continue; } - kfree(head->extent_op); + btrfs_free_delayed_extent_op(head->extent_op); delayed_refs->num_heads--; if (list_empty(&head->cluster)) delayed_refs->num_heads_ready--; -- cgit v1.2.3 From 4eee4fa4f8ab8c2b2623c22be4c3cb91d525aa57 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Fri, 21 Dec 2012 09:17:45 +0000 Subject: Btrfs: use wrapper page_offset Use wrapper page_offset to get byte-offset into filesystem object for page. Signed-off-by: Liu Bo Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/disk-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4438aac4947f..a762f9137610 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -420,7 +420,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) { struct extent_io_tree *tree; - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 start = page_offset(page); u64 found_start; struct extent_buffer *eb; -- cgit v1.2.3 From 2ab28f322f9896782da904f5942f3873432addc8 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 12 Oct 2012 15:27:49 -0400 Subject: Btrfs: wait on ordered extents at the last possible moment Since we don't actually copy the extent information from the source tree in the fast case we don't need to wait for ordered io to be completed in order to fsync, we just need to wait for the io to be completed. So when we're logging our file just attach all of the ordered extents to the log, and then when the log syncs just wait for IO_DONE on the ordered extents and then write the super. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/disk-io.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a762f9137610..1db8a9938829 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1178,9 +1178,13 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, INIT_LIST_HEAD(&root->dirty_list); INIT_LIST_HEAD(&root->root_list); + INIT_LIST_HEAD(&root->logged_list[0]); + INIT_LIST_HEAD(&root->logged_list[1]); spin_lock_init(&root->orphan_lock); spin_lock_init(&root->inode_lock); spin_lock_init(&root->accounting_lock); + spin_lock_init(&root->log_extents_lock[0]); + spin_lock_init(&root->log_extents_lock[1]); mutex_init(&root->objectid_mutex); mutex_init(&root->log_mutex); init_waitqueue_head(&root->log_writer_wait); -- cgit v1.2.3 From a1897fddd28daf6b23d05a30dc2a18836f77f8e3 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 27 Dec 2012 09:01:23 +0000 Subject: Btrfs: record first logical byte in memory This'd save us a rbtree search which may become expensive in large filesystem. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/disk-io.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1db8a9938829..04f98e3ffd90 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2130,6 +2130,7 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->block_group_cache_lock); fs_info->block_group_cache_tree = RB_ROOT; + fs_info->first_logical_byte = (u64)-1; extent_io_tree_init(&fs_info->freed_extents[0], fs_info->btree_inode->i_mapping); -- cgit v1.2.3 From e2d845211eda9cf296e8edf6724b3d541f4fbfd5 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 29 Jan 2013 10:09:20 +0000 Subject: Btrfs: use percpu counter for dirty metadata count ->dirty_metadata_bytes is accessed very frequently, so use percpu counter instead of the u64 variant to reduce the contention of the lock. This patch also fixed the problem that we access it without lock protection in __btrfs_btree_balance_dirty(), which may cause we skip the dirty pages flush. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/disk-io.c | 64 +++++++++++++++++++++++++++++------------------------- 1 file changed, 34 insertions(+), 30 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 04f98e3ffd90..34ace168eebc 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -946,18 +946,20 @@ static int btree_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct extent_io_tree *tree; + struct btrfs_fs_info *fs_info; + int ret; + tree = &BTRFS_I(mapping->host)->io_tree; if (wbc->sync_mode == WB_SYNC_NONE) { - struct btrfs_root *root = BTRFS_I(mapping->host)->root; - u64 num_dirty; - unsigned long thresh = 32 * 1024 * 1024; if (wbc->for_kupdate) return 0; + fs_info = BTRFS_I(mapping->host)->root->fs_info; /* this is a bit racy, but that's ok */ - num_dirty = root->fs_info->dirty_metadata_bytes; - if (num_dirty < thresh) + ret = percpu_counter_compare(&fs_info->dirty_metadata_bytes, + BTRFS_DIRTY_METADATA_THRESH); + if (ret < 0) return 0; } return btree_write_cache_pages(mapping, wbc); @@ -1125,24 +1127,16 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf) { + struct btrfs_fs_info *fs_info = root->fs_info; + if (btrfs_header_generation(buf) == - root->fs_info->running_transaction->transid) { + fs_info->running_transaction->transid) { btrfs_assert_tree_locked(buf); if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) { - spin_lock(&root->fs_info->delalloc_lock); - if (root->fs_info->dirty_metadata_bytes >= buf->len) - root->fs_info->dirty_metadata_bytes -= buf->len; - else { - spin_unlock(&root->fs_info->delalloc_lock); - btrfs_panic(root->fs_info, -EOVERFLOW, - "Can't clear %lu bytes from " - " dirty_mdatadata_bytes (%llu)", - buf->len, - root->fs_info->dirty_metadata_bytes); - } - spin_unlock(&root->fs_info->delalloc_lock); - + __percpu_counter_add(&fs_info->dirty_metadata_bytes, + -buf->len, + fs_info->dirty_metadata_batch); /* ugh, clear_extent_buffer_dirty needs to lock the page */ btrfs_set_lock_blocking(buf); clear_extent_buffer_dirty(buf); @@ -2008,10 +2002,18 @@ int open_ctree(struct super_block *sb, goto fail_srcu; } + ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0); + if (ret) { + err = ret; + goto fail_bdi; + } + fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE * + (1 + ilog2(nr_cpu_ids)); + fs_info->btree_inode = new_inode(sb); if (!fs_info->btree_inode) { err = -ENOMEM; - goto fail_bdi; + goto fail_dirty_metadata_bytes; } mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); @@ -2266,6 +2268,7 @@ int open_ctree(struct super_block *sb, leafsize = btrfs_super_leafsize(disk_super); sectorsize = btrfs_super_sectorsize(disk_super); stripesize = btrfs_super_stripesize(disk_super); + fs_info->dirty_metadata_batch = leafsize * (1 + ilog2(nr_cpu_ids)); /* * mixed block groups end up with duplicate but slightly offset @@ -2728,6 +2731,8 @@ fail_iput: invalidate_inode_pages2(fs_info->btree_inode->i_mapping); iput(fs_info->btree_inode); +fail_dirty_metadata_bytes: + percpu_counter_destroy(&fs_info->dirty_metadata_bytes); fail_bdi: bdi_destroy(&fs_info->bdi); fail_srcu: @@ -3406,6 +3411,7 @@ int close_ctree(struct btrfs_root *root) btrfs_close_devices(fs_info->fs_devices); btrfs_mapping_tree_free(&fs_info->mapping_tree); + percpu_counter_destroy(&fs_info->dirty_metadata_bytes); bdi_destroy(&fs_info->bdi); cleanup_srcu_struct(&fs_info->subvol_srcu); @@ -3448,11 +3454,10 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) (unsigned long long)transid, (unsigned long long)root->fs_info->generation); was_dirty = set_extent_buffer_dirty(buf); - if (!was_dirty) { - spin_lock(&root->fs_info->delalloc_lock); - root->fs_info->dirty_metadata_bytes += buf->len; - spin_unlock(&root->fs_info->delalloc_lock); - } + if (!was_dirty) + __percpu_counter_add(&root->fs_info->dirty_metadata_bytes, + buf->len, + root->fs_info->dirty_metadata_batch); } static void __btrfs_btree_balance_dirty(struct btrfs_root *root, @@ -3462,8 +3467,7 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root, * looks as though older kernels can get into trouble with * this code, they end up stuck in balance_dirty_pages forever */ - u64 num_dirty; - unsigned long thresh = 32 * 1024 * 1024; + int ret; if (current->flags & PF_MEMALLOC) return; @@ -3471,9 +3475,9 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root, if (flush_delayed) btrfs_balance_delayed_items(root); - num_dirty = root->fs_info->dirty_metadata_bytes; - - if (num_dirty > thresh) { + ret = percpu_counter_compare(&root->fs_info->dirty_metadata_bytes, + BTRFS_DIRTY_METADATA_THRESH); + if (ret > 0) { balance_dirty_pages_ratelimited_nr( root->fs_info->btree_inode->i_mapping, 1); } -- cgit v1.2.3 From 963d678b0f7649300e3a67f2513ca9d830c6e303 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 29 Jan 2013 10:10:51 +0000 Subject: Btrfs: use percpu counter for fs_info->delalloc_bytes fs_info->delalloc_bytes is accessed very frequently, so use percpu counter instead of the u64 variant for it to reduce the lock contention. This patch also fixed the problem that we access the variant without the lock protection.At worst, we would not flush the delalloc inodes, and just return ENOSPC error when we still have some free space in the fs. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/disk-io.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 34ace168eebc..2c9498aefe86 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2010,10 +2010,16 @@ int open_ctree(struct super_block *sb, fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE * (1 + ilog2(nr_cpu_ids)); + ret = percpu_counter_init(&fs_info->delalloc_bytes, 0); + if (ret) { + err = ret; + goto fail_dirty_metadata_bytes; + } + fs_info->btree_inode = new_inode(sb); if (!fs_info->btree_inode) { err = -ENOMEM; - goto fail_dirty_metadata_bytes; + goto fail_delalloc_bytes; } mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); @@ -2269,6 +2275,7 @@ int open_ctree(struct super_block *sb, sectorsize = btrfs_super_sectorsize(disk_super); stripesize = btrfs_super_stripesize(disk_super); fs_info->dirty_metadata_batch = leafsize * (1 + ilog2(nr_cpu_ids)); + fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids)); /* * mixed block groups end up with duplicate but slightly offset @@ -2731,6 +2738,8 @@ fail_iput: invalidate_inode_pages2(fs_info->btree_inode->i_mapping); iput(fs_info->btree_inode); +fail_delalloc_bytes: + percpu_counter_destroy(&fs_info->delalloc_bytes); fail_dirty_metadata_bytes: percpu_counter_destroy(&fs_info->dirty_metadata_bytes); fail_bdi: @@ -3362,9 +3371,9 @@ int close_ctree(struct btrfs_root *root) btrfs_free_qgroup_config(root->fs_info); - if (fs_info->delalloc_bytes) { - printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", - (unsigned long long)fs_info->delalloc_bytes); + if (percpu_counter_sum(&fs_info->delalloc_bytes)) { + printk(KERN_INFO "btrfs: at unmount delalloc count %lld\n", + percpu_counter_sum(&fs_info->delalloc_bytes)); } free_extent_buffer(fs_info->extent_root->node); @@ -3412,6 +3421,7 @@ int close_ctree(struct btrfs_root *root) btrfs_mapping_tree_free(&fs_info->mapping_tree); percpu_counter_destroy(&fs_info->dirty_metadata_bytes); + percpu_counter_destroy(&fs_info->delalloc_bytes); bdi_destroy(&fs_info->bdi); cleanup_srcu_struct(&fs_info->subvol_srcu); -- cgit v1.2.3 From df0af1a57f72c74d53a9377c60ff20095afab97d Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 29 Jan 2013 10:11:59 +0000 Subject: Btrfs: use the inode own lock to protect its delalloc_bytes We need not use a global lock to protect the delalloc_bytes of the inode, just use its own lock. In this way, we can reduce the lock contention and ->delalloc_lock will just protect delalloc inode list. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/disk-io.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2c9498aefe86..11f6dbcb1191 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3690,6 +3690,8 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) delalloc_inodes); list_del_init(&btrfs_inode->delalloc_inodes); + clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &btrfs_inode->runtime_flags); btrfs_invalidate_inodes(btrfs_inode->root); } -- cgit v1.2.3 From de98ced9e743656d108de41841797def0f5cb951 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 29 Jan 2013 10:13:12 +0000 Subject: Btrfs: use seqlock to protect fs_info->avail_{data, metadata, system}_alloc_bits There is no lock to protect fs_info->avail_{data, metadata, system}_alloc_bits, it may introduce some problem, such as the wrong profile information, so we add a seqlock to protect them. Signed-off-by: Zhao Lei Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/disk-io.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 11f6dbcb1191..00b6742fdde7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2040,6 +2040,7 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->tree_mod_seq_lock); rwlock_init(&fs_info->tree_mod_log_lock); mutex_init(&fs_info->reloc_mutex); + seqlock_init(&fs_info->profiles_lock); init_completion(&fs_info->kobj_unregister); INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); -- cgit v1.2.3 From 87533c475187c1420794a2e164bc67a7974f1327 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 29 Jan 2013 10:14:48 +0000 Subject: Btrfs: use bit operation for ->fs_state There is no lock to protect fs_info->fs_state, it will introduce some problems, such as the value may be covered by the other task when several tasks modify it. For example: Task0 - CPU0 Task1 - CPU1 mov %fs_state rax or $0x1 rax mov %fs_state rax or $0x2 rax mov rax %fs_state mov rax %fs_state The expected value is 3, but in fact, it is 2. Though this problem doesn't happen now (because there is only one flag currently), the code is error prone, if we add other flags, the above problem will happen to a certainty. Now we use bit operation for it to fix the above problem. In this way, we can make the code more robust and be easy to add new flags. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/disk-io.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 00b6742fdde7..8e58a1f90546 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2201,7 +2201,8 @@ int open_ctree(struct super_block *sb, goto fail_alloc; /* check FS state, whether FS is broken. */ - fs_info->fs_state |= btrfs_super_flags(disk_super); + if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR) + set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); if (ret) { @@ -3359,7 +3360,7 @@ int close_ctree(struct btrfs_root *root) printk(KERN_ERR "btrfs: commit super ret %d\n", ret); } - if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) btrfs_error_commit_super(root); btrfs_put_block_group_cache(fs_info); -- cgit v1.2.3 From d1d3cd27a32b819a0b0e2b9b6b5f9e80890f9821 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Thu, 31 Jan 2013 00:54:53 +0000 Subject: btrfs: list_entry can't return NULL No need to test the result, we can't get a null pointer from list_entry() Signed-off-by: Eric Sandeen Signed-off-by: Josef Bacik --- fs/btrfs/disk-io.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8e58a1f90546..bd48bf21118e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3846,8 +3846,6 @@ int btrfs_cleanup_transaction(struct btrfs_root *root) while (!list_empty(&list)) { t = list_entry(list.next, struct btrfs_transaction, list); - if (!t) - break; btrfs_destroy_ordered_operations(root); -- cgit v1.2.3 From eb12db690c7eb0f6593ba5792f5861409e88bc03 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 30 Jan 2013 16:03:59 -0500 Subject: Btrfs: fix freeing delayed ref head while still holding its mutex I hit this error when reproducing a bug that would end in a transaction abort. We take the delayed ref head's mutex to keep anybody from processing it while we're destroying it, but we fail to drop the mutex before we carry on and free the damned thing. Fix this by doing the remove logic for the head ourselves and unlock the mutex, that way we can avoid use after free's or hung tasks waiting on that mutex to come back so they know the delayed ref completed. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/disk-io.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index bd48bf21118e..8140cb01951f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3615,11 +3615,11 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, } while ((node = rb_first(&delayed_refs->root)) != NULL) { - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); + struct btrfs_delayed_ref_head *head = NULL; + ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); atomic_set(&ref->refs, 1); if (btrfs_delayed_ref_is_head(ref)) { - struct btrfs_delayed_ref_head *head; head = btrfs_delayed_node_to_head(ref); if (!mutex_trylock(&head->mutex)) { @@ -3641,10 +3641,12 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, delayed_refs->num_heads_ready--; list_del_init(&head->cluster); } + ref->in_tree = 0; rb_erase(&ref->rb_node, &delayed_refs->root); delayed_refs->num_entries--; - + if (head) + mutex_unlock(&head->mutex); spin_unlock(&delayed_refs->lock); btrfs_put_delayed_ref(ref); -- cgit v1.2.3 From 779880ef35b60ac82eed1dcaec6db5b34a170df8 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 31 Jan 2013 14:30:08 -0500 Subject: Btrfs: fix how we discard outstanding ordered extents on abort When we abort we've been just free'ing up all the ordered extents and hoping for the best. This results in lots of warnings from various places, warnings from btrfs_destroy_inode() because it's ENOSPC accounting isn't fixed. It will also screw up lots of pages who have been set private but never get cleared because the ordered extents are never allowed to be submitted. This patch fixes those warnings. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/disk-io.c | 33 +++++++-------------------------- 1 file changed, 7 insertions(+), 26 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8140cb01951f..259d7891d10c 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3565,35 +3565,16 @@ static void btrfs_destroy_ordered_operations(struct btrfs_root *root) static void btrfs_destroy_ordered_extents(struct btrfs_root *root) { - struct list_head splice; struct btrfs_ordered_extent *ordered; - struct inode *inode; - - INIT_LIST_HEAD(&splice); spin_lock(&root->fs_info->ordered_extent_lock); - - list_splice_init(&root->fs_info->ordered_extents, &splice); - while (!list_empty(&splice)) { - ordered = list_entry(splice.next, struct btrfs_ordered_extent, - root_extent_list); - - list_del_init(&ordered->root_extent_list); - atomic_inc(&ordered->refs); - - /* the inode may be getting freed (in sys_unlink path). */ - inode = igrab(ordered->inode); - - spin_unlock(&root->fs_info->ordered_extent_lock); - if (inode) - iput(inode); - - atomic_set(&ordered->refs, 1); - btrfs_put_ordered_extent(ordered); - - spin_lock(&root->fs_info->ordered_extent_lock); - } - + /* + * This will just short circuit the ordered completion stuff which will + * make sure the ordered extent gets properly cleaned up. + */ + list_for_each_entry(ordered, &root->fs_info->ordered_extents, + root_extent_list) + set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); spin_unlock(&root->fs_info->ordered_extent_lock); } -- cgit v1.2.3 From 2b8195bb5717729e4e94ab4ad73a543feaafb0a2 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 7 Feb 2013 06:01:35 +0000 Subject: Btrfs: fix the race between bio and btrfs_stop_workers open_ctree() need read the metadata to initialize the global information of btrfs. But it may fail after it submit some bio, and then it will jump to the error path. Unfortunately, it doesn't check if there are some bios in flight, and just stop all the worker threads. As a result, when the submitted bios end, they can not find any worker thread which can deal with subsequent work, then oops happen. kernel BUG at fs/btrfs/async-thread.c:605! Fix this problem by invoking invalidate_inode_pages2() before we stop the worker threads. This function will wait until the bio end because it need lock the pages which are going to be invalidated, and if a page is under disk read IO, it must be locked. invalidate_inode_pages2() need wait until end bio handler to unlocked it. Reported-and-Tested-by: Tsutomu Itoh Signed-off-by: Eric Sandeen Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/disk-io.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 259d7891d10c..9fcce54ecde4 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2711,13 +2711,13 @@ fail_cleaner: * kthreads */ filemap_write_and_wait(fs_info->btree_inode->i_mapping); - invalidate_inode_pages2(fs_info->btree_inode->i_mapping); fail_block_groups: btrfs_free_block_groups(fs_info); fail_tree_roots: free_root_pointers(fs_info, 1); + invalidate_inode_pages2(fs_info->btree_inode->i_mapping); fail_sb_buffer: btrfs_stop_workers(&fs_info->generic_worker); @@ -2738,7 +2738,6 @@ fail_alloc: fail_iput: btrfs_mapping_tree_free(&fs_info->mapping_tree); - invalidate_inode_pages2(fs_info->btree_inode->i_mapping); iput(fs_info->btree_inode); fail_delalloc_bytes: percpu_counter_destroy(&fs_info->delalloc_bytes); -- cgit v1.2.3 From 569e0f358c0c37f6733702d4a5d2c412860f7169 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 13 Feb 2013 11:09:14 -0500 Subject: Btrfs: place ordered operations on a per transaction list Miao made the ordered operations stuff run async, which introduced a deadlock where we could get somebody (sync) racing in and committing the transaction while a commit was already happening. The new committer would try and flush ordered operations which would hang waiting for the commit to finish because it is done asynchronously and no longer inherits the callers trans handle. To fix this we need to make the ordered operations list a per transaction list. We can get new inodes added to the ordered operation list by truncating them and then having another process writing to them, so this makes it so that anybody trying to add an ordered operation _must_ start a transaction in order to add itself to the list, which will keep new inodes from getting added to the ordered operations list after we start committing. This should fix the deadlock and also keeps us from doing a lot more work than we need to during commit. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/disk-io.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9fcce54ecde4..e511d9f78c19 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -56,7 +56,8 @@ static void end_workqueue_fn(struct btrfs_work *work); static void free_fs_root(struct btrfs_root *root); static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, int read_only); -static void btrfs_destroy_ordered_operations(struct btrfs_root *root); +static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t, + struct btrfs_root *root); static void btrfs_destroy_ordered_extents(struct btrfs_root *root); static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, struct btrfs_root *root); @@ -2029,7 +2030,6 @@ int open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->delayed_iputs); INIT_LIST_HEAD(&fs_info->delalloc_inodes); - INIT_LIST_HEAD(&fs_info->ordered_operations); INIT_LIST_HEAD(&fs_info->caching_block_groups); spin_lock_init(&fs_info->delalloc_lock); spin_lock_init(&fs_info->trans_lock); @@ -3538,7 +3538,8 @@ void btrfs_error_commit_super(struct btrfs_root *root) btrfs_cleanup_transaction(root); } -static void btrfs_destroy_ordered_operations(struct btrfs_root *root) +static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t, + struct btrfs_root *root) { struct btrfs_inode *btrfs_inode; struct list_head splice; @@ -3548,7 +3549,7 @@ static void btrfs_destroy_ordered_operations(struct btrfs_root *root) mutex_lock(&root->fs_info->ordered_operations_mutex); spin_lock(&root->fs_info->ordered_extent_lock); - list_splice_init(&root->fs_info->ordered_operations, &splice); + list_splice_init(&t->ordered_operations, &splice); while (!list_empty(&splice)) { btrfs_inode = list_entry(splice.next, struct btrfs_inode, ordered_operations); @@ -3829,7 +3830,7 @@ int btrfs_cleanup_transaction(struct btrfs_root *root) while (!list_empty(&list)) { t = list_entry(list.next, struct btrfs_transaction, list); - btrfs_destroy_ordered_operations(root); + btrfs_destroy_ordered_operations(t, root); btrfs_destroy_ordered_extents(root); -- cgit v1.2.3 From cdb4c5748cb3ac533889a6b0b95aa10651e68785 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Wed, 20 Feb 2013 00:55:13 +0000 Subject: btrfs: define BTRFS_MAGIC as a u64 value super.magic is an le64 but it's treated as an unterminated string when compared against BTRFS_MAGIC which is defined as a string. Instead define BTRFS_MAGIC as a normal hex value and use endian helpers to compare it to the super's magic. I tested this by mounting an fs made before the change and made sure that it didn't introduce sparse errors. This matches a similar cleanup that is pending in btrfs-progs. David Sterba pointed out that we should fix the kernel side as well :). Signed-off-by: Zach Brown Signed-off-by: Josef Bacik --- fs/btrfs/disk-io.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e511d9f78c19..39ff34a62a24 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2407,8 +2407,7 @@ int open_ctree(struct super_block *sb, sb->s_blocksize = sectorsize; sb->s_blocksize_bits = blksize_bits(sectorsize); - if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC, - sizeof(disk_super->magic))) { + if (disk_super->magic != cpu_to_le64(BTRFS_MAGIC)) { printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id); goto fail_sb_buffer; } @@ -2815,8 +2814,7 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) super = (struct btrfs_super_block *)bh->b_data; if (btrfs_super_bytenr(super) != bytenr || - strncmp((char *)(&super->magic), BTRFS_MAGIC, - sizeof(super->magic))) { + super->magic != cpu_to_le64(BTRFS_MAGIC)) { brelse(bh); continue; } -- cgit v1.2.3 From 3321719ed67440bba1b0c5ae19c30d640263ccc8 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 27 Feb 2013 13:28:24 +0000 Subject: Btrfs: fix memory leak of log roots When we abort a transaction while fsyncing, we'll skip freeing log roots part of committing a transaction, which leads to memory leak. This adds a 'free log roots' in putting super when no more users hold references on log roots, so it's safe and clean. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/disk-io.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index eb7c14308521..5031e6dd5938 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3253,6 +3253,11 @@ void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) if (btrfs_root_refs(&root->root_item) == 0) synchronize_srcu(&fs_info->subvol_srcu); + if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + btrfs_free_log(NULL, root); + btrfs_free_log_root_tree(NULL, fs_info); + } + __btrfs_remove_free_space_cache(root->free_ino_pinned); __btrfs_remove_free_space_cache(root->free_ino_ctl); free_fs_root(root); -- cgit v1.2.3 From 83c8266acc1d19debbf353a16aabbd892ef99462 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 1 Mar 2013 15:03:00 +0000 Subject: btrfs: try harder to allocate raid56 stripe cache The stripe hash table is large, starting with allocation order 4 and can go as high as order 7 in case lock debugging is turned on and structure padding happens. Observed mount failure: mount: page allocation failure: order:7, mode:0x200050 Pid: 8234, comm: mount Tainted: G W 3.8.0-default+ #267 Call Trace: [] warn_alloc_failed+0xf3/0x140 [] ? __alloc_pages_direct_compact+0x92/0x250 [] __alloc_pages_nodemask+0x733/0x9d0 [] ? cache_alloc_refill+0x3f8/0x840 [] cache_alloc_refill+0x43c/0x840 [] ? is_kernel_percpu_address+0x4b/0x90 [] ? btrfs_alloc_stripe_hash_table+0x5c/0x130 [btrfs] [] kmem_cache_alloc_trace+0x247/0x270 [] btrfs_alloc_stripe_hash_table+0x5c/0x130 [btrfs] [] open_ctree+0xb2f/0x1f90 [btrfs] [] ? string+0x49/0xe0 [] ? vsnprintf+0x443/0x5d0 [] btrfs_mount+0x526/0x600 [btrfs] [] ? cache_alloc_debugcheck_after+0x4c/0x200 [] mount_fs+0x20/0xe0 [] vfs_kern_mount+0x76/0x120 [] do_mount+0x386/0x980 [] ? strndup_user+0x5b/0x80 [] sys_mount+0x90/0xe0 [] system_call_fastpath+0x16/0x1b Signed-off-by: David Sterba Signed-off-by: Josef Bacik --- fs/btrfs/disk-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 5031e6dd5938..02369a3c162e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2197,7 +2197,7 @@ int open_ctree(struct super_block *sb, ret = btrfs_alloc_stripe_hash_table(fs_info); if (ret) { - err = -ENOMEM; + err = ret; goto fail_alloc; } -- cgit v1.2.3 From aec8030a8745221c8658f2033b22c98528897b13 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 4 Mar 2013 09:44:29 +0000 Subject: Btrfs: fix wrong handle at error path of create_snapshot() when the commit fails There are several bugs at error path of create_snapshot() when the transaction commitment failed. - access the freed transaction handler. At the end of the transaction commitment, the transaction handler was freed, so we should not access it after the transaction commitment. - we were not aware of the error which happened during the snapshot creation if we submitted a async transaction commitment. - pending snapshot access vs pending snapshot free. when something wrong happened after we submitted a async transaction commitment, the transaction committer would cleanup the pending snapshots and free them. But the snapshot creators were not aware of it, they would access the freed pending snapshots. This patch fixes the above problems by: - remove the dangerous code that accessed the freed handler - assign ->error if the error happens during the snapshot creation - the transaction committer doesn't free the pending snapshots, just assigns the error number and evicts them before we unblock the transaction. Reported-by: Dan Carpenter Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/disk-io.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 02369a3c162e..7d84651e850b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -62,7 +62,7 @@ static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t, static void btrfs_destroy_ordered_extents(struct btrfs_root *root); static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, struct btrfs_root *root); -static void btrfs_destroy_pending_snapshots(struct btrfs_transaction *t); +static void btrfs_evict_pending_snapshots(struct btrfs_transaction *t); static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root); static int btrfs_destroy_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages, @@ -3687,7 +3687,7 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, return ret; } -static void btrfs_destroy_pending_snapshots(struct btrfs_transaction *t) +static void btrfs_evict_pending_snapshots(struct btrfs_transaction *t) { struct btrfs_pending_snapshot *snapshot; struct list_head splice; @@ -3700,10 +3700,8 @@ static void btrfs_destroy_pending_snapshots(struct btrfs_transaction *t) snapshot = list_entry(splice.next, struct btrfs_pending_snapshot, list); - + snapshot->error = -ECANCELED; list_del_init(&snapshot->list); - - kfree(snapshot); } } @@ -3840,6 +3838,8 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, cur_trans->blocked = 1; wake_up(&root->fs_info->transaction_blocked_wait); + btrfs_evict_pending_snapshots(cur_trans); + cur_trans->blocked = 0; wake_up(&root->fs_info->transaction_wait); @@ -3849,8 +3849,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, btrfs_destroy_delayed_inodes(root); btrfs_assert_delayed_root_empty(root); - btrfs_destroy_pending_snapshots(cur_trans); - btrfs_destroy_marked_extents(root, &cur_trans->dirty_pages, EXTENT_DIRTY); btrfs_destroy_pinned_extent(root, @@ -3894,6 +3892,8 @@ int btrfs_cleanup_transaction(struct btrfs_root *root) if (waitqueue_active(&root->fs_info->transaction_blocked_wait)) wake_up(&root->fs_info->transaction_blocked_wait); + btrfs_evict_pending_snapshots(t); + t->blocked = 0; smp_mb(); if (waitqueue_active(&root->fs_info->transaction_wait)) @@ -3907,8 +3907,6 @@ int btrfs_cleanup_transaction(struct btrfs_root *root) btrfs_destroy_delayed_inodes(root); btrfs_assert_delayed_root_empty(root); - btrfs_destroy_pending_snapshots(t); - btrfs_destroy_delalloc_inodes(root); spin_lock(&root->fs_info->trans_lock); -- cgit v1.2.3