From 64a167011bcabc1e855658387c8a4464b71f3138 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 15 Jul 2009 23:29:37 +0100 Subject: Btrfs: add rw argument to merge_bio_hook() We'll want to merge writes so they can fill a full RAID[56] stripe, but not necessarily reads. Signed-off-by: David Woodhouse Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 67ed24ae86bb..1b98c4ce3c6f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1566,7 +1566,7 @@ static void btrfs_clear_bit_hook(struct inode *inode, * extent_io.c merge_bio_hook, this must check the chunk tree to make sure * we don't create bios that span stripes or chunks */ -int btrfs_merge_bio_hook(struct page *page, unsigned long offset, +int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags) { @@ -1581,7 +1581,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, length = bio->bi_size; map_length = length; - ret = btrfs_map_block(root->fs_info, READ, logical, + ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, NULL, 0); /* Will always return 0 with map_multi == NULL */ BUG_ON(ret < 0); -- cgit v1.2.3 From 53b381b3abeb86f12787a6c40fee9b2f71edc23b Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Tue, 29 Jan 2013 18:40:14 -0500 Subject: Btrfs: RAID5 and RAID6 This builds on David Woodhouse's original Btrfs raid5/6 implementation. The code has changed quite a bit, blame Chris Mason for any bugs. Read/modify/write is done after the higher levels of the filesystem have prepared a given bio. This means the higher layers are not responsible for building full stripes, and they don't need to query for the topology of the extents that may get allocated during delayed allocation runs. It also means different files can easily share the same stripe. But, it does expose us to incorrect parity if we crash or lose power while doing a read/modify/write cycle. This will be addressed in a later commit. Scrub is unable to repair crc errors on raid5/6 chunks. Discard does not work on raid5/6 (yet) The stripe size is fixed at 64KiB per disk. This will be tunable in a later commit. Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1b98c4ce3c6f..6f4e41dca970 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -39,6 +39,7 @@ #include #include #include +#include #include "compat.h" #include "ctree.h" #include "disk-io.h" @@ -6386,19 +6387,24 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, int async_submit = 0; map_length = orig_bio->bi_size; - ret = btrfs_map_block(root->fs_info, READ, start_sector << 9, + ret = btrfs_map_block(root->fs_info, rw, start_sector << 9, &map_length, NULL, 0); if (ret) { bio_put(orig_bio); return -EIO; } - if (map_length >= orig_bio->bi_size) { bio = orig_bio; goto submit; } - async_submit = 1; + /* async crcs make it difficult to collect full stripe writes. */ + if (btrfs_get_alloc_profile(root, 1) & + (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) + async_submit = 0; + else + async_submit = 1; + bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); if (!bio) return -ENOMEM; @@ -6440,7 +6446,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, bio->bi_end_io = btrfs_end_dio_bio; map_length = orig_bio->bi_size; - ret = btrfs_map_block(root->fs_info, READ, + ret = btrfs_map_block(root->fs_info, rw, start_sector << 9, &map_length, NULL, 0); if (ret) { @@ -6583,15 +6589,17 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; + ssize_t ret; if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov, offset, nr_segs)) return 0; - return __blockdev_direct_IO(rw, iocb, inode, + ret = __blockdev_direct_IO(rw, iocb, inode, BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, btrfs_submit_direct, 0); + return ret; } #define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC) -- cgit v1.2.3 From 0e8c36a9fd8169a8b96c2ddc8446894bcd07b6b1 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 19 Dec 2012 06:59:51 +0000 Subject: Btrfs: fix lots of orphan inodes when the space is not enough We're running into having 50-100 orphans left over with xfstests 83 because of ENOSPC when trying to start the transaction for the inode update. But in fact, it makes no sense in updating the inode for the new size while we're deleting the stupid thing. This patch fixes this problem. Reported-by: Josef Bacik Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ca7ace7b7b52..1ea0988b82e1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3904,6 +3904,12 @@ void btrfs_evict_inode(struct inode *inode) goto no_delete; } + ret = btrfs_commit_inode_delayed_inode(inode); + if (ret) { + btrfs_orphan_del(NULL, inode); + goto no_delete; + } + rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); if (!rsv) { btrfs_orphan_del(NULL, inode); @@ -3941,7 +3947,7 @@ void btrfs_evict_inode(struct inode *inode) goto no_delete; } - trans = btrfs_start_transaction_lflush(root, 1); + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { btrfs_orphan_del(NULL, inode); btrfs_free_block_rsv(root, rsv); @@ -3955,9 +3961,6 @@ void btrfs_evict_inode(struct inode *inode) break; trans->block_rsv = &root->fs_info->trans_block_rsv; - ret = btrfs_update_inode(trans, root, inode); - BUG_ON(ret); - btrfs_end_transaction(trans, root); trans = NULL; btrfs_btree_balance_dirty(root); -- cgit v1.2.3 From 4eee4fa4f8ab8c2b2623c22be4c3cb91d525aa57 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Fri, 21 Dec 2012 09:17:45 +0000 Subject: Btrfs: use wrapper page_offset Use wrapper page_offset to get byte-offset into filesystem object for page. Signed-off-by: Liu Bo Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1ea0988b82e1..35d152444932 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2062,7 +2062,7 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, struct extent_state *state, int mirror) { - size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT); + size_t offset = start - page_offset(page); struct inode *inode = page->mapping->host; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; char *kaddr; @@ -6752,8 +6752,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) return; } lock_extent_bits(tree, page_start, page_end, 0, &cached_state); - ordered = btrfs_lookup_ordered_extent(inode, - page_offset(page)); + ordered = btrfs_lookup_ordered_extent(inode, page_offset(page)); if (ordered) { /* * IO on this page will never be started, so we need -- cgit v1.2.3 From 2ab28f322f9896782da904f5942f3873432addc8 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 12 Oct 2012 15:27:49 -0400 Subject: Btrfs: wait on ordered extents at the last possible moment Since we don't actually copy the extent information from the source tree in the fast case we don't need to wait for ordered io to be completed in order to fsync, we just need to wait for the io to be completed. So when we're logging our file just attach all of the ordered extents to the log, and then when the log syncs just wait for IO_DONE on the ordered extents and then write the super. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 35d152444932..31a871ec48f2 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -700,6 +700,8 @@ retry: em->start = async_extent->start; em->len = async_extent->ram_size; em->orig_start = em->start; + em->mod_start = em->start; + em->mod_len = em->len; em->block_start = ins.objectid; em->block_len = ins.offset; @@ -892,6 +894,8 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans, em->orig_start = em->start; ram_size = ins.offset; em->len = ins.offset; + em->mod_start = em->start; + em->mod_len = em->len; em->block_start = ins.objectid; em->block_len = ins.offset; @@ -1338,6 +1342,8 @@ out_check: em->block_start = disk_bytenr; em->orig_block_len = disk_num_bytes; em->bdev = root->fs_info->fs_devices->latest_bdev; + em->mod_start = em->start; + em->mod_len = em->len; set_bit(EXTENT_FLAG_PINNED, &em->flags); set_bit(EXTENT_FLAG_FILLING, &em->flags); em->generation = -1; @@ -5966,6 +5972,8 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start, em->start = start; em->orig_start = orig_start; + em->mod_start = start; + em->mod_len = len; em->len = len; em->block_len = block_len; em->block_start = block_start; -- cgit v1.2.3 From 51fab693477c39ba82c153d3fb30a4cc6de49874 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 27 Dec 2012 09:01:21 +0000 Subject: Btrfs: use token to avoid times mapping extent buffer The API in tree log code has done sort of changes, and it proves that we can benifit from using token, so do the same thing here. function_graph tracer's timer shows that it costs nearly half time of before(39.788us -> 22.391us). Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 63 +++++++++++++++++++++++++++++++------------------------- 1 file changed, 35 insertions(+), 28 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 31a871ec48f2..280e4d487636 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2715,34 +2715,41 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item *item, struct inode *inode) { - btrfs_set_inode_uid(leaf, item, i_uid_read(inode)); - btrfs_set_inode_gid(leaf, item, i_gid_read(inode)); - btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); - btrfs_set_inode_mode(leaf, item, inode->i_mode); - btrfs_set_inode_nlink(leaf, item, inode->i_nlink); - - btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item), - inode->i_atime.tv_sec); - btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item), - inode->i_atime.tv_nsec); - - btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item), - inode->i_mtime.tv_sec); - btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item), - inode->i_mtime.tv_nsec); - - btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item), - inode->i_ctime.tv_sec); - btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item), - inode->i_ctime.tv_nsec); - - btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); - btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); - btrfs_set_inode_sequence(leaf, item, inode->i_version); - btrfs_set_inode_transid(leaf, item, trans->transid); - btrfs_set_inode_rdev(leaf, item, inode->i_rdev); - btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); - btrfs_set_inode_block_group(leaf, item, 0); + struct btrfs_map_token token; + + btrfs_init_map_token(&token); + + btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); + btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); + btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size, + &token); + btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); + btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); + + btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item), + inode->i_atime.tv_sec, &token); + btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item), + inode->i_atime.tv_nsec, &token); + + btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item), + inode->i_mtime.tv_sec, &token); + btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item), + inode->i_mtime.tv_nsec, &token); + + btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item), + inode->i_ctime.tv_sec, &token); + btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item), + inode->i_ctime.tv_nsec, &token); + + btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), + &token); + btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation, + &token); + btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token); + btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); + btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); + btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); + btrfs_set_token_inode_block_group(leaf, item, 0, &token); } /* -- cgit v1.2.3 From 63607cc86ab808e077a895be2d7895dcbf6f4cf3 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 22 Jan 2013 10:50:35 +0000 Subject: Btrfs: traverse and flush the delalloc inodes once btrfs_start_delalloc_inodes() needn't traverse and flush the delalloc inodes repeatedly. It is because we can regard the data that the users write after we start delalloc inodes flush as the one which is after the delalloc inodes flush is done, and we can flush it next time. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 280e4d487636..d1e3a2d15f91 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7614,7 +7614,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) INIT_LIST_HEAD(&works); INIT_LIST_HEAD(&splice); -again: + spin_lock(&root->fs_info->delalloc_lock); list_splice_init(&root->fs_info->delalloc_inodes, &splice); while (!list_empty(&splice)) { @@ -7650,13 +7650,6 @@ again: btrfs_wait_and_free_delalloc_work(work); } - spin_lock(&root->fs_info->delalloc_lock); - if (!list_empty(&root->fs_info->delalloc_inodes)) { - spin_unlock(&root->fs_info->delalloc_lock); - goto again; - } - spin_unlock(&root->fs_info->delalloc_lock); - /* the filemap_flush will queue IO into the worker threads, but * we have to make sure the IO is actually started and that * ordered extents get created before we return -- cgit v1.2.3 From fe5fafbebd7e648aa6cb0a419dc812188b4d4ac8 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 24 Jan 2013 13:49:14 -0500 Subject: Revert "Btrfs: fix permissions of empty files not affected by umask" This reverts commit 2794ed013b3551cbae887ea1b93c52aaacb7370d. Wasn't supposed to get used in btrfs_mknod, it was supposed to be in btrfs_create, which was done in commit 9185aa587b7425f8f4520da2e66792f5f3c2b815. Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d1e3a2d15f91..60ec7589900c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5036,12 +5036,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, goto out_unlock; } - err = btrfs_update_inode(trans, root, inode); - if (err) { - drop_inode = 1; - goto out_unlock; - } - /* * If the active LSM wants to access the inode during * d_instantiate it needs these. Smack checks to see -- cgit v1.2.3 From 55e301fd57a6239ec14b91a1cf2e70b3dd135194 Mon Sep 17 00:00:00 2001 From: Filipe Brandenburger Date: Tue, 29 Jan 2013 06:04:50 +0000 Subject: Btrfs: move fs/btrfs/ioctl.h to include/uapi/linux/btrfs.h The header file will then be installed under /usr/include/linux so that userspace applications can refer to Btrfs ioctls by name and use the same structs used internally in the kernel. Signed-off-by: Filipe Brandenburger Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 60ec7589900c..fc8aa8bf80a1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -39,12 +39,12 @@ #include #include #include +#include #include "compat.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" -#include "ioctl.h" #include "print-tree.h" #include "ordered-data.h" #include "xattr.h" -- cgit v1.2.3 From 963d678b0f7649300e3a67f2513ca9d830c6e303 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 29 Jan 2013 10:10:51 +0000 Subject: Btrfs: use percpu counter for fs_info->delalloc_bytes fs_info->delalloc_bytes is accessed very frequently, so use percpu counter instead of the u64 variant for it to reduce the lock contention. This patch also fixed the problem that we access the variant without the lock protection.At worst, we would not flush the delalloc inodes, and just return ENOSPC error when we still have some free space in the fs. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index fc8aa8bf80a1..24c0b7805fe1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1516,7 +1516,8 @@ static void btrfs_set_bit_hook(struct inode *inode, spin_lock(&root->fs_info->delalloc_lock); BTRFS_I(inode)->delalloc_bytes += len; - root->fs_info->delalloc_bytes += len; + __percpu_counter_add(&root->fs_info->delalloc_bytes, len, + root->fs_info->delalloc_batch); if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) { list_add_tail(&BTRFS_I(inode)->delalloc_inodes, &root->fs_info->delalloc_inodes); @@ -1557,7 +1558,8 @@ static void btrfs_clear_bit_hook(struct inode *inode, btrfs_free_reserved_data_space(inode, len); spin_lock(&root->fs_info->delalloc_lock); - root->fs_info->delalloc_bytes -= len; + __percpu_counter_add(&root->fs_info->delalloc_bytes, -len, + root->fs_info->delalloc_batch); BTRFS_I(inode)->delalloc_bytes -= len; if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && -- cgit v1.2.3 From df0af1a57f72c74d53a9377c60ff20095afab97d Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 29 Jan 2013 10:11:59 +0000 Subject: Btrfs: use the inode own lock to protect its delalloc_bytes We need not use a global lock to protect the delalloc_bytes of the inode, just use its own lock. In this way, we can reduce the lock contention and ->delalloc_lock will just protect delalloc inode list. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 47 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 34 insertions(+), 13 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 24c0b7805fe1..82c7c66f8523 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1514,15 +1514,22 @@ static void btrfs_set_bit_hook(struct inode *inode, spin_unlock(&BTRFS_I(inode)->lock); } - spin_lock(&root->fs_info->delalloc_lock); - BTRFS_I(inode)->delalloc_bytes += len; __percpu_counter_add(&root->fs_info->delalloc_bytes, len, root->fs_info->delalloc_batch); - if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) { - list_add_tail(&BTRFS_I(inode)->delalloc_inodes, - &root->fs_info->delalloc_inodes); + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->delalloc_bytes += len; + if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &BTRFS_I(inode)->runtime_flags)) { + spin_lock(&root->fs_info->delalloc_lock); + if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { + list_add_tail(&BTRFS_I(inode)->delalloc_inodes, + &root->fs_info->delalloc_inodes); + set_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &BTRFS_I(inode)->runtime_flags); + } + spin_unlock(&root->fs_info->delalloc_lock); } - spin_unlock(&root->fs_info->delalloc_lock); + spin_unlock(&BTRFS_I(inode)->lock); } } @@ -1557,16 +1564,22 @@ static void btrfs_clear_bit_hook(struct inode *inode, && do_list) btrfs_free_reserved_data_space(inode, len); - spin_lock(&root->fs_info->delalloc_lock); __percpu_counter_add(&root->fs_info->delalloc_bytes, -len, root->fs_info->delalloc_batch); + spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->delalloc_bytes -= len; - if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && - !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { - list_del_init(&BTRFS_I(inode)->delalloc_inodes); + test_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &BTRFS_I(inode)->runtime_flags)) { + spin_lock(&root->fs_info->delalloc_lock); + if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) { + list_del_init(&BTRFS_I(inode)->delalloc_inodes); + clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &BTRFS_I(inode)->runtime_flags); + } + spin_unlock(&root->fs_info->delalloc_lock); } - spin_unlock(&root->fs_info->delalloc_lock); + spin_unlock(&BTRFS_I(inode)->lock); } } @@ -7326,14 +7339,19 @@ fail: static int btrfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { + u64 delalloc_bytes; struct inode *inode = dentry->d_inode; u32 blocksize = inode->i_sb->s_blocksize; generic_fillattr(inode, stat); stat->dev = BTRFS_I(inode)->root->anon_dev; stat->blksize = PAGE_CACHE_SIZE; + + spin_lock(&BTRFS_I(inode)->lock); + delalloc_bytes = BTRFS_I(inode)->delalloc_bytes; + spin_unlock(&BTRFS_I(inode)->lock); stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) + - ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9; + ALIGN(delalloc_bytes, blocksize)) >> 9; return 0; } @@ -7620,8 +7638,11 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) list_del_init(&binode->delalloc_inodes); inode = igrab(&binode->vfs_inode); - if (!inode) + if (!inode) { + clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &binode->runtime_flags); continue; + } list_add_tail(&binode->delalloc_inodes, &root->fs_info->delalloc_inodes); -- cgit v1.2.3 From 0bec9ef525e33233d7739b71be83bb78746f6e94 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 31 Jan 2013 14:58:00 -0500 Subject: Btrfs: unreserve space if our ordered extent fails to work When a transaction aborts or there's an EIO on an ordered extent or any error really we will not free up the space we reserved for this ordered extent. This results in warnings from the block group cache cleanup in the case of a transaction abort, or leaking space in the case of EIO on an ordered extent. Fix this up by free'ing the reserved space if we have an error at all trying to complete an ordered extent. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 82c7c66f8523..2b48d2d51d50 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2022,11 +2022,23 @@ out: if (trans) btrfs_end_transaction(trans, root); - if (ret) + if (ret) { clear_extent_uptodate(io_tree, ordered_extent->file_offset, ordered_extent->file_offset + ordered_extent->len - 1, NULL, GFP_NOFS); + /* + * If the ordered extent had an IOERR or something else went + * wrong we need to return the space for this ordered extent + * back to the allocator. + */ + if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && + !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) + btrfs_free_reserved_extent(root, ordered_extent->start, + ordered_extent->disk_len); + } + + /* * This needs to be done to make sure anybody waiting knows we are done * updating everything for this ordered extent. -- cgit v1.2.3 From 925396ecf251432d6d0f703a6cfd0cb9e651d936 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 1 Feb 2013 15:57:47 -0500 Subject: Btrfs: account for orphan inodes properly during cleanup Dave sent me a panic where we were doing the orphan cleanup and panic'ed trying to release our reservation from the orphan block rsv. The reason for this is because our orphan block rsv had been free'd out from underneath us because the transaction commit found that there were no orphan inodes according to its count and decided to free it. This is incorrect so make sure we inc the orphan inodes count so the accounting is all done properly. This would also cause the warning in the orphan commit code normally if you had any orphans to cleanup as they would only decrement the orphan count so you'd get a negative orphan count which could cause problems during runtime. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2b48d2d51d50..2dc59734e071 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2502,6 +2502,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) */ set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, &BTRFS_I(inode)->runtime_flags); + atomic_inc(&root->orphan_inodes); /* if we have links, this was a truncate, lets do that */ if (inode->i_nlink) { -- cgit v1.2.3 From 3e04e7f10b68999e0d8321516ea19d9d5b044dee Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 6 Feb 2013 16:49:15 -0500 Subject: Btrfs: handle errors in compression submission path I noticed we would deadlock if we aborted a transaction while doing compressed io. This is because we don't unlock our pages if something goes horribly wrong. To fix this we need to make sure that we call extent_clear_unlock_delalloc in order to unlock all the pages. If we have to cow in the async submission thread we need to make sure to unlock our locked_page as the cow error path will not unlock the locked page as it depends on the caller to unlock that page. With this patch we no longer deadlock on the page lock when we have an aborted transaction. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 38 ++++++++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 10 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2dc59734e071..348b7bb920ef 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -608,7 +608,7 @@ static noinline int submit_compressed_extents(struct inode *inode, if (list_empty(&async_cow->extents)) return 0; - +again: while (!list_empty(&async_cow->extents)) { async_extent = list_entry(async_cow->extents.next, struct async_extent, list); @@ -648,6 +648,8 @@ retry: async_extent->ram_size - 1, btrfs_get_extent, WB_SYNC_ALL); + else if (ret) + unlock_page(async_cow->locked_page); kfree(async_extent); cond_resched(); continue; @@ -672,6 +674,7 @@ retry: if (ret) { int i; + for (i = 0; i < async_extent->nr_pages; i++) { WARN_ON(async_extent->pages[i]->mapping); page_cache_release(async_extent->pages[i]); @@ -679,12 +682,10 @@ retry: kfree(async_extent->pages); async_extent->nr_pages = 0; async_extent->pages = NULL; - unlock_extent(io_tree, async_extent->start, - async_extent->start + - async_extent->ram_size - 1); + if (ret == -ENOSPC) goto retry; - goto out_free; /* JDM: Requeue? */ + goto out_free; } /* @@ -696,7 +697,8 @@ retry: async_extent->ram_size - 1, 0); em = alloc_extent_map(); - BUG_ON(!em); /* -ENOMEM */ + if (!em) + goto out_free_reserve; em->start = async_extent->start; em->len = async_extent->ram_size; em->orig_start = em->start; @@ -728,6 +730,9 @@ retry: async_extent->ram_size - 1, 0); } + if (ret) + goto out_free_reserve; + ret = btrfs_add_ordered_extent_compress(inode, async_extent->start, ins.objectid, @@ -735,7 +740,8 @@ retry: ins.offset, BTRFS_ORDERED_COMPRESSED, async_extent->compress_type); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + goto out_free_reserve; /* * clear dirty, set writeback and unlock the pages. @@ -756,18 +762,30 @@ retry: ins.objectid, ins.offset, async_extent->pages, async_extent->nr_pages); - - BUG_ON(ret); /* -ENOMEM */ alloc_hint = ins.objectid + ins.offset; kfree(async_extent); + if (ret) + goto out; cond_resched(); } ret = 0; out: return ret; +out_free_reserve: + btrfs_free_reserved_extent(root, ins.objectid, ins.offset); out_free: + extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, + async_extent->start, + async_extent->start + + async_extent->ram_size - 1, + NULL, EXTENT_CLEAR_UNLOCK_PAGE | + EXTENT_CLEAR_UNLOCK | + EXTENT_CLEAR_DELALLOC | + EXTENT_CLEAR_DIRTY | + EXTENT_SET_WRITEBACK | + EXTENT_END_WRITEBACK); kfree(async_extent); - goto out; + goto again; } static u64 get_extent_allocation_hint(struct inode *inode, u64 start, -- cgit v1.2.3 From 5d80366e9b5e56b3ffc1923b4995e83bbbf605e3 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 7 Feb 2013 16:06:02 -0500 Subject: Btrfs: steal from global reserve if we are cleaning up orphans Sometimes xfstest 83 will fail to remount the scratch device because we've gotten ourselves so full that we cannot cleanup the orphan items. In this case check to see if we're doing the orphan cleanup and if we are allow us to steal our reservation from the global block rsv. With this patch I've not been able to reproduce the failed mount problem. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 348b7bb920ef..cf26778085e0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2218,11 +2218,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root) } } -enum btrfs_orphan_cleanup_state { - ORPHAN_CLEANUP_STARTED = 1, - ORPHAN_CLEANUP_DONE = 2, -}; - /* * This is called in transaction commit time. If there are no orphan * files in the subvolume, it removes orphan item and frees block_rsv -- cgit v1.2.3 From 4a7d0f6854c4a4ad1dba00a3b128a32d39b9a742 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 7 Feb 2013 16:27:28 -0500 Subject: Btrfs: cleanup orphan reservation if truncate fails I noticed we were getting lots of warnings with xfstest 83 because we have reservations outstanding. This is because we moved the orphan add outside of the truncate, but we don't actually cleanup our reservation if something fails. This fixes the problem and I no longer see warnings. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index cf26778085e0..16925807a9ed 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2538,6 +2538,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) goto out; ret = btrfs_truncate(inode); + if (ret) + btrfs_orphan_del(NULL, inode); } else { nr_unlink++; } -- cgit v1.2.3 From 0934856d4697e63c14056375e26e3bd6e8ebd34b Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 7 Feb 2013 10:12:07 +0000 Subject: Btrfs: fix deadlock due to unsubmitted The deadlock problem happened when running fsstress(a test program in LTP). Steps to reproduce: # mkfs.btrfs -b 100M # mount # /fsstress -p 3 -n 10000000 -d The reason is: btrfs_direct_IO() |->do_direct_IO() |->get_page() |->get_blocks() | |->btrfs_delalloc_resereve_space() | |->btrfs_add_ordered_extent() ------- Add a new ordered extent |->dio_send_cur_page(page0) -------------- We didn't submit bio here |->get_page() |->get_blocks() |->btrfs_delalloc_resereve_space() |->flush_space() |->btrfs_start_ordered_extent() |->wait_event() ---------- Wait the completion of the ordered extent that is mentioned above But because we didn't submit the bio that is mentioned above, the ordered extent can not complete, we would wait for its completion forever. There are two methods which can fix this deadlock problem: 1. submit the bio before we invoke get_blocks() 2. reserve the space before we do dio Though the 1st is the simplest way, we need modify the code of VFS, and it is likely to break contiguous requests, and introduce performance regression for the other filesystems. So we have to choose the 2nd way. Signed-off-by: Miao Xie Cc: Josef Bacik Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 81 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 41 insertions(+), 40 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 16925807a9ed..d11f38d8696c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6059,16 +6059,15 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, u64 len = bh_result->b_size; struct btrfs_trans_handle *trans; int unlock_bits = EXTENT_LOCKED; - int ret; + int ret = 0; if (create) { - ret = btrfs_delalloc_reserve_space(inode, len); - if (ret) - return ret; + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents++; + spin_unlock(&BTRFS_I(inode)->lock); unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY; - } else { + } else len = min_t(u64, len, root->sectorsize); - } lockstart = start; lockend = start + len - 1; @@ -6080,14 +6079,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create)) return -ENOTBLK; - if (create) { - ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, - lockend, EXTENT_DELALLOC, NULL, - &cached_state, GFP_NOFS); - if (ret) - goto unlock_err; - } - em = btrfs_get_extent(inode, NULL, 0, start, len, 0); if (IS_ERR(em)) { ret = PTR_ERR(em); @@ -6119,7 +6110,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, if (!create && (em->block_start == EXTENT_MAP_HOLE || test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { free_extent_map(em); - ret = 0; goto unlock_err; } @@ -6217,6 +6207,11 @@ unlock: */ if (start + len > i_size_read(inode)) i_size_write(inode, start + len); + + ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, + lockstart + len - 1, EXTENT_DELALLOC, NULL, + &cached_state, GFP_NOFS); + BUG_ON(ret); } /* @@ -6225,24 +6220,9 @@ unlock: * aren't using if there is any left over space. */ if (lockstart < lockend) { - if (create && len < lockend - lockstart) { - clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, - lockstart + len - 1, - unlock_bits | EXTENT_DEFRAG, 1, 0, - &cached_state, GFP_NOFS); - /* - * Beside unlock, we also need to cleanup reserved space - * for the left range by attaching EXTENT_DO_ACCOUNTING. - */ - clear_extent_bit(&BTRFS_I(inode)->io_tree, - lockstart + len, lockend, - unlock_bits | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, 1, 0, NULL, GFP_NOFS); - } else { - clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, - lockend, unlock_bits, 1, 0, - &cached_state, GFP_NOFS); - } + clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, + lockend, unlock_bits, 1, 0, + &cached_state, GFP_NOFS); } else { free_extent_state(cached_state); } @@ -6252,9 +6232,6 @@ unlock: return 0; unlock_err: - if (create) - unlock_bits |= EXTENT_DO_ACCOUNTING; - clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, unlock_bits, 1, 0, &cached_state, GFP_NOFS); return ret; @@ -6692,15 +6669,39 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; + size_t count = 0; + ssize_t ret; if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov, offset, nr_segs)) return 0; - return __blockdev_direct_IO(rw, iocb, inode, - BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, - iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, - btrfs_submit_direct, 0); + if (rw & WRITE) { + count = iov_length(iov, nr_segs); + ret = btrfs_delalloc_reserve_space(inode, count); + if (ret) + return ret; + } + + ret = __blockdev_direct_IO(rw, iocb, inode, + BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, + iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, + btrfs_submit_direct, 0); + + if (rw & WRITE) { + if (ret < 0 && ret != -EIOCBQUEUED) + btrfs_delalloc_release_space(inode, count); + else if (ret > 0 && (size_t)ret < count) { + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents++; + spin_unlock(&BTRFS_I(inode)->lock); + btrfs_delalloc_release_space(inode, + count - (size_t)ret); + } + btrfs_delalloc_release_metadata(inode, 0); + } + + return ret; } #define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC) -- cgit v1.2.3 From 2e60a51e62185cce48758e596ae7cb2da673b58f Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Fri, 8 Feb 2013 07:01:08 +0000 Subject: Btrfs: serialize unlocked dio reads with truncate Currently, we can do unlocked dio reads, but the following race is possible: dio_read_task truncate_task ->btrfs_setattr() ->btrfs_direct_IO ->__blockdev_direct_IO ->btrfs_get_block ->btrfs_truncate() #alloc truncated blocks #to other inode ->submit_io() #INFORMATION LEAK In order to avoid this problem, we must serialize unlocked dio reads with truncate. There are two approaches: - use extent lock to protect the extent that we truncate - use inode_dio_wait() to make sure the truncating task will wait for the read DIO. If we use the 1st one, we will meet the endless truncation problem due to the nonlocked read DIO after we implement the nonlocked write DIO. It is because we still need invoke inode_dio_wait() avoid the race between write DIO and truncation. By that time, we have to introduce btrfs_inode_{block, resume}_nolock_dio() again. That is we have to implement this patch again, so I choose the 2nd way to fix the problem. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d11f38d8696c..c6ee8f1063ff 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3888,6 +3888,12 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) /* we don't support swapfiles, so vmtruncate shouldn't fail */ truncate_setsize(inode, newsize); + + /* Disable nonlocked read DIO to avoid the end less truncate */ + btrfs_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + btrfs_inode_resume_unlocked_dio(inode); + ret = btrfs_truncate(inode); if (ret && inode->i_nlink) btrfs_orphan_del(NULL, inode); @@ -6670,6 +6676,8 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; size_t count = 0; + int flags = 0; + bool wakeup = false; ssize_t ret; if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov, @@ -6681,13 +6689,22 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, ret = btrfs_delalloc_reserve_space(inode, count); if (ret) return ret; + } else { + atomic_inc(&inode->i_dio_count); + smp_mb__after_atomic_inc(); + if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK, + &BTRFS_I(inode)->runtime_flags))) { + inode_dio_done(inode); + flags = DIO_LOCKING | DIO_SKIP_HOLES; + } else { + wakeup = true; + } } ret = __blockdev_direct_IO(rw, iocb, inode, BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, - btrfs_submit_direct, 0); - + btrfs_submit_direct, flags); if (rw & WRITE) { if (ret < 0 && ret != -EIOCBQUEUED) btrfs_delalloc_release_space(inode, count); @@ -6700,6 +6717,8 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, } btrfs_delalloc_release_metadata(inode, 0); } + if (wakeup) + inode_dio_done(inode); return ret; } -- cgit v1.2.3 From 38851cc19adbfa1def2b47106d8050a80e0a3673 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Fri, 8 Feb 2013 07:04:11 +0000 Subject: Btrfs: implement unlocked dio write This idea is from ext4. By this patch, we can make the dio write parallel, and improve the performance. But because we can not update isize without i_mutex, the unlocked dio write just can be done in front of the EOF. We needn't worry about the race between dio write and truncate, because the truncate need wait untill all the dio write end. And we also needn't worry about the race between dio write and punch hole, because we have extent lock to protect our operation. I ran fio to test the performance of this feature. == Hardware == CPU: Intel(R) Core(TM)2 Duo CPU E7500 @ 2.93GHz Mem: 2GB SSD: Intel X25-M 120GB (Test Partition: 60GB) == config file == [global] ioengine=psync direct=1 bs=4k size=32G runtime=60 directory=/mnt/btrfs/ filename=testfile group_reporting thread [file1] numjobs=1 # 2 4 rw=randwrite == result (KBps) == write 1 2 4 lock 24936 24738 24726 nolock 24962 30866 32101 == result (iops) == write 1 2 4 lock 6234 6184 6181 nolock 6240 7716 8025 Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c6ee8f1063ff..fce61991213c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6677,28 +6677,36 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, struct inode *inode = file->f_mapping->host; size_t count = 0; int flags = 0; - bool wakeup = false; + bool wakeup = true; + bool relock = false; ssize_t ret; if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov, offset, nr_segs)) return 0; + atomic_inc(&inode->i_dio_count); + smp_mb__after_atomic_inc(); + if (rw & WRITE) { count = iov_length(iov, nr_segs); + /* + * If the write DIO is beyond the EOF, we need update + * the isize, but it is protected by i_mutex. So we can + * not unlock the i_mutex at this case. + */ + if (offset + count <= inode->i_size) { + mutex_unlock(&inode->i_mutex); + relock = true; + } ret = btrfs_delalloc_reserve_space(inode, count); if (ret) - return ret; - } else { - atomic_inc(&inode->i_dio_count); - smp_mb__after_atomic_inc(); - if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK, - &BTRFS_I(inode)->runtime_flags))) { - inode_dio_done(inode); - flags = DIO_LOCKING | DIO_SKIP_HOLES; - } else { - wakeup = true; - } + goto out; + } else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK, + &BTRFS_I(inode)->runtime_flags))) { + inode_dio_done(inode); + flags = DIO_LOCKING | DIO_SKIP_HOLES; + wakeup = false; } ret = __blockdev_direct_IO(rw, iocb, inode, @@ -6717,8 +6725,11 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, } btrfs_delalloc_release_metadata(inode, 0); } +out: if (wakeup) inode_dio_done(inode); + if (relock) + mutex_lock(&inode->i_mutex); return ret; } -- cgit v1.2.3 From fa6ac8765c48a06dfed914e8c8c3a903f9d313a0 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 20 Feb 2013 14:10:23 +0000 Subject: Btrfs: fix cleaner thread not working with inode cache option Right now inode cache inode is treated as the same as space cache inode, ie. keep inode in memory till putting super. But this leads to an awkward situation. If we're going to delete a snapshot/subvolume, btrfs will not actually delete it and return free space, but will add it to dead roots list until the last inode on this snap/subvol being destroyed. Then we'll fetch deleted roots and cleanup them via cleaner thread. So here is the problem, if we enable inode cache option, each snap/subvol has a cached inode which is used to store inode allcation information. And this cache inode will be kept in memory, as the above said. So with inode cache, snap/subvol can only be added into dead roots list during freeing roots stage in umount, so that we can ONLY get space back after another remount(we cleanup dead roots on mount). But the real thing is we'll no more use the snap/subvol if we mark it deleted, so we can safely iput its cache inode when we delete snap/subvol. Another thing is that we need to change the rules of droping inode, we don't keep snap/subvol's cache inode in memory till end so that we can add snap/subvol into dead roots list in time. Reported-by: Mitch Harder Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index fce61991213c..b009fb52bd8a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7315,8 +7315,9 @@ int btrfs_drop_inode(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; + /* the snap/subvol tree is on deleting */ if (btrfs_root_refs(&root->root_item) == 0 && - !btrfs_is_free_space_inode(inode)) + root != root->fs_info->tree_root) return 1; else return generic_drop_inode(inode); -- cgit v1.2.3 From 24542bf7ea5e4fdfdb5157ff544c093fa4dcb536 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Fri, 16 Nov 2012 00:04:43 +0000 Subject: btrfs: limit fallocate extent reservation to 256MB Very large fallocate requests are cpu bound and result in extents with a repeating pattern of ever decreasing size: $ time fallocate -l 1T file real 0m13.039s ( an excerpt of the extents from btrfs-debug-tree: ) prealloc data disk byte 1536292564992 nr 397312 prealloc data disk byte 1536292962304 nr 196608 prealloc data disk byte 1536293158912 nr 98304 prealloc data disk byte 1536293257216 nr 49152 prealloc data disk byte 1536293306368 nr 24576 prealloc data disk byte 1536293330944 nr 12288 prealloc data disk byte 1536293343232 nr 8192 prealloc data disk byte 1536293351424 nr 4096 prealloc data disk byte 1536293355520 nr 4096 prealloc data disk byte 1536293359616 nr 4096 The excessive cpu use comes from __btrfs_prealloc_file_range() trying to allocate the entire remaining size after each extent is allocated. btrfs_reserve_extent() repeatedly cuts this requested size in half until it gets down to the size that the allocators can return. We limit the problem for now by capping each reservation at 256 meg. The small extents come from a masking bug when decreasing the requested reservation size. The high 32bits are cleared and the remaining low bits might happen to reserve a small size. Fix this by using round_down() which properly casts the mask. After these fixes huge fallocate requests are fast and result in nice large extents: $ time fallocate -l 1T file real 0m0.082s prealloc data disk byte 1112425889792 nr 268435456 prealloc data disk byte 1112694325248 nr 268435456 prealloc data disk byte 1112962760704 nr 268435456 Reported-by: Eric Sandeen Signed-off-by: Zach Brown Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4e6a11c2cfdd..3bc62b181ef8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7894,8 +7894,9 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, } } - ret = btrfs_reserve_extent(trans, root, num_bytes, min_size, - 0, *alloc_hint, &ins, 1); + ret = btrfs_reserve_extent(trans, root, + min(num_bytes, 256ULL * 1024 * 1024), + min_size, 0, *alloc_hint, &ins, 1); if (ret) { if (own_trans) btrfs_end_transaction(trans, root); -- cgit v1.2.3 From 38c227d87c49ad5d173cb5d4374d49acec6a495d Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Tue, 29 Jan 2013 03:18:40 +0000 Subject: Btrfs: snapshot-aware defrag This comes from one of btrfs's project ideas, As we defragment files, we break any sharing from other snapshots. The balancing code will preserve the sharing, and defrag needs to grow this as well. Now we're able to fill the blank with this patch, in which we make full use of backref walking stuff. Here is the basic idea, o set the writeback ranges started by defragment with flag EXTENT_DEFRAG o at endio, after we finish updating fs tree, we use backref walking to find all parents of the ranges and re-link them with the new COWed file layout by adding corresponding backrefs. Signed-off-by: Li Zefan Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 654 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 654 insertions(+) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3bc62b181ef8..4d0aec0cf5d8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -55,6 +55,7 @@ #include "locking.h" #include "free-space-cache.h" #include "inode-map.h" +#include "backref.h" struct btrfs_iget_args { u64 ino; @@ -1932,6 +1933,640 @@ out: return ret; } +/* snapshot-aware defrag */ +struct sa_defrag_extent_backref { + struct rb_node node; + struct old_sa_defrag_extent *old; + u64 root_id; + u64 inum; + u64 file_pos; + u64 extent_offset; + u64 num_bytes; + u64 generation; +}; + +struct old_sa_defrag_extent { + struct list_head list; + struct new_sa_defrag_extent *new; + + u64 extent_offset; + u64 bytenr; + u64 offset; + u64 len; + int count; +}; + +struct new_sa_defrag_extent { + struct rb_root root; + struct list_head head; + struct btrfs_path *path; + struct inode *inode; + u64 file_pos; + u64 len; + u64 bytenr; + u64 disk_len; + u8 compress_type; +}; + +static int backref_comp(struct sa_defrag_extent_backref *b1, + struct sa_defrag_extent_backref *b2) +{ + if (b1->root_id < b2->root_id) + return -1; + else if (b1->root_id > b2->root_id) + return 1; + + if (b1->inum < b2->inum) + return -1; + else if (b1->inum > b2->inum) + return 1; + + if (b1->file_pos < b2->file_pos) + return -1; + else if (b1->file_pos > b2->file_pos) + return 1; + + /* + * [------------------------------] ===> (a range of space) + * |<--->| |<---->| =============> (fs/file tree A) + * |<---------------------------->| ===> (fs/file tree B) + * + * A range of space can refer to two file extents in one tree while + * refer to only one file extent in another tree. + * + * So we may process a disk offset more than one time(two extents in A) + * and locate at the same extent(one extent in B), then insert two same + * backrefs(both refer to the extent in B). + */ + return 0; +} + +static void backref_insert(struct rb_root *root, + struct sa_defrag_extent_backref *backref) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct sa_defrag_extent_backref *entry; + int ret; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct sa_defrag_extent_backref, node); + + ret = backref_comp(backref, entry); + if (ret < 0) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + + rb_link_node(&backref->node, parent, p); + rb_insert_color(&backref->node, root); +} + +/* + * Note the backref might has changed, and in this case we just return 0. + */ +static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id, + void *ctx) +{ + struct btrfs_file_extent_item *extent; + struct btrfs_fs_info *fs_info; + struct old_sa_defrag_extent *old = ctx; + struct new_sa_defrag_extent *new = old->new; + struct btrfs_path *path = new->path; + struct btrfs_key key; + struct btrfs_root *root; + struct sa_defrag_extent_backref *backref; + struct extent_buffer *leaf; + struct inode *inode = new->inode; + int slot; + int ret; + u64 extent_offset; + u64 num_bytes; + + if (BTRFS_I(inode)->root->root_key.objectid == root_id && + inum == btrfs_ino(inode)) + return 0; + + key.objectid = root_id; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + + fs_info = BTRFS_I(inode)->root->fs_info; + root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(root)) { + if (PTR_ERR(root) == -ENOENT) + return 0; + WARN_ON(1); + pr_debug("inum=%llu, offset=%llu, root_id=%llu\n", + inum, offset, root_id); + return PTR_ERR(root); + } + + key.objectid = inum; + key.type = BTRFS_EXTENT_DATA_KEY; + if (offset > (u64)-1 << 32) + key.offset = 0; + else + key.offset = offset; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + WARN_ON(1); + return ret; + } + + while (1) { + cond_resched(); + + leaf = path->nodes[0]; + slot = path->slots[0]; + + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = 0; + goto out; + } + continue; + } + + path->slots[0]++; + + btrfs_item_key_to_cpu(leaf, &key, slot); + + if (key.objectid > inum) + goto out; + + if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY) + continue; + + extent = btrfs_item_ptr(leaf, slot, + struct btrfs_file_extent_item); + + if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr) + continue; + + extent_offset = btrfs_file_extent_offset(leaf, extent); + if (key.offset - extent_offset != offset) + continue; + + num_bytes = btrfs_file_extent_num_bytes(leaf, extent); + if (extent_offset >= old->extent_offset + old->offset + + old->len || extent_offset + num_bytes <= + old->extent_offset + old->offset) + continue; + + break; + } + + backref = kmalloc(sizeof(*backref), GFP_NOFS); + if (!backref) { + ret = -ENOENT; + goto out; + } + + backref->root_id = root_id; + backref->inum = inum; + backref->file_pos = offset + extent_offset; + backref->num_bytes = num_bytes; + backref->extent_offset = extent_offset; + backref->generation = btrfs_file_extent_generation(leaf, extent); + backref->old = old; + backref_insert(&new->root, backref); + old->count++; +out: + btrfs_release_path(path); + WARN_ON(ret); + return ret; +} + +static noinline bool record_extent_backrefs(struct btrfs_path *path, + struct new_sa_defrag_extent *new) +{ + struct btrfs_fs_info *fs_info = BTRFS_I(new->inode)->root->fs_info; + struct old_sa_defrag_extent *old, *tmp; + int ret; + + new->path = path; + + list_for_each_entry_safe(old, tmp, &new->head, list) { + ret = iterate_inodes_from_logical(old->bytenr, fs_info, + path, record_one_backref, + old); + BUG_ON(ret < 0 && ret != -ENOENT); + + /* no backref to be processed for this extent */ + if (!old->count) { + list_del(&old->list); + kfree(old); + } + } + + if (list_empty(&new->head)) + return false; + + return true; +} + +static int relink_is_mergable(struct extent_buffer *leaf, + struct btrfs_file_extent_item *fi, + u64 disk_bytenr) +{ + if (btrfs_file_extent_disk_bytenr(leaf, fi) != disk_bytenr) + return 0; + + if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG) + return 0; + + if (btrfs_file_extent_compression(leaf, fi) || + btrfs_file_extent_encryption(leaf, fi) || + btrfs_file_extent_other_encoding(leaf, fi)) + return 0; + + return 1; +} + +/* + * Note the backref might has changed, and in this case we just return 0. + */ +static noinline int relink_extent_backref(struct btrfs_path *path, + struct sa_defrag_extent_backref *prev, + struct sa_defrag_extent_backref *backref) +{ + struct btrfs_file_extent_item *extent; + struct btrfs_file_extent_item *item; + struct btrfs_ordered_extent *ordered; + struct btrfs_trans_handle *trans; + struct btrfs_fs_info *fs_info; + struct btrfs_root *root; + struct btrfs_key key; + struct extent_buffer *leaf; + struct old_sa_defrag_extent *old = backref->old; + struct new_sa_defrag_extent *new = old->new; + struct inode *src_inode = new->inode; + struct inode *inode; + struct extent_state *cached = NULL; + int ret = 0; + u64 start; + u64 len; + u64 lock_start; + u64 lock_end; + bool merge = false; + int index; + + if (prev && prev->root_id == backref->root_id && + prev->inum == backref->inum && + prev->file_pos + prev->num_bytes == backref->file_pos) + merge = true; + + /* step 1: get root */ + key.objectid = backref->root_id; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + + fs_info = BTRFS_I(src_inode)->root->fs_info; + index = srcu_read_lock(&fs_info->subvol_srcu); + + root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(root)) { + srcu_read_unlock(&fs_info->subvol_srcu, index); + if (PTR_ERR(root) == -ENOENT) + return 0; + return PTR_ERR(root); + } + if (btrfs_root_refs(&root->root_item) == 0) { + srcu_read_unlock(&fs_info->subvol_srcu, index); + /* parse ENOENT to 0 */ + return 0; + } + + /* step 2: get inode */ + key.objectid = backref->inum; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + inode = btrfs_iget(fs_info->sb, &key, root, NULL); + if (IS_ERR(inode)) { + srcu_read_unlock(&fs_info->subvol_srcu, index); + return 0; + } + + srcu_read_unlock(&fs_info->subvol_srcu, index); + + /* step 3: relink backref */ + lock_start = backref->file_pos; + lock_end = backref->file_pos + backref->num_bytes - 1; + lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end, + 0, &cached); + + ordered = btrfs_lookup_first_ordered_extent(inode, lock_end); + if (ordered) { + btrfs_put_ordered_extent(ordered); + goto out_unlock; + } + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_unlock; + } + + key.objectid = backref->inum; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = backref->file_pos; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + goto out_free_path; + } else if (ret > 0) { + ret = 0; + goto out_free_path; + } + + extent = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + + if (btrfs_file_extent_generation(path->nodes[0], extent) != + backref->generation) + goto out_free_path; + + btrfs_release_path(path); + + start = backref->file_pos; + if (backref->extent_offset < old->extent_offset + old->offset) + start += old->extent_offset + old->offset - + backref->extent_offset; + + len = min(backref->extent_offset + backref->num_bytes, + old->extent_offset + old->offset + old->len); + len -= max(backref->extent_offset, old->extent_offset + old->offset); + + ret = btrfs_drop_extents(trans, root, inode, start, + start + len, 1); + if (ret) + goto out_free_path; +again: + key.objectid = btrfs_ino(inode); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = start; + + if (merge) { + struct btrfs_file_extent_item *fi; + u64 extent_len; + struct btrfs_key found_key; + + ret = btrfs_search_slot(trans, root, &key, path, 1, 1); + if (ret < 0) + goto out_free_path; + + path->slots[0]--; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_len = btrfs_file_extent_num_bytes(leaf, fi); + + if (relink_is_mergable(leaf, fi, new->bytenr) && + extent_len + found_key.offset == start) { + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_len + len); + btrfs_mark_buffer_dirty(leaf); + inode_add_bytes(inode, len); + + ret = 1; + goto out_free_path; + } else { + merge = false; + btrfs_release_path(path); + goto again; + } + } + + ret = btrfs_insert_empty_item(trans, root, path, &key, + sizeof(*extent)); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_free_path; + } + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr); + btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len); + btrfs_set_file_extent_offset(leaf, item, start - new->file_pos); + btrfs_set_file_extent_num_bytes(leaf, item, len); + btrfs_set_file_extent_ram_bytes(leaf, item, new->len); + btrfs_set_file_extent_generation(leaf, item, trans->transid); + btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG); + btrfs_set_file_extent_compression(leaf, item, new->compress_type); + btrfs_set_file_extent_encryption(leaf, item, 0); + btrfs_set_file_extent_other_encoding(leaf, item, 0); + + btrfs_mark_buffer_dirty(leaf); + inode_add_bytes(inode, len); + + ret = btrfs_inc_extent_ref(trans, root, new->bytenr, + new->disk_len, 0, + backref->root_id, backref->inum, + new->file_pos, 0); /* start - extent_offset */ + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_free_path; + } + + ret = 1; +out_free_path: + btrfs_release_path(path); + btrfs_end_transaction(trans, root); +out_unlock: + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end, + &cached, GFP_NOFS); + iput(inode); + return ret; +} + +static void relink_file_extents(struct new_sa_defrag_extent *new) +{ + struct btrfs_path *path; + struct old_sa_defrag_extent *old, *tmp; + struct sa_defrag_extent_backref *backref; + struct sa_defrag_extent_backref *prev = NULL; + struct inode *inode; + struct btrfs_root *root; + struct rb_node *node; + int ret; + + inode = new->inode; + root = BTRFS_I(inode)->root; + + path = btrfs_alloc_path(); + if (!path) + return; + + if (!record_extent_backrefs(path, new)) { + btrfs_free_path(path); + goto out; + } + btrfs_release_path(path); + + while (1) { + node = rb_first(&new->root); + if (!node) + break; + rb_erase(node, &new->root); + + backref = rb_entry(node, struct sa_defrag_extent_backref, node); + + ret = relink_extent_backref(path, prev, backref); + WARN_ON(ret < 0); + + kfree(prev); + + if (ret == 1) + prev = backref; + else + prev = NULL; + cond_resched(); + } + kfree(prev); + + btrfs_free_path(path); + + list_for_each_entry_safe(old, tmp, &new->head, list) { + list_del(&old->list); + kfree(old); + } +out: + atomic_dec(&root->fs_info->defrag_running); + wake_up(&root->fs_info->transaction_wait); + + kfree(new); +} + +static struct new_sa_defrag_extent * +record_old_file_extents(struct inode *inode, + struct btrfs_ordered_extent *ordered) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_path *path; + struct btrfs_key key; + struct old_sa_defrag_extent *old, *tmp; + struct new_sa_defrag_extent *new; + int ret; + + new = kmalloc(sizeof(*new), GFP_NOFS); + if (!new) + return NULL; + + new->inode = inode; + new->file_pos = ordered->file_offset; + new->len = ordered->len; + new->bytenr = ordered->start; + new->disk_len = ordered->disk_len; + new->compress_type = ordered->compress_type; + new->root = RB_ROOT; + INIT_LIST_HEAD(&new->head); + + path = btrfs_alloc_path(); + if (!path) + goto out_kfree; + + key.objectid = btrfs_ino(inode); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = new->file_pos; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out_free_path; + if (ret > 0 && path->slots[0] > 0) + path->slots[0]--; + + /* find out all the old extents for the file range */ + while (1) { + struct btrfs_file_extent_item *extent; + struct extent_buffer *l; + int slot; + u64 num_bytes; + u64 offset; + u64 end; + u64 disk_bytenr; + u64 extent_offset; + + l = path->nodes[0]; + slot = path->slots[0]; + + if (slot >= btrfs_header_nritems(l)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out_free_list; + else if (ret > 0) + break; + continue; + } + + btrfs_item_key_to_cpu(l, &key, slot); + + if (key.objectid != btrfs_ino(inode)) + break; + if (key.type != BTRFS_EXTENT_DATA_KEY) + break; + if (key.offset >= new->file_pos + new->len) + break; + + extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item); + + num_bytes = btrfs_file_extent_num_bytes(l, extent); + if (key.offset + num_bytes < new->file_pos) + goto next; + + disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent); + if (!disk_bytenr) + goto next; + + extent_offset = btrfs_file_extent_offset(l, extent); + + old = kmalloc(sizeof(*old), GFP_NOFS); + if (!old) + goto out_free_list; + + offset = max(new->file_pos, key.offset); + end = min(new->file_pos + new->len, key.offset + num_bytes); + + old->bytenr = disk_bytenr; + old->extent_offset = extent_offset; + old->offset = offset - key.offset; + old->len = end - offset; + old->new = new; + old->count = 0; + list_add_tail(&old->list, &new->head); +next: + path->slots[0]++; + cond_resched(); + } + + btrfs_free_path(path); + atomic_inc(&root->fs_info->defrag_running); + + return new; + +out_free_list: + list_for_each_entry_safe(old, tmp, &new->head, list) { + list_del(&old->list); + kfree(old); + } +out_free_path: + btrfs_free_path(path); +out_kfree: + kfree(new); + return NULL; +} + /* * helper function for btrfs_finish_ordered_io, this * just reads in some of the csum leaves to prime them into ram @@ -1949,6 +2584,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) struct btrfs_trans_handle *trans = NULL; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_state *cached_state = NULL; + struct new_sa_defrag_extent *new = NULL; int compress_type = 0; int ret; bool nolock; @@ -1983,6 +2619,20 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) ordered_extent->file_offset + ordered_extent->len - 1, 0, &cached_state); + ret = test_range_bit(io_tree, ordered_extent->file_offset, + ordered_extent->file_offset + ordered_extent->len - 1, + EXTENT_DEFRAG, 1, cached_state); + if (ret) { + u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item); + if (last_snapshot >= BTRFS_I(inode)->generation) + /* the inode is shared */ + new = record_old_file_extents(inode, ordered_extent); + + clear_extent_bit(io_tree, ordered_extent->file_offset, + ordered_extent->file_offset + ordered_extent->len - 1, + EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS); + } + if (nolock) trans = btrfs_join_transaction_nolock(root); else @@ -2064,6 +2714,10 @@ out: */ btrfs_remove_ordered_extent(inode, ordered_extent); + /* for snapshot-aware defrag */ + if (new) + relink_file_extents(new); + /* once for us */ btrfs_put_ordered_extent(ordered_extent); /* once for the tree */ -- cgit v1.2.3 From 172a50497ffaf84d60dff37fbeb03894268fe5c2 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 21 Feb 2013 02:48:22 -0700 Subject: Btrfs: fix wrong outstanding_extents when doing DIO write When running the 083th case of xfstests on the filesystem with "compress-force=lzo", the following WARNINGs were triggered. WARNING: at fs/btrfs/inode.c:7908 WARNING: at fs/btrfs/inode.c:7909 WARNING: at fs/btrfs/inode.c:7911 WARNING: at fs/btrfs/extent-tree.c:4510 WARNING: at fs/btrfs/extent-tree.c:4511 This problem was introduced by the patch "Btrfs: fix deadlock due to unsubmitted". In this patch, there are two bugs which caused the above problem. The 1st one is a off-by-one bug, if the DIO write return 0, it is also a short write, we need release the reserved space for it. But we didn't do it in that patch. Fix it by change "ret > 0" to "ret >= 0". The 2nd one is ->outstanding_extents was increased twice when a short write happened. As we know, ->outstanding_extents is a counter to keep track of the number of extent items we may use duo to delalloc, when we reserve the free space for a delalloc write, we assume that the write will introduce just one extent item, so we increase ->outstanding_extents by 1 at that time. And then we will increase it every time we split the write, it is done at the beginning of btrfs_get_blocks_direct(). So when a short write happens, we needn't increase ->outstanding_extents again. But this patch done. In order to fix the 2nd problem, I re-write the logic for ->outstanding_extents operation. We don't increase it at the beginning of btrfs_get_blocks_direct(), instead, we just increase it when the split actually happens. Reported-by: Mitch Harder Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4d0aec0cf5d8..40d49da5e846 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6708,12 +6708,9 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, int unlock_bits = EXTENT_LOCKED; int ret = 0; - if (create) { - spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents++; - spin_unlock(&BTRFS_I(inode)->lock); + if (create) unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY; - } else + else len = min_t(u64, len, root->sectorsize); lockstart = start; @@ -6855,6 +6852,10 @@ unlock: if (start + len > i_size_read(inode)) i_size_write(inode, start + len); + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents++; + spin_unlock(&BTRFS_I(inode)->lock); + ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockstart + len - 1, EXTENT_DELALLOC, NULL, &cached_state, GFP_NOFS); @@ -7362,14 +7363,11 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, if (rw & WRITE) { if (ret < 0 && ret != -EIOCBQUEUED) btrfs_delalloc_release_space(inode, count); - else if (ret > 0 && (size_t)ret < count) { - spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents++; - spin_unlock(&BTRFS_I(inode)->lock); + else if (ret >= 0 && (size_t)ret < count) btrfs_delalloc_release_space(inode, count - (size_t)ret); - } - btrfs_delalloc_release_metadata(inode, 0); + else + btrfs_delalloc_release_metadata(inode, 0); } out: if (wakeup) -- cgit v1.2.3 From 496ad9aa8ef448058e36ca7a787c61f2e63f0f54 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 23 Jan 2013 17:07:38 -0500 Subject: new helper: file_inode(file) Signed-off-by: Al Viro --- fs/btrfs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 16d9e8e191e6..02d946a61ddd 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4342,7 +4342,7 @@ unsigned char btrfs_filetype_table[] = { static int btrfs_real_readdir(struct file *filp, void *dirent, filldir_t filldir) { - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = file_inode(filp); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_item *item; struct btrfs_dir_item *di; @@ -6737,7 +6737,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct page *page = vmf->page; - struct inode *inode = fdentry(vma->vm_file)->d_inode; + struct inode *inode = file_inode(vma->vm_file); struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_ordered_extent *ordered; -- cgit v1.2.3 From f2bdf9a8f79052e9577936cbe5696c0e232aa0e3 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 21 Feb 2013 15:28:28 -0500 Subject: Btrfs: make sure NODATACOW also gets NODATASUM set A user reported hitting the BUG_ON() in btrfs_finished_ordered_io() where we had csums on a NOCOW extent. This can happen if we have NODATACOW set but not NODATASUM set, which can happen in two cases, either we mount with -o nodatacow and then write into preallocated space, or chattr +C a directory and move a file into that directory. Liu has fixed the move case in a different place, but this fixes the mount -o nodatacow case. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 40d49da5e846..14c82cdbb696 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5574,7 +5574,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, if (btrfs_test_opt(root, NODATASUM)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; if (btrfs_test_opt(root, NODATACOW)) - BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | + BTRFS_INODE_NODATASUM; } insert_inode_hash(inode); -- cgit v1.2.3 From 8c4ce81e911ab6c84e4f75e18d4ceb3fa555c35b Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 25 Feb 2013 04:04:42 +0000 Subject: Btrfs: do not change inode flags in rename Before we forced to change a file's NOCOW and COMPRESS flag due to the parent directory's, but this ends up a bad idea, because it confuses end users a lot about file's NOCOW status, eg. if someone change a file to NOCOW via 'chattr' and then rename it in the current directory which is without NOCOW attribute, the file will lose the NOCOW flag silently. This diables 'change flags in rename', so from now on we'll only inherit flags from the parent directory on creation stage while in other places we can use 'chattr' to set NOCOW or COMPRESS flags. Reported-by: Marios Titas Signed-off-by: Liu Bo Reviewed-by: David Sterba Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 25 ------------------------- 1 file changed, 25 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 14c82cdbb696..be09654e11b9 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8060,29 +8060,6 @@ static int btrfs_getattr(struct vfsmount *mnt, return 0; } -/* - * If a file is moved, it will inherit the cow and compression flags of the new - * directory. - */ -static void fixup_inode_flags(struct inode *dir, struct inode *inode) -{ - struct btrfs_inode *b_dir = BTRFS_I(dir); - struct btrfs_inode *b_inode = BTRFS_I(inode); - - if (b_dir->flags & BTRFS_INODE_NODATACOW) - b_inode->flags |= BTRFS_INODE_NODATACOW; - else - b_inode->flags &= ~BTRFS_INODE_NODATACOW; - - if (b_dir->flags & BTRFS_INODE_COMPRESS) { - b_inode->flags |= BTRFS_INODE_COMPRESS; - b_inode->flags &= ~BTRFS_INODE_NOCOMPRESS; - } else { - b_inode->flags &= ~(BTRFS_INODE_COMPRESS | - BTRFS_INODE_NOCOMPRESS); - } -} - static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { @@ -8248,8 +8225,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, } } - fixup_inode_flags(new_dir, old_inode); - ret = btrfs_add_link(trans, new_dir, old_inode, new_dentry->d_name.name, new_dentry->d_name.len, 0, index); -- cgit v1.2.3 From fda2832febb1928da0625b2c5d15559b29d7e740 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 26 Feb 2013 08:10:22 +0000 Subject: btrfs: cleanup for open-coded alignment Though most of the btrfs codes are using ALIGN macro for page alignment, there are still some codes using open-coded alignment like the following: ------ u64 mask = ((u64)root->stripesize - 1); u64 ret = (val + mask) & ~mask; ------ Or even hidden one: ------ num_bytes = (end - start + blocksize) & ~(blocksize - 1); ------ Sometimes these open-coded alignment is not so easy to understand for newbie like me. This commit changes the open-coded alignment to the ALIGN macro for a better readability. Also there is a previous patch from David Sterba with similar changes, but the patch is for 3.2 kernel and seems not merged. http://www.spinics.net/lists/linux-btrfs/msg12747.html Cc: David Sterba Signed-off-by: Qu Wenruo Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 37 +++++++++++++++---------------------- 1 file changed, 15 insertions(+), 22 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index be09654e11b9..9ef7a5b1b77e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -233,8 +233,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, u64 isize = i_size_read(inode); u64 actual_end = min(end + 1, isize); u64 inline_len = actual_end - start; - u64 aligned_end = (end + root->sectorsize - 1) & - ~((u64)root->sectorsize - 1); + u64 aligned_end = ALIGN(end, root->sectorsize); u64 data_len = inline_len; int ret; @@ -391,7 +390,7 @@ again: * a compressed extent to 128k. */ total_compressed = min(total_compressed, max_uncompressed); - num_bytes = (end - start + blocksize) & ~(blocksize - 1); + num_bytes = ALIGN(end - start + 1, blocksize); num_bytes = max(blocksize, num_bytes); total_in = 0; ret = 0; @@ -490,15 +489,13 @@ cont: * up to a block size boundary so the allocator does sane * things */ - total_compressed = (total_compressed + blocksize - 1) & - ~(blocksize - 1); + total_compressed = ALIGN(total_compressed, blocksize); /* * one last check to make sure the compression is really a * win, compare the page count read with the blocks on disk */ - total_in = (total_in + PAGE_CACHE_SIZE - 1) & - ~(PAGE_CACHE_SIZE - 1); + total_in = ALIGN(total_in, PAGE_CACHE_SIZE); if (total_compressed >= total_in) { will_compress = 0; } else { @@ -856,7 +853,7 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans, BUG_ON(btrfs_is_free_space_inode(inode)); - num_bytes = (end - start + blocksize) & ~(blocksize - 1); + num_bytes = ALIGN(end - start + 1, blocksize); num_bytes = max(blocksize, num_bytes); disk_num_bytes = num_bytes; @@ -4015,7 +4012,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u64 extent_num_bytes = 0; u64 extent_offset = 0; u64 item_end = 0; - u64 mask = root->sectorsize - 1; u32 found_type = (u8)-1; int found_extent; int del_item; @@ -4039,7 +4035,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, * extent just the way it is. */ if (root->ref_cows || root == root->fs_info->tree_root) - btrfs_drop_extent_cache(inode, (new_size + mask) & (~mask), (u64)-1, 0); + btrfs_drop_extent_cache(inode, ALIGN(new_size, + root->sectorsize), (u64)-1, 0); /* * This function is also used to drop the items in the log tree before @@ -4118,10 +4115,9 @@ search_again: if (!del_item) { u64 orig_num_bytes = btrfs_file_extent_num_bytes(leaf, fi); - extent_num_bytes = new_size - - found_key.offset + root->sectorsize - 1; - extent_num_bytes = extent_num_bytes & - ~((u64)root->sectorsize - 1); + extent_num_bytes = ALIGN(new_size - + found_key.offset, + root->sectorsize); btrfs_set_file_extent_num_bytes(leaf, fi, extent_num_bytes); num_dec = (orig_num_bytes - @@ -4357,9 +4353,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) struct extent_map *em = NULL; struct extent_state *cached_state = NULL; struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; - u64 mask = root->sectorsize - 1; - u64 hole_start = (oldsize + mask) & ~mask; - u64 block_end = (size + mask) & ~mask; + u64 hole_start = ALIGN(oldsize, root->sectorsize); + u64 block_end = ALIGN(size, root->sectorsize); u64 last_byte; u64 cur_offset; u64 hole_size; @@ -4392,7 +4387,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) break; } last_byte = min(extent_map_end(em), block_end); - last_byte = (last_byte + mask) & ~mask; + last_byte = ALIGN(last_byte , root->sectorsize); if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { struct extent_map *hole_em; hole_size = last_byte - cur_offset; @@ -6111,8 +6106,7 @@ again: } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { size_t size; size = btrfs_file_extent_inline_len(leaf, item); - extent_end = (extent_start + size + root->sectorsize - 1) & - ~((u64)root->sectorsize - 1); + extent_end = ALIGN(extent_start + size, root->sectorsize); } if (start >= extent_end) { @@ -6184,8 +6178,7 @@ again: copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset, size - extent_offset); em->start = extent_start + extent_offset; - em->len = (copy_size + root->sectorsize - 1) & - ~((u64)root->sectorsize - 1); + em->len = ALIGN(copy_size, root->sectorsize); em->orig_block_len = em->len; em->orig_start = em->start; if (compress_type) { -- cgit v1.2.3 From bdc20e67e82cfc4901d3a5a0d79104b0e2296d83 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 28 Feb 2013 13:23:38 -0500 Subject: Btrfs: copy everything if we've created an inline extent I noticed while looking into a tree logging bug that we aren't logging inline extents properly. Since this requires copying and it shouldn't happen too often just force us to copy everything for the inode into the tree log when we have an inline extent. With this patch we have valid data after a crash when we write an inline extent. Thanks, Cc: stable@vger.kernel.org Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9ef7a5b1b77e..ecd9c4cdb0db 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -266,6 +266,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, return 1; } + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); btrfs_delalloc_release_metadata(inode, end + 1 - start); btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); return 0; -- cgit v1.2.3 From 154ea2893002618bc3f9a1e2d8186c65490968b1 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 5 Mar 2013 11:11:26 -0500 Subject: Btrfs: enforce min_bytes parameter during extent allocation Commit 24542bf7ea5e4fdfdb5157ff544c093fa4dcb536 changed preallocation of extents to cap the max size we try to allocate. It's a valid change, but the extent reservation code is also used by balance, and that can't tolerate a smaller extent being allocated. __btrfs_prealloc_file_range already has a min_size parameter, which is used by relocation to request a specific extent size. This commit adds an extra check to enforce that minimum extent size. Signed-off-by: Chris Mason Reported-by: Stefan Behrens --- fs/btrfs/inode.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ecd9c4cdb0db..13ab4de0a400 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8502,6 +8502,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, struct btrfs_key ins; u64 cur_offset = start; u64 i_size; + u64 cur_bytes; int ret = 0; bool own_trans = true; @@ -8516,8 +8517,9 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, } } - ret = btrfs_reserve_extent(trans, root, - min(num_bytes, 256ULL * 1024 * 1024), + cur_bytes = min(num_bytes, 256ULL * 1024 * 1024); + cur_bytes = max(cur_bytes, min_size); + ret = btrfs_reserve_extent(trans, root, cur_bytes, min_size, 0, *alloc_hint, &ins, 1); if (ret) { if (own_trans) -- cgit v1.2.3 From a09a0a705dd6c80bc96b5e6f18dc103d4e1a7d63 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 11 Mar 2013 09:20:58 +0000 Subject: Btrfs: get better concurrency for snapshot-aware defrag work Using spinning case instead of blocking will result in better concurrency overall. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 13ab4de0a400..1f26888825e2 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2312,6 +2312,7 @@ again: key.type = BTRFS_EXTENT_DATA_KEY; key.offset = start; + path->leave_spinning = 1; if (merge) { struct btrfs_file_extent_item *fi; u64 extent_len; @@ -2368,6 +2369,7 @@ again: btrfs_mark_buffer_dirty(leaf); inode_add_bytes(inode, len); + btrfs_release_path(path); ret = btrfs_inc_extent_ref(trans, root, new->bytenr, new->disk_len, 0, @@ -2381,6 +2383,7 @@ again: ret = 1; out_free_path: btrfs_release_path(path); + path->leave_spinning = 0; btrfs_end_transaction(trans, root); out_unlock: unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end, -- cgit v1.2.3