From 2c30c71bd653afcbed7f6754e8fe3d16e0e708a1 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 7 Nov 2013 12:20:26 -0800 Subject: block: Convert various code to bio_for_each_segment() With immutable biovecs we don't want code accessing bi_io_vec directly - the uses this patch changes weren't incorrect since they all own the bio, but it makes the code harder to audit for no good reason - also, this will help with multipage bvecs later. Signed-off-by: Kent Overstreet Cc: Jens Axboe Cc: Alexander Viro Cc: Chris Mason Cc: Jaegeuk Kim Cc: Joern Engel Cc: Prasad Joshi Cc: Trond Myklebust --- fs/btrfs/inode.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f1a77449d032..d6630dc130ba 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6779,17 +6779,16 @@ unlock_err: static void btrfs_endio_direct_read(struct bio *bio, int err) { struct btrfs_dio_private *dip = bio->bi_private; - struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; - struct bio_vec *bvec = bio->bi_io_vec; + struct bio_vec *bvec; struct inode *inode = dip->inode; struct btrfs_root *root = BTRFS_I(inode)->root; struct bio *dio_bio; u32 *csums = (u32 *)dip->csum; - int index = 0; u64 start; + int i; start = dip->logical_offset; - do { + bio_for_each_segment_all(bvec, bio, i) { if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { struct page *page = bvec->bv_page; char *kaddr; @@ -6805,18 +6804,16 @@ static void btrfs_endio_direct_read(struct bio *bio, int err) local_irq_restore(flags); flush_dcache_page(bvec->bv_page); - if (csum != csums[index]) { + if (csum != csums[i]) { btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u", btrfs_ino(inode), start, csum, - csums[index]); + csums[i]); err = -EIO; } } start += bvec->bv_len; - bvec++; - index++; - } while (bvec <= bvec_end); + } unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, dip->logical_offset + dip->bytes - 1); -- cgit v1.2.3 From 4f024f3797c43cb4b73cd2c50cec728842d0e49e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 11 Oct 2013 15:44:27 -0700 Subject: block: Abstract out bvec iterator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Immutable biovecs are going to require an explicit iterator. To implement immutable bvecs, a later patch is going to add a bi_bvec_done member to this struct; for now, this patch effectively just renames things. Signed-off-by: Kent Overstreet Cc: Jens Axboe Cc: Geert Uytterhoeven Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: "Ed L. Cashin" Cc: Nick Piggin Cc: Lars Ellenberg Cc: Jiri Kosina Cc: Matthew Wilcox Cc: Geoff Levand Cc: Yehuda Sadeh Cc: Sage Weil Cc: Alex Elder Cc: ceph-devel@vger.kernel.org Cc: Joshua Morris Cc: Philip Kelleher Cc: Rusty Russell Cc: "Michael S. Tsirkin" Cc: Konrad Rzeszutek Wilk Cc: Jeremy Fitzhardinge Cc: Neil Brown Cc: Alasdair Kergon Cc: Mike Snitzer Cc: dm-devel@redhat.com Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: linux390@de.ibm.com Cc: Boaz Harrosh Cc: Benny Halevy Cc: "James E.J. Bottomley" Cc: Greg Kroah-Hartman Cc: "Nicholas A. Bellinger" Cc: Alexander Viro Cc: Chris Mason Cc: "Theodore Ts'o" Cc: Andreas Dilger Cc: Jaegeuk Kim Cc: Steven Whitehouse Cc: Dave Kleikamp Cc: Joern Engel Cc: Prasad Joshi Cc: Trond Myklebust Cc: KONISHI Ryusuke Cc: Mark Fasheh Cc: Joel Becker Cc: Ben Myers Cc: xfs@oss.sgi.com Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Len Brown Cc: Pavel Machek Cc: "Rafael J. Wysocki" Cc: Herton Ronaldo Krzesinski Cc: Ben Hutchings Cc: Andrew Morton Cc: Guo Chao Cc: Tejun Heo Cc: Asai Thambi S P Cc: Selvan Mani Cc: Sam Bradshaw Cc: Wei Yongjun Cc: "Roger Pau Monné" Cc: Jan Beulich Cc: Stefano Stabellini Cc: Ian Campbell Cc: Sebastian Ott Cc: Christian Borntraeger Cc: Minchan Kim Cc: Jiang Liu Cc: Nitin Gupta Cc: Jerome Marchand Cc: Joe Perches Cc: Peng Tao Cc: Andy Adamson Cc: fanchaoting Cc: Jie Liu Cc: Sunil Mushran Cc: "Martin K. Petersen" Cc: Namjae Jeon Cc: Pankaj Kumar Cc: Dan Magenheimer Cc: Mel Gorman 6 --- fs/btrfs/inode.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d6630dc130ba..7ab0e94ad492 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1577,7 +1577,7 @@ int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, unsigned long bio_flags) { struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; - u64 logical = (u64)bio->bi_sector << 9; + u64 logical = (u64)bio->bi_iter.bi_sector << 9; u64 length = 0; u64 map_length; int ret; @@ -1585,7 +1585,7 @@ int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, if (bio_flags & EXTENT_BIO_COMPRESSED) return 0; - length = bio->bi_size; + length = bio->bi_iter.bi_size; map_length = length; ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, NULL, 0); @@ -6894,7 +6894,8 @@ static void btrfs_end_dio_bio(struct bio *bio, int err) printk(KERN_ERR "btrfs direct IO failed ino %llu rw %lu " "sector %#Lx len %u err no %d\n", btrfs_ino(dip->inode), bio->bi_rw, - (unsigned long long)bio->bi_sector, bio->bi_size, err); + (unsigned long long)bio->bi_iter.bi_sector, + bio->bi_iter.bi_size, err); dip->errors = 1; /* @@ -6985,7 +6986,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, struct bio *bio; struct bio *orig_bio = dip->orig_bio; struct bio_vec *bvec = orig_bio->bi_io_vec; - u64 start_sector = orig_bio->bi_sector; + u64 start_sector = orig_bio->bi_iter.bi_sector; u64 file_offset = dip->logical_offset; u64 submit_len = 0; u64 map_length; @@ -6993,7 +6994,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, int ret = 0; int async_submit = 0; - map_length = orig_bio->bi_size; + map_length = orig_bio->bi_iter.bi_size; ret = btrfs_map_block(root->fs_info, rw, start_sector << 9, &map_length, NULL, 0); if (ret) { @@ -7001,7 +7002,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, return -EIO; } - if (map_length >= orig_bio->bi_size) { + if (map_length >= orig_bio->bi_iter.bi_size) { bio = orig_bio; goto submit; } @@ -7053,7 +7054,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; - map_length = orig_bio->bi_size; + map_length = orig_bio->bi_iter.bi_size; ret = btrfs_map_block(root->fs_info, rw, start_sector << 9, &map_length, NULL, 0); @@ -7111,7 +7112,8 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, if (!skip_sum && !write) { csum_size = btrfs_super_csum_size(root->fs_info->super_copy); - sum_len = dio_bio->bi_size >> inode->i_sb->s_blocksize_bits; + sum_len = dio_bio->bi_iter.bi_size >> + inode->i_sb->s_blocksize_bits; sum_len *= csum_size; } else { sum_len = 0; @@ -7126,8 +7128,8 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, dip->private = dio_bio->bi_private; dip->inode = inode; dip->logical_offset = file_offset; - dip->bytes = dio_bio->bi_size; - dip->disk_bytenr = (u64)dio_bio->bi_sector << 9; + dip->bytes = dio_bio->bi_iter.bi_size; + dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9; io_bio->bi_private = dip; dip->errors = 0; dip->orig_bio = io_bio; -- cgit v1.2.3 From dff6efc326a4d5f305797d4a6bba14f374fdd633 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Nov 2013 07:17:07 -0800 Subject: fs: fix iversion handling Currently notify_change directly updates i_version for size updates, which not only is counter to how all other fields are updated through struct iattr, but also breaks XFS, which need inode updates to happen under its own lock, and synchronized to the structure that gets written to the log. Remove the update in the common code, and it to btrfs and ext4, XFS already does a proper updaste internally and currently gets a double update with the existing code. IMHO this is 3.13 and -stable material and should go in through the XFS tree. Signed-off-by: Christoph Hellwig Reviewed-by: Andreas Dilger Acked-by: Jan Kara Reviewed-by: Dave Chinner Signed-off-by: Chris Mason Signed-off-by: Ben Myers --- fs/btrfs/inode.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f1a77449d032..471a4f7f4044 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4354,8 +4354,12 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) * these flags set. For all other operations the VFS set these flags * explicitly if it wants a timestamp update. */ - if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) - inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb); + if (newsize != oldsize) { + inode_inc_iversion(inode); + if (!(mask & (ATTR_CTIME | ATTR_MTIME))) + inode->i_ctime = inode->i_mtime = + current_fs_time(inode->i_sb); + } if (newsize > oldsize) { truncate_pagecache(inode, newsize); -- cgit v1.2.3 From 996a710d46418cacb5b4a519ab9341a74066551d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 20 Dec 2013 05:16:43 -0800 Subject: btrfs: use generic posix ACL infrastructure Also don't bother to set up a .get_acl method for symlinks as we do not support access control (ACLs or even mode bits) for symlinks in Linux. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/btrfs/inode.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f1a77449d032..b1314300d9fc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4464,7 +4464,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) err = btrfs_dirty_inode(inode); if (!err && attr->ia_valid & ATTR_MODE) - err = btrfs_acl_chmod(inode); + err = posix_acl_chmod(inode, inode->i_mode); } return err; @@ -8649,12 +8649,14 @@ static const struct inode_operations btrfs_dir_inode_operations = { .removexattr = btrfs_removexattr, .permission = btrfs_permission, .get_acl = btrfs_get_acl, + .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, }; static const struct inode_operations btrfs_dir_ro_inode_operations = { .lookup = btrfs_lookup, .permission = btrfs_permission, .get_acl = btrfs_get_acl, + .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, }; @@ -8724,6 +8726,7 @@ static const struct inode_operations btrfs_file_inode_operations = { .permission = btrfs_permission, .fiemap = btrfs_fiemap, .get_acl = btrfs_get_acl, + .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, }; static const struct inode_operations btrfs_special_inode_operations = { @@ -8735,6 +8738,7 @@ static const struct inode_operations btrfs_special_inode_operations = { .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, .get_acl = btrfs_get_acl, + .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, }; static const struct inode_operations btrfs_symlink_inode_operations = { @@ -8748,7 +8752,6 @@ static const struct inode_operations btrfs_symlink_inode_operations = { .getxattr = btrfs_getxattr, .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, - .get_acl = btrfs_get_acl, .update_time = btrfs_update_time, }; -- cgit v1.2.3 From 16e7549f045d33b0c5b0ebf19d08439e9221d40c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 22 Oct 2013 12:18:51 -0400 Subject: Btrfs: incompatible format change to remove hole extents Btrfs has always had these filler extent data items for holes in inodes. This has made somethings very easy, like logging hole punches and sending hole punches. However for large holey files these extent data items are pure overhead. So add an incompatible feature to no longer add hole extents to reduce the amount of metadata used by these sort of files. This has a few changes for logging and send obviously since they will need to detect holes and log/send the holes if there are any. I've tested this thoroughly with xfstests and it doesn't cause any issues with and without the incompat format set. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 78 ++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 48 insertions(+), 30 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f1a77449d032..c0c0dc8f07fa 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4203,6 +4203,49 @@ out: return ret; } +static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode, + u64 offset, u64 len) +{ + struct btrfs_trans_handle *trans; + int ret; + + /* + * Still need to make sure the inode looks like it's been updated so + * that any holes get logged if we fsync. + */ + if (btrfs_fs_incompat(root->fs_info, NO_HOLES)) { + BTRFS_I(inode)->last_trans = root->fs_info->generation; + BTRFS_I(inode)->last_sub_trans = root->log_transid; + BTRFS_I(inode)->last_log_commit = root->last_log_commit; + return 0; + } + + /* + * 1 - for the one we're dropping + * 1 - for the one we're adding + * 1 - for updating the inode. + */ + trans = btrfs_start_transaction(root, 3); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + btrfs_end_transaction(trans, root); + return ret; + } + + ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset, + 0, 0, len, 0, len, 0, 0, 0); + if (ret) + btrfs_abort_transaction(trans, root, ret); + else + btrfs_update_inode(trans, root, inode); + btrfs_end_transaction(trans, root); + return ret; +} + /* * This function puts in dummy file extents for the area we're creating a hole * for. So if we are truncating this file to a larger size we need to insert @@ -4211,7 +4254,6 @@ out: */ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) { - struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_map *em = NULL; @@ -4266,31 +4308,10 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) struct extent_map *hole_em; hole_size = last_byte - cur_offset; - trans = btrfs_start_transaction(root, 3); - if (IS_ERR(trans)) { - err = PTR_ERR(trans); - break; - } - - err = btrfs_drop_extents(trans, root, inode, - cur_offset, - cur_offset + hole_size, 1); - if (err) { - btrfs_abort_transaction(trans, root, err); - btrfs_end_transaction(trans, root); - break; - } - - err = btrfs_insert_file_extent(trans, root, - btrfs_ino(inode), cur_offset, 0, - 0, hole_size, 0, hole_size, - 0, 0, 0); - if (err) { - btrfs_abort_transaction(trans, root, err); - btrfs_end_transaction(trans, root); + err = maybe_insert_hole(root, inode, cur_offset, + hole_size); + if (err) break; - } - btrfs_drop_extent_cache(inode, cur_offset, cur_offset + hole_size - 1, 0); hole_em = alloc_extent_map(); @@ -4309,7 +4330,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) hole_em->ram_bytes = hole_size; hole_em->bdev = root->fs_info->fs_devices->latest_bdev; hole_em->compress_type = BTRFS_COMPRESS_NONE; - hole_em->generation = trans->transid; + hole_em->generation = root->fs_info->generation; while (1) { write_lock(&em_tree->lock); @@ -4322,17 +4343,14 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) hole_size - 1, 0); } free_extent_map(hole_em); -next: - btrfs_update_inode(trans, root, inode); - btrfs_end_transaction(trans, root); } +next: free_extent_map(em); em = NULL; cur_offset = last_byte; if (cur_offset >= block_end) break; } - free_extent_map(em); unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state, GFP_NOFS); -- cgit v1.2.3 From 99e22f783bbcd048819975b8a1463f39a9966bcf Mon Sep 17 00:00:00 2001 From: Valentina Giusti Date: Mon, 4 Nov 2013 22:34:22 +0100 Subject: btrfs: remove unused variable from btrfs_new_inode Variable owner in btrfs_new_inode is unused since commit d82a6f1d7e8b61ed5996334d0db66651bb43641d (Btrfs: kill BTRFS_I(inode)->block_group) Signed-off-by: Valentina Giusti Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c0c0dc8f07fa..41079c6ed968 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5372,7 +5372,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, u32 sizes[2]; unsigned long ptr; int ret; - int owner; path = btrfs_alloc_path(); if (!path) @@ -5418,11 +5417,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, */ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); - if (S_ISDIR(mode)) - owner = 0; - else - owner = 1; - key[0].objectid = objectid; btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); key[0].offset = 0; -- cgit v1.2.3 From 75ac2dd907013b44edbdec16f8969d14811149c9 Mon Sep 17 00:00:00 2001 From: Kelley Nielsen Date: Mon, 4 Nov 2013 19:35:58 -0800 Subject: btrfs: expand btrfs_find_item() to include find_root_ref functionality This patch is the second step in bootstrapping the btrfs_find_item interface. The btrfs_find_root_ref() is similar to the former __inode_info(); it accepts four of its parameters, and duplicates the first half of its functionality. Replace the one former call to btrfs_find_root_ref() with a call to btrfs_find_item(), along with the defined key type that was used internally by btrfs_find_root ref, and a null found key. In btrfs_find_item(), add a test for the null key at the place where the functionality of btrfs_find_root_ref() ends; btrfs_find_item() then returns if the test passes. Finally, remove btrfs_find_root_ref(). Signed-off-by: Kelley Nielsen Suggested-by: Zach Brown Reviewed-by: Josh Triplett Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 41079c6ed968..5a5de36d39fc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4673,9 +4673,9 @@ static int fixup_tree_root_location(struct btrfs_root *root, } err = -ENOENT; - ret = btrfs_find_root_ref(root->fs_info->tree_root, path, - BTRFS_I(dir)->root->root_key.objectid, - location->objectid); + ret = btrfs_find_item(root->fs_info->tree_root, path, + BTRFS_I(dir)->root->root_key.objectid, + location->objectid, BTRFS_ROOT_REF_KEY, NULL); if (ret) { if (ret < 0) err = ret; -- cgit v1.2.3 From 131e404a2a54d30f894425ef723f9867a43bff4c Mon Sep 17 00:00:00 2001 From: Filipe David Borba Manana Date: Tue, 19 Nov 2013 22:29:35 +0000 Subject: Btrfs: fix very slow inode eviction and fs unmount The inode eviction can be very slow, because during eviction we tell the VFS to truncate all of the inode's pages. This results in calls to btrfs_invalidatepage() which in turn does calls to lock_extent_bits() and clear_extent_bit(). These calls result in too many merges and splits of extent_state structures, which consume a lot of time and cpu when the inode has many pages. In some scenarios I have experienced umount times higher than 15 minutes, even when there's no pending IO (after a btrfs fs sync). A quick way to reproduce this issue: $ mkfs.btrfs -f /dev/sdb3 $ mount /dev/sdb3 /mnt/btrfs $ cd /mnt/btrfs $ sysbench --test=fileio --file-num=128 --file-total-size=16G \ --file-test-mode=seqwr --num-threads=128 \ --file-block-size=16384 --max-time=60 --max-requests=0 run $ time btrfs fi sync . FSSync '.' real 0m25.457s user 0m0.000s sys 0m0.092s $ cd .. $ time umount /mnt/btrfs real 1m38.234s user 0m0.000s sys 1m25.760s The same test on ext4 runs much faster: $ mkfs.ext4 /dev/sdb3 $ mount /dev/sdb3 /mnt/ext4 $ cd /mnt/ext4 $ sysbench --test=fileio --file-num=128 --file-total-size=16G \ --file-test-mode=seqwr --num-threads=128 \ --file-block-size=16384 --max-time=60 --max-requests=0 run $ sync $ cd .. $ time umount /mnt/ext4 real 0m3.626s user 0m0.004s sys 0m3.012s After this patch, the unmount (inode evictions) is much faster: $ mkfs.btrfs -f /dev/sdb3 $ mount /dev/sdb3 /mnt/btrfs $ cd /mnt/btrfs $ sysbench --test=fileio --file-num=128 --file-total-size=16G \ --file-test-mode=seqwr --num-threads=128 \ --file-block-size=16384 --max-time=60 --max-requests=0 run $ time btrfs fi sync . FSSync '.' real 0m26.774s user 0m0.000s sys 0m0.084s $ cd .. $ time umount /mnt/btrfs real 0m1.811s user 0m0.000s sys 0m1.564s Signed-off-by: Filipe David Borba Manana Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 84 insertions(+), 14 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5a5de36d39fc..e889779c9b37 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4488,6 +4488,62 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) return err; } +/* + * While truncating the inode pages during eviction, we get the VFS calling + * btrfs_invalidatepage() against each page of the inode. This is slow because + * the calls to btrfs_invalidatepage() result in a huge amount of calls to + * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting + * extent_state structures over and over, wasting lots of time. + * + * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all + * those expensive operations on a per page basis and do only the ordered io + * finishing, while we release here the extent_map and extent_state structures, + * without the excessive merging and splitting. + */ +static void evict_inode_truncate_pages(struct inode *inode) +{ + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree; + struct rb_node *node; + + ASSERT(inode->i_state & I_FREEING); + truncate_inode_pages(&inode->i_data, 0); + + write_lock(&map_tree->lock); + while (!RB_EMPTY_ROOT(&map_tree->map)) { + struct extent_map *em; + + node = rb_first(&map_tree->map); + em = rb_entry(node, struct extent_map, rb_node); + remove_extent_mapping(map_tree, em); + free_extent_map(em); + } + write_unlock(&map_tree->lock); + + spin_lock(&io_tree->lock); + while (!RB_EMPTY_ROOT(&io_tree->state)) { + struct extent_state *state; + struct extent_state *cached_state = NULL; + + node = rb_first(&io_tree->state); + state = rb_entry(node, struct extent_state, rb_node); + atomic_inc(&state->refs); + spin_unlock(&io_tree->lock); + + lock_extent_bits(io_tree, state->start, state->end, + 0, &cached_state); + clear_extent_bit(io_tree, state->start, state->end, + EXTENT_LOCKED | EXTENT_DIRTY | + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, 1, 1, + &cached_state, GFP_NOFS); + free_extent_state(state); + + spin_lock(&io_tree->lock); + } + spin_unlock(&io_tree->lock); +} + void btrfs_evict_inode(struct inode *inode) { struct btrfs_trans_handle *trans; @@ -4498,7 +4554,8 @@ void btrfs_evict_inode(struct inode *inode) trace_btrfs_inode_evict(inode); - truncate_inode_pages(&inode->i_data, 0); + evict_inode_truncate_pages(inode); + if (inode->i_nlink && ((btrfs_root_refs(&root->root_item) != 0 && root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) || @@ -7379,6 +7436,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, struct extent_state *cached_state = NULL; u64 page_start = page_offset(page); u64 page_end = page_start + PAGE_CACHE_SIZE - 1; + int inode_evicting = inode->i_state & I_FREEING; /* * we have the page locked, so new writeback can't start, @@ -7394,17 +7452,21 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, btrfs_releasepage(page, GFP_NOFS); return; } - lock_extent_bits(tree, page_start, page_end, 0, &cached_state); - ordered = btrfs_lookup_ordered_extent(inode, page_offset(page)); + + if (!inode_evicting) + lock_extent_bits(tree, page_start, page_end, 0, &cached_state); + ordered = btrfs_lookup_ordered_extent(inode, page_start); if (ordered) { /* * IO on this page will never be started, so we need * to account for any ordered extents now */ - clear_extent_bit(tree, page_start, page_end, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, 1, 0, &cached_state, GFP_NOFS); + if (!inode_evicting) + clear_extent_bit(tree, page_start, page_end, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, 1, 0, &cached_state, + GFP_NOFS); /* * whoever cleared the private bit is responsible * for the finish_ordered_io @@ -7428,14 +7490,22 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, btrfs_finish_ordered_io(ordered); } btrfs_put_ordered_extent(ordered); - cached_state = NULL; - lock_extent_bits(tree, page_start, page_end, 0, &cached_state); + if (!inode_evicting) { + cached_state = NULL; + lock_extent_bits(tree, page_start, page_end, 0, + &cached_state); + } + } + + if (!inode_evicting) { + clear_extent_bit(tree, page_start, page_end, + EXTENT_LOCKED | EXTENT_DIRTY | + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, 1, 1, + &cached_state, GFP_NOFS); + + __btrfs_releasepage(page, GFP_NOFS); } - clear_extent_bit(tree, page_start, page_end, - EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1, - &cached_state, GFP_NOFS); - __btrfs_releasepage(page, GFP_NOFS); ClearPageChecked(page); if (PagePrivate(page)) { -- cgit v1.2.3 From 5662344b3c0d9ddd9afd48716d795166f982d5e2 Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Fri, 13 Dec 2013 09:51:42 +0900 Subject: Btrfs: fix error check of btrfs_lookup_dentry() Clean up btrfs_lookup_dentry() to never return NULL, but PTR_ERR(-ENOENT) instead. This keeps the return value convention consistent. Callers who use btrfs_lookup_dentry() require a trivial update. create_snapshot() in particular looks like it can also lose a BUG_ON(!inode) which is not really needed - there seems less harm in returning ENOENT to userspace at that point in the stack than there is to crash the machine. Signed-off-by: Tsutomu Itoh Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e889779c9b37..2bd4f7590c83 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4992,7 +4992,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) return ERR_PTR(ret); if (location.objectid == 0) - return NULL; + return ERR_PTR(-ENOENT); if (location.type == BTRFS_INODE_ITEM_KEY) { inode = btrfs_iget(dir->i_sb, &location, root, NULL); @@ -5056,10 +5056,17 @@ static void btrfs_dentry_release(struct dentry *dentry) static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { - struct dentry *ret; + struct inode *inode; - ret = d_splice_alias(btrfs_lookup_dentry(dir, dentry), dentry); - return ret; + inode = btrfs_lookup_dentry(dir, dentry); + if (IS_ERR(inode)) { + if (PTR_ERR(inode) == -ENOENT) + inode = NULL; + else + return ERR_CAST(inode); + } + + return d_splice_alias(inode, dentry); } unsigned char btrfs_filetype_table[] = { -- cgit v1.2.3 From 663df053309c8d9200b87cc1a129729b8e97eb26 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Sun, 15 Dec 2013 11:39:42 +0800 Subject: Btrfs: remove dead comments for read_csums() Chris introduced hleper function read_csums() and this function has been removed, but we forgot to remove its corresponding comments. Signed-off-by: Wang Shilong Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2bd4f7590c83..b3b3142f1734 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2543,12 +2543,6 @@ out_kfree: return NULL; } -/* - * helper function for btrfs_finish_ordered_io, this - * just reads in some of the csum leaves to prime them into ram - * before we start the transaction. It limits the amount of btree - * reads required while inside the transaction. - */ /* as ordered data IO finishes, this gets called so we can finish * an ordered extent if the range of bytes in the file it covers are * fully written. -- cgit v1.2.3 From 180589efde8a01b4a30af273f670ac81c8abf9c5 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Sat, 14 Dec 2013 15:27:31 +0800 Subject: Btrfs: fix a warning when iput a file See the warning below: [ 1209.102076] [] remove_extent_mapping+0x69/0x70 [btrfs] [ 1209.102084] [] btrfs_evict_inode+0x96/0x4d0 [btrfs] [ 1209.102089] [] ? wake_atomic_t_function+0x40/0x40 [ 1209.102092] [] evict+0x9e/0x190 [ 1209.102094] [] iput+0xf3/0x180 [ 1209.102101] [] btrfs_run_delayed_iputs+0xb1/0xd0 [btrfs] [ 1209.102107] [] __btrfs_end_transaction+0x268/0x350 [btrfs] clear extent bit here to avoid triggering WARN_ON() in remove_extent_mapping() Signed-off-by: Wang Shilong Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b3b3142f1734..2ccf8e6b1e16 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4509,6 +4509,8 @@ static void evict_inode_truncate_pages(struct inode *inode) node = rb_first(&map_tree->map); em = rb_entry(node, struct extent_map, rb_node); + clear_bit(EXTENT_FLAG_PINNED, &em->flags); + clear_bit(EXTENT_FLAG_LOGGING, &em->flags); remove_extent_mapping(map_tree, em); free_extent_map(em); } -- cgit v1.2.3 From efe120a067c8674a8ae21b194f0e68f098b61ee2 Mon Sep 17 00:00:00 2001 From: Frank Holton Date: Fri, 20 Dec 2013 11:37:06 -0500 Subject: Btrfs: convert printk to btrfs_ and fix BTRFS prefix Convert all applicable cases of printk and pr_* to the btrfs_* macros. Fix all uses of the BTRFS prefix. Signed-off-by: Frank Holton Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2ccf8e6b1e16..06bcf5b53cb0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6966,8 +6966,8 @@ static void btrfs_end_dio_bio(struct bio *bio, int err) struct btrfs_dio_private *dip = bio->bi_private; if (err) { - printk(KERN_ERR "btrfs direct IO failed ino %llu rw %lu " - "sector %#Lx len %u err no %d\n", + btrfs_err(BTRFS_I(dip->inode)->root->fs_info, + "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d", btrfs_ino(dip->inode), bio->bi_rw, (unsigned long long)bio->bi_sector, bio->bi_size, err); dip->errors = 1; -- cgit v1.2.3 From 67de11769bd5ec339a62169f500b04f304826c00 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 26 Dec 2013 13:07:06 +0800 Subject: Btrfs: introduce the delayed inode ref deletion for the single link inode The inode reference item is close to inode item, so we insert it simultaneously with the inode item insertion when we create a file/directory.. In fact, we also can handle the inode reference deletion by the same way. So we made this patch to introduce the delayed inode reference deletion for the single link inode(At most case, the file doesn't has hard link, so we don't take the hard link into account). This function is based on the delayed inode mechanism. After applying this patch, we can reduce the time of the file/directory deletion by ~10%. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 53 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 06bcf5b53cb0..9eaa1c8ed385 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3309,6 +3309,7 @@ static void btrfs_read_locked_inode(struct inode *inode) struct btrfs_timespec *tspec; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key location; + unsigned long ptr; int maybe_acls; u32 rdev; int ret; @@ -3332,7 +3333,7 @@ static void btrfs_read_locked_inode(struct inode *inode) leaf = path->nodes[0]; if (filled) - goto cache_acl; + goto cache_index; inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); @@ -3375,6 +3376,30 @@ static void btrfs_read_locked_inode(struct inode *inode) BTRFS_I(inode)->index_cnt = (u64)-1; BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); + +cache_index: + path->slots[0]++; + if (inode->i_nlink != 1 || + path->slots[0] >= btrfs_header_nritems(leaf)) + goto cache_acl; + + btrfs_item_key_to_cpu(leaf, &location, path->slots[0]); + if (location.objectid != btrfs_ino(inode)) + goto cache_acl; + + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + if (location.type == BTRFS_INODE_REF_KEY) { + struct btrfs_inode_ref *ref; + + ref = (struct btrfs_inode_ref *)ptr; + BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref); + } else if (location.type == BTRFS_INODE_EXTREF_KEY) { + struct btrfs_inode_extref *extref; + + extref = (struct btrfs_inode_extref *)ptr; + BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf, + extref); + } cache_acl: /* * try to precache a NULL acl entry for files that don't have @@ -3587,6 +3612,24 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, goto err; btrfs_release_path(path); + /* + * If we don't have dir index, we have to get it by looking up + * the inode ref, since we get the inode ref, remove it directly, + * it is unnecessary to do delayed deletion. + * + * But if we have dir index, needn't search inode ref to get it. + * Since the inode ref is close to the inode item, it is better + * that we delay to delete it, and just do this deletion when + * we update the inode item. + */ + if (BTRFS_I(inode)->dir_index) { + ret = btrfs_delayed_delete_inode_ref(inode); + if (!ret) { + index = BTRFS_I(inode)->dir_index; + goto skip_backref; + } + } + ret = btrfs_del_inode_ref(trans, root, name, name_len, ino, dir_ino, &index); if (ret) { @@ -3596,7 +3639,7 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, root, ret); goto err; } - +skip_backref: ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); if (ret) { btrfs_abort_transaction(trans, root, ret); @@ -5465,6 +5508,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, * number */ BTRFS_I(inode)->index_cnt = 2; + BTRFS_I(inode)->dir_index = *index; BTRFS_I(inode)->root = root; BTRFS_I(inode)->generation = trans->transid; inode->i_generation = BTRFS_I(inode)->generation; @@ -5809,6 +5853,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, goto fail; } + /* There are several dir indexes for this inode, clear the cache. */ + BTRFS_I(inode)->dir_index = 0ULL; inc_nlink(inode); inode_inc_iversion(inode); inode->i_ctime = CURRENT_TIME; @@ -7861,6 +7907,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->flags = 0; ei->csum_bytes = 0; ei->index_cnt = (u64)-1; + ei->dir_index = 0; ei->last_unlink_trans = 0; ei->last_log_commit = 0; @@ -8148,6 +8195,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (ret) goto out_fail; + BTRFS_I(old_inode)->dir_index = 0ULL; if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { /* force full log commit if subvolume involved. */ root->fs_info->last_trans_log_full_commit = trans->transid; @@ -8236,6 +8284,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_fail; } + if (old_inode->i_nlink == 1) + BTRFS_I(old_inode)->dir_index = index; + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { struct dentry *parent = new_dentry->d_parent; btrfs_log_new_name(trans, old_inode, old_dir, parent); -- cgit v1.2.3 From eb653de15987612444b6cde3b0e67b1edd94625f Mon Sep 17 00:00:00 2001 From: Filipe David Borba Manana Date: Mon, 23 Dec 2013 11:53:02 +0000 Subject: Btrfs: reduce btree node locking duration on item update If we do a btree search with the goal of updating an existing item without changing its size (ins_len == 0 and cow == 1), then we never need to hold locks on upper level nodes (even when slot == 0) after we COW their child nodes/leaves, as we won't have node splits or merges in this scenario (that is, no key additions, removals or shifts on any nodes or leaves). Therefore release the locks immediately after COWing the child nodes/leaves while navigating the btree, even if their parent slot is 0, instead of returning a path to the caller with those nodes locked, which would get released only when the caller releases or frees the path (or if it calls btrfs_unlock_up_safe). This is a common scenario, for example when updating inode items in fs trees and block group items in the extent tree. The following benchmarks were performed on a quad core machine with 32Gb of ram, using a leaf/node size of 4Kb (to generate deeper fs trees more quickly). sysbench --test=fileio --file-num=131072 --file-total-size=8G \ --file-test-mode=seqwr --num-threads=512 --file-block-size=8192 \ --max-requests=100000 --file-io-mode=sync [prepare|run] Before this change: 49.85Mb/s (average of 5 runs) After this change: 50.38Mb/s (average of 5 runs) Signed-off-by: Filipe David Borba Manana Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9eaa1c8ed385..8e45fdcdbd8e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3515,7 +3515,6 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, goto failed; } - btrfs_unlock_up_safe(path, 1); leaf = path->nodes[0]; inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); -- cgit v1.2.3 From e77751aad1facc4973613a11e2ad98ee4bbb04e1 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Fri, 27 Dec 2013 21:11:50 +0800 Subject: Btrfs: fix the wrong nocow range check The following warning message was outputed when running the 274th case of xfstests with nodatacow option: BUG: Bad page state in process kswapd0 pfn:1c66f page:ffffea0000636848 count:0 mapcount:0 mapping:(null) index:0x78000 page flags: 0x1000000000100a(error|uptodate|private_2) It is because the check of nocow range was wrong, we should compare the start and end position of the extent with the write position to verify if the write position was in the extent, but the current code just used the start postion to do the check, so we got the wrong extent and told the caller that it was a nocow write. And then when we write back the dirty pages, we found we should cow the extent, but at that time, there was no space in the fs, we had to the error flag for the page. When someone reclaimed that page, the above warning outputed. Fix it. Reported-by: Tsutomu Itoh Signed-off-by: Miao Xie Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8e45fdcdbd8e..ffb23e506762 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6503,6 +6503,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, int slot; int found_type; bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW); + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -6546,6 +6547,10 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, if (!nocow && found_type == BTRFS_FILE_EXTENT_REG) goto out; + extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); + if (extent_end <= offset) + goto out; + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); if (disk_bytenr == 0) goto out; @@ -6563,8 +6568,6 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); } - extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); - if (btrfs_extent_readonly(root, disk_bytenr)) goto out; btrfs_release_path(path); -- cgit v1.2.3 From 1acae57b161ef1282f565ef907f72aeed0eb71d9 Mon Sep 17 00:00:00 2001 From: Filipe David Borba Manana Date: Tue, 7 Jan 2014 11:42:27 +0000 Subject: Btrfs: faster file extent item replace operations When writing to a file we drop existing file extent items that cover the write range and then add a new file extent item that represents that write range. Before this change we were doing a tree lookup to remove the file extent items, and then after we did another tree lookup to insert the new file extent item. Most of the time all the file extent items we need to drop are located within a single leaf - this is the leaf where our new file extent item ends up at. Therefore, in this common case just combine these 2 operations into a single one. By avoiding the second btree navigation for insertion of the new file extent item, we reduce btree node/leaf lock acquisitions/releases, btree block/leaf COW operations, CPU time on btree node/leaf key binary searches, etc. Besides for file writes, this is an operation that happens for file fsync's as well. However log btrees are much less likely to big as big as regular fs btrees, therefore the impact of this change is smaller. The following benchmark was performed against an SSD drive and a HDD drive, both for random and sequential writes: sysbench --test=fileio --file-num=4096 --file-total-size=8G \ --file-test-mode=[rndwr|seqwr] --num-threads=512 \ --file-block-size=8192 \ --max-requests=1000000 \ --file-fsync-freq=0 --file-io-mode=sync [prepare|run] All results below are averages of 10 runs of the respective test. ** SSD sequential writes Before this change: 225.88 Mb/sec After this change: 277.26 Mb/sec ** SSD random writes Before this change: 49.91 Mb/sec After this change: 56.39 Mb/sec ** HDD sequential writes Before this change: 68.53 Mb/sec After this change: 69.87 Mb/sec ** HDD random writes Before this change: 13.04 Mb/sec After this change: 14.39 Mb/sec Signed-off-by: Filipe David Borba Manana Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 87 +++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 55 insertions(+), 32 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ffb23e506762..23f18eb5fb55 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -125,13 +125,12 @@ static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, * no overlapping inline items exist in the btree */ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, + struct btrfs_path *path, int extent_inserted, struct btrfs_root *root, struct inode *inode, u64 start, size_t size, size_t compressed_size, int compress_type, struct page **compressed_pages) { - struct btrfs_key key; - struct btrfs_path *path; struct extent_buffer *leaf; struct page *page = NULL; char *kaddr; @@ -140,29 +139,29 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, int err = 0; int ret; size_t cur_size = size; - size_t datasize; unsigned long offset; if (compressed_size && compressed_pages) cur_size = compressed_size; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; + inode_add_bytes(inode, size); - path->leave_spinning = 1; + if (!extent_inserted) { + struct btrfs_key key; + size_t datasize; - key.objectid = btrfs_ino(inode); - key.offset = start; - btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); - datasize = btrfs_file_extent_calc_inline_size(cur_size); + key.objectid = btrfs_ino(inode); + key.offset = start; + btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); - inode_add_bytes(inode, size); - ret = btrfs_insert_empty_item(trans, root, path, &key, - datasize); - if (ret) { - err = ret; - goto fail; + datasize = btrfs_file_extent_calc_inline_size(cur_size); + path->leave_spinning = 1; + ret = btrfs_insert_empty_item(trans, root, path, &key, + datasize); + if (ret) { + err = ret; + goto fail; + } } leaf = path->nodes[0]; ei = btrfs_item_ptr(leaf, path->slots[0], @@ -203,7 +202,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, page_cache_release(page); } btrfs_mark_buffer_dirty(leaf); - btrfs_free_path(path); + btrfs_release_path(path); /* * we're an inline extent, so nobody can @@ -219,7 +218,6 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, return ret; fail: - btrfs_free_path(path); return err; } @@ -242,6 +240,9 @@ static noinline int cow_file_range_inline(struct btrfs_root *root, u64 aligned_end = ALIGN(end, root->sectorsize); u64 data_len = inline_len; int ret; + struct btrfs_path *path; + int extent_inserted = 0; + u32 extent_item_size; if (compressed_size) data_len = compressed_size; @@ -256,12 +257,27 @@ static noinline int cow_file_range_inline(struct btrfs_root *root, return 1; } + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) + if (IS_ERR(trans)) { + btrfs_free_path(path); return PTR_ERR(trans); + } trans->block_rsv = &root->fs_info->delalloc_block_rsv; - ret = btrfs_drop_extents(trans, root, inode, start, aligned_end, 1); + if (compressed_size && compressed_pages) + extent_item_size = btrfs_file_extent_calc_inline_size( + compressed_size); + else + extent_item_size = btrfs_file_extent_calc_inline_size( + inline_len); + + ret = __btrfs_drop_extents(trans, root, inode, path, + start, aligned_end, NULL, + 1, 1, extent_item_size, &extent_inserted); if (ret) { btrfs_abort_transaction(trans, root, ret); goto out; @@ -269,7 +285,8 @@ static noinline int cow_file_range_inline(struct btrfs_root *root, if (isize > actual_end) inline_len = min_t(u64, isize, actual_end); - ret = insert_inline_extent(trans, root, inode, start, + ret = insert_inline_extent(trans, path, extent_inserted, + root, inode, start, inline_len, compressed_size, compress_type, compressed_pages); if (ret && ret != -ENOSPC) { @@ -284,6 +301,7 @@ static noinline int cow_file_range_inline(struct btrfs_root *root, btrfs_delalloc_release_metadata(inode, end + 1 - start); btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); out: + btrfs_free_path(path); btrfs_end_transaction(trans, root); return ret; } @@ -1841,14 +1859,13 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_key ins; + int extent_inserted = 0; int ret; path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->leave_spinning = 1; - /* * we may be replacing one extent in the tree with another. * The new extent is pinned in the extent map, and we don't want @@ -1858,17 +1875,23 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, * the caller is expected to unpin it and allow it to be merged * with the others. */ - ret = btrfs_drop_extents(trans, root, inode, file_pos, - file_pos + num_bytes, 0); + ret = __btrfs_drop_extents(trans, root, inode, path, file_pos, + file_pos + num_bytes, NULL, 0, + 1, sizeof(*fi), &extent_inserted); if (ret) goto out; - ins.objectid = btrfs_ino(inode); - ins.offset = file_pos; - ins.type = BTRFS_EXTENT_DATA_KEY; - ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi)); - if (ret) - goto out; + if (!extent_inserted) { + ins.objectid = btrfs_ino(inode); + ins.offset = file_pos; + ins.type = BTRFS_EXTENT_DATA_KEY; + + path->leave_spinning = 1; + ret = btrfs_insert_empty_item(trans, root, path, &ins, + sizeof(*fi)); + if (ret) + goto out; + } leaf = path->nodes[0]; fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); -- cgit v1.2.3 From 63541927c8d11d2686778b1e8ec71c14b4fd53e4 Mon Sep 17 00:00:00 2001 From: Filipe David Borba Manana Date: Tue, 7 Jan 2014 11:47:46 +0000 Subject: Btrfs: add support for inode properties This change adds infrastructure to allow for generic properties for inodes. Properties are name/value pairs that can be associated with inodes for different purposes. They are stored as xattrs with the prefix "btrfs." Properties can be inherited - this means when a directory inode has inheritable properties set, these are added to new inodes created under that directory. Further, subvolumes can also have properties associated with them, and they can be inherited from their parent subvolume. Naturally, directory properties have priority over subvolume properties (in practice a subvolume property is just a regular property associated with the root inode, objectid 256, of the subvolume's fs tree). This change also adds one specific property implementation, named "compression", whose values can be "lzo" or "zlib" and it's an inheritable property. The corresponding changes to btrfs-progs were also implemented. A patch with xfstests for this feature will follow once there's agreement on this change/feature. Further, the script at the bottom of this commit message was used to do some benchmarks to measure any performance penalties of this feature. Basically the tests correspond to: Test 1 - create a filesystem and mount it with compress-force=lzo, then sequentially create N files of 64Kb each, measure how long it took to create the files, unmount the filesystem, mount the filesystem and perform an 'ls -lha' against the test directory holding the N files, and report the time the command took. Test 2 - create a filesystem and don't use any compression option when mounting it - instead set the compression property of the subvolume's root to 'lzo'. Then create N files of 64Kb, and report the time it took. The unmount the filesystem, mount it again and perform an 'ls -lha' like in the former test. This means every single file ends up with a property (xattr) associated to it. Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the compression property, have no real effect other than adding more work when inheriting properties and taking more btree leaf space. Test 4 - same as test 3 but with 10 properties per file. Results (in seconds, and averages of 5 runs each), for different N numbers of files follow. * Without properties (test 1) file creation time ls -lha time 10 000 files 3.49 0.76 100 000 files 47.19 8.37 1 000 000 files 518.51 107.06 * With 1 property (compression property set to lzo - test 2) file creation time ls -lha time 10 000 files 3.63 0.93 100 000 files 48.56 9.74 1 000 000 files 537.72 125.11 * With 4 properties (test 3) file creation time ls -lha time 10 000 files 3.94 1.20 100 000 files 52.14 11.48 1 000 000 files 572.70 142.13 * With 10 properties (test 4) file creation time ls -lha time 10 000 files 4.61 1.35 100 000 files 58.86 13.83 1 000 000 files 656.01 177.61 The increased latencies with properties are essencialy because of: *) When creating an inode, we now synchronously write 1 more item (an xattr item) for each property inherited from the parent dir (or subvolume). This could be done in an asynchronous way such as we do for dir intex items (delayed-inode.c), which could help reduce the file creation latency; *) With properties, we now have larger fs trees. For this particular test each xattr item uses 75 bytes of leaf space in the fs tree. This could be less by using a new item for xattr items, instead of the current btrfs_dir_item, since we could cut the 'location' and 'type' fields (saving 18 bytes) and maybe 'transid' too (saving a total of 26 bytes per xattr item) from the btrfs_dir_item type. Also tried batching the xattr insertions (ignoring proper hash collision handling, since it didn't exist) when creating files that inherit properties from their parent inode/subvolume, but the end results were (surprisingly) essentially the same. Test script: $ cat test.pl #!/usr/bin/perl -w use strict; use Time::HiRes qw(time); use constant NUM_FILES => 10_000; use constant FILE_SIZES => (64 * 1024); use constant DEV => '/dev/sdb4'; use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev'; use constant TEST_DIR => (MNT_POINT . '/testdir'); system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!"; # following line for testing without properties #system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!"; # following 2 lines for testing with properties system("mount", DEV, MNT_POINT) == 0 or die "mount failed!"; system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!"; system("mkdir", TEST_DIR) == 0 or die "mkdir failed!"; my ($t1, $t2); $t1 = time(); for (my $i = 1; $i <= NUM_FILES; $i++) { my $p = TEST_DIR . '/file_' . $i; open(my $f, '>', $p) or die "Error opening file!"; $f->autoflush(1); for (my $j = 0; $j < FILE_SIZES; $j += 4096) { print $f ('A' x 4096) or die "Error writing to file!"; } close($f); } $t2 = time(); print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n"; system("umount", DEV) == 0 or die "umount failed!"; system("mount", DEV, MNT_POINT) == 0 or die "mount failed!"; $t1 = time(); system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!"; $t2 = time(); print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n"; system("umount", DEV) == 0 or die "umount failed!"; Signed-off-by: Filipe David Borba Manana Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 42 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 36 insertions(+), 6 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 23f18eb5fb55..1ea19cea96d0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -58,6 +58,7 @@ #include "inode-map.h" #include "backref.h" #include "hash.h" +#include "props.h" struct btrfs_iget_args { u64 ino; @@ -3265,7 +3266,8 @@ out: * slot is the slot the inode is in, objectid is the objectid of the inode */ static noinline int acls_after_inode_item(struct extent_buffer *leaf, - int slot, u64 objectid) + int slot, u64 objectid, + int *first_xattr_slot) { u32 nritems = btrfs_header_nritems(leaf); struct btrfs_key found_key; @@ -3281,6 +3283,7 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf, } slot++; + *first_xattr_slot = -1; while (slot < nritems) { btrfs_item_key_to_cpu(leaf, &found_key, slot); @@ -3290,6 +3293,8 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf, /* we found an xattr, assume we've got an acl */ if (found_key.type == BTRFS_XATTR_ITEM_KEY) { + if (*first_xattr_slot == -1) + *first_xattr_slot = slot; if (found_key.offset == xattr_access || found_key.offset == xattr_default) return 1; @@ -3318,6 +3323,8 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf, * something larger than an xattr. We have to assume the inode * has acls */ + if (*first_xattr_slot == -1) + *first_xattr_slot = slot; return 1; } @@ -3337,6 +3344,7 @@ static void btrfs_read_locked_inode(struct inode *inode) u32 rdev; int ret; bool filled = false; + int first_xattr_slot; ret = btrfs_fill_inode(inode, &rdev); if (!ret) @@ -3346,7 +3354,6 @@ static void btrfs_read_locked_inode(struct inode *inode) if (!path) goto make_bad; - path->leave_spinning = 1; memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); ret = btrfs_lookup_inode(NULL, root, path, &location, 0); @@ -3429,12 +3436,21 @@ cache_acl: * any xattrs or acls */ maybe_acls = acls_after_inode_item(leaf, path->slots[0], - btrfs_ino(inode)); + btrfs_ino(inode), &first_xattr_slot); + if (first_xattr_slot != -1) { + path->slots[0] = first_xattr_slot; + ret = btrfs_load_inode_props(inode, path); + if (ret) + btrfs_err(root->fs_info, + "error loading props for ino %llu (root %llu): %d\n", + btrfs_ino(inode), + root->root_key.objectid, ret); + } + btrfs_free_path(path); + if (!maybe_acls) cache_no_acl(inode); - btrfs_free_path(path); - switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_mapping->a_ops = &btrfs_aops; @@ -5607,6 +5623,12 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, btrfs_update_root_times(trans, root); + ret = btrfs_inode_inherit_props(trans, inode, dir); + if (ret) + btrfs_err(root->fs_info, + "error inheriting props for ino %llu (root %llu): %d", + btrfs_ino(inode), root->root_key.objectid, ret); + return inode; fail: if (dir) @@ -7889,7 +7911,9 @@ out: * create a new subvolume directory/inode (helper for the ioctl). */ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, - struct btrfs_root *new_root, u64 new_dirid) + struct btrfs_root *new_root, + struct btrfs_root *parent_root, + u64 new_dirid) { struct inode *inode; int err; @@ -7907,6 +7931,12 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, set_nlink(inode, 1); btrfs_i_size_write(inode, 0); + err = btrfs_subvol_inherit_props(trans, new_root, parent_root); + if (err) + btrfs_err(new_root->fs_info, + "error inheriting subvolume %llu properties: %d\n", + new_root->root_key.objectid, err); + err = btrfs_update_inode(trans, new_root, inode); iput(inode); -- cgit v1.2.3 From 2c21b4d733d6e50514e30ffd87110364ddda695b Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Tue, 14 Jan 2014 19:42:20 +0800 Subject: Btrfs: fix transaction abortion when remounting btrfs from RW to RO Steps to reproduce: # mkfs.btrfs -f /dev/sda8 # mount /dev/sda8 /mnt -o flushoncommit # dd if=/dev/zero of=/mnt/data bs=4k count=102400 & # mount /dev/sda8 /mnt -o remount, ro When remounting RW to RO, the logic is to firstly set flag to RO and then commit transaction, however with option flushoncommit enabled,we will do RO check within committing transaction, so we get a transaction abortion here. Actually,here check is wrong, we should check if FS_STATE_ERROR is set, fix it. Reported-by: Qu Wenruo Suggested-by: Miao Xie Signed-off-by: Wang Shilong Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1ea19cea96d0..7b61ea3141e5 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8477,7 +8477,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) { int ret; - if (root->fs_info->sb->s_flags & MS_RDONLY) + if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) return -EROFS; ret = __start_delalloc_inodes(root, delay_iput); @@ -8503,7 +8503,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput) struct list_head splice; int ret; - if (fs_info->sb->s_flags & MS_RDONLY) + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) return -EROFS; INIT_LIST_HEAD(&splice); -- cgit v1.2.3 From 3c9665df0c5d3f471b07efc32181459386678ebd Mon Sep 17 00:00:00 2001 From: Gui Hecheng Date: Thu, 23 Jan 2014 13:41:09 +0800 Subject: btrfs: fix warning while merging two adjacent extents When we have two adjacent extents in relink_extent_backref, we try to merge them. When we use btrfs_search_slot to locate the slot for the current extent, we shouldn't set "ins_len = 1", because we will merge it into the previous extent rather than insert a new item. Otherwise, we may happen to create a new leaf in btrfs_search_slot and path->slot[0] will be 0. Then we try to fetch the previous item using "path->slots[0]--", and it will cause a warning as follows: [ 145.713385] WARNING: CPU: 3 PID: 1796 at fs/btrfs/extent_io.c:5043 map_private_extent_buffer+0xd4/0xe0 [ 145.713387] btrfs bad mapping eb start 5337088 len 4096, wanted 167772306 8 ... [ 145.713462] [] map_private_extent_buffer+0xd4/0xe0 [ 145.713476] [] ? btrfs_free_path+0x2a/0x40 [ 145.713485] [] btrfs_get_token_64+0x64/0xf0 [ 145.713498] [] relink_extent_backref+0x41c/0x820 [ 145.713508] [] btrfs_finish_ordered_io+0x239/0xa80 I encounter this warning when running defrag having mkfs.btrfs with option -M. At the same time there are read/writes & snapshots running at background. Signed-off-by: Gui Hecheng Reviewed-by: Liu Bo Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7b61ea3141e5..3b6598783be9 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2314,7 +2314,7 @@ again: u64 extent_len; struct btrfs_key found_key; - ret = btrfs_search_slot(trans, root, &key, path, 1, 1); + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) goto out_free_path; -- cgit v1.2.3 From 514ac8ad8793a097c0c9d89202c642479d6dfa34 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 3 Jan 2014 21:07:00 -0800 Subject: Btrfs: don't use ram_bytes for uncompressed inline items If we truncate an uncompressed inline item, ram_bytes isn't updated to reflect the new size. The fixe uses the size directly from the item header when reading uncompressed inlines, and also fixes truncate to update the size as it goes. Reported-by: Jens Axboe Signed-off-by: Chris Mason CC: stable@vger.kernel.org --- fs/btrfs/inode.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3b6598783be9..ad961a598c99 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1281,7 +1281,8 @@ next_slot: nocow = 1; } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { extent_end = found_key.offset + - btrfs_file_extent_inline_len(leaf, fi); + btrfs_file_extent_inline_len(leaf, + path->slots[0], fi); extent_end = ALIGN(extent_end, root->sectorsize); } else { BUG_ON(1); @@ -4023,7 +4024,7 @@ search_again: btrfs_file_extent_num_bytes(leaf, fi); } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { item_end += btrfs_file_extent_inline_len(leaf, - fi); + path->slots[0], fi); } item_end--; } @@ -4093,6 +4094,12 @@ search_again: inode_sub_bytes(inode, item_end + 1 - new_size); } + + /* + * update the ram bytes to properly reflect + * the new size of our item + */ + btrfs_set_file_extent_ram_bytes(leaf, fi, size); size = btrfs_file_extent_calc_inline_size(size); btrfs_truncate_item(root, path, size, 1); @@ -6162,7 +6169,7 @@ again: btrfs_file_extent_num_bytes(leaf, item); } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { size_t size; - size = btrfs_file_extent_inline_len(leaf, item); + size = btrfs_file_extent_inline_len(leaf, path->slots[0], item); extent_end = ALIGN(extent_start + size, root->sectorsize); } next: @@ -6231,7 +6238,7 @@ next: goto out; } - size = btrfs_file_extent_inline_len(leaf, item); + size = btrfs_file_extent_inline_len(leaf, path->slots[0], item); extent_offset = page_offset(page) + pg_offset - extent_start; copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset, size - extent_offset); -- cgit v1.2.3 From 90d3e592e99b8e374ead2b45148abf506493a959 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 9 Jan 2014 17:28:00 -0800 Subject: Btrfs: setup inode location during btrfs_init_inode_locked We have a race during inode init because the BTRFS_I(inode)->location is setup after the inode hash table lock is dropped. btrfs_find_actor uses the location field, so our search might not find an existing inode in the hash table if we race with the inode init code. This commit changes things to setup the location field sooner. Also the find actor now uses only the location objectid to match inodes. For inode hashing, we just need a unique and stable test, it doesn't have to reflect the inode numbers we show to userland. Signed-off-by: Chris Mason CC: stable@vger.kernel.org --- fs/btrfs/inode.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ad961a598c99..fb74a536add3 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -61,7 +61,7 @@ #include "props.h" struct btrfs_iget_args { - u64 ino; + struct btrfs_key *location; struct btrfs_root *root; }; @@ -4977,7 +4977,9 @@ again: static int btrfs_init_locked_inode(struct inode *inode, void *p) { struct btrfs_iget_args *args = p; - inode->i_ino = args->ino; + inode->i_ino = args->location->objectid; + memcpy(&BTRFS_I(inode)->location, args->location, + sizeof(*args->location)); BTRFS_I(inode)->root = args->root; return 0; } @@ -4985,19 +4987,19 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p) static int btrfs_find_actor(struct inode *inode, void *opaque) { struct btrfs_iget_args *args = opaque; - return args->ino == btrfs_ino(inode) && + return args->location->objectid == BTRFS_I(inode)->location.objectid && args->root == BTRFS_I(inode)->root; } static struct inode *btrfs_iget_locked(struct super_block *s, - u64 objectid, + struct btrfs_key *location, struct btrfs_root *root) { struct inode *inode; struct btrfs_iget_args args; - unsigned long hashval = btrfs_inode_hash(objectid, root); + unsigned long hashval = btrfs_inode_hash(location->objectid, root); - args.ino = objectid; + args.location = location; args.root = root; inode = iget5_locked(s, hashval, btrfs_find_actor, @@ -5014,13 +5016,11 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, { struct inode *inode; - inode = btrfs_iget_locked(s, location->objectid, root); + inode = btrfs_iget_locked(s, location, root); if (!inode) return ERR_PTR(-ENOMEM); if (inode->i_state & I_NEW) { - BTRFS_I(inode)->root = root; - memcpy(&BTRFS_I(inode)->location, location, sizeof(*location)); btrfs_read_locked_inode(inode); if (!is_bad_inode(inode)) { inode_tree_add(inode); -- cgit v1.2.3