From 2c30c71bd653afcbed7f6754e8fe3d16e0e708a1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Thu, 7 Nov 2013 12:20:26 -0800
Subject: block: Convert various code to bio_for_each_segment()

With immutable biovecs we don't want code accessing bi_io_vec directly -
the uses this patch changes weren't incorrect since they all own the
bio, but it makes the code harder to audit for no good reason - also,
this will help with multipage bvecs later.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Chris Mason <chris.mason@fusionio.com>
Cc: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Cc: Joern Engel <joern@logfs.org>
Cc: Prasad Joshi <prasadjoshi.linux@gmail.com>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/btrfs/inode.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f1a77449d032..d6630dc130ba 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6779,17 +6779,16 @@ unlock_err:
 static void btrfs_endio_direct_read(struct bio *bio, int err)
 {
 	struct btrfs_dio_private *dip = bio->bi_private;
-	struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
-	struct bio_vec *bvec = bio->bi_io_vec;
+	struct bio_vec *bvec;
 	struct inode *inode = dip->inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct bio *dio_bio;
 	u32 *csums = (u32 *)dip->csum;
-	int index = 0;
 	u64 start;
+	int i;
 
 	start = dip->logical_offset;
-	do {
+	bio_for_each_segment_all(bvec, bio, i) {
 		if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
 			struct page *page = bvec->bv_page;
 			char *kaddr;
@@ -6805,18 +6804,16 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
 			local_irq_restore(flags);
 
 			flush_dcache_page(bvec->bv_page);
-			if (csum != csums[index]) {
+			if (csum != csums[i]) {
 				btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u",
 					  btrfs_ino(inode), start, csum,
-					  csums[index]);
+					  csums[i]);
 				err = -EIO;
 			}
 		}
 
 		start += bvec->bv_len;
-		bvec++;
-		index++;
-	} while (bvec <= bvec_end);
+	}
 
 	unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
 		      dip->logical_offset + dip->bytes - 1);
-- 
cgit v1.2.3


From 4f024f3797c43cb4b73cd2c50cec728842d0e49e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Fri, 11 Oct 2013 15:44:27 -0700
Subject: block: Abstract out bvec iterator
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Immutable biovecs are going to require an explicit iterator. To
implement immutable bvecs, a later patch is going to add a bi_bvec_done
member to this struct; for now, this patch effectively just renames
things.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Ed L. Cashin" <ecashin@coraid.com>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Lars Ellenberg <drbd-dev@lists.linbit.com>
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Matthew Wilcox <willy@linux.intel.com>
Cc: Geoff Levand <geoff@infradead.org>
Cc: Yehuda Sadeh <yehuda@inktank.com>
Cc: Sage Weil <sage@inktank.com>
Cc: Alex Elder <elder@inktank.com>
Cc: ceph-devel@vger.kernel.org
Cc: Joshua Morris <josh.h.morris@us.ibm.com>
Cc: Philip Kelleher <pjk1939@linux.vnet.ibm.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Neil Brown <neilb@suse.de>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: dm-devel@redhat.com
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: linux390@de.ibm.com
Cc: Boaz Harrosh <bharrosh@panasas.com>
Cc: Benny Halevy <bhalevy@tonian.com>
Cc: "James E.J. Bottomley" <JBottomley@parallels.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Nicholas A. Bellinger" <nab@linux-iscsi.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Chris Mason <chris.mason@fusionio.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Cc: Dave Kleikamp <shaggy@kernel.org>
Cc: Joern Engel <joern@logfs.org>
Cc: Prasad Joshi <prasadjoshi.linux@gmail.com>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: KONISHI Ryusuke <konishi.ryusuke@lab.ntt.co.jp>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Ben Myers <bpm@sgi.com>
Cc: xfs@oss.sgi.com
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Herton Ronaldo Krzesinski <herton.krzesinski@canonical.com>
Cc: Ben Hutchings <ben@decadent.org.uk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Guo Chao <yan@linux.vnet.ibm.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Asai Thambi S P <asamymuthupa@micron.com>
Cc: Selvan Mani <smani@micron.com>
Cc: Sam Bradshaw <sbradshaw@micron.com>
Cc: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Cc: "Roger Pau Monné" <roger.pau@citrix.com>
Cc: Jan Beulich <jbeulich@suse.com>
Cc: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Cc: Ian Campbell <Ian.Campbell@citrix.com>
Cc: Sebastian Ott <sebott@linux.vnet.ibm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Jiang Liu <jiang.liu@huawei.com>
Cc: Nitin Gupta <ngupta@vflare.org>
Cc: Jerome Marchand <jmarchand@redhat.com>
Cc: Joe Perches <joe@perches.com>
Cc: Peng Tao <tao.peng@emc.com>
Cc: Andy Adamson <andros@netapp.com>
Cc: fanchaoting <fanchaoting@cn.fujitsu.com>
Cc: Jie Liu <jeff.liu@oracle.com>
Cc: Sunil Mushran <sunil.mushran@gmail.com>
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Cc: Namjae Jeon <namjae.jeon@samsung.com>
Cc: Pankaj Kumar <pankaj.km@samsung.com>
Cc: Dan Magenheimer <dan.magenheimer@oracle.com>
Cc: Mel Gorman <mgorman@suse.de>6
---
 fs/btrfs/inode.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d6630dc130ba..7ab0e94ad492 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1577,7 +1577,7 @@ int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
 			 unsigned long bio_flags)
 {
 	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
-	u64 logical = (u64)bio->bi_sector << 9;
+	u64 logical = (u64)bio->bi_iter.bi_sector << 9;
 	u64 length = 0;
 	u64 map_length;
 	int ret;
@@ -1585,7 +1585,7 @@ int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
 	if (bio_flags & EXTENT_BIO_COMPRESSED)
 		return 0;
 
-	length = bio->bi_size;
+	length = bio->bi_iter.bi_size;
 	map_length = length;
 	ret = btrfs_map_block(root->fs_info, rw, logical,
 			      &map_length, NULL, 0);
@@ -6894,7 +6894,8 @@ static void btrfs_end_dio_bio(struct bio *bio, int err)
 		printk(KERN_ERR "btrfs direct IO failed ino %llu rw %lu "
 		      "sector %#Lx len %u err no %d\n",
 		      btrfs_ino(dip->inode), bio->bi_rw,
-		      (unsigned long long)bio->bi_sector, bio->bi_size, err);
+		      (unsigned long long)bio->bi_iter.bi_sector,
+		      bio->bi_iter.bi_size, err);
 		dip->errors = 1;
 
 		/*
@@ -6985,7 +6986,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
 	struct bio *bio;
 	struct bio *orig_bio = dip->orig_bio;
 	struct bio_vec *bvec = orig_bio->bi_io_vec;
-	u64 start_sector = orig_bio->bi_sector;
+	u64 start_sector = orig_bio->bi_iter.bi_sector;
 	u64 file_offset = dip->logical_offset;
 	u64 submit_len = 0;
 	u64 map_length;
@@ -6993,7 +6994,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
 	int ret = 0;
 	int async_submit = 0;
 
-	map_length = orig_bio->bi_size;
+	map_length = orig_bio->bi_iter.bi_size;
 	ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
 			      &map_length, NULL, 0);
 	if (ret) {
@@ -7001,7 +7002,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
 		return -EIO;
 	}
 
-	if (map_length >= orig_bio->bi_size) {
+	if (map_length >= orig_bio->bi_iter.bi_size) {
 		bio = orig_bio;
 		goto submit;
 	}
@@ -7053,7 +7054,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
 			bio->bi_private = dip;
 			bio->bi_end_io = btrfs_end_dio_bio;
 
-			map_length = orig_bio->bi_size;
+			map_length = orig_bio->bi_iter.bi_size;
 			ret = btrfs_map_block(root->fs_info, rw,
 					      start_sector << 9,
 					      &map_length, NULL, 0);
@@ -7111,7 +7112,8 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
 
 	if (!skip_sum && !write) {
 		csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
-		sum_len = dio_bio->bi_size >> inode->i_sb->s_blocksize_bits;
+		sum_len = dio_bio->bi_iter.bi_size >>
+			inode->i_sb->s_blocksize_bits;
 		sum_len *= csum_size;
 	} else {
 		sum_len = 0;
@@ -7126,8 +7128,8 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
 	dip->private = dio_bio->bi_private;
 	dip->inode = inode;
 	dip->logical_offset = file_offset;
-	dip->bytes = dio_bio->bi_size;
-	dip->disk_bytenr = (u64)dio_bio->bi_sector << 9;
+	dip->bytes = dio_bio->bi_iter.bi_size;
+	dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
 	io_bio->bi_private = dip;
 	dip->errors = 0;
 	dip->orig_bio = io_bio;
-- 
cgit v1.2.3


From dff6efc326a4d5f305797d4a6bba14f374fdd633 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 19 Nov 2013 07:17:07 -0800
Subject: fs: fix iversion handling

Currently notify_change directly updates i_version for size updates,
which not only is counter to how all other fields are updated through
struct iattr, but also breaks XFS, which need inode updates to happen
under its own lock, and synchronized to the structure that gets written
to the log.

Remove the update in the common code, and it to btrfs and ext4,
XFS already does a proper updaste internally and currently gets a
double update with the existing code.

IMHO this is 3.13 and -stable material and should go in through the XFS
tree.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
Acked-by: Jan Kara <jack@suse.cz>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Chris Mason <clm@fb.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/btrfs/inode.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f1a77449d032..471a4f7f4044 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4354,8 +4354,12 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
 	 * these flags set.  For all other operations the VFS set these flags
 	 * explicitly if it wants a timestamp update.
 	 */
-	if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME))))
-		inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb);
+	if (newsize != oldsize) {
+		inode_inc_iversion(inode);
+		if (!(mask & (ATTR_CTIME | ATTR_MTIME)))
+			inode->i_ctime = inode->i_mtime =
+				current_fs_time(inode->i_sb);
+	}
 
 	if (newsize > oldsize) {
 		truncate_pagecache(inode, newsize);
-- 
cgit v1.2.3


From 996a710d46418cacb5b4a519ab9341a74066551d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 20 Dec 2013 05:16:43 -0800
Subject: btrfs: use generic posix ACL infrastructure

Also don't bother to set up a .get_acl method for symlinks as we do not
support access control (ACLs or even mode bits) for symlinks in Linux.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/inode.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f1a77449d032..b1314300d9fc 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4464,7 +4464,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 		err = btrfs_dirty_inode(inode);
 
 		if (!err && attr->ia_valid & ATTR_MODE)
-			err = btrfs_acl_chmod(inode);
+			err = posix_acl_chmod(inode, inode->i_mode);
 	}
 
 	return err;
@@ -8649,12 +8649,14 @@ static const struct inode_operations btrfs_dir_inode_operations = {
 	.removexattr	= btrfs_removexattr,
 	.permission	= btrfs_permission,
 	.get_acl	= btrfs_get_acl,
+	.set_acl	= btrfs_set_acl,
 	.update_time	= btrfs_update_time,
 };
 static const struct inode_operations btrfs_dir_ro_inode_operations = {
 	.lookup		= btrfs_lookup,
 	.permission	= btrfs_permission,
 	.get_acl	= btrfs_get_acl,
+	.set_acl	= btrfs_set_acl,
 	.update_time	= btrfs_update_time,
 };
 
@@ -8724,6 +8726,7 @@ static const struct inode_operations btrfs_file_inode_operations = {
 	.permission	= btrfs_permission,
 	.fiemap		= btrfs_fiemap,
 	.get_acl	= btrfs_get_acl,
+	.set_acl	= btrfs_set_acl,
 	.update_time	= btrfs_update_time,
 };
 static const struct inode_operations btrfs_special_inode_operations = {
@@ -8735,6 +8738,7 @@ static const struct inode_operations btrfs_special_inode_operations = {
 	.listxattr	= btrfs_listxattr,
 	.removexattr	= btrfs_removexattr,
 	.get_acl	= btrfs_get_acl,
+	.set_acl	= btrfs_set_acl,
 	.update_time	= btrfs_update_time,
 };
 static const struct inode_operations btrfs_symlink_inode_operations = {
@@ -8748,7 +8752,6 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
 	.getxattr	= btrfs_getxattr,
 	.listxattr	= btrfs_listxattr,
 	.removexattr	= btrfs_removexattr,
-	.get_acl	= btrfs_get_acl,
 	.update_time	= btrfs_update_time,
 };
 
-- 
cgit v1.2.3


From 16e7549f045d33b0c5b0ebf19d08439e9221d40c Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Tue, 22 Oct 2013 12:18:51 -0400
Subject: Btrfs: incompatible format change to remove hole extents

Btrfs has always had these filler extent data items for holes in inodes.  This
has made somethings very easy, like logging hole punches and sending hole
punches.  However for large holey files these extent data items are pure
overhead.  So add an incompatible feature to no longer add hole extents to
reduce the amount of metadata used by these sort of files.  This has a few
changes for logging and send obviously since they will need to detect holes and
log/send the holes if there are any.  I've tested this thoroughly with xfstests
and it doesn't cause any issues with and without the incompat format set.
Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/inode.c | 78 ++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 48 insertions(+), 30 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f1a77449d032..c0c0dc8f07fa 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4203,6 +4203,49 @@ out:
 	return ret;
 }
 
+static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
+			     u64 offset, u64 len)
+{
+	struct btrfs_trans_handle *trans;
+	int ret;
+
+	/*
+	 * Still need to make sure the inode looks like it's been updated so
+	 * that any holes get logged if we fsync.
+	 */
+	if (btrfs_fs_incompat(root->fs_info, NO_HOLES)) {
+		BTRFS_I(inode)->last_trans = root->fs_info->generation;
+		BTRFS_I(inode)->last_sub_trans = root->log_transid;
+		BTRFS_I(inode)->last_log_commit = root->last_log_commit;
+		return 0;
+	}
+
+	/*
+	 * 1 - for the one we're dropping
+	 * 1 - for the one we're adding
+	 * 1 - for updating the inode.
+	 */
+	trans = btrfs_start_transaction(root, 3);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+
+	ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		btrfs_end_transaction(trans, root);
+		return ret;
+	}
+
+	ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,
+				       0, 0, len, 0, len, 0, 0, 0);
+	if (ret)
+		btrfs_abort_transaction(trans, root, ret);
+	else
+		btrfs_update_inode(trans, root, inode);
+	btrfs_end_transaction(trans, root);
+	return ret;
+}
+
 /*
  * This function puts in dummy file extents for the area we're creating a hole
  * for.  So if we are truncating this file to a larger size we need to insert
@@ -4211,7 +4254,6 @@ out:
  */
 int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 {
-	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct extent_map *em = NULL;
@@ -4266,31 +4308,10 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 			struct extent_map *hole_em;
 			hole_size = last_byte - cur_offset;
 
-			trans = btrfs_start_transaction(root, 3);
-			if (IS_ERR(trans)) {
-				err = PTR_ERR(trans);
-				break;
-			}
-
-			err = btrfs_drop_extents(trans, root, inode,
-						 cur_offset,
-						 cur_offset + hole_size, 1);
-			if (err) {
-				btrfs_abort_transaction(trans, root, err);
-				btrfs_end_transaction(trans, root);
-				break;
-			}
-
-			err = btrfs_insert_file_extent(trans, root,
-					btrfs_ino(inode), cur_offset, 0,
-					0, hole_size, 0, hole_size,
-					0, 0, 0);
-			if (err) {
-				btrfs_abort_transaction(trans, root, err);
-				btrfs_end_transaction(trans, root);
+			err = maybe_insert_hole(root, inode, cur_offset,
+						hole_size);
+			if (err)
 				break;
-			}
-
 			btrfs_drop_extent_cache(inode, cur_offset,
 						cur_offset + hole_size - 1, 0);
 			hole_em = alloc_extent_map();
@@ -4309,7 +4330,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 			hole_em->ram_bytes = hole_size;
 			hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
 			hole_em->compress_type = BTRFS_COMPRESS_NONE;
-			hole_em->generation = trans->transid;
+			hole_em->generation = root->fs_info->generation;
 
 			while (1) {
 				write_lock(&em_tree->lock);
@@ -4322,17 +4343,14 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 							hole_size - 1, 0);
 			}
 			free_extent_map(hole_em);
-next:
-			btrfs_update_inode(trans, root, inode);
-			btrfs_end_transaction(trans, root);
 		}
+next:
 		free_extent_map(em);
 		em = NULL;
 		cur_offset = last_byte;
 		if (cur_offset >= block_end)
 			break;
 	}
-
 	free_extent_map(em);
 	unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
 			     GFP_NOFS);
-- 
cgit v1.2.3


From 99e22f783bbcd048819975b8a1463f39a9966bcf Mon Sep 17 00:00:00 2001
From: Valentina Giusti <valentina.giusti@microon.de>
Date: Mon, 4 Nov 2013 22:34:22 +0100
Subject: btrfs: remove unused variable from btrfs_new_inode

Variable owner in btrfs_new_inode is unused since commit
d82a6f1d7e8b61ed5996334d0db66651bb43641d
(Btrfs: kill BTRFS_I(inode)->block_group)

Signed-off-by: Valentina Giusti <valentina.giusti@microon.de>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/inode.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c0c0dc8f07fa..41079c6ed968 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5372,7 +5372,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	u32 sizes[2];
 	unsigned long ptr;
 	int ret;
-	int owner;
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -5418,11 +5417,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	 */
 	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
 
-	if (S_ISDIR(mode))
-		owner = 0;
-	else
-		owner = 1;
-
 	key[0].objectid = objectid;
 	btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
 	key[0].offset = 0;
-- 
cgit v1.2.3


From 75ac2dd907013b44edbdec16f8969d14811149c9 Mon Sep 17 00:00:00 2001
From: Kelley Nielsen <kelleynnn@gmail.com>
Date: Mon, 4 Nov 2013 19:35:58 -0800
Subject: btrfs: expand btrfs_find_item() to include find_root_ref
 functionality

This patch is the second step in bootstrapping the btrfs_find_item
interface. The btrfs_find_root_ref() is similar to the former
__inode_info(); it accepts four of its parameters, and duplicates the
first half of its functionality.

Replace the one former call to btrfs_find_root_ref() with a call to
btrfs_find_item(), along with the defined key type that was used
internally by btrfs_find_root ref, and a null found key. In
btrfs_find_item(), add a test for the null key at the place where
the functionality of btrfs_find_root_ref() ends; btrfs_find_item()
then returns if the test passes. Finally, remove btrfs_find_root_ref().

Signed-off-by: Kelley Nielsen <kelleynnn@gmail.com>
Suggested-by: Zach Brown <zab@redhat.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/inode.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 41079c6ed968..5a5de36d39fc 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4673,9 +4673,9 @@ static int fixup_tree_root_location(struct btrfs_root *root,
 	}
 
 	err = -ENOENT;
-	ret = btrfs_find_root_ref(root->fs_info->tree_root, path,
-				  BTRFS_I(dir)->root->root_key.objectid,
-				  location->objectid);
+	ret = btrfs_find_item(root->fs_info->tree_root, path,
+				BTRFS_I(dir)->root->root_key.objectid,
+				location->objectid, BTRFS_ROOT_REF_KEY, NULL);
 	if (ret) {
 		if (ret < 0)
 			err = ret;
-- 
cgit v1.2.3


From 131e404a2a54d30f894425ef723f9867a43bff4c Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Tue, 19 Nov 2013 22:29:35 +0000
Subject: Btrfs: fix very slow inode eviction and fs unmount

The inode eviction can be very slow, because during eviction we
tell the VFS to truncate all of the inode's pages. This results
in calls to btrfs_invalidatepage() which in turn does calls to
lock_extent_bits() and clear_extent_bit(). These calls result in
too many merges and splits of extent_state structures, which
consume a lot of time and cpu when the inode has many pages. In
some scenarios I have experienced umount times higher than 15
minutes, even when there's no pending IO (after a btrfs fs sync).

A quick way to reproduce this issue:

$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ cd /mnt/btrfs
$ sysbench --test=fileio --file-num=128 --file-total-size=16G \
    --file-test-mode=seqwr --num-threads=128 \
    --file-block-size=16384 --max-time=60 --max-requests=0 run
$ time btrfs fi sync .
FSSync '.'

real	0m25.457s
user	0m0.000s
sys	0m0.092s
$ cd ..
$ time umount /mnt/btrfs

real	1m38.234s
user	0m0.000s
sys	1m25.760s

The same test on ext4 runs much faster:

$ mkfs.ext4 /dev/sdb3
$ mount /dev/sdb3 /mnt/ext4
$ cd /mnt/ext4
$ sysbench --test=fileio --file-num=128 --file-total-size=16G \
    --file-test-mode=seqwr --num-threads=128 \
    --file-block-size=16384 --max-time=60 --max-requests=0 run
$ sync
$ cd ..
$ time umount /mnt/ext4

real	0m3.626s
user	0m0.004s
sys	0m3.012s

After this patch, the unmount (inode evictions) is much faster:

$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ cd /mnt/btrfs
$ sysbench --test=fileio --file-num=128 --file-total-size=16G \
    --file-test-mode=seqwr --num-threads=128 \
    --file-block-size=16384 --max-time=60 --max-requests=0 run
$ time btrfs fi sync .
FSSync '.'

real	0m26.774s
user	0m0.000s
sys	0m0.084s
$ cd ..
$ time umount /mnt/btrfs

real	0m1.811s
user	0m0.000s
sys	0m1.564s

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/inode.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 84 insertions(+), 14 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5a5de36d39fc..e889779c9b37 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4488,6 +4488,62 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 	return err;
 }
 
+/*
+ * While truncating the inode pages during eviction, we get the VFS calling
+ * btrfs_invalidatepage() against each page of the inode. This is slow because
+ * the calls to btrfs_invalidatepage() result in a huge amount of calls to
+ * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting
+ * extent_state structures over and over, wasting lots of time.
+ *
+ * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all
+ * those expensive operations on a per page basis and do only the ordered io
+ * finishing, while we release here the extent_map and extent_state structures,
+ * without the excessive merging and splitting.
+ */
+static void evict_inode_truncate_pages(struct inode *inode)
+{
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree;
+	struct rb_node *node;
+
+	ASSERT(inode->i_state & I_FREEING);
+	truncate_inode_pages(&inode->i_data, 0);
+
+	write_lock(&map_tree->lock);
+	while (!RB_EMPTY_ROOT(&map_tree->map)) {
+		struct extent_map *em;
+
+		node = rb_first(&map_tree->map);
+		em = rb_entry(node, struct extent_map, rb_node);
+		remove_extent_mapping(map_tree, em);
+		free_extent_map(em);
+	}
+	write_unlock(&map_tree->lock);
+
+	spin_lock(&io_tree->lock);
+	while (!RB_EMPTY_ROOT(&io_tree->state)) {
+		struct extent_state *state;
+		struct extent_state *cached_state = NULL;
+
+		node = rb_first(&io_tree->state);
+		state = rb_entry(node, struct extent_state, rb_node);
+		atomic_inc(&state->refs);
+		spin_unlock(&io_tree->lock);
+
+		lock_extent_bits(io_tree, state->start, state->end,
+				 0, &cached_state);
+		clear_extent_bit(io_tree, state->start, state->end,
+				 EXTENT_LOCKED | EXTENT_DIRTY |
+				 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+				 EXTENT_DEFRAG, 1, 1,
+				 &cached_state, GFP_NOFS);
+		free_extent_state(state);
+
+		spin_lock(&io_tree->lock);
+	}
+	spin_unlock(&io_tree->lock);
+}
+
 void btrfs_evict_inode(struct inode *inode)
 {
 	struct btrfs_trans_handle *trans;
@@ -4498,7 +4554,8 @@ void btrfs_evict_inode(struct inode *inode)
 
 	trace_btrfs_inode_evict(inode);
 
-	truncate_inode_pages(&inode->i_data, 0);
+	evict_inode_truncate_pages(inode);
+
 	if (inode->i_nlink &&
 	    ((btrfs_root_refs(&root->root_item) != 0 &&
 	      root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
@@ -7379,6 +7436,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
 	struct extent_state *cached_state = NULL;
 	u64 page_start = page_offset(page);
 	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
+	int inode_evicting = inode->i_state & I_FREEING;
 
 	/*
 	 * we have the page locked, so new writeback can't start,
@@ -7394,17 +7452,21 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
 		btrfs_releasepage(page, GFP_NOFS);
 		return;
 	}
-	lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
-	ordered = btrfs_lookup_ordered_extent(inode, page_offset(page));
+
+	if (!inode_evicting)
+		lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
+	ordered = btrfs_lookup_ordered_extent(inode, page_start);
 	if (ordered) {
 		/*
 		 * IO on this page will never be started, so we need
 		 * to account for any ordered extents now
 		 */
-		clear_extent_bit(tree, page_start, page_end,
-				 EXTENT_DIRTY | EXTENT_DELALLOC |
-				 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
-				 EXTENT_DEFRAG, 1, 0, &cached_state, GFP_NOFS);
+		if (!inode_evicting)
+			clear_extent_bit(tree, page_start, page_end,
+					 EXTENT_DIRTY | EXTENT_DELALLOC |
+					 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
+					 EXTENT_DEFRAG, 1, 0, &cached_state,
+					 GFP_NOFS);
 		/*
 		 * whoever cleared the private bit is responsible
 		 * for the finish_ordered_io
@@ -7428,14 +7490,22 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
 				btrfs_finish_ordered_io(ordered);
 		}
 		btrfs_put_ordered_extent(ordered);
-		cached_state = NULL;
-		lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
+		if (!inode_evicting) {
+			cached_state = NULL;
+			lock_extent_bits(tree, page_start, page_end, 0,
+					 &cached_state);
+		}
+	}
+
+	if (!inode_evicting) {
+		clear_extent_bit(tree, page_start, page_end,
+				 EXTENT_LOCKED | EXTENT_DIRTY |
+				 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+				 EXTENT_DEFRAG, 1, 1,
+				 &cached_state, GFP_NOFS);
+
+		__btrfs_releasepage(page, GFP_NOFS);
 	}
-	clear_extent_bit(tree, page_start, page_end,
-		 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
-		 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
-		 &cached_state, GFP_NOFS);
-	__btrfs_releasepage(page, GFP_NOFS);
 
 	ClearPageChecked(page);
 	if (PagePrivate(page)) {
-- 
cgit v1.2.3


From 5662344b3c0d9ddd9afd48716d795166f982d5e2 Mon Sep 17 00:00:00 2001
From: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Date: Fri, 13 Dec 2013 09:51:42 +0900
Subject: Btrfs: fix error check of btrfs_lookup_dentry()

Clean up btrfs_lookup_dentry() to never return NULL, but PTR_ERR(-ENOENT)
instead. This keeps the return value convention consistent.

Callers who use btrfs_lookup_dentry() require a trivial update.

create_snapshot() in particular looks like it can also lose a BUG_ON(!inode)
which is not really needed - there seems less harm in returning ENOENT to
userspace at that point in the stack than there is to crash the machine.

Signed-off-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/inode.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e889779c9b37..2bd4f7590c83 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4992,7 +4992,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 		return ERR_PTR(ret);
 
 	if (location.objectid == 0)
-		return NULL;
+		return ERR_PTR(-ENOENT);
 
 	if (location.type == BTRFS_INODE_ITEM_KEY) {
 		inode = btrfs_iget(dir->i_sb, &location, root, NULL);
@@ -5056,10 +5056,17 @@ static void btrfs_dentry_release(struct dentry *dentry)
 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
 				   unsigned int flags)
 {
-	struct dentry *ret;
+	struct inode *inode;
 
-	ret = d_splice_alias(btrfs_lookup_dentry(dir, dentry), dentry);
-	return ret;
+	inode = btrfs_lookup_dentry(dir, dentry);
+	if (IS_ERR(inode)) {
+		if (PTR_ERR(inode) == -ENOENT)
+			inode = NULL;
+		else
+			return ERR_CAST(inode);
+	}
+
+	return d_splice_alias(inode, dentry);
 }
 
 unsigned char btrfs_filetype_table[] = {
-- 
cgit v1.2.3


From 663df053309c8d9200b87cc1a129729b8e97eb26 Mon Sep 17 00:00:00 2001
From: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Date: Sun, 15 Dec 2013 11:39:42 +0800
Subject: Btrfs: remove dead comments for read_csums()

Chris introduced hleper function  read_csums() and this function
has been removed, but we forgot to remove its corresponding comments.

Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/inode.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2bd4f7590c83..b3b3142f1734 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2543,12 +2543,6 @@ out_kfree:
 	return NULL;
 }
 
-/*
- * helper function for btrfs_finish_ordered_io, this
- * just reads in some of the csum leaves to prime them into ram
- * before we start the transaction.  It limits the amount of btree
- * reads required while inside the transaction.
- */
 /* as ordered data IO finishes, this gets called so we can finish
  * an ordered extent if the range of bytes in the file it covers are
  * fully written.
-- 
cgit v1.2.3


From 180589efde8a01b4a30af273f670ac81c8abf9c5 Mon Sep 17 00:00:00 2001
From: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Date: Sat, 14 Dec 2013 15:27:31 +0800
Subject: Btrfs: fix a warning when iput a file

See the warning below:

[ 1209.102076]  [<ffffffffa04721b9>] remove_extent_mapping+0x69/0x70 [btrfs]
[ 1209.102084]  [<ffffffffa0466b06>] btrfs_evict_inode+0x96/0x4d0 [btrfs]
[ 1209.102089]  [<ffffffff81073010>] ? wake_atomic_t_function+0x40/0x40
[ 1209.102092]  [<ffffffff8118ab2e>] evict+0x9e/0x190
[ 1209.102094]  [<ffffffff8118b313>] iput+0xf3/0x180
[ 1209.102101]  [<ffffffffa0461fd1>] btrfs_run_delayed_iputs+0xb1/0xd0 [btrfs]
[ 1209.102107]  [<ffffffffa045d358>] __btrfs_end_transaction+0x268/0x350 [btrfs]

clear extent bit here to avoid triggering WARN_ON() in remove_extent_mapping()

Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/inode.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b3b3142f1734..2ccf8e6b1e16 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4509,6 +4509,8 @@ static void evict_inode_truncate_pages(struct inode *inode)
 
 		node = rb_first(&map_tree->map);
 		em = rb_entry(node, struct extent_map, rb_node);
+		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+		clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
 		remove_extent_mapping(map_tree, em);
 		free_extent_map(em);
 	}
-- 
cgit v1.2.3


From efe120a067c8674a8ae21b194f0e68f098b61ee2 Mon Sep 17 00:00:00 2001
From: Frank Holton <fholton@gmail.com>
Date: Fri, 20 Dec 2013 11:37:06 -0500
Subject: Btrfs: convert printk to btrfs_ and fix BTRFS prefix

Convert all applicable cases of printk and pr_* to the btrfs_* macros.

Fix all uses of the BTRFS prefix.

Signed-off-by: Frank Holton <fholton@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/inode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2ccf8e6b1e16..06bcf5b53cb0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6966,8 +6966,8 @@ static void btrfs_end_dio_bio(struct bio *bio, int err)
 	struct btrfs_dio_private *dip = bio->bi_private;
 
 	if (err) {
-		printk(KERN_ERR "btrfs direct IO failed ino %llu rw %lu "
-		      "sector %#Lx len %u err no %d\n",
+		btrfs_err(BTRFS_I(dip->inode)->root->fs_info,
+			  "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d",
 		      btrfs_ino(dip->inode), bio->bi_rw,
 		      (unsigned long long)bio->bi_sector, bio->bi_size, err);
 		dip->errors = 1;
-- 
cgit v1.2.3


From 67de11769bd5ec339a62169f500b04f304826c00 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 26 Dec 2013 13:07:06 +0800
Subject: Btrfs: introduce the delayed inode ref deletion for the single link
 inode

The inode reference item is close to inode item, so we insert it simultaneously
with the inode item insertion when we create a file/directory.. In fact, we also
can handle the inode reference deletion by the same way. So we made this patch to
introduce the delayed inode reference deletion for the single link inode(At most
case, the file doesn't has hard link, so we don't take the hard link into account).

This function is based on the delayed inode mechanism. After applying this patch,
we can reduce the time of the file/directory deletion by ~10%.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/inode.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 53 insertions(+), 2 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 06bcf5b53cb0..9eaa1c8ed385 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3309,6 +3309,7 @@ static void btrfs_read_locked_inode(struct inode *inode)
 	struct btrfs_timespec *tspec;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_key location;
+	unsigned long ptr;
 	int maybe_acls;
 	u32 rdev;
 	int ret;
@@ -3332,7 +3333,7 @@ static void btrfs_read_locked_inode(struct inode *inode)
 	leaf = path->nodes[0];
 
 	if (filled)
-		goto cache_acl;
+		goto cache_index;
 
 	inode_item = btrfs_item_ptr(leaf, path->slots[0],
 				    struct btrfs_inode_item);
@@ -3375,6 +3376,30 @@ static void btrfs_read_locked_inode(struct inode *inode)
 
 	BTRFS_I(inode)->index_cnt = (u64)-1;
 	BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
+
+cache_index:
+	path->slots[0]++;
+	if (inode->i_nlink != 1 ||
+	    path->slots[0] >= btrfs_header_nritems(leaf))
+		goto cache_acl;
+
+	btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
+	if (location.objectid != btrfs_ino(inode))
+		goto cache_acl;
+
+	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+	if (location.type == BTRFS_INODE_REF_KEY) {
+		struct btrfs_inode_ref *ref;
+
+		ref = (struct btrfs_inode_ref *)ptr;
+		BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
+	} else if (location.type == BTRFS_INODE_EXTREF_KEY) {
+		struct btrfs_inode_extref *extref;
+
+		extref = (struct btrfs_inode_extref *)ptr;
+		BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
+								     extref);
+	}
 cache_acl:
 	/*
 	 * try to precache a NULL acl entry for files that don't have
@@ -3587,6 +3612,24 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 		goto err;
 	btrfs_release_path(path);
 
+	/*
+	 * If we don't have dir index, we have to get it by looking up
+	 * the inode ref, since we get the inode ref, remove it directly,
+	 * it is unnecessary to do delayed deletion.
+	 *
+	 * But if we have dir index, needn't search inode ref to get it.
+	 * Since the inode ref is close to the inode item, it is better
+	 * that we delay to delete it, and just do this deletion when
+	 * we update the inode item.
+	 */
+	if (BTRFS_I(inode)->dir_index) {
+		ret = btrfs_delayed_delete_inode_ref(inode);
+		if (!ret) {
+			index = BTRFS_I(inode)->dir_index;
+			goto skip_backref;
+		}
+	}
+
 	ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
 				  dir_ino, &index);
 	if (ret) {
@@ -3596,7 +3639,7 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 		btrfs_abort_transaction(trans, root, ret);
 		goto err;
 	}
-
+skip_backref:
 	ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
 	if (ret) {
 		btrfs_abort_transaction(trans, root, ret);
@@ -5465,6 +5508,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	 * number
 	 */
 	BTRFS_I(inode)->index_cnt = 2;
+	BTRFS_I(inode)->dir_index = *index;
 	BTRFS_I(inode)->root = root;
 	BTRFS_I(inode)->generation = trans->transid;
 	inode->i_generation = BTRFS_I(inode)->generation;
@@ -5809,6 +5853,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 		goto fail;
 	}
 
+	/* There are several dir indexes for this inode, clear the cache. */
+	BTRFS_I(inode)->dir_index = 0ULL;
 	inc_nlink(inode);
 	inode_inc_iversion(inode);
 	inode->i_ctime = CURRENT_TIME;
@@ -7861,6 +7907,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	ei->flags = 0;
 	ei->csum_bytes = 0;
 	ei->index_cnt = (u64)-1;
+	ei->dir_index = 0;
 	ei->last_unlink_trans = 0;
 	ei->last_log_commit = 0;
 
@@ -8148,6 +8195,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (ret)
 		goto out_fail;
 
+	BTRFS_I(old_inode)->dir_index = 0ULL;
 	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
 		/* force full log commit if subvolume involved. */
 		root->fs_info->last_trans_log_full_commit = trans->transid;
@@ -8236,6 +8284,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		goto out_fail;
 	}
 
+	if (old_inode->i_nlink == 1)
+		BTRFS_I(old_inode)->dir_index = index;
+
 	if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
 		struct dentry *parent = new_dentry->d_parent;
 		btrfs_log_new_name(trans, old_inode, old_dir, parent);
-- 
cgit v1.2.3


From eb653de15987612444b6cde3b0e67b1edd94625f Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Mon, 23 Dec 2013 11:53:02 +0000
Subject: Btrfs: reduce btree node locking duration on item update

If we do a btree search with the goal of updating an existing item
without changing its size (ins_len == 0 and cow == 1), then we never
need to hold locks on upper level nodes (even when slot == 0) after we
COW their child nodes/leaves, as we won't have node splits or merges
in this scenario (that is, no key additions, removals or shifts on any
nodes or leaves).

Therefore release the locks immediately after COWing the child nodes/leaves
while navigating the btree, even if their parent slot is 0, instead of
returning a path to the caller with those nodes locked, which would get
released only when the caller releases or frees the path (or if it calls
btrfs_unlock_up_safe).

This is a common scenario, for example when updating inode items in fs
trees and block group items in the extent tree.

The following benchmarks were performed on a quad core machine with 32Gb
of ram, using a leaf/node size of 4Kb (to generate deeper fs trees more
quickly).

  sysbench --test=fileio --file-num=131072 --file-total-size=8G \
    --file-test-mode=seqwr --num-threads=512 --file-block-size=8192 \
    --max-requests=100000 --file-io-mode=sync [prepare|run]

Before this change:  49.85Mb/s (average of 5 runs)
After this change:   50.38Mb/s (average of 5 runs)

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/inode.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9eaa1c8ed385..8e45fdcdbd8e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3515,7 +3515,6 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
 		goto failed;
 	}
 
-	btrfs_unlock_up_safe(path, 1);
 	leaf = path->nodes[0];
 	inode_item = btrfs_item_ptr(leaf, path->slots[0],
 				    struct btrfs_inode_item);
-- 
cgit v1.2.3


From e77751aad1facc4973613a11e2ad98ee4bbb04e1 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Fri, 27 Dec 2013 21:11:50 +0800
Subject: Btrfs: fix the wrong nocow range check

The following warning message was outputed when running the 274th case
of xfstests with nodatacow option:
 BUG: Bad page state in process kswapd0  pfn:1c66f
 page:ffffea0000636848 count:0 mapcount:0 mapping:(null) index:0x78000
 page flags: 0x1000000000100a(error|uptodate|private_2)

It is because the check of nocow range was wrong, we should compare the
start and end position of the extent with the write position to verify
if the write position was in the extent, but the current code just used
the start postion to do the check, so we got the wrong extent and told
the caller that it was a nocow write. And then when we write back the
dirty pages, we found we should cow the extent, but at that time, there
was no space in the fs, we had to the error flag for the page. When
someone reclaimed that page, the above warning outputed. Fix it.

Reported-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/inode.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8e45fdcdbd8e..ffb23e506762 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6503,6 +6503,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
 	int slot;
 	int found_type;
 	bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW);
+
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
@@ -6546,6 +6547,10 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
 	if (!nocow && found_type == BTRFS_FILE_EXTENT_REG)
 		goto out;
 
+	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+	if (extent_end <= offset)
+		goto out;
+
 	disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
 	if (disk_bytenr == 0)
 		goto out;
@@ -6563,8 +6568,6 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
 		*ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
 	}
 
-	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
-
 	if (btrfs_extent_readonly(root, disk_bytenr))
 		goto out;
 	btrfs_release_path(path);
-- 
cgit v1.2.3


From 1acae57b161ef1282f565ef907f72aeed0eb71d9 Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Tue, 7 Jan 2014 11:42:27 +0000
Subject: Btrfs: faster file extent item replace operations

When writing to a file we drop existing file extent items that cover the
write range and then add a new file extent item that represents that write
range.

Before this change we were doing a tree lookup to remove the file extent
items, and then after we did another tree lookup to insert the new file
extent item.
Most of the time all the file extent items we need to drop are located
within a single leaf - this is the leaf where our new file extent item ends
up at. Therefore, in this common case just combine these 2 operations into
a single one.

By avoiding the second btree navigation for insertion of the new file extent
item, we reduce btree node/leaf lock acquisitions/releases, btree block/leaf
COW operations, CPU time on btree node/leaf key binary searches, etc.

Besides for file writes, this is an operation that happens for file fsync's
as well. However log btrees are much less likely to big as big as regular
fs btrees, therefore the impact of this change is smaller.

The following benchmark was performed against an SSD drive and a
HDD drive, both for random and sequential writes:

  sysbench --test=fileio --file-num=4096 --file-total-size=8G \
     --file-test-mode=[rndwr|seqwr] --num-threads=512 \
     --file-block-size=8192 \ --max-requests=1000000 \
     --file-fsync-freq=0 --file-io-mode=sync [prepare|run]

All results below are averages of 10 runs of the respective test.

** SSD sequential writes

Before this change: 225.88 Mb/sec
After this change:  277.26 Mb/sec

** SSD random writes

Before this change: 49.91 Mb/sec
After this change:  56.39 Mb/sec

** HDD sequential writes

Before this change: 68.53 Mb/sec
After this change:  69.87 Mb/sec

** HDD random writes

Before this change: 13.04 Mb/sec
After this change:  14.39 Mb/sec

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/inode.c | 87 +++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 55 insertions(+), 32 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ffb23e506762..23f18eb5fb55 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -125,13 +125,12 @@ static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
  * no overlapping inline items exist in the btree
  */
 static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
+				struct btrfs_path *path, int extent_inserted,
 				struct btrfs_root *root, struct inode *inode,
 				u64 start, size_t size, size_t compressed_size,
 				int compress_type,
 				struct page **compressed_pages)
 {
-	struct btrfs_key key;
-	struct btrfs_path *path;
 	struct extent_buffer *leaf;
 	struct page *page = NULL;
 	char *kaddr;
@@ -140,29 +139,29 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
 	int err = 0;
 	int ret;
 	size_t cur_size = size;
-	size_t datasize;
 	unsigned long offset;
 
 	if (compressed_size && compressed_pages)
 		cur_size = compressed_size;
 
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
+	inode_add_bytes(inode, size);
 
-	path->leave_spinning = 1;
+	if (!extent_inserted) {
+		struct btrfs_key key;
+		size_t datasize;
 
-	key.objectid = btrfs_ino(inode);
-	key.offset = start;
-	btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
-	datasize = btrfs_file_extent_calc_inline_size(cur_size);
+		key.objectid = btrfs_ino(inode);
+		key.offset = start;
+		btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
 
-	inode_add_bytes(inode, size);
-	ret = btrfs_insert_empty_item(trans, root, path, &key,
-				      datasize);
-	if (ret) {
-		err = ret;
-		goto fail;
+		datasize = btrfs_file_extent_calc_inline_size(cur_size);
+		path->leave_spinning = 1;
+		ret = btrfs_insert_empty_item(trans, root, path, &key,
+					      datasize);
+		if (ret) {
+			err = ret;
+			goto fail;
+		}
 	}
 	leaf = path->nodes[0];
 	ei = btrfs_item_ptr(leaf, path->slots[0],
@@ -203,7 +202,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
 		page_cache_release(page);
 	}
 	btrfs_mark_buffer_dirty(leaf);
-	btrfs_free_path(path);
+	btrfs_release_path(path);
 
 	/*
 	 * we're an inline extent, so nobody can
@@ -219,7 +218,6 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
 
 	return ret;
 fail:
-	btrfs_free_path(path);
 	return err;
 }
 
@@ -242,6 +240,9 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
 	u64 aligned_end = ALIGN(end, root->sectorsize);
 	u64 data_len = inline_len;
 	int ret;
+	struct btrfs_path *path;
+	int extent_inserted = 0;
+	u32 extent_item_size;
 
 	if (compressed_size)
 		data_len = compressed_size;
@@ -256,12 +257,27 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
 		return 1;
 	}
 
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
 	trans = btrfs_join_transaction(root);
-	if (IS_ERR(trans))
+	if (IS_ERR(trans)) {
+		btrfs_free_path(path);
 		return PTR_ERR(trans);
+	}
 	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 
-	ret = btrfs_drop_extents(trans, root, inode, start, aligned_end, 1);
+	if (compressed_size && compressed_pages)
+		extent_item_size = btrfs_file_extent_calc_inline_size(
+		   compressed_size);
+	else
+		extent_item_size = btrfs_file_extent_calc_inline_size(
+		    inline_len);
+
+	ret = __btrfs_drop_extents(trans, root, inode, path,
+				   start, aligned_end, NULL,
+				   1, 1, extent_item_size, &extent_inserted);
 	if (ret) {
 		btrfs_abort_transaction(trans, root, ret);
 		goto out;
@@ -269,7 +285,8 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
 
 	if (isize > actual_end)
 		inline_len = min_t(u64, isize, actual_end);
-	ret = insert_inline_extent(trans, root, inode, start,
+	ret = insert_inline_extent(trans, path, extent_inserted,
+				   root, inode, start,
 				   inline_len, compressed_size,
 				   compress_type, compressed_pages);
 	if (ret && ret != -ENOSPC) {
@@ -284,6 +301,7 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
 	btrfs_delalloc_release_metadata(inode, end + 1 - start);
 	btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
 out:
+	btrfs_free_path(path);
 	btrfs_end_transaction(trans, root);
 	return ret;
 }
@@ -1841,14 +1859,13 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_path *path;
 	struct extent_buffer *leaf;
 	struct btrfs_key ins;
+	int extent_inserted = 0;
 	int ret;
 
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 
-	path->leave_spinning = 1;
-
 	/*
 	 * we may be replacing one extent in the tree with another.
 	 * The new extent is pinned in the extent map, and we don't want
@@ -1858,17 +1875,23 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 	 * the caller is expected to unpin it and allow it to be merged
 	 * with the others.
 	 */
-	ret = btrfs_drop_extents(trans, root, inode, file_pos,
-				 file_pos + num_bytes, 0);
+	ret = __btrfs_drop_extents(trans, root, inode, path, file_pos,
+				   file_pos + num_bytes, NULL, 0,
+				   1, sizeof(*fi), &extent_inserted);
 	if (ret)
 		goto out;
 
-	ins.objectid = btrfs_ino(inode);
-	ins.offset = file_pos;
-	ins.type = BTRFS_EXTENT_DATA_KEY;
-	ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi));
-	if (ret)
-		goto out;
+	if (!extent_inserted) {
+		ins.objectid = btrfs_ino(inode);
+		ins.offset = file_pos;
+		ins.type = BTRFS_EXTENT_DATA_KEY;
+
+		path->leave_spinning = 1;
+		ret = btrfs_insert_empty_item(trans, root, path, &ins,
+					      sizeof(*fi));
+		if (ret)
+			goto out;
+	}
 	leaf = path->nodes[0];
 	fi = btrfs_item_ptr(leaf, path->slots[0],
 			    struct btrfs_file_extent_item);
-- 
cgit v1.2.3


From 63541927c8d11d2686778b1e8ec71c14b4fd53e4 Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Tue, 7 Jan 2014 11:47:46 +0000
Subject: Btrfs: add support for inode properties

This change adds infrastructure to allow for generic properties for
inodes. Properties are name/value pairs that can be associated with
inodes for different purposes. They are stored as xattrs with the
prefix "btrfs."

Properties can be inherited - this means when a directory inode has
inheritable properties set, these are added to new inodes created
under that directory. Further, subvolumes can also have properties
associated with them, and they can be inherited from their parent
subvolume. Naturally, directory properties have priority over subvolume
properties (in practice a subvolume property is just a regular
property associated with the root inode, objectid 256, of the
subvolume's fs tree).

This change also adds one specific property implementation, named
"compression", whose values can be "lzo" or "zlib" and it's an
inheritable property.

The corresponding changes to btrfs-progs were also implemented.
A patch with xfstests for this feature will follow once there's
agreement on this change/feature.

Further, the script at the bottom of this commit message was used to
do some benchmarks to measure any performance penalties of this feature.

Basically the tests correspond to:

Test 1 - create a filesystem and mount it with compress-force=lzo,
then sequentially create N files of 64Kb each, measure how long it took
to create the files, unmount the filesystem, mount the filesystem and
perform an 'ls -lha' against the test directory holding the N files, and
report the time the command took.

Test 2 - create a filesystem and don't use any compression option when
mounting it - instead set the compression property of the subvolume's
root to 'lzo'. Then create N files of 64Kb, and report the time it took.
The unmount the filesystem, mount it again and perform an 'ls -lha' like
in the former test. This means every single file ends up with a property
(xattr) associated to it.

Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the
compression property, have no real effect other than adding more work
when inheriting properties and taking more btree leaf space.

Test 4 - same as test 3 but with 10 properties per file.

Results (in seconds, and averages of 5 runs each), for different N
numbers of files follow.

* Without properties (test 1)

                    file creation time        ls -lha time
10 000 files              3.49                   0.76
100 000 files            47.19                   8.37
1 000 000 files         518.51                 107.06

* With 1 property (compression property set to lzo - test 2)

                    file creation time        ls -lha time
10 000 files              3.63                    0.93
100 000 files            48.56                    9.74
1 000 000 files         537.72                  125.11

* With 4 properties (test 3)

                    file creation time        ls -lha time
10 000 files              3.94                    1.20
100 000 files            52.14                   11.48
1 000 000 files         572.70                  142.13

* With 10 properties (test 4)

                    file creation time        ls -lha time
10 000 files              4.61                    1.35
100 000 files            58.86                   13.83
1 000 000 files         656.01                  177.61

The increased latencies with properties are essencialy because of:

*) When creating an inode, we now synchronously write 1 more item
   (an xattr item) for each property inherited from the parent dir
   (or subvolume). This could be done in an asynchronous way such
   as we do for dir intex items (delayed-inode.c), which could help
   reduce the file creation latency;

*) With properties, we now have larger fs trees. For this particular
   test each xattr item uses 75 bytes of leaf space in the fs tree.
   This could be less by using a new item for xattr items, instead of
   the current btrfs_dir_item, since we could cut the 'location' and
   'type' fields (saving 18 bytes) and maybe 'transid' too (saving a
   total of 26 bytes per xattr item) from the btrfs_dir_item type.

Also tried batching the xattr insertions (ignoring proper hash
collision handling, since it didn't exist) when creating files that
inherit properties from their parent inode/subvolume, but the end
results were (surprisingly) essentially the same.

Test script:

$ cat test.pl
  #!/usr/bin/perl -w

  use strict;
  use Time::HiRes qw(time);
  use constant NUM_FILES => 10_000;
  use constant FILE_SIZES => (64 * 1024);
  use constant DEV => '/dev/sdb4';
  use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev';
  use constant TEST_DIR => (MNT_POINT . '/testdir');

  system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!";

  # following line for testing without properties
  #system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!";

  # following 2 lines for testing with properties
  system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
  system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!";

  system("mkdir", TEST_DIR) == 0 or die "mkdir failed!";
  my ($t1, $t2);

  $t1 = time();
  for (my $i = 1; $i <= NUM_FILES; $i++) {
      my $p = TEST_DIR . '/file_' . $i;
      open(my $f, '>', $p) or die "Error opening file!";
      $f->autoflush(1);
      for (my $j = 0; $j < FILE_SIZES; $j += 4096) {
          print $f ('A' x 4096) or die "Error writing to file!";
      }
      close($f);
  }
  $t2 = time();
  print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n";
  system("umount", DEV) == 0 or die "umount failed!";
  system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";

  $t1 = time();
  system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!";
  $t2 = time();
  print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n";
  system("umount", DEV) == 0 or die "umount failed!";

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/inode.c | 42 ++++++++++++++++++++++++++++++++++++------
 1 file changed, 36 insertions(+), 6 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 23f18eb5fb55..1ea19cea96d0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -58,6 +58,7 @@
 #include "inode-map.h"
 #include "backref.h"
 #include "hash.h"
+#include "props.h"
 
 struct btrfs_iget_args {
 	u64 ino;
@@ -3265,7 +3266,8 @@ out:
  * slot is the slot the inode is in, objectid is the objectid of the inode
  */
 static noinline int acls_after_inode_item(struct extent_buffer *leaf,
-					  int slot, u64 objectid)
+					  int slot, u64 objectid,
+					  int *first_xattr_slot)
 {
 	u32 nritems = btrfs_header_nritems(leaf);
 	struct btrfs_key found_key;
@@ -3281,6 +3283,7 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
 	}
 
 	slot++;
+	*first_xattr_slot = -1;
 	while (slot < nritems) {
 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
 
@@ -3290,6 +3293,8 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
 
 		/* we found an xattr, assume we've got an acl */
 		if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
+			if (*first_xattr_slot == -1)
+				*first_xattr_slot = slot;
 			if (found_key.offset == xattr_access ||
 			    found_key.offset == xattr_default)
 				return 1;
@@ -3318,6 +3323,8 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
 	 * something larger than an xattr.  We have to assume the inode
 	 * has acls
 	 */
+	if (*first_xattr_slot == -1)
+		*first_xattr_slot = slot;
 	return 1;
 }
 
@@ -3337,6 +3344,7 @@ static void btrfs_read_locked_inode(struct inode *inode)
 	u32 rdev;
 	int ret;
 	bool filled = false;
+	int first_xattr_slot;
 
 	ret = btrfs_fill_inode(inode, &rdev);
 	if (!ret)
@@ -3346,7 +3354,6 @@ static void btrfs_read_locked_inode(struct inode *inode)
 	if (!path)
 		goto make_bad;
 
-	path->leave_spinning = 1;
 	memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
 
 	ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
@@ -3429,12 +3436,21 @@ cache_acl:
 	 * any xattrs or acls
 	 */
 	maybe_acls = acls_after_inode_item(leaf, path->slots[0],
-					   btrfs_ino(inode));
+					   btrfs_ino(inode), &first_xattr_slot);
+	if (first_xattr_slot != -1) {
+		path->slots[0] = first_xattr_slot;
+		ret = btrfs_load_inode_props(inode, path);
+		if (ret)
+			btrfs_err(root->fs_info,
+				  "error loading props for ino %llu (root %llu): %d\n",
+				  btrfs_ino(inode),
+				  root->root_key.objectid, ret);
+	}
+	btrfs_free_path(path);
+
 	if (!maybe_acls)
 		cache_no_acl(inode);
 
-	btrfs_free_path(path);
-
 	switch (inode->i_mode & S_IFMT) {
 	case S_IFREG:
 		inode->i_mapping->a_ops = &btrfs_aops;
@@ -5607,6 +5623,12 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 
 	btrfs_update_root_times(trans, root);
 
+	ret = btrfs_inode_inherit_props(trans, inode, dir);
+	if (ret)
+		btrfs_err(root->fs_info,
+			  "error inheriting props for ino %llu (root %llu): %d",
+			  btrfs_ino(inode), root->root_key.objectid, ret);
+
 	return inode;
 fail:
 	if (dir)
@@ -7889,7 +7911,9 @@ out:
  * create a new subvolume directory/inode (helper for the ioctl).
  */
 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
-			     struct btrfs_root *new_root, u64 new_dirid)
+			     struct btrfs_root *new_root,
+			     struct btrfs_root *parent_root,
+			     u64 new_dirid)
 {
 	struct inode *inode;
 	int err;
@@ -7907,6 +7931,12 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
 	set_nlink(inode, 1);
 	btrfs_i_size_write(inode, 0);
 
+	err = btrfs_subvol_inherit_props(trans, new_root, parent_root);
+	if (err)
+		btrfs_err(new_root->fs_info,
+			  "error inheriting subvolume %llu properties: %d\n",
+			  new_root->root_key.objectid, err);
+
 	err = btrfs_update_inode(trans, new_root, inode);
 
 	iput(inode);
-- 
cgit v1.2.3


From 2c21b4d733d6e50514e30ffd87110364ddda695b Mon Sep 17 00:00:00 2001
From: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Date: Tue, 14 Jan 2014 19:42:20 +0800
Subject: Btrfs: fix transaction abortion when remounting btrfs from RW to RO

Steps to reproduce:
 # mkfs.btrfs -f /dev/sda8
 # mount /dev/sda8 /mnt -o flushoncommit
 # dd if=/dev/zero of=/mnt/data bs=4k count=102400 &
 # mount /dev/sda8 /mnt -o remount, ro

When remounting RW to RO, the logic is to firstly set flag
to RO and then commit transaction, however with option
flushoncommit enabled,we will do RO check within committing
transaction, so we get a transaction abortion here.

Actually,here check is wrong, we should check if FS_STATE_ERROR
is set, fix it.

Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Suggested-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/inode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1ea19cea96d0..7b61ea3141e5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8477,7 +8477,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
 {
 	int ret;
 
-	if (root->fs_info->sb->s_flags & MS_RDONLY)
+	if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
 		return -EROFS;
 
 	ret = __start_delalloc_inodes(root, delay_iput);
@@ -8503,7 +8503,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput)
 	struct list_head splice;
 	int ret;
 
-	if (fs_info->sb->s_flags & MS_RDONLY)
+	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
 		return -EROFS;
 
 	INIT_LIST_HEAD(&splice);
-- 
cgit v1.2.3


From 3c9665df0c5d3f471b07efc32181459386678ebd Mon Sep 17 00:00:00 2001
From: Gui Hecheng <guihc.fnst@cn.fujitsu.com>
Date: Thu, 23 Jan 2014 13:41:09 +0800
Subject: btrfs: fix warning while merging two adjacent extents

When we have two adjacent extents in relink_extent_backref,
we try to merge them. When we use btrfs_search_slot to locate the
slot for the current extent, we shouldn't set "ins_len = 1",
because we will merge it into the previous extent rather than
insert a new item. Otherwise, we may happen to create a new leaf
in btrfs_search_slot and path->slot[0] will be 0. Then we try to
fetch the previous item using "path->slots[0]--", and it will cause
a warning as follows:

	[  145.713385] WARNING: CPU: 3 PID: 1796 at fs/btrfs/extent_io.c:5043 map_private_extent_buffer+0xd4/0xe0
	[  145.713387] btrfs bad mapping eb start 5337088 len 4096, wanted 167772306 8
	...
	[  145.713462]  [<ffffffffa034b1f4>] map_private_extent_buffer+0xd4/0xe0
	[  145.713476]  [<ffffffffa030097a>] ? btrfs_free_path+0x2a/0x40
	[  145.713485]  [<ffffffffa0340864>] btrfs_get_token_64+0x64/0xf0
	[  145.713498]  [<ffffffffa033472c>] relink_extent_backref+0x41c/0x820
	[  145.713508]  [<ffffffffa0334d69>] btrfs_finish_ordered_io+0x239/0xa80

I encounter this warning when running defrag having mkfs.btrfs
with option -M. At the same time there are read/writes & snapshots
running at background.

Signed-off-by: Gui Hecheng <guihc.fnst@cn.fujitsu.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7b61ea3141e5..3b6598783be9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2314,7 +2314,7 @@ again:
 		u64 extent_len;
 		struct btrfs_key found_key;
 
-		ret = btrfs_search_slot(trans, root, &key, path, 1, 1);
+		ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
 		if (ret < 0)
 			goto out_free_path;
 
-- 
cgit v1.2.3


From 514ac8ad8793a097c0c9d89202c642479d6dfa34 Mon Sep 17 00:00:00 2001
From: Chris Mason <clm@fb.com>
Date: Fri, 3 Jan 2014 21:07:00 -0800
Subject: Btrfs: don't use ram_bytes for uncompressed inline items

If we truncate an uncompressed inline item, ram_bytes isn't updated to reflect
the new size.  The fixe uses the size directly from the item header when
reading uncompressed inlines, and also fixes truncate to update the
size as it goes.

Reported-by: Jens Axboe <axboe@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
CC: stable@vger.kernel.org
---
 fs/btrfs/inode.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3b6598783be9..ad961a598c99 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1281,7 +1281,8 @@ next_slot:
 			nocow = 1;
 		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 			extent_end = found_key.offset +
-				btrfs_file_extent_inline_len(leaf, fi);
+				btrfs_file_extent_inline_len(leaf,
+						     path->slots[0], fi);
 			extent_end = ALIGN(extent_end, root->sectorsize);
 		} else {
 			BUG_ON(1);
@@ -4023,7 +4024,7 @@ search_again:
 				    btrfs_file_extent_num_bytes(leaf, fi);
 			} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 				item_end += btrfs_file_extent_inline_len(leaf,
-									 fi);
+							 path->slots[0], fi);
 			}
 			item_end--;
 		}
@@ -4093,6 +4094,12 @@ search_again:
 					inode_sub_bytes(inode, item_end + 1 -
 							new_size);
 				}
+
+				/*
+				 * update the ram bytes to properly reflect
+				 * the new size of our item
+				 */
+				btrfs_set_file_extent_ram_bytes(leaf, fi, size);
 				size =
 				    btrfs_file_extent_calc_inline_size(size);
 				btrfs_truncate_item(root, path, size, 1);
@@ -6162,7 +6169,7 @@ again:
 		       btrfs_file_extent_num_bytes(leaf, item);
 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 		size_t size;
-		size = btrfs_file_extent_inline_len(leaf, item);
+		size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
 		extent_end = ALIGN(extent_start + size, root->sectorsize);
 	}
 next:
@@ -6231,7 +6238,7 @@ next:
 			goto out;
 		}
 
-		size = btrfs_file_extent_inline_len(leaf, item);
+		size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
 		extent_offset = page_offset(page) + pg_offset - extent_start;
 		copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
 				size - extent_offset);
-- 
cgit v1.2.3


From 90d3e592e99b8e374ead2b45148abf506493a959 Mon Sep 17 00:00:00 2001
From: Chris Mason <clm@fb.com>
Date: Thu, 9 Jan 2014 17:28:00 -0800
Subject: Btrfs: setup inode location during btrfs_init_inode_locked

We have a race during inode init because the BTRFS_I(inode)->location is setup
after the inode hash table lock is dropped.  btrfs_find_actor uses the location
field, so our search might not find an existing inode in the hash table if we
race with the inode init code.

This commit changes things to setup the location field sooner.  Also the find actor now
uses only the location objectid to match inodes.  For inode hashing, we just
need a unique and stable test, it doesn't have to reflect the inode numbers we
show to userland.

Signed-off-by: Chris Mason <clm@fb.com>
CC: stable@vger.kernel.org
---
 fs/btrfs/inode.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ad961a598c99..fb74a536add3 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -61,7 +61,7 @@
 #include "props.h"
 
 struct btrfs_iget_args {
-	u64 ino;
+	struct btrfs_key *location;
 	struct btrfs_root *root;
 };
 
@@ -4977,7 +4977,9 @@ again:
 static int btrfs_init_locked_inode(struct inode *inode, void *p)
 {
 	struct btrfs_iget_args *args = p;
-	inode->i_ino = args->ino;
+	inode->i_ino = args->location->objectid;
+	memcpy(&BTRFS_I(inode)->location, args->location,
+	       sizeof(*args->location));
 	BTRFS_I(inode)->root = args->root;
 	return 0;
 }
@@ -4985,19 +4987,19 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
 static int btrfs_find_actor(struct inode *inode, void *opaque)
 {
 	struct btrfs_iget_args *args = opaque;
-	return args->ino == btrfs_ino(inode) &&
+	return args->location->objectid == BTRFS_I(inode)->location.objectid &&
 		args->root == BTRFS_I(inode)->root;
 }
 
 static struct inode *btrfs_iget_locked(struct super_block *s,
-				       u64 objectid,
+				       struct btrfs_key *location,
 				       struct btrfs_root *root)
 {
 	struct inode *inode;
 	struct btrfs_iget_args args;
-	unsigned long hashval = btrfs_inode_hash(objectid, root);
+	unsigned long hashval = btrfs_inode_hash(location->objectid, root);
 
-	args.ino = objectid;
+	args.location = location;
 	args.root = root;
 
 	inode = iget5_locked(s, hashval, btrfs_find_actor,
@@ -5014,13 +5016,11 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
 {
 	struct inode *inode;
 
-	inode = btrfs_iget_locked(s, location->objectid, root);
+	inode = btrfs_iget_locked(s, location, root);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 
 	if (inode->i_state & I_NEW) {
-		BTRFS_I(inode)->root = root;
-		memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
 		btrfs_read_locked_inode(inode);
 		if (!is_bad_inode(inode)) {
 			inode_tree_add(inode);
-- 
cgit v1.2.3