From 67710892ec983aa79ad1e2a2642fe8e3a4a194ea Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 6 Jun 2016 11:51:25 +0100
Subject: Btrfs: be more precise on errors when getting an inode from disk

When we attempt to read an inode from disk, we end up always returning an
-ESTALE error to the caller regardless of the actual failure reason, which
can be an out of memory problem (when allocating a path), some error found
when reading from the fs/subvolume btree (like a genuine IO error) or the
inode does not exists. So lets start returning the real error code to the
callers so that they don't treat all -ESTALE errors as meaning that the
inode does not exists (such as during orphan cleanup). This will also be
needed for a subsequent patch in the same series dealing with a special
fsync case.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/inode.c | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3e61bd1f3f65..f9686541997b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3428,10 +3428,10 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 		found_key.offset = 0;
 		inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
 		ret = PTR_ERR_OR_ZERO(inode);
-		if (ret && ret != -ESTALE)
+		if (ret && ret != -ENOENT)
 			goto out;
 
-		if (ret == -ESTALE && root == root->fs_info->tree_root) {
+		if (ret == -ENOENT && root == root->fs_info->tree_root) {
 			struct btrfs_root *dead_root;
 			struct btrfs_fs_info *fs_info = root->fs_info;
 			int is_dead_root = 0;
@@ -3467,7 +3467,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 		 * Inode is already gone but the orphan item is still there,
 		 * kill the orphan item.
 		 */
-		if (ret == -ESTALE) {
+		if (ret == -ENOENT) {
 			trans = btrfs_start_transaction(root, 1);
 			if (IS_ERR(trans)) {
 				ret = PTR_ERR(trans);
@@ -3626,7 +3626,7 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
 /*
  * read an inode from the btree into the in-memory inode
  */
-static void btrfs_read_locked_inode(struct inode *inode)
+static int btrfs_read_locked_inode(struct inode *inode)
 {
 	struct btrfs_path *path;
 	struct extent_buffer *leaf;
@@ -3645,14 +3645,19 @@ static void btrfs_read_locked_inode(struct inode *inode)
 		filled = true;
 
 	path = btrfs_alloc_path();
-	if (!path)
+	if (!path) {
+		ret = -ENOMEM;
 		goto make_bad;
+	}
 
 	memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
 
 	ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
-	if (ret)
+	if (ret) {
+		if (ret > 0)
+			ret = -ENOENT;
 		goto make_bad;
+	}
 
 	leaf = path->nodes[0];
 
@@ -3805,11 +3810,12 @@ cache_acl:
 	}
 
 	btrfs_update_iflags(inode);
-	return;
+	return 0;
 
 make_bad:
 	btrfs_free_path(path);
 	make_bad_inode(inode);
+	return ret;
 }
 
 /*
@@ -5595,7 +5601,9 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
 		return ERR_PTR(-ENOMEM);
 
 	if (inode->i_state & I_NEW) {
-		btrfs_read_locked_inode(inode);
+		int ret;
+
+		ret = btrfs_read_locked_inode(inode);
 		if (!is_bad_inode(inode)) {
 			inode_tree_add(inode);
 			unlock_new_inode(inode);
@@ -5604,7 +5612,8 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
 		} else {
 			unlock_new_inode(inode);
 			iput(inode);
-			inode = ERR_PTR(-ESTALE);
+			ASSERT(ret < 0);
+			inode = ERR_PTR(ret < 0 ? ret : -ESTALE);
 		}
 	}
 
-- 
cgit v1.2.3


From 44f714dae50a2e795d3268a6831762aa6fa54f55 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 6 Jun 2016 16:11:13 +0100
Subject: Btrfs: improve performance on fsync against new inode after
 rename/unlink

With commit 56f23fdbb600 ("Btrfs: fix file/data loss caused by fsync after
rename and new inode") we got simple fix for a functional issue when the
following sequence of actions is done:

  at transaction N
  create file A at directory D
  at transaction N + M (where M >= 1)
  move/rename existing file A from directory D to directory E
  create a new file named A at directory D
  fsync the new file
  power fail

The solution was to simply detect such scenario and fallback to a full
transaction commit when we detect it. However this turned out to had a
significant impact on throughput (and a bit on latency too) for benchmarks
using the dbench tool, which simulates real workloads from smbd (Samba)
servers. For example on a test vm (with a debug kernel):

Unpatched:
Throughput 19.1572 MB/sec  32 clients  32 procs  max_latency=1005.229 ms

Patched:
Throughput 23.7015 MB/sec  32 clients  32 procs  max_latency=809.206 ms

The patched results (this patch is applied) are similar to the results of
a kernel with the commit 56f23fdbb600 ("Btrfs: fix file/data loss caused
by fsync after rename and new inode") reverted.

This change avoids the fallback to a transaction commit and instead makes
sure all the names of the conflicting inode (the one that had a name in a
past transaction that matches the name of the new file in the same parent
directory) are logged so that at log replay time we don't lose neither the
new file nor the old file, and the old file gets the name it was renamed
to.

This also ends up avoiding a full transaction commit for a similar case
that involves an unlink instead of a rename of the old file:

  at transaction N
  create file A at directory D
  at transaction N + M (where M >= 1)
  remove file A
  create a new file named A at directory D
  fsync the new file
  power fail

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/inode.c | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f9686541997b..453b9d0da5f1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4203,6 +4203,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	int err = 0;
 	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct btrfs_trans_handle *trans;
+	u64 last_unlink_trans;
 
 	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
 		return -ENOTEMPTY;
@@ -4225,11 +4226,27 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	if (err)
 		goto out;
 
+	last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
+
 	/* now the directory is empty */
 	err = btrfs_unlink_inode(trans, root, dir, d_inode(dentry),
 				 dentry->d_name.name, dentry->d_name.len);
-	if (!err)
+	if (!err) {
 		btrfs_i_size_write(inode, 0);
+		/*
+		 * Propagate the last_unlink_trans value of the deleted dir to
+		 * its parent directory. This is to prevent an unrecoverable
+		 * log tree in the case we do something like this:
+		 * 1) create dir foo
+		 * 2) create snapshot under dir foo
+		 * 3) delete the snapshot
+		 * 4) rmdir foo
+		 * 5) mkdir foo
+		 * 6) fsync foo or some file inside foo
+		 */
+		if (last_unlink_trans >= trans->transid)
+			BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
+	}
 out:
 	btrfs_end_transaction(trans, root);
 	btrfs_btree_balance_dirty(root);
-- 
cgit v1.2.3