From e4545de5b035c7debb73d260c78377dbb69cbfb5 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 17 Jun 2015 12:49:23 +0100 Subject: Btrfs: fix fsync data loss after append write If we do an append write to a file (which increases its inode's i_size) that does not have the flag BTRFS_INODE_NEEDS_FULL_SYNC set in its inode, and the previous transaction added a new hard link to the file, which sets the flag BTRFS_INODE_COPY_EVERYTHING in the file's inode, and then fsync the file, the inode's new i_size isn't logged. This has the consequence that after the fsync log is replayed, the file size remains what it was before the append write operation, which means users/applications will not be able to read the data that was successsfully fsync'ed before. This happens because neither the inode item nor the delayed inode get their i_size updated when the append write is made - doing so would require starting a transaction in the buffered write path, something that we do not do intentionally for performance reasons. Fix this by making sure that when the flag BTRFS_INODE_COPY_EVERYTHING is set the inode is logged with its current i_size (log the in-memory inode into the log tree). This issue is not a recent regression and is easy to reproduce with the following test case for fstests: seq=`basename $0` seqres=$RESULT_DIR/$seq echo "QA output created by $seq" here=`pwd` tmp=/tmp/$$ status=1 # failure is the default! _cleanup() { _cleanup_flakey rm -f $tmp.* } trap "_cleanup; exit \$status" 0 1 2 3 15 # get standard environment, filters and checks . ./common/rc . ./common/filter . ./common/dmflakey # real QA test starts here _supported_fs generic _supported_os Linux _need_to_be_root _require_scratch _require_dm_flakey _require_metadata_journaling $SCRATCH_DEV _crash_and_mount() { # Simulate a crash/power loss. _load_flakey_table $FLAKEY_DROP_WRITES _unmount_flakey # Allow writes again and mount. This makes the fs replay its fsync log. _load_flakey_table $FLAKEY_ALLOW_WRITES _mount_flakey } rm -f $seqres.full _scratch_mkfs >> $seqres.full 2>&1 _init_flakey _mount_flakey # Create the test file with some initial data and then fsync it. # The fsync here is only needed to trigger the issue in btrfs, as it causes the # the flag BTRFS_INODE_NEEDS_FULL_SYNC to be removed from the btrfs inode. $XFS_IO_PROG -f -c "pwrite -S 0xaa 0 32k" \ -c "fsync" \ $SCRATCH_MNT/foo | _filter_xfs_io sync # Add a hard link to our file. # On btrfs this sets the flag BTRFS_INODE_COPY_EVERYTHING on the btrfs inode, # which is a necessary condition to trigger the issue. ln $SCRATCH_MNT/foo $SCRATCH_MNT/bar # Sync the filesystem to force a commit of the current btrfs transaction, this # is a necessary condition to trigger the bug on btrfs. sync # Now append more data to our file, increasing its size, and fsync the file. # In btrfs because the inode flag BTRFS_INODE_COPY_EVERYTHING was set and the # write path did not update the inode item in the btree nor the delayed inode # item (in memory struture) in the current transaction (created by the fsync # handler), the fsync did not record the inode's new i_size in the fsync # log/journal. This made the data unavailable after the fsync log/journal is # replayed. $XFS_IO_PROG -c "pwrite -S 0xbb 32K 32K" \ -c "fsync" \ $SCRATCH_MNT/foo | _filter_xfs_io echo "File content after fsync and before crash:" od -t x1 $SCRATCH_MNT/foo _crash_and_mount echo "File content after crash and log replay:" od -t x1 $SCRATCH_MNT/foo status=0 exit The expected file output before and after the crash/power failure expects the appended data to be available, which is: 0000000 aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa * 0100000 bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb * 0200000 Cc: stable@vger.kernel.org Signed-off-by: Filipe Manana Reviewed-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 1ce80c1c4eb6..76c4f9d1b80a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4155,6 +4155,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, u64 ino = btrfs_ino(inode); struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; u64 logged_isize = 0; + bool need_log_inode_item = true; path = btrfs_alloc_path(); if (!path) @@ -4263,11 +4264,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, } else { if (inode_only == LOG_INODE_ALL) fast_search = true; - ret = log_inode_item(trans, log, dst_path, inode); - if (ret) { - err = ret; - goto out_unlock; - } goto log_extents; } @@ -4290,6 +4286,9 @@ again: if (min_key.type > max_key.type) break; + if (min_key.type == BTRFS_INODE_ITEM_KEY) + need_log_inode_item = false; + src = path->nodes[0]; if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { ins_nr++; @@ -4360,6 +4359,11 @@ next_slot: log_extents: btrfs_release_path(path); btrfs_release_path(dst_path); + if (need_log_inode_item) { + err = log_inode_item(trans, log, dst_path, inode); + if (err) + goto out_unlock; + } if (fast_search) { /* * Some ordered extents started by fsync might have completed -- cgit v1.2.3 From 36283bf777d963fac099213297e155d071096994 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Sat, 20 Jun 2015 00:44:51 +0100 Subject: Btrfs: fix fsync xattr loss in the fast fsync path After commit 4f764e515361 ("Btrfs: remove deleted xattrs on fsync log replay"), we can end up in a situation where during log replay we end up deleting xattrs that were never deleted when their file was last fsynced. This happens in the fast fsync path (flag BTRFS_INODE_NEEDS_FULL_SYNC is not set in the inode) if the inode has the flag BTRFS_INODE_COPY_EVERYTHING set, the xattr was added in a past transaction and the leaf where the xattr is located was not updated (COWed or created) in the current transaction. In this scenario the xattr item never ends up in the log tree and therefore at log replay time, which makes the replay code delete the xattr from the fs/subvol tree as it thinks that xattr was deleted prior to the last fsync. Fix this by always logging all xattrs, which is the simplest and most reliable way to detect deleted xattrs and replay the deletes at log replay time. This issue is reproducible with the following test case for fstests: seq=`basename $0` seqres=$RESULT_DIR/$seq echo "QA output created by $seq" here=`pwd` tmp=/tmp/$$ status=1 # failure is the default! _cleanup() { _cleanup_flakey rm -f $tmp.* } trap "_cleanup; exit \$status" 0 1 2 3 15 # get standard environment, filters and checks . ./common/rc . ./common/filter . ./common/dmflakey . ./common/attr # real QA test starts here # We create a lot of xattrs for a single file. Only btrfs and xfs are currently # able to store such a large mount of xattrs per file, other filesystems such # as ext3/4 and f2fs for example, fail with ENOSPC even if we attempt to add # less than 1000 xattrs with very small values. _supported_fs btrfs xfs _supported_os Linux _need_to_be_root _require_scratch _require_dm_flakey _require_attrs _require_metadata_journaling $SCRATCH_DEV rm -f $seqres.full _scratch_mkfs >> $seqres.full 2>&1 _init_flakey _mount_flakey # Create the test file with some initial data and make sure everything is # durably persisted. $XFS_IO_PROG -f -c "pwrite -S 0xaa 0 32k" $SCRATCH_MNT/foo | _filter_xfs_io sync # Add many small xattrs to our file. # We create such a large amount because it's needed to trigger the issue found # in btrfs - we need to have an amount that causes the fs to have at least 3 # btree leafs with xattrs stored in them, and it must work on any leaf size # (maximum leaf/node size is 64Kb). num_xattrs=2000 for ((i = 1; i <= $num_xattrs; i++)); do name="user.attr_$(printf "%04d" $i)" $SETFATTR_PROG -n $name -v "val_$(printf "%04d" $i)" $SCRATCH_MNT/foo done # Sync the filesystem to force a commit of the current btrfs transaction, this # is a necessary condition to trigger the bug on btrfs. sync # Now update our file's data and fsync the file. # After a successful fsync, if the fsync log/journal is replayed we expect to # see all the xattrs we added before with the same values (and the updated file # data of course). Btrfs used to delete some of these xattrs when it replayed # its fsync log/journal. $XFS_IO_PROG -c "pwrite -S 0xbb 8K 16K" \ -c "fsync" \ $SCRATCH_MNT/foo | _filter_xfs_io # Simulate a crash/power loss. _load_flakey_table $FLAKEY_DROP_WRITES _unmount_flakey # Allow writes again and mount. This makes the fs replay its fsync log. _load_flakey_table $FLAKEY_ALLOW_WRITES _mount_flakey echo "File content after crash and log replay:" od -t x1 $SCRATCH_MNT/foo echo "File xattrs after crash and log replay:" for ((i = 1; i <= $num_xattrs; i++)); do name="user.attr_$(printf "%04d" $i)" echo -n "$name=" $GETFATTR_PROG --absolute-names -n $name --only-values $SCRATCH_MNT/foo echo done status=0 exit The golden output expects all xattrs to be available, and with the correct values, after the fsync log is replayed. Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 104 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 76c4f9d1b80a..66f87156882f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4117,6 +4117,86 @@ static int logged_inode_size(struct btrfs_root *log, struct inode *inode, return 0; } +/* + * At the moment we always log all xattrs. This is to figure out at log replay + * time which xattrs must have their deletion replayed. If a xattr is missing + * in the log tree and exists in the fs/subvol tree, we delete it. This is + * because if a xattr is deleted, the inode is fsynced and a power failure + * happens, causing the log to be replayed the next time the fs is mounted, + * we want the xattr to not exist anymore (same behaviour as other filesystems + * with a journal, ext3/4, xfs, f2fs, etc). + */ +static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, + struct btrfs_path *path, + struct btrfs_path *dst_path) +{ + int ret; + struct btrfs_key key; + const u64 ino = btrfs_ino(inode); + int ins_nr = 0; + int start_slot = 0; + + key.objectid = ino; + key.type = BTRFS_XATTR_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ret; + + while (true) { + int slot = path->slots[0]; + struct extent_buffer *leaf = path->nodes[0]; + int nritems = btrfs_header_nritems(leaf); + + if (slot >= nritems) { + if (ins_nr > 0) { + u64 last_extent = 0; + + ret = copy_items(trans, inode, dst_path, path, + &last_extent, start_slot, + ins_nr, 1, 0); + /* can't be 1, extent items aren't processed */ + ASSERT(ret <= 0); + if (ret < 0) + return ret; + ins_nr = 0; + } + ret = btrfs_next_leaf(root, path); + if (ret < 0) + return ret; + else if (ret > 0) + break; + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) + break; + + if (ins_nr == 0) + start_slot = slot; + ins_nr++; + path->slots[0]++; + cond_resched(); + } + if (ins_nr > 0) { + u64 last_extent = 0; + + ret = copy_items(trans, inode, dst_path, path, + &last_extent, start_slot, + ins_nr, 1, 0); + /* can't be 1, extent items aren't processed */ + ASSERT(ret <= 0); + if (ret < 0) + return ret; + } + + return 0; +} + /* log a single inode in the tree log. * At least one parent directory for this inode must exist in the tree * or be logged already. @@ -4289,6 +4369,25 @@ again: if (min_key.type == BTRFS_INODE_ITEM_KEY) need_log_inode_item = false; + /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ + if (min_key.type == BTRFS_XATTR_ITEM_KEY) { + if (ins_nr == 0) + goto next_slot; + ret = copy_items(trans, inode, dst_path, path, + &last_extent, ins_start_slot, + ins_nr, inode_only, logged_isize); + if (ret < 0) { + err = ret; + goto out_unlock; + } + ins_nr = 0; + if (ret) { + btrfs_release_path(path); + continue; + } + goto next_slot; + } + src = path->nodes[0]; if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { ins_nr++; @@ -4356,6 +4455,11 @@ next_slot: ins_nr = 0; } + btrfs_release_path(path); + btrfs_release_path(dst_path); + err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path); + if (err) + goto out_unlock; log_extents: btrfs_release_path(path); btrfs_release_path(dst_path); -- cgit v1.2.3 From a89ca6f24ffe435edad57de02eaabd37a2c6bff6 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 25 Jun 2015 04:17:46 +0100 Subject: Btrfs: fix fsync after truncate when no_holes feature is enabled When we have the no_holes feature enabled, if a we truncate a file to a smaller size, truncate it again but to a size greater than or equals to its original size and fsync it, the log tree will not have any information about the hole covering the range [truncate_1_offset, new_file_size[. Which means if the fsync log is replayed, the file will remain with the state it had before both truncate operations. Without the no_holes feature this does not happen, since when the inode is logged (full sync flag is set) it will find in the fs/subvol tree a leaf with a generation matching the current transaction id that has an explicit extent item representing the hole. Fix this by adding an explicit extent item representing a hole between the last extent and the inode's i_size if we are doing a full sync. The issue is easy to reproduce with the following test case for fstests: . ./common/rc . ./common/filter . ./common/dmflakey _need_to_be_root _supported_fs generic _supported_os Linux _require_scratch _require_dm_flakey # This test was motivated by an issue found in btrfs when the btrfs # no-holes feature is enabled (introduced in kernel 3.14). So enable # the feature if the fs being tested is btrfs. if [ $FSTYP == "btrfs" ]; then _require_btrfs_fs_feature "no_holes" _require_btrfs_mkfs_feature "no-holes" MKFS_OPTIONS="$MKFS_OPTIONS -O no-holes" fi rm -f $seqres.full _scratch_mkfs >>$seqres.full 2>&1 _init_flakey _mount_flakey # Create our test files and make sure everything is durably persisted. $XFS_IO_PROG -f -c "pwrite -S 0xaa 0 64K" \ -c "pwrite -S 0xbb 64K 61K" \ $SCRATCH_MNT/foo | _filter_xfs_io $XFS_IO_PROG -f -c "pwrite -S 0xee 0 64K" \ -c "pwrite -S 0xff 64K 61K" \ $SCRATCH_MNT/bar | _filter_xfs_io sync # Now truncate our file foo to a smaller size (64Kb) and then truncate # it to the size it had before the shrinking truncate (125Kb). Then # fsync our file. If a power failure happens after the fsync, we expect # our file to have a size of 125Kb, with the first 64Kb of data having # the value 0xaa and the second 61Kb of data having the value 0x00. $XFS_IO_PROG -c "truncate 64K" \ -c "truncate 125K" \ -c "fsync" \ $SCRATCH_MNT/foo # Do something similar to our file bar, but the first truncation sets # the file size to 0 and the second truncation expands the size to the # double of what it was initially. $XFS_IO_PROG -c "truncate 0" \ -c "truncate 253K" \ -c "fsync" \ $SCRATCH_MNT/bar _load_flakey_table $FLAKEY_DROP_WRITES _unmount_flakey # Allow writes again, mount to trigger log replay and validate file # contents. _load_flakey_table $FLAKEY_ALLOW_WRITES _mount_flakey # We expect foo to have a size of 125Kb, the first 64Kb of data all # having the value 0xaa and the remaining 61Kb to be a hole (all bytes # with value 0x00). echo "File foo content after log replay:" od -t x1 $SCRATCH_MNT/foo # We expect bar to have a size of 253Kb and no extents (any byte read # from bar has the value 0x00). echo "File bar content after log replay:" od -t x1 $SCRATCH_MNT/bar status=0 exit The expected file contents in the golden output are: File foo content after log replay: 0000000 aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa * 0200000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 * 0372000 File bar content after log replay: 0000000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 * 0772000 Without this fix, their contents are: File foo content after log replay: 0000000 aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa * 0200000 bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb * 0372000 File bar content after log replay: 0000000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee * 0200000 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff * 0372000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 * 0772000 A test case submission for fstests follows soon. Signed-off-by: Filipe Manana Reviewed-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 108 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 66f87156882f..9c45431e69ab 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4197,6 +4197,107 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, return 0; } +/* + * If the no holes feature is enabled we need to make sure any hole between the + * last extent and the i_size of our inode is explicitly marked in the log. This + * is to make sure that doing something like: + * + * 1) create file with 128Kb of data + * 2) truncate file to 64Kb + * 3) truncate file to 256Kb + * 4) fsync file + * 5) + * 6) mount fs and trigger log replay + * + * Will give us a file with a size of 256Kb, the first 64Kb of data match what + * the file had in its first 64Kb of data at step 1 and the last 192Kb of the + * file correspond to a hole. The presence of explicit holes in a log tree is + * what guarantees that log replay will remove/adjust file extent items in the + * fs/subvol tree. + * + * Here we do not need to care about holes between extents, that is already done + * by copy_items(). We also only need to do this in the full sync path, where we + * lookup for extents from the fs/subvol tree only. In the fast path case, we + * lookup the list of modified extent maps and if any represents a hole, we + * insert a corresponding extent representing a hole in the log tree. + */ +static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, + struct btrfs_path *path) +{ + int ret; + struct btrfs_key key; + u64 hole_start; + u64 hole_size; + struct extent_buffer *leaf; + struct btrfs_root *log = root->log_root; + const u64 ino = btrfs_ino(inode); + const u64 i_size = i_size_read(inode); + + if (!btrfs_fs_incompat(root->fs_info, NO_HOLES)) + return 0; + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + ASSERT(ret != 0); + if (ret < 0) + return ret; + + ASSERT(path->slots[0] > 0); + path->slots[0]--; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) { + /* inode does not have any extents */ + hole_start = 0; + hole_size = i_size; + } else { + struct btrfs_file_extent_item *extent; + u64 len; + + /* + * If there's an extent beyond i_size, an explicit hole was + * already inserted by copy_items(). + */ + if (key.offset >= i_size) + return 0; + + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + if (btrfs_file_extent_type(leaf, extent) == + BTRFS_FILE_EXTENT_INLINE) { + len = btrfs_file_extent_inline_len(leaf, + path->slots[0], + extent); + ASSERT(len == i_size); + return 0; + } + + len = btrfs_file_extent_num_bytes(leaf, extent); + /* Last extent goes beyond i_size, no need to log a hole. */ + if (key.offset + len > i_size) + return 0; + hole_start = key.offset + len; + hole_size = i_size - hole_start; + } + btrfs_release_path(path); + + /* Last extent ends at i_size. */ + if (hole_size == 0) + return 0; + + hole_size = ALIGN(hole_size, root->sectorsize); + ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0, + hole_size, 0, hole_size, 0, 0, 0); + return ret; +} + /* log a single inode in the tree log. * At least one parent directory for this inode must exist in the tree * or be logged already. @@ -4460,6 +4561,13 @@ next_slot: err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path); if (err) goto out_unlock; + if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { + btrfs_release_path(path); + btrfs_release_path(dst_path); + err = btrfs_log_trailing_hole(trans, root, inode, path); + if (err) + goto out_unlock; + } log_extents: btrfs_release_path(path); btrfs_release_path(dst_path); -- cgit v1.2.3