From 0a3404dcff29a7589e8d6ce8c36f97c5411f85b4 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 28 Jan 2013 12:34:55 +0000 Subject: Btrfs: fix wrong sync_writers decrement in btrfs_file_aio_write() If the checks at the beginning of btrfs_file_aio_write() fail, we needn't decrease ->sync_writers, because we have not increased it. Fix it. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/file.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 841cfe3be0e0..a902faab7161 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1595,9 +1595,10 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, if (err < 0 && num_written > 0) num_written = err; } -out: + if (sync) atomic_dec(&BTRFS_I(inode)->sync_writers); +out: sb_end_write(inode->i_sb); current->backing_dev_info = NULL; return num_written ? num_written : err; -- cgit v1.2.3 From 6f1c36055f96e80031c7fdda3fd5be826b8d7782 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Tue, 29 Jan 2013 03:22:10 +0000 Subject: Btrfs: fix race between snapshot deletion and getting inode While running snapshot testscript created by Mitch and David, the race between autodefrag and snapshot deletion can lead to corruption of dead_root list so that we can get crash on btrfs_clean_old_snapshots(). And besides autodefrag, scrub also does the same thing, ie. read root first and get inode. Here is the story(take autodefrag as an example): (1) when we delete a snapshot or subvolume, it will set its root's refs to zero and do a iput() on its own inode, and if this inode happens to be the only active in-meory one in root's inode rbtree, it will add itself to the global dead_roots list for later cleanup. (2) after (1), the autodefrag thread may read another inode for defrag and the inode is just in the deleted snapshot/subvolume, but all of these are without checking if the root is still valid(refs > 0). So the end up result is adding the deleted snapshot/subvolume's root to the global dead_roots list AGAIN. Fortunately, we already have a srcu lock to avoid the race, ie. subvol_srcu. So all we need to do is to take the lock to protect 'read root and get inode', since we synchronize to wait for the rcu grace period before adding something to the global dead_roots list. Reported-by: Mitch Harder Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/file.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index a902faab7161..b06d289f998f 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -293,15 +293,24 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, struct btrfs_key key; struct btrfs_ioctl_defrag_range_args range; int num_defrag; + int index; + int ret; /* get the inode */ key.objectid = defrag->root; btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); key.offset = (u64)-1; + + index = srcu_read_lock(&fs_info->subvol_srcu); + inode_root = btrfs_read_fs_root_no_name(fs_info, &key); if (IS_ERR(inode_root)) { - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - return PTR_ERR(inode_root); + ret = PTR_ERR(inode_root); + goto cleanup; + } + if (btrfs_root_refs(&inode_root->root_item) == 0) { + ret = -ENOENT; + goto cleanup; } key.objectid = defrag->ino; @@ -309,9 +318,10 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, key.offset = 0; inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); if (IS_ERR(inode)) { - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - return PTR_ERR(inode); + ret = PTR_ERR(inode); + goto cleanup; } + srcu_read_unlock(&fs_info->subvol_srcu, index); /* do a chunk of defrag */ clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); @@ -346,6 +356,10 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, iput(inode); return 0; +cleanup: + srcu_read_unlock(&fs_info->subvol_srcu, index); + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + return ret; } /* -- cgit v1.2.3 From 2ab28f322f9896782da904f5942f3873432addc8 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 12 Oct 2012 15:27:49 -0400 Subject: Btrfs: wait on ordered extents at the last possible moment Since we don't actually copy the extent information from the source tree in the fast case we don't need to wait for ordered io to be completed in order to fsync, we just need to wait for the io to be completed. So when we're logging our file just attach all of the ordered extents to the log, and then when the log syncs just wait for IO_DONE on the ordered extents and then write the super. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/file.c | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index b06d289f998f..083abca56055 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1655,16 +1655,21 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) struct btrfs_root *root = BTRFS_I(inode)->root; int ret = 0; struct btrfs_trans_handle *trans; + bool full_sync = 0; trace_btrfs_sync_file(file, datasync); /* * We write the dirty pages in the range and wait until they complete * out of the ->i_mutex. If so, we can flush the dirty pages by - * multi-task, and make the performance up. + * multi-task, and make the performance up. See + * btrfs_wait_ordered_range for an explanation of the ASYNC check. */ atomic_inc(&BTRFS_I(inode)->sync_writers); - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + ret = filemap_fdatawrite_range(inode->i_mapping, start, end); + if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) + ret = filemap_fdatawrite_range(inode->i_mapping, start, end); atomic_dec(&BTRFS_I(inode)->sync_writers); if (ret) return ret; @@ -1676,7 +1681,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * range being left. */ atomic_inc(&root->log_batch); - btrfs_wait_ordered_range(inode, start, end - start + 1); + full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + if (full_sync) + btrfs_wait_ordered_range(inode, start, end - start + 1); atomic_inc(&root->log_batch); /* @@ -1743,13 +1751,25 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (ret != BTRFS_NO_LOG_SYNC) { if (ret > 0) { + /* + * If we didn't already wait for ordered extents we need + * to do that now. + */ + if (!full_sync) + btrfs_wait_ordered_range(inode, start, + end - start + 1); ret = btrfs_commit_transaction(trans, root); } else { ret = btrfs_sync_log(trans, root); - if (ret == 0) + if (ret == 0) { ret = btrfs_end_transaction(trans, root); - else + } else { + if (!full_sync) + btrfs_wait_ordered_range(inode, start, + end - + start + 1); ret = btrfs_commit_transaction(trans, root); + } } } else { ret = btrfs_end_transaction(trans, root); -- cgit v1.2.3 From 55e301fd57a6239ec14b91a1cf2e70b3dd135194 Mon Sep 17 00:00:00 2001 From: Filipe Brandenburger Date: Tue, 29 Jan 2013 06:04:50 +0000 Subject: Btrfs: move fs/btrfs/ioctl.h to include/uapi/linux/btrfs.h The header file will then be installed under /usr/include/linux so that userspace applications can refer to Btrfs ioctls by name and use the same structs used internally in the kernel. Signed-off-by: Filipe Brandenburger Signed-off-by: Josef Bacik --- fs/btrfs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 083abca56055..13c78ea3ebce 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -30,11 +30,11 @@ #include #include #include +#include #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" -#include "ioctl.h" #include "print-tree.h" #include "tree-log.h" #include "locking.h" -- cgit v1.2.3 From 87533c475187c1420794a2e164bc67a7974f1327 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 29 Jan 2013 10:14:48 +0000 Subject: Btrfs: use bit operation for ->fs_state There is no lock to protect fs_info->fs_state, it will introduce some problems, such as the value may be covered by the other task when several tasks modify it. For example: Task0 - CPU0 Task1 - CPU1 mov %fs_state rax or $0x1 rax mov %fs_state rax or $0x2 rax mov rax %fs_state mov rax %fs_state The expected value is 3, but in fact, it is 2. Though this problem doesn't happen now (because there is only one flag currently), the code is error prone, if we add other flags, the above problem will happen to a certainty. Now we use bit operation for it to fix the above problem. In this way, we can make the code more robust and be easy to add new flags. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 13c78ea3ebce..75d0fe134be3 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1545,7 +1545,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, * although we have opened a file as writable, we have * to stop this write operation to ensure FS consistency. */ - if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) { mutex_unlock(&inode->i_mutex); err = -EROFS; goto out; -- cgit v1.2.3 From 569e0f358c0c37f6733702d4a5d2c412860f7169 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 13 Feb 2013 11:09:14 -0500 Subject: Btrfs: place ordered operations on a per transaction list Miao made the ordered operations stuff run async, which introduced a deadlock where we could get somebody (sync) racing in and committing the transaction while a commit was already happening. The new committer would try and flush ordered operations which would hang waiting for the commit to finish because it is done asynchronously and no longer inherits the callers trans handle. To fix this we need to make the ordered operations list a per transaction list. We can get new inodes added to the ordered operation list by truncating them and then having another process writing to them, so this makes it so that anybody trying to add an ordered operation _must_ start a transaction in order to add itself to the list, which will keep new inodes from getting added to the ordered operations list after we start committing. This should fix the deadlock and also keeps us from doing a lot more work than we need to during commit. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/file.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 75d0fe134be3..b12ba52c4505 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1628,7 +1628,20 @@ int btrfs_release_file(struct inode *inode, struct file *filp) */ if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, &BTRFS_I(inode)->runtime_flags)) { - btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode); + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + + /* + * We need to block on a committing transaction to keep us from + * throwing a ordered operation on to the list and causing + * something like sync to deadlock trying to flush out this + * inode. + */ + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode); + btrfs_end_transaction(trans, root); if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) filemap_flush(inode->i_mapping); } -- cgit v1.2.3 From dc81cdc58ad2f413b96b9004f8d681e5dc554473 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 20 Feb 2013 23:32:52 -0700 Subject: Btrfs: fix remount vs autodefrag If we remount the fs to close the auto defragment or make the fs R/O, we should stop the auto defragment. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/file.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 9f67e623206d..6e6dd8cdad92 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -374,6 +374,11 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) atomic_inc(&fs_info->defrag_running); while(1) { + /* Pause the auto defragger. */ + if (test_bit(BTRFS_FS_STATE_REMOUNTING, + &fs_info->fs_state)) + break; + if (!__need_auto_defrag(fs_info->tree_root)) break; -- cgit v1.2.3 From 496ad9aa8ef448058e36ca7a787c61f2e63f0f54 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 23 Jan 2013 17:07:38 -0500 Subject: new helper: file_inode(file) Signed-off-by: Al Viro --- fs/btrfs/file.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 77061bf43edb..4118e0b6e339 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1211,7 +1211,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file, struct extent_state *cached_state = NULL; int i; unsigned long index = pos >> PAGE_CACHE_SHIFT; - struct inode *inode = fdentry(file)->d_inode; + struct inode *inode = file_inode(file); gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); int err = 0; int faili = 0; @@ -1298,7 +1298,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, struct iov_iter *i, loff_t pos) { - struct inode *inode = fdentry(file)->d_inode; + struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; struct page **pages = NULL; unsigned long first_index; @@ -1486,7 +1486,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, unsigned long nr_segs, loff_t pos) { struct file *file = iocb->ki_filp; - struct inode *inode = fdentry(file)->d_inode; + struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; loff_t *ppos = &iocb->ki_pos; u64 start_pos; @@ -2087,7 +2087,7 @@ out: static long btrfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct extent_state *cached_state = NULL; u64 cur_offset; u64 last_byte; -- cgit v1.2.3 From fda2832febb1928da0625b2c5d15559b29d7e740 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 26 Feb 2013 08:10:22 +0000 Subject: btrfs: cleanup for open-coded alignment Though most of the btrfs codes are using ALIGN macro for page alignment, there are still some codes using open-coded alignment like the following: ------ u64 mask = ((u64)root->stripesize - 1); u64 ret = (val + mask) & ~mask; ------ Or even hidden one: ------ num_bytes = (end - start + blocksize) & ~(blocksize - 1); ------ Sometimes these open-coded alignment is not so easy to understand for newbie like me. This commit changes the open-coded alignment to the ALIGN macro for a better readability. Also there is a previous patch from David Sterba with similar changes, but the patch is for 3.2 kernel and seems not merged. http://www.spinics.net/lists/linux-btrfs/msg12747.html Cc: David Sterba Signed-off-by: Qu Wenruo Signed-off-by: Josef Bacik --- fs/btrfs/file.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 6e6dd8cdad92..83c790d84038 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -510,8 +510,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, loff_t isize = i_size_read(inode); start_pos = pos & ~((u64)root->sectorsize - 1); - num_bytes = (write_bytes + pos - start_pos + - root->sectorsize - 1) & ~((u64)root->sectorsize - 1); + num_bytes = ALIGN(write_bytes + pos - start_pos, root->sectorsize); end_of_last_block = start_pos + num_bytes - 1; err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, -- cgit v1.2.3 From 3b2775942d6ccb14342f3aae55f22fbbfea8db14 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 15 Mar 2013 08:46:39 -0600 Subject: Btrfs: fix warning of free_extent_map Users report that an extent map's list is still linked when it's actually going to be freed from cache. The story is that a) when we're going to drop an extent map and may split this large one into smaller ems, and if this large one is flagged as EXTENT_FLAG_LOGGING which means that it's on the list to be logged, then the smaller ems split from it will also be flagged as EXTENT_FLAG_LOGGING, and this is _not_ expected. b) we'll keep ems from unlinking the list and freeing when they are flagged with EXTENT_FLAG_LOGGING, because the log code holds one reference. The end result is the warning, but the truth is that we set the flag EXTENT_FLAG_LOGGING only during fsync. So clear flag EXTENT_FLAG_LOGGING for extent maps split from a large one. Reported-by: Johannes Hirte Reported-by: Darrick J. Wong Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/file.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 83c790d84038..7bdb47faa12e 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -591,6 +591,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, } compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); clear_bit(EXTENT_FLAG_PINNED, &em->flags); + clear_bit(EXTENT_FLAG_LOGGING, &flags); remove_extent_mapping(em_tree, em); if (no_splits) goto next; -- cgit v1.2.3