diff options
Diffstat (limited to 'fs/btrfs')
33 files changed, 410 insertions, 238 deletions
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index ff0b0be92d61..5456937836b8 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -265,16 +265,17 @@ out: } } -static void run_ordered_work(struct __btrfs_workqueue *wq) +static void run_ordered_work(struct __btrfs_workqueue *wq, + struct btrfs_work *self) { struct list_head *list = &wq->ordered_list; struct btrfs_work *work; spinlock_t *lock = &wq->list_lock; unsigned long flags; + void *wtag; + bool free_self = false; while (1) { - void *wtag; - spin_lock_irqsave(lock, flags); if (list_empty(list)) break; @@ -300,16 +301,47 @@ static void run_ordered_work(struct __btrfs_workqueue *wq) list_del(&work->ordered_list); spin_unlock_irqrestore(lock, flags); - /* - * We don't want to call the ordered free functions with the - * lock held though. Save the work as tag for the trace event, - * because the callback could free the structure. - */ - wtag = work; - work->ordered_free(work); - trace_btrfs_all_work_done(wq->fs_info, wtag); + if (work == self) { + /* + * This is the work item that the worker is currently + * executing. + * + * The kernel workqueue code guarantees non-reentrancy + * of work items. I.e., if a work item with the same + * address and work function is queued twice, the second + * execution is blocked until the first one finishes. A + * work item may be freed and recycled with the same + * work function; the workqueue code assumes that the + * original work item cannot depend on the recycled work + * item in that case (see find_worker_executing_work()). + * + * Note that the work of one Btrfs filesystem may depend + * on the work of another Btrfs filesystem via, e.g., a + * loop device. Therefore, we must not allow the current + * work item to be recycled until we are really done, + * otherwise we break the above assumption and can + * deadlock. + */ + free_self = true; + } else { + /* + * We don't want to call the ordered free functions with + * the lock held though. Save the work as tag for the + * trace event, because the callback could free the + * structure. + */ + wtag = work; + work->ordered_free(work); + trace_btrfs_all_work_done(wq->fs_info, wtag); + } } spin_unlock_irqrestore(lock, flags); + + if (free_self) { + wtag = self; + self->ordered_free(self); + trace_btrfs_all_work_done(wq->fs_info, wtag); + } } static void normal_work_helper(struct btrfs_work *work) @@ -337,7 +369,7 @@ static void normal_work_helper(struct btrfs_work *work) work->func(work); if (need_order) { set_bit(WORK_DONE_BIT, &work->flags); - run_ordered_work(wq); + run_ordered_work(wq, work); } if (!need_order) trace_btrfs_all_work_done(wq->fs_info, wtag); @@ -415,3 +447,11 @@ void btrfs_set_work_high_priority(struct btrfs_work *work) { set_bit(WORK_HIGH_PRIO_BIT, &work->flags); } + +void btrfs_flush_workqueue(struct btrfs_workqueue *wq) +{ + if (wq->high) + flush_workqueue(wq->high->normal_wq); + + flush_workqueue(wq->normal->normal_wq); +} diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 1f9597355c9d..a0f6986806a4 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -85,4 +85,6 @@ void btrfs_set_work_high_priority(struct btrfs_work *work); struct btrfs_fs_info *btrfs_work_owner(struct btrfs_work *work); struct btrfs_fs_info *btrfs_workqueue_owner(struct __btrfs_workqueue *wq); bool btrfs_workqueue_normal_congested(struct btrfs_workqueue *wq); +void btrfs_flush_workqueue(struct btrfs_workqueue *wq); + #endif diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 85dc7ab8f89e..2973d256bb44 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -2018,13 +2018,19 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, extent_item_objectid); if (!search_commit_root) { - trans = btrfs_join_transaction(fs_info->extent_root); - if (IS_ERR(trans)) - return PTR_ERR(trans); + trans = btrfs_attach_transaction(fs_info->extent_root); + if (IS_ERR(trans)) { + if (PTR_ERR(trans) != -ENOENT && + PTR_ERR(trans) != -EROFS) + return PTR_ERR(trans); + trans = NULL; + } + } + + if (trans) btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem); - } else { + else down_read(&fs_info->commit_root_sem); - } ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, tree_mod_seq_elem.seq, &refs, @@ -2056,7 +2062,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, free_leaf_list(refs); out: - if (!search_commit_root) { + if (trans) { btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); btrfs_end_transaction(trans, fs_info->extent_root); } else { diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index c94d3390cbfc..b5ebb43b1824 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -331,26 +331,6 @@ struct tree_mod_elem { struct tree_mod_root old_root; }; -static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info) -{ - read_lock(&fs_info->tree_mod_log_lock); -} - -static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info) -{ - read_unlock(&fs_info->tree_mod_log_lock); -} - -static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info) -{ - write_lock(&fs_info->tree_mod_log_lock); -} - -static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info) -{ - write_unlock(&fs_info->tree_mod_log_lock); -} - /* * Pull a new tree mod seq number for our operation. */ @@ -370,14 +350,12 @@ static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info) u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem) { - tree_mod_log_write_lock(fs_info); - spin_lock(&fs_info->tree_mod_seq_lock); + write_lock(&fs_info->tree_mod_log_lock); if (!elem->seq) { elem->seq = btrfs_inc_tree_mod_seq(fs_info); list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); } - spin_unlock(&fs_info->tree_mod_seq_lock); - tree_mod_log_write_unlock(fs_info); + write_unlock(&fs_info->tree_mod_log_lock); return elem->seq; } @@ -396,7 +374,7 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, if (!seq_putting) return; - spin_lock(&fs_info->tree_mod_seq_lock); + write_lock(&fs_info->tree_mod_log_lock); list_del(&elem->list); elem->seq = 0; @@ -407,29 +385,27 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, * blocker with lower sequence number exists, we * cannot remove anything from the log */ - spin_unlock(&fs_info->tree_mod_seq_lock); + write_unlock(&fs_info->tree_mod_log_lock); return; } min_seq = cur_elem->seq; } } - spin_unlock(&fs_info->tree_mod_seq_lock); /* * anything that's lower than the lowest existing (read: blocked) * sequence number can be removed from the tree. */ - tree_mod_log_write_lock(fs_info); tm_root = &fs_info->tree_mod_log; for (node = rb_first(tm_root); node; node = next) { next = rb_next(node); tm = container_of(node, struct tree_mod_elem, node); - if (tm->seq > min_seq) + if (tm->seq >= min_seq) continue; rb_erase(node, tm_root); kfree(tm); } - tree_mod_log_write_unlock(fs_info); + write_unlock(&fs_info->tree_mod_log_lock); } /* @@ -440,7 +416,7 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, * for root replace operations, or the logical address of the affected * block for all other operations. * - * Note: must be called with write lock (tree_mod_log_write_lock). + * Note: must be called with write lock for fs_info::tree_mod_log_lock. */ static noinline int __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) @@ -480,7 +456,7 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it * returns zero with the tree_mod_log_lock acquired. The caller must hold * this until all tree mod log insertions are recorded in the rb tree and then - * call tree_mod_log_write_unlock() to release. + * write unlock fs_info::tree_mod_log_lock. */ static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) { @@ -490,9 +466,9 @@ static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info, if (eb && btrfs_header_level(eb) == 0) return 1; - tree_mod_log_write_lock(fs_info); + write_lock(&fs_info->tree_mod_log_lock); if (list_empty(&(fs_info)->tree_mod_seq_list)) { - tree_mod_log_write_unlock(fs_info); + write_unlock(&fs_info->tree_mod_log_lock); return 1; } @@ -556,7 +532,7 @@ tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, } ret = __tree_mod_log_insert(fs_info, tm); - tree_mod_log_write_unlock(fs_info); + write_unlock(&eb->fs_info->tree_mod_log_lock); if (ret) kfree(tm); @@ -620,7 +596,7 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, ret = __tree_mod_log_insert(fs_info, tm); if (ret) goto free_tms; - tree_mod_log_write_unlock(fs_info); + write_unlock(&eb->fs_info->tree_mod_log_lock); kfree(tm_list); return 0; @@ -631,7 +607,7 @@ free_tms: kfree(tm_list[i]); } if (locked) - tree_mod_log_write_unlock(fs_info); + write_unlock(&eb->fs_info->tree_mod_log_lock); kfree(tm_list); kfree(tm); @@ -712,7 +688,7 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, if (!ret) ret = __tree_mod_log_insert(fs_info, tm); - tree_mod_log_write_unlock(fs_info); + write_unlock(&fs_info->tree_mod_log_lock); if (ret) goto free_tms; kfree(tm_list); @@ -739,7 +715,7 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq, struct tree_mod_elem *cur = NULL; struct tree_mod_elem *found = NULL; - tree_mod_log_read_lock(fs_info); + read_lock(&fs_info->tree_mod_log_lock); tm_root = &fs_info->tree_mod_log; node = tm_root->rb_node; while (node) { @@ -767,7 +743,7 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq, break; } } - tree_mod_log_read_unlock(fs_info); + read_unlock(&fs_info->tree_mod_log_lock); return found; } @@ -848,7 +824,7 @@ tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, goto free_tms; } - tree_mod_log_write_unlock(fs_info); + write_unlock(&fs_info->tree_mod_log_lock); kfree(tm_list); return 0; @@ -860,7 +836,7 @@ free_tms: kfree(tm_list[i]); } if (locked) - tree_mod_log_write_unlock(fs_info); + write_unlock(&fs_info->tree_mod_log_lock); kfree(tm_list); return ret; @@ -920,7 +896,7 @@ tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) goto free_tms; ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems); - tree_mod_log_write_unlock(fs_info); + write_unlock(&eb->fs_info->tree_mod_log_lock); if (ret) goto free_tms; kfree(tm_list); @@ -1271,7 +1247,7 @@ __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, unsigned long p_size = sizeof(struct btrfs_key_ptr); n = btrfs_header_nritems(eb); - tree_mod_log_read_lock(fs_info); + read_lock(&fs_info->tree_mod_log_lock); while (tm && tm->seq >= time_seq) { /* * all the operations are recorded with the operator used for @@ -1326,7 +1302,7 @@ __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, if (tm->logical != first_tm->logical) break; } - tree_mod_log_read_unlock(fs_info); + read_unlock(&fs_info->tree_mod_log_lock); btrfs_set_header_nritems(eb, n); } @@ -1406,6 +1382,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq) struct tree_mod_elem *tm; struct extent_buffer *eb = NULL; struct extent_buffer *eb_root; + u64 eb_root_owner = 0; struct extent_buffer *old; struct tree_mod_root *old_root = NULL; u64 old_generation = 0; @@ -1439,6 +1416,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq) free_extent_buffer(old); } } else if (old_root) { + eb_root_owner = btrfs_header_owner(eb_root); btrfs_tree_read_unlock(eb_root); free_extent_buffer(eb_root); eb = alloc_dummy_extent_buffer(root->fs_info, logical, @@ -1457,7 +1435,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq) if (old_root) { btrfs_set_header_bytenr(eb, eb->start); btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); - btrfs_set_header_owner(eb, btrfs_header_owner(eb_root)); + btrfs_set_header_owner(eb, eb_root_owner); btrfs_set_header_level(eb, old_root->level); btrfs_set_header_generation(eb, old_generation); } @@ -2971,6 +2949,10 @@ int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key, again: b = get_old_root(root, time_seq); + if (!b) { + ret = -EIO; + goto done; + } level = btrfs_header_level(b); p->locks[level] = BTRFS_READ_LOCK; @@ -5465,6 +5447,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root, advance_left = advance_right = 0; while (1) { + cond_resched(); if (advance_left && !left_end_reached) { ret = tree_advance(left_root, left_path, &left_level, left_root_level, diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index a423c36bcd72..2bc37d03d407 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -851,14 +851,12 @@ struct btrfs_fs_info { struct list_head delayed_iputs; struct mutex cleaner_delayed_iput_mutex; - /* this protects tree_mod_seq_list */ - spinlock_t tree_mod_seq_lock; atomic64_t tree_mod_seq; - struct list_head tree_mod_seq_list; - /* this protects tree_mod_log */ + /* this protects tree_mod_log and tree_mod_seq_list */ rwlock_t tree_mod_log_lock; struct rb_root tree_mod_log; + struct list_head tree_mod_seq_list; atomic_t nr_async_submits; atomic_t async_submit_draining; diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 8d93854a4b4f..c1ca4ce11e69 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -193,8 +193,6 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, ref->in_tree = 0; btrfs_put_delayed_ref(ref); atomic_dec(&delayed_refs->num_entries); - if (trans->delayed_ref_updates) - trans->delayed_ref_updates--; } static bool merge_ref(struct btrfs_trans_handle *trans, @@ -281,7 +279,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, if (head->is_data) return; - spin_lock(&fs_info->tree_mod_seq_lock); + read_lock(&fs_info->tree_mod_log_lock); if (!list_empty(&fs_info->tree_mod_seq_list)) { struct seq_list *elem; @@ -289,7 +287,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, struct seq_list, list); seq = elem->seq; } - spin_unlock(&fs_info->tree_mod_seq_lock); + read_unlock(&fs_info->tree_mod_log_lock); ref = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node, list); @@ -317,7 +315,7 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem; int ret = 0; - spin_lock(&fs_info->tree_mod_seq_lock); + read_lock(&fs_info->tree_mod_log_lock); if (!list_empty(&fs_info->tree_mod_seq_list)) { elem = list_first_entry(&fs_info->tree_mod_seq_list, struct seq_list, list); @@ -331,7 +329,7 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, } } - spin_unlock(&fs_info->tree_mod_seq_lock); + read_unlock(&fs_info->tree_mod_log_lock); return ret; } @@ -445,7 +443,6 @@ add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans, add_tail: list_add_tail(&ref->list, &href->ref_list); atomic_inc(&root->num_entries); - trans->delayed_ref_updates++; spin_unlock(&href->lock); return ret; } diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index fb973cc0af66..395b07764269 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -511,18 +511,27 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, } btrfs_wait_ordered_roots(root->fs_info, -1, 0, (u64)-1); - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) { - mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); - return PTR_ERR(trans); + while (1) { + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + return PTR_ERR(trans); + } + ret = btrfs_commit_transaction(trans, root); + WARN_ON(ret); + mutex_lock(&uuid_mutex); + /* keep away write_all_supers() during the finishing procedure */ + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + mutex_lock(&root->fs_info->chunk_mutex); + if (src_device->has_pending_chunks) { + mutex_unlock(&root->fs_info->chunk_mutex); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + mutex_unlock(&uuid_mutex); + } else { + break; + } } - ret = btrfs_commit_transaction(trans, root); - WARN_ON(ret); - mutex_lock(&uuid_mutex); - /* keep away write_all_supers() during the finishing procedure */ - mutex_lock(&root->fs_info->fs_devices->device_list_mutex); - mutex_lock(&root->fs_info->chunk_mutex); btrfs_dev_replace_lock(dev_replace, 1); dev_replace->replace_state = scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9d3352fe8dc9..1de017051928 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1712,8 +1712,8 @@ static void end_workqueue_fn(struct btrfs_work *work) bio->bi_error = end_io_wq->error; bio->bi_private = end_io_wq->private; bio->bi_end_io = end_io_wq->end_io; - kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq); bio_endio(bio); + kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq); } static int cleaner_kthread(void *arg) @@ -2104,7 +2104,7 @@ static void free_root_extent_buffers(struct btrfs_root *root) } /* helper to cleanup tree roots */ -static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root) +static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root) { free_root_extent_buffers(info->tree_root); @@ -2113,7 +2113,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root) free_root_extent_buffers(info->csum_root); free_root_extent_buffers(info->quota_root); free_root_extent_buffers(info->uuid_root); - if (chunk_root) + if (free_chunk_root) free_root_extent_buffers(info->chunk_root); free_root_extent_buffers(info->free_space_root); } @@ -2519,7 +2519,6 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); spin_lock_init(&fs_info->free_chunk_lock); - spin_lock_init(&fs_info->tree_mod_seq_lock); spin_lock_init(&fs_info->super_lock); spin_lock_init(&fs_info->qgroup_op_lock); spin_lock_init(&fs_info->buffer_lock); @@ -2980,6 +2979,7 @@ retry_root_backup: /* do not make disk changes in broken FS or nologreplay is given */ if (btrfs_super_log_root(disk_super) != 0 && !btrfs_test_opt(tree_root->fs_info, NOLOGREPLAY)) { + btrfs_info(fs_info, "start tree-log replay"); ret = btrfs_replay_log(fs_info, fs_devices); if (ret) { err = ret; @@ -3136,7 +3136,7 @@ fail_block_groups: btrfs_free_block_groups(fs_info); fail_tree_roots: - free_root_pointers(fs_info, 1); + free_root_pointers(fs_info, true); invalidate_inode_pages2(fs_info->btree_inode->i_mapping); fail_sb_buffer: @@ -3165,7 +3165,7 @@ recovery_tree_root: if (!btrfs_test_opt(tree_root->fs_info, USEBACKUPROOT)) goto fail_tree_roots; - free_root_pointers(fs_info, 0); + free_root_pointers(fs_info, false); /* don't use the log in recovery mode, it won't be valid */ btrfs_set_super_log_root(disk_super, 0); @@ -3825,6 +3825,19 @@ void close_ctree(struct btrfs_root *root) */ btrfs_delete_unused_bgs(root->fs_info); + /* + * There might be existing delayed inode workers still running + * and holding an empty delayed inode item. We must wait for + * them to complete first because they can create a transaction. + * This happens when someone calls btrfs_balance_delayed_items() + * and then a transaction commit runs the same delayed nodes + * before any delayed worker has done something with the nodes. + * We must wait for any worker here and not at transaction + * commit time since that could cause a deadlock. + * This is a very rare case. + */ + btrfs_flush_workqueue(fs_info->delayed_workers); + ret = btrfs_commit_super(root); if (ret) btrfs_err(fs_info, "commit super ret %d", ret); @@ -3862,7 +3875,7 @@ void close_ctree(struct btrfs_root *root) btrfs_stop_all_workers(fs_info); clear_bit(BTRFS_FS_OPEN, &fs_info->flags); - free_root_pointers(fs_info, 1); + free_root_pointers(fs_info, true); iput(fs_info->btree_inode); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 7938c48c72ff..538f378eea52 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7571,6 +7571,14 @@ search: */ if ((flags & extra) && !(block_group->flags & extra)) goto loop; + + /* + * This block group has different flags than we want. + * It's possible that we have MIXED_GROUP flag but no + * block group is mixed. Just skip such block group. + */ + btrfs_release_block_group(block_group, delalloc); + continue; } have_block_group: @@ -10317,6 +10325,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) btrfs_err(info, "bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups", cache->key.objectid); + btrfs_put_block_group(cache); ret = -EINVAL; goto error; } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4d901200be13..1372d3e5d90b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4060,6 +4060,14 @@ retry: */ scanned = 1; index = 0; + + /* + * If we're looping we could run into a page that is locked by a + * writer and that writer could be waiting on writeback for a + * page in our current bio, and thus deadlock, so flush the + * write bio here. + */ + flush_write_bio(data); goto retry; } @@ -4994,12 +5002,14 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, return eb; eb = alloc_dummy_extent_buffer(fs_info, start, nodesize); if (!eb) - return NULL; + return ERR_PTR(-ENOMEM); eb->fs_info = fs_info; again: ret = radix_tree_preload(GFP_NOFS); - if (ret) + if (ret) { + exists = ERR_PTR(ret); goto free_eb; + } spin_lock(&fs_info->buffer_lock); ret = radix_tree_insert(&fs_info->buffer_radix, start >> PAGE_SHIFT, eb); diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 26f9ac719d20..4f59b4089eb0 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -227,6 +227,17 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) struct extent_map *merge = NULL; struct rb_node *rb; + /* + * We can't modify an extent map that is in the tree and that is being + * used by another task, as it can cause that other task to see it in + * inconsistent state during the merging. We always have 1 reference for + * the tree and 1 for this task (which is unpinning the extent map or + * clearing the logging flag), so anything > 2 means it's being used by + * other tasks too. + */ + if (atomic_read(&em->refs) > 2) + return; + if (em->start != 0) { rb = rb_prev(&em->rb_node); if (rb) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 437544846e4e..03661b744eaf 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1555,6 +1555,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, break; } + only_release_metadata = false; sector_offset = pos & (root->sectorsize - 1); reserve_bytes = round_up(write_bytes + sector_offset, root->sectorsize); @@ -1704,7 +1705,6 @@ again: set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, EXTENT_NORESERVE, NULL, NULL, GFP_NOFS); - only_release_metadata = false; } btrfs_drop_pages(pages, num_pages); @@ -1952,6 +1952,18 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) u64 len; /* + * If the inode needs a full sync, make sure we use a full range to + * avoid log tree corruption, due to hole detection racing with ordered + * extent completion for adjacent ranges, and assertion failures during + * hole detection. + */ + if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags)) { + start = 0; + end = LLONG_MAX; + } + + /* * The range length can be represented by u64, we have to do the typecasts * to avoid signed overflow if it's [0, LLONG_MAX] eg. from fsync() */ @@ -2634,6 +2646,11 @@ out_only_mutex: * for detecting, at fsync time, if the inode isn't yet in the * log tree or it's there but not up to date. */ + struct timespec now = current_time(inode); + + inode_inc_iversion(inode); + inode->i_mtime = now; + inode->i_ctime = now; trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { err = PTR_ERR(trans); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 69a3c11af9d4..a84a1ceb260a 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -391,6 +391,12 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, struct inode *inode if (uptodate && !PageUptodate(page)) { btrfs_readpage(NULL, page); lock_page(page); + if (page->mapping != inode->i_mapping) { + btrfs_err(BTRFS_I(inode)->root->fs_info, + "free space cache page truncated"); + io_ctl_drop_pages(io_ctl); + return -EIO; + } if (!PageUptodate(page)) { btrfs_err(BTRFS_I(inode)->root->fs_info, "error reading free space cache"); diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index d27014b8bf72..075b59516c8c 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -159,6 +159,7 @@ static void start_caching(struct btrfs_root *root) spin_lock(&root->ino_cache_lock); root->ino_cache_state = BTRFS_CACHE_FINISHED; spin_unlock(&root->ino_cache_lock); + wake_up(&root->ino_cache_wait); return; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a609d2017a63..f56610ad64bd 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5576,7 +5576,6 @@ static void inode_tree_del(struct inode *inode) spin_unlock(&root->inode_lock); if (empty && btrfs_root_refs(&root->root_item) == 0) { - synchronize_srcu(&root->fs_info->subvol_srcu); spin_lock(&root->inode_lock); empty = RB_EMPTY_ROOT(&root->inode_tree); spin_unlock(&root->inode_lock); @@ -9596,9 +9595,8 @@ static int btrfs_rename_exchange(struct inode *old_dir, return -EXDEV; /* close the race window with snapshot create/destroy ioctl */ - if (old_ino == BTRFS_FIRST_FREE_OBJECTID) - down_read(&root->fs_info->subvol_sem); - if (new_ino == BTRFS_FIRST_FREE_OBJECTID) + if (old_ino == BTRFS_FIRST_FREE_OBJECTID || + new_ino == BTRFS_FIRST_FREE_OBJECTID) down_read(&dest->fs_info->subvol_sem); /* @@ -9615,6 +9613,9 @@ static int btrfs_rename_exchange(struct inode *old_dir, goto out_notrans; } + if (dest != root) + btrfs_record_root_in_trans(trans, dest); + /* * We need to find a free sequence number both in the source and * in the destination directory for the exchange. @@ -9781,9 +9782,8 @@ out_fail: ret2 = btrfs_end_transaction(trans, root); ret = ret ? ret : ret2; out_notrans: - if (new_ino == BTRFS_FIRST_FREE_OBJECTID) - up_read(&dest->fs_info->subvol_sem); - if (old_ino == BTRFS_FIRST_FREE_OBJECTID) + if (new_ino == BTRFS_FIRST_FREE_OBJECTID || + old_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&root->fs_info->subvol_sem); return ret; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 242584a0d3b5..eefe103c65da 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -385,6 +385,16 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; + /* + * If the fs is mounted with nologreplay, which requires it to be + * mounted in RO mode as well, we can not allow discard on free space + * inside block groups, because log trees refer to extents that are not + * pinned in a block group's free space cache (pinning the extents is + * precisely the first phase of replaying a log tree). + */ + if (btrfs_test_opt(fs_info, NOLOGREPLAY)) + return -EROFS; + rcu_read_lock(); list_for_each_entry_rcu(device, &fs_info->fs_devices->devices, dev_list) { @@ -600,12 +610,18 @@ static noinline int create_subvol(struct inode *dir, btrfs_i_size_write(dir, dir->i_size + namelen * 2); ret = btrfs_update_inode(trans, root, dir); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto fail; + } ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, objectid, root->root_key.objectid, btrfs_ino(dir), index, name, namelen); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto fail; + } ret = btrfs_uuid_tree_add(trans, root->fs_info->uuid_root, root_item->uuid, BTRFS_UUID_KEY_SUBVOL, diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index b2d1e95de7be..7dc2284017fa 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -837,10 +837,15 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) } btrfs_start_ordered_extent(inode, ordered, 1); end = ordered->file_offset; + /* + * If the ordered extent had an error save the error but don't + * exit without waiting first for all other ordered extents in + * the range to complete. + */ if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) ret = -EIO; btrfs_put_ordered_extent(ordered); - if (ret || end == 0 || end == start) + if (end == 0 || end == start) break; end--; } diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index f25233093d68..0355e6d9e21c 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -759,10 +759,10 @@ out: return ret; } -static int update_qgroup_status_item(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_root *root) +static int update_qgroup_status_item(struct btrfs_trans_handle *trans) { + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *quota_root = fs_info->quota_root; struct btrfs_path *path; struct btrfs_key key; struct extent_buffer *l; @@ -778,7 +778,7 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1); if (ret > 0) ret = -ENOENT; @@ -1863,7 +1863,7 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans, fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; spin_unlock(&fs_info->qgroup_lock); - ret = update_qgroup_status_item(trans, fs_info, quota_root); + ret = update_qgroup_status_item(trans); if (ret) fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; @@ -2380,9 +2380,6 @@ out: btrfs_free_path(path); mutex_lock(&fs_info->qgroup_rescan_lock); - if (!btrfs_fs_closing(fs_info)) - fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; - if (err > 0 && fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) { fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; @@ -2398,16 +2395,30 @@ out: trans = btrfs_start_transaction(fs_info->quota_root, 1); if (IS_ERR(trans)) { err = PTR_ERR(trans); + trans = NULL; btrfs_err(fs_info, "fail to start transaction for status update: %d\n", err); - goto done; } - ret = update_qgroup_status_item(trans, fs_info, fs_info->quota_root); - if (ret < 0) { - err = ret; - btrfs_err(fs_info, "fail to update qgroup status: %d", err); + + mutex_lock(&fs_info->qgroup_rescan_lock); + if (!btrfs_fs_closing(fs_info)) + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; + if (trans) { + ret = update_qgroup_status_item(trans); + if (ret < 0) { + err = ret; + btrfs_err(fs_info, "fail to update qgroup status: %d", + err); + } } + fs_info->qgroup_rescan_running = false; + complete_all(&fs_info->qgroup_rescan_completion); + mutex_unlock(&fs_info->qgroup_rescan_lock); + + if (!trans) + return; + btrfs_end_transaction(trans, fs_info->quota_root); if (btrfs_fs_closing(fs_info)) { @@ -2418,12 +2429,6 @@ out: } else { btrfs_err(fs_info, "qgroup scan failed with %d", err); } - -done: - mutex_lock(&fs_info->qgroup_rescan_lock); - fs_info->qgroup_rescan_running = false; - mutex_unlock(&fs_info->qgroup_rescan_lock); - complete_all(&fs_info->qgroup_rescan_completion); } /* diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index af6a776fa18c..5aa07de5750e 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -2395,8 +2395,9 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, bitmap_clear(rbio->dbitmap, pagenr, 1); kunmap(p); - for (stripe = 0; stripe < rbio->real_stripes; stripe++) + for (stripe = 0; stripe < nr_data; stripe++) kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); + kunmap(p_page); } __free_page(p_page); diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 75bab76739be..0d1565d71231 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -734,21 +734,19 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, static void reada_start_machine_worker(struct btrfs_work *work) { struct reada_machine_work *rmw; - struct btrfs_fs_info *fs_info; int old_ioprio; rmw = container_of(work, struct reada_machine_work, work); - fs_info = rmw->fs_info; - - kfree(rmw); old_ioprio = IOPRIO_PRIO_VALUE(task_nice_ioclass(current), task_nice_ioprio(current)); set_task_ioprio(current, BTRFS_IOPRIO_READA); - __reada_start_machine(fs_info); + __reada_start_machine(rmw->fs_info); set_task_ioprio(current, old_ioprio); - atomic_dec(&fs_info->reada_works_cnt); + atomic_dec(&rmw->fs_info->reada_works_cnt); + + kfree(rmw); } static void __reada_start_machine(struct btrfs_fs_info *fs_info) @@ -759,6 +757,7 @@ static void __reada_start_machine(struct btrfs_fs_info *fs_info) u64 total = 0; int i; +again: do { enqueued = 0; mutex_lock(&fs_devices->device_list_mutex); @@ -771,6 +770,10 @@ static void __reada_start_machine(struct btrfs_fs_info *fs_info) mutex_unlock(&fs_devices->device_list_mutex); total += enqueued; } while (enqueued && total < 10000); + if (fs_devices->seed) { + fs_devices = fs_devices->seed; + goto again; + } if (enqueued == 0) return; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index b0c3a6afe664..1003b983a8d7 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -537,8 +537,8 @@ static int should_ignore_root(struct btrfs_root *root) if (!reloc_root) return 0; - if (btrfs_root_last_snapshot(&reloc_root->root_item) == - root->fs_info->running_transaction->transid - 1) + if (btrfs_header_generation(reloc_root->commit_root) == + root->fs_info->running_transaction->transid) return 0; /* * if there is reloc tree and it was created in previous @@ -1185,7 +1185,7 @@ out: free_backref_node(cache, lower); } - free_backref_node(cache, node); + remove_backref_node(cache, node); return ERR_PTR(err); } ASSERT(!node || !node->detached); @@ -1296,7 +1296,7 @@ static int __must_check __add_reloc_root(struct btrfs_root *root) if (!node) return -ENOMEM; - node->bytenr = root->node->start; + node->bytenr = root->commit_root->start; node->data = root; spin_lock(&rc->reloc_root_tree.lock); @@ -1328,10 +1328,11 @@ static void __del_reloc_root(struct btrfs_root *root) if (rc && root->node) { spin_lock(&rc->reloc_root_tree.lock); rb_node = tree_search(&rc->reloc_root_tree.rb_root, - root->node->start); + root->commit_root->start); if (rb_node) { node = rb_entry(rb_node, struct mapping_node, rb_node); rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); + RB_CLEAR_NODE(&node->rb_node); } spin_unlock(&rc->reloc_root_tree.lock); if (!node) @@ -1349,7 +1350,7 @@ static void __del_reloc_root(struct btrfs_root *root) * helper to update the 'address of tree root -> reloc tree' * mapping */ -static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr) +static int __update_reloc_root(struct btrfs_root *root) { struct rb_node *rb_node; struct mapping_node *node = NULL; @@ -1357,7 +1358,7 @@ static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr) spin_lock(&rc->reloc_root_tree.lock); rb_node = tree_search(&rc->reloc_root_tree.rb_root, - root->node->start); + root->commit_root->start); if (rb_node) { node = rb_entry(rb_node, struct mapping_node, rb_node); rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); @@ -1369,7 +1370,7 @@ static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr) BUG_ON((struct btrfs_root *)node->data != root); spin_lock(&rc->reloc_root_tree.lock); - node->bytenr = new_bytenr; + node->bytenr = root->node->start; rb_node = tree_insert(&rc->reloc_root_tree.rb_root, node->bytenr, &node->rb_node); spin_unlock(&rc->reloc_root_tree.lock); @@ -1519,6 +1520,7 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, } if (reloc_root->commit_root != reloc_root->node) { + __update_reloc_root(reloc_root); btrfs_set_root_node(root_item, reloc_root->node); free_extent_buffer(reloc_root->commit_root); reloc_root->commit_root = btrfs_root_node(reloc_root); @@ -2457,7 +2459,21 @@ out: free_reloc_roots(&reloc_roots); } - BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); + /* + * We used to have + * + * BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); + * + * here, but it's wrong. If we fail to start the transaction in + * prepare_to_merge() we will have only 0 ref reloc roots, none of which + * have actually been removed from the reloc_root_tree rb tree. This is + * fine because we're bailing here, and we hold a reference on the root + * for the list that holds it, so these roots will be cleaned up when we + * do the reloc_dirty_list afterwards. Meanwhile the root->reloc_root + * will be cleaned up on unmount. + * + * The remaining nodes will be cleaned up by free_reloc_control. + */ } static void free_block_list(struct rb_root *blocks) @@ -4587,6 +4603,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) reloc_root->root_key.offset); if (IS_ERR(fs_root)) { err = PTR_ERR(fs_root); + list_add_tail(&reloc_root->root_list, &reloc_roots); goto out_free; } @@ -4702,11 +4719,6 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, BUG_ON(rc->stage == UPDATE_DATA_PTRS && root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID); - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { - if (buf == root->node) - __update_reloc_root(root, cow->start); - } - level = btrfs_header_level(buf); if (btrfs_header_generation(buf) <= btrfs_root_last_snapshot(&root->root_item)) diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index edae751e870c..307b8baaf0e9 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -144,10 +144,8 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root return -ENOMEM; ret = btrfs_search_slot(trans, root, key, path, 0, 1); - if (ret < 0) { - btrfs_abort_transaction(trans, ret); + if (ret < 0) goto out; - } if (ret != 0) { btrfs_print_leaf(root, path->nodes[0]); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index a45f26ac5da7..edfc7ba38b33 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -37,6 +37,14 @@ #include "compression.h" /* + * Maximum number of references an extent can have in order for us to attempt to + * issue clone operations instead of write operations. This currently exists to + * avoid hitting limitations of the backreference walking code (taking a lot of + * time and using too much memory for extents with large number of references). + */ +#define SEND_MAX_EXTENT_REFS 64 + +/* * A fs_path is a helper to dynamically build path names with unknown size. * It reallocates the internal buffer on demand. * It allows fast adding of path elements on the right side (normal path) and @@ -1327,6 +1335,7 @@ static int find_extent_clone(struct send_ctx *sctx, struct clone_root *cur_clone_root; struct btrfs_key found_key; struct btrfs_path *tmp_path; + struct btrfs_extent_item *ei; int compressed; u32 i; @@ -1376,7 +1385,6 @@ static int find_extent_clone(struct send_ctx *sctx, ret = extent_from_logical(fs_info, disk_byte, tmp_path, &found_key, &flags); up_read(&fs_info->commit_root_sem); - btrfs_release_path(tmp_path); if (ret < 0) goto out; @@ -1385,6 +1393,21 @@ static int find_extent_clone(struct send_ctx *sctx, goto out; } + ei = btrfs_item_ptr(tmp_path->nodes[0], tmp_path->slots[0], + struct btrfs_extent_item); + /* + * Backreference walking (iterate_extent_inodes() below) is currently + * too expensive when an extent has a large number of references, both + * in time spent and used memory. So for now just fallback to write + * operations instead of clone operations when an extent has more than + * a certain amount of references. + */ + if (btrfs_extent_refs(tmp_path->nodes[0], ei) > SEND_MAX_EXTENT_REFS) { + ret = -ENOENT; + goto out; + } + btrfs_release_path(tmp_path); + /* * Setup the clone roots. */ @@ -5835,68 +5858,21 @@ static int changed_extent(struct send_ctx *sctx, { int ret = 0; - if (sctx->cur_ino != sctx->cmp_key->objectid) { - - if (result == BTRFS_COMPARE_TREE_CHANGED) { - struct extent_buffer *leaf_l; - struct extent_buffer *leaf_r; - struct btrfs_file_extent_item *ei_l; - struct btrfs_file_extent_item *ei_r; - - leaf_l = sctx->left_path->nodes[0]; - leaf_r = sctx->right_path->nodes[0]; - ei_l = btrfs_item_ptr(leaf_l, - sctx->left_path->slots[0], - struct btrfs_file_extent_item); - ei_r = btrfs_item_ptr(leaf_r, - sctx->right_path->slots[0], - struct btrfs_file_extent_item); - - /* - * We may have found an extent item that has changed - * only its disk_bytenr field and the corresponding - * inode item was not updated. This case happens due to - * very specific timings during relocation when a leaf - * that contains file extent items is COWed while - * relocation is ongoing and its in the stage where it - * updates data pointers. So when this happens we can - * safely ignore it since we know it's the same extent, - * but just at different logical and physical locations - * (when an extent is fully replaced with a new one, we - * know the generation number must have changed too, - * since snapshot creation implies committing the current - * transaction, and the inode item must have been updated - * as well). - * This replacement of the disk_bytenr happens at - * relocation.c:replace_file_extents() through - * relocation.c:btrfs_reloc_cow_block(). - */ - if (btrfs_file_extent_generation(leaf_l, ei_l) == - btrfs_file_extent_generation(leaf_r, ei_r) && - btrfs_file_extent_ram_bytes(leaf_l, ei_l) == - btrfs_file_extent_ram_bytes(leaf_r, ei_r) && - btrfs_file_extent_compression(leaf_l, ei_l) == - btrfs_file_extent_compression(leaf_r, ei_r) && - btrfs_file_extent_encryption(leaf_l, ei_l) == - btrfs_file_extent_encryption(leaf_r, ei_r) && - btrfs_file_extent_other_encoding(leaf_l, ei_l) == - btrfs_file_extent_other_encoding(leaf_r, ei_r) && - btrfs_file_extent_type(leaf_l, ei_l) == - btrfs_file_extent_type(leaf_r, ei_r) && - btrfs_file_extent_disk_bytenr(leaf_l, ei_l) != - btrfs_file_extent_disk_bytenr(leaf_r, ei_r) && - btrfs_file_extent_disk_num_bytes(leaf_l, ei_l) == - btrfs_file_extent_disk_num_bytes(leaf_r, ei_r) && - btrfs_file_extent_offset(leaf_l, ei_l) == - btrfs_file_extent_offset(leaf_r, ei_r) && - btrfs_file_extent_num_bytes(leaf_l, ei_l) == - btrfs_file_extent_num_bytes(leaf_r, ei_r)) - return 0; - } - - inconsistent_snapshot_error(sctx, result, "extent"); - return -EIO; - } + /* + * We have found an extent item that changed without the inode item + * having changed. This can happen either after relocation (where the + * disk_bytenr of an extent item is replaced at + * relocation.c:replace_file_extents()) or after deduplication into a + * file in both the parent and send snapshots (where an extent item can + * get modified or replaced with a new one). Note that deduplication + * updates the inode item, but it only changes the iversion (sequence + * field in the inode item) of the inode, so if a file is deduplicated + * the same amount of times in both the parent and send snapshots, its + * iversion becames the same in both snapshots, whence the inode item is + * the same on both snapshots. + */ + if (sctx->cur_ino != sctx->cmp_key->objectid) + return 0; if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) { if (result != BTRFS_COMPARE_TREE_DELETED) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index a7b69deb6d70..9286603a6a98 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1809,6 +1809,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) } if (btrfs_super_log_root(fs_info->super_copy) != 0) { + btrfs_warn(fs_info, + "mount required to replay tree-log, cannot remount read-write"); ret = -EINVAL; goto restore; } @@ -2164,7 +2166,15 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) */ thresh = 4 * 1024 * 1024; - if (!mixed && total_free_meta - thresh < block_rsv->size) + /* + * We only want to claim there's no available space if we can no longer + * allocate chunks for our metadata profile and our global reserve will + * not fit in the free metadata space. If we aren't ->full then we + * still can allocate chunks and thus are fine using the currently + * calculated f_bavail. + */ + if (!mixed && block_rsv->space_info->full && + total_free_meta - thresh < block_rsv->size) buf->f_bavail = 0; buf->f_type = BTRFS_SUPER_MAGIC; diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 1f157fba8940..510cad48e519 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -751,7 +751,12 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs, fs_devs->fsid_kobj.kset = btrfs_kset; error = kobject_init_and_add(&fs_devs->fsid_kobj, &btrfs_ktype, parent, "%pU", fs_devs->fsid); - return error; + if (error) { + kobject_put(&fs_devs->fsid_kobj); + return error; + } + + return 0; } int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info) diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index bf62ad919a95..9edc2674b8a7 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -112,7 +112,6 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void) spin_lock_init(&fs_info->qgroup_op_lock); spin_lock_init(&fs_info->super_lock); spin_lock_init(&fs_info->fs_roots_radix_lock); - spin_lock_init(&fs_info->tree_mod_seq_lock); mutex_init(&fs_info->qgroup_ioctl_lock); mutex_init(&fs_info->qgroup_rescan_lock); rwlock_init(&fs_info->tree_mod_log_lock); diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c index a724d9a79bd2..5e3b875d87e2 100644 --- a/fs/btrfs/tests/free-space-tree-tests.c +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -476,9 +476,9 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, root->node = alloc_test_extent_buffer(root->fs_info, nodesize, nodesize); - if (!root->node) { - test_msg("Couldn't allocate dummy buffer\n"); - ret = -ENOMEM; + if (IS_ERR(root->node)) { + test_msg("couldn't allocate dummy buffer\n"); + ret = PTR_ERR(root->node); goto out; } btrfs_set_header_level(root->node, 0); diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 9c6666692341..e0aa6b9786fa 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -488,9 +488,9 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) */ root->node = alloc_test_extent_buffer(root->fs_info, nodesize, nodesize); - if (!root->node) { + if (IS_ERR(root->node)) { test_msg("Couldn't allocate dummy buffer\n"); - ret = -ENOMEM; + ret = PTR_ERR(root->node); goto out; } btrfs_set_header_level(root->node, 0); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index fd6c74662e9a..31df020634cd 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1917,6 +1917,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_transaction *prev_trans = NULL; int ret; + /* + * Some places just start a transaction to commit it. We need to make + * sure that if this commit fails that the abort code actually marks the + * transaction as failed, so set trans->dirty to make the abort code do + * the right thing. + */ + trans->dirty = true; + /* Stop the commit early if ->aborted is set */ if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { ret = cur_trans->aborted; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 47d11a30bee7..f79682937faf 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2827,6 +2827,12 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, log->log_transid = root->log_transid; root->log_start_pid = 0; /* + * Update or create log root item under the root's log_mutex to prevent + * races with concurrent log syncs that can lead to failure to update + * log root item because it was not created yet. + */ + ret = update_log_root(trans, log); + /* * IO has been started, blocks of the log tree have WRITTEN flag set * in their headers. new modifications of the log will be written to * new positions. so it's safe to allow log writers to go in. @@ -2845,8 +2851,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_unlock(&log_root_tree->log_mutex); - ret = update_log_root(trans, log); - mutex_lock(&log_root_tree->log_mutex); if (atomic_dec_and_test(&log_root_tree->log_writers)) { /* @@ -3343,9 +3347,16 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, } btrfs_release_path(path); - /* find the first key from this transaction again */ + /* + * Find the first key from this transaction again. See the note for + * log_new_dir_dentries, if we're logging a directory recursively we + * won't be holding its i_mutex, which means we can modify the directory + * while we're logging it. If we remove an entry between our first + * search and this search we'll not find the key again and can just + * bail. + */ ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); - if (WARN_ON(ret != 0)) + if (ret != 0) goto done; /* @@ -4432,13 +4443,8 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, struct btrfs_file_extent_item); if (btrfs_file_extent_type(leaf, extent) == - BTRFS_FILE_EXTENT_INLINE) { - len = btrfs_file_extent_inline_len(leaf, - path->slots[0], - extent); - ASSERT(len == i_size); + BTRFS_FILE_EXTENT_INLINE) return 0; - } len = btrfs_file_extent_num_bytes(leaf, extent); /* Last extent goes beyond i_size, no need to log a hole. */ @@ -4835,7 +4841,7 @@ again: err = btrfs_log_inode(trans, root, other_inode, LOG_OTHER_INODE, 0, LLONG_MAX, ctx); - iput(other_inode); + btrfs_add_delayed_iput(other_inode); if (err) goto out_unlock; else @@ -5253,7 +5259,7 @@ process_leaf: } if (btrfs_inode_in_log(di_inode, trans->transid)) { - iput(di_inode); + btrfs_add_delayed_iput(di_inode); break; } @@ -5265,7 +5271,7 @@ process_leaf: if (!ret && btrfs_must_commit_transaction(trans, di_inode)) ret = 1; - iput(di_inode); + btrfs_add_delayed_iput(di_inode); if (ret) goto next_dir_inode; if (ctx->log_new_dentries) { @@ -5411,7 +5417,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, if (!ret && ctx && ctx->log_new_dentries) ret = log_new_dir_dentries(trans, root, dir_inode, ctx); - iput(dir_inode); + btrfs_add_delayed_iput(dir_inode); if (ret) goto out; } @@ -5691,9 +5697,28 @@ again: wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); if (IS_ERR(wc.replay_dest)) { ret = PTR_ERR(wc.replay_dest); + + /* + * We didn't find the subvol, likely because it was + * deleted. This is ok, simply skip this log and go to + * the next one. + * + * We need to exclude the root because we can't have + * other log replays overwriting this log as we'll read + * it back in a few more times. This will keep our + * block from being modified, and we'll just bail for + * each subsequent pass. + */ + if (ret == -ENOENT) + ret = btrfs_pin_extent_for_log_replay(fs_info->extent_root, + log->node->start, + log->node->len); free_extent_buffer(log->node); free_extent_buffer(log->commit_root); kfree(log); + + if (!ret) + goto next; btrfs_handle_fs_error(fs_info, ret, "Couldn't read target root for tree log recovery."); goto error; @@ -5725,7 +5750,6 @@ again: &root->highest_objectid); } - key.offset = found_key.offset - 1; wc.replay_dest->log_root = NULL; free_extent_buffer(log->node); free_extent_buffer(log->commit_root); @@ -5733,9 +5757,10 @@ again: if (ret) goto error; - +next: if (found_key.offset == 0) break; + key.offset = found_key.offset - 1; } btrfs_release_path(path); diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index 83bb2f2aa83c..ee1c76cf8886 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -335,6 +335,8 @@ again_search_slot: } if (ret < 0 && ret != -ENOENT) goto out; + key.offset++; + goto again_search_slot; } item_size -= sizeof(subid_le); offset += sizeof(subid_le); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c063ac57c30e..70aa22a8a9cc 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4876,6 +4876,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, for (i = 0; i < map->num_stripes; i++) { num_bytes = map->stripes[i].dev->bytes_used + stripe_size; btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes); + map->stripes[i].dev->has_pending_chunks = true; } spin_lock(&extent_root->fs_info->free_chunk_lock); @@ -5071,8 +5072,7 @@ static inline int btrfs_chunk_max_errors(struct map_lookup *map) if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_DUP)) { + BTRFS_BLOCK_GROUP_RAID5)) { max_errors = 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) { max_errors = 2; @@ -7250,6 +7250,7 @@ void btrfs_update_commit_device_bytes_used(struct btrfs_root *root, for (i = 0; i < map->num_stripes; i++) { dev = map->stripes[i].dev; dev->commit_bytes_used = dev->bytes_used; + dev->has_pending_chunks = false; } } unlock_chunks(root); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 9c09aa29d6bd..96c1b847def6 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -62,6 +62,11 @@ struct btrfs_device { spinlock_t io_lock ____cacheline_aligned; int running_pending; + /* When true means this device has pending chunk alloc in + * current transaction. Protected by chunk_mutex. + */ + bool has_pending_chunks; + /* regular prio bios */ struct btrfs_pending_bios pending_bios; /* WRITE_SYNC bios */ @@ -307,7 +312,6 @@ struct btrfs_bio { u64 map_type; /* get from map_lookup->type */ bio_end_io_t *end_io; struct bio *orig_bio; - unsigned long flags; void *private; atomic_t error; int max_errors; |