From 19fd294957e426bfdd8e19085096467ec18df5c4 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 15 Jun 2011 10:47:30 +0000 Subject: btrfs: fix wrong reservation when doing delayed inode operations We have migrated the space for the delayed inode items from trans_block_rsv to global_block_rsv, but we forgot to set trans->block_rsv to global_block_rsv when we doing delayed inode operations, and the following Oops happened: [ 9792.654889] ------------[ cut here ]------------ [ 9792.654898] WARNING: at fs/btrfs/extent-tree.c:5681 btrfs_alloc_free_block+0xca/0x27c [btrfs]() [ 9792.654899] Hardware name: To Be Filled By O.E.M. [ 9792.654900] Modules linked in: btrfs zlib_deflate libcrc32c ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 ip6table_filter ip6_tables arc4 rt61pci rt2x00pci rt2x00lib snd_hda_codec_hdmi mac80211 snd_hda_codec_realtek cfg80211 snd_hda_intel edac_core snd_seq rfkill pcspkr serio_raw snd_hda_codec eeprom_93cx6 edac_mce_amd sp5100_tco i2c_piix4 k10temp snd_hwdep snd_seq_device snd_pcm floppy r8169 xhci_hcd mii snd_timer snd soundcore snd_page_alloc ipv6 firewire_ohci pata_acpi ata_generic firewire_core pata_via crc_itu_t radeon ttm drm_kms_helper drm i2c_algo_bit i2c_core [last unloaded: scsi_wait_scan] [ 9792.654919] Pid: 2762, comm: rm Tainted: G W 2.6.39+ #1 [ 9792.654920] Call Trace: [ 9792.654922] [] warn_slowpath_common+0x83/0x9b [ 9792.654925] [] warn_slowpath_null+0x1a/0x1c [ 9792.654933] [] btrfs_alloc_free_block+0xca/0x27c [btrfs] [ 9792.654945] [] ? map_extent_buffer+0x6e/0xa8 [btrfs] [ 9792.654953] [] __btrfs_cow_block+0xfc/0x30c [btrfs] [ 9792.654963] [] ? btrfs_buffer_uptodate+0x47/0x58 [btrfs] [ 9792.654970] [] ? read_block_for_search+0x94/0x368 [btrfs] [ 9792.654978] [] btrfs_cow_block+0xfe/0x146 [btrfs] [ 9792.654986] [] btrfs_search_slot+0x14d/0x4b6 [btrfs] [ 9792.654997] [] ? map_extent_buffer+0x6e/0xa8 [btrfs] [ 9792.655022] [] btrfs_lookup_inode+0x2f/0x8f [btrfs] [ 9792.655025] [] ? _cond_resched+0xe/0x22 [ 9792.655027] [] ? mutex_lock+0x29/0x50 [ 9792.655039] [] btrfs_update_delayed_inode+0x72/0x137 [btrfs] [ 9792.655051] [] btrfs_run_delayed_items+0x90/0xdb [btrfs] [ 9792.655062] [] btrfs_commit_transaction+0x228/0x654 [btrfs] [ 9792.655064] [] ? remove_wait_queue+0x3a/0x3a [ 9792.655075] [] btrfs_evict_inode+0x14d/0x202 [btrfs] [ 9792.655077] [] evict+0x71/0x111 [ 9792.655079] [] iput+0x12a/0x132 [ 9792.655081] [] do_unlinkat+0x106/0x155 [ 9792.655083] [] ? path_put+0x1f/0x23 [ 9792.655085] [] ? audit_syscall_entry+0x145/0x171 [ 9792.655087] [] ? putname+0x34/0x36 [ 9792.655090] [] sys_unlinkat+0x29/0x2b [ 9792.655092] [] system_call_fastpath+0x16/0x1b [ 9792.655093] ---[ end trace 02b696eb02b3f768 ]--- This patch fix it by setting the reservation of the transaction handle to the correct one. Reported-by: Josef Bacik Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/delayed-inode.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) (limited to 'fs/btrfs/delayed-inode.c') diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 6462c29d2d37..fc515b787e8c 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -297,7 +297,6 @@ struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len) item->data_len = data_len; item->ins_or_del = 0; item->bytes_reserved = 0; - item->block_rsv = NULL; item->delayed_node = NULL; atomic_set(&item->refs, 1); } @@ -593,10 +592,8 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, num_bytes = btrfs_calc_trans_metadata_size(root, 1); ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); - if (!ret) { + if (!ret) item->bytes_reserved = num_bytes; - item->block_rsv = dst_rsv; - } return ret; } @@ -604,10 +601,13 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, struct btrfs_delayed_item *item) { + struct btrfs_block_rsv *rsv; + if (!item->bytes_reserved) return; - btrfs_block_rsv_release(root, item->block_rsv, + rsv = &root->fs_info->global_block_rsv; + btrfs_block_rsv_release(root, rsv, item->bytes_reserved); } @@ -1014,6 +1014,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_delayed_root *delayed_root; struct btrfs_delayed_node *curr_node, *prev_node; struct btrfs_path *path; + struct btrfs_block_rsv *block_rsv; int ret = 0; path = btrfs_alloc_path(); @@ -1021,6 +1022,9 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, return -ENOMEM; path->leave_spinning = 1; + block_rsv = trans->block_rsv; + trans->block_rsv = &root->fs_info->global_block_rsv; + delayed_root = btrfs_get_delayed_root(root); curr_node = btrfs_first_delayed_node(delayed_root); @@ -1045,6 +1049,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, } btrfs_free_path(path); + trans->block_rsv = block_rsv; return ret; } @@ -1052,6 +1057,7 @@ static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_delayed_node *node) { struct btrfs_path *path; + struct btrfs_block_rsv *block_rsv; int ret; path = btrfs_alloc_path(); @@ -1059,6 +1065,9 @@ static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, return -ENOMEM; path->leave_spinning = 1; + block_rsv = trans->block_rsv; + trans->block_rsv = &node->root->fs_info->global_block_rsv; + ret = btrfs_insert_delayed_items(trans, path, node->root, node); if (!ret) ret = btrfs_delete_delayed_items(trans, path, node->root, node); @@ -1066,6 +1075,7 @@ static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, ret = btrfs_update_delayed_inode(trans, node->root, path, node); btrfs_free_path(path); + trans->block_rsv = block_rsv; return ret; } @@ -1116,6 +1126,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) struct btrfs_path *path; struct btrfs_delayed_node *delayed_node = NULL; struct btrfs_root *root; + struct btrfs_block_rsv *block_rsv; unsigned long nr = 0; int need_requeue = 0; int ret; @@ -1134,6 +1145,9 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) if (IS_ERR(trans)) goto free_path; + block_rsv = trans->block_rsv; + trans->block_rsv = &root->fs_info->global_block_rsv; + ret = btrfs_insert_delayed_items(trans, path, root, delayed_node); if (!ret) ret = btrfs_delete_delayed_items(trans, path, root, @@ -1176,6 +1190,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) nr = trans->blocks_used; + trans->block_rsv = block_rsv; btrfs_end_transaction_dmeta(trans, root); __btrfs_btree_balance_dirty(root, nr); free_path: -- cgit v1.2.3 From e999376f094162aa425ae749aa1df95ab928d010 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 17 Jun 2011 16:14:09 -0400 Subject: Btrfs: avoid delayed metadata items during commits Snapshot creation has two phases. One is the initial snapshot setup, and the second is done during commit, while nobody is allowed to modify the root we are snapshotting. The delayed metadata insertion code can break that rule, it does a delayed inode update on the inode of the parent of the snapshot, and delayed directory item insertion. This makes sure to run the pending delayed operations before we record the snapshot root, which avoids corruptions. Signed-off-by: Chris Mason --- fs/btrfs/delayed-inode.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs/btrfs/delayed-inode.c') diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index fc515b787e8c..f1cbd028f7b3 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1237,6 +1237,13 @@ again: return 0; } +void btrfs_assert_delayed_root_empty(struct btrfs_root *root) +{ + struct btrfs_delayed_root *delayed_root; + delayed_root = btrfs_get_delayed_root(root); + WARN_ON(btrfs_first_delayed_node(delayed_root)); +} + void btrfs_balance_delayed_items(struct btrfs_root *root) { struct btrfs_delayed_root *delayed_root; -- cgit v1.2.3 From 2f7e33d432d097a2a7f467b031bf18be91cb3d49 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 23 Jun 2011 07:27:13 +0000 Subject: btrfs: fix inconsonant inode information When iputting the inode, We may leave the delayed nodes if they have some delayed items that have not been dealt with. So when the inode is read again, we must look up the relative delayed node, and use the information in it to initialize the inode. Or we will get inconsonant inode information, it may cause that the same directory index number is allocated again, and hit the following oops: [ 5447.554187] err add delayed dir index item(name: pglog_0.965_0) into the insertion tree of the delayed node(root id: 262, inode id: 258, errno: -17) [ 5447.569766] ------------[ cut here ]------------ [ 5447.575361] kernel BUG at fs/btrfs/delayed-inode.c:1301! [SNIP] [ 5447.790721] Call Trace: [ 5447.793191] [] btrfs_insert_dir_item+0x189/0x1bb [btrfs] [ 5447.800156] [] btrfs_add_link+0x12b/0x191 [btrfs] [ 5447.806517] [] btrfs_add_nondir+0x31/0x58 [btrfs] [ 5447.812876] [] btrfs_create+0xf9/0x197 [btrfs] [ 5447.818961] [] vfs_create+0x72/0x92 [ 5447.824090] [] do_last+0x22c/0x40b [ 5447.829133] [] path_openat+0xc0/0x2ef [ 5447.834438] [] ? __perf_event_task_sched_out+0x24/0x44 [ 5447.841216] [] ? perf_event_task_sched_out+0x59/0x67 [ 5447.847846] [] do_filp_open+0x3d/0x87 [ 5447.853156] [] ? strncpy_from_user+0x43/0x4d [ 5447.859072] [] ? getname_flags+0x2e/0x80 [ 5447.864636] [] ? do_getname+0x14b/0x173 [ 5447.870112] [] ? audit_getname+0x16/0x26 [ 5447.875682] [] ? spin_lock+0xe/0x10 [ 5447.880882] [] do_sys_open+0x69/0xae [ 5447.886153] [] sys_open+0x20/0x22 [ 5447.891114] [] system_call_fastpath+0x16/0x1b Fix it by reusing the old delayed node. Reported-by: Jim Schutt Signed-off-by: Miao Xie Tested-by: Jim Schutt Signed-off-by: Chris Mason --- fs/btrfs/delayed-inode.c | 104 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 80 insertions(+), 24 deletions(-) (limited to 'fs/btrfs/delayed-inode.c') diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index f1cbd028f7b3..98c68e658a9b 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -82,19 +82,16 @@ static inline struct btrfs_delayed_root *btrfs_get_delayed_root( return root->fs_info->delayed_root; } -static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( - struct inode *inode) +static struct btrfs_delayed_node *btrfs_get_delayed_node(struct inode *inode) { - struct btrfs_delayed_node *node; struct btrfs_inode *btrfs_inode = BTRFS_I(inode); struct btrfs_root *root = btrfs_inode->root; u64 ino = btrfs_ino(inode); - int ret; + struct btrfs_delayed_node *node; -again: node = ACCESS_ONCE(btrfs_inode->delayed_node); if (node) { - atomic_inc(&node->refs); /* can be accessed */ + atomic_inc(&node->refs); return node; } @@ -102,8 +99,10 @@ again: node = radix_tree_lookup(&root->delayed_nodes_tree, ino); if (node) { if (btrfs_inode->delayed_node) { + atomic_inc(&node->refs); /* can be accessed */ + BUG_ON(btrfs_inode->delayed_node != node); spin_unlock(&root->inode_lock); - goto again; + return node; } btrfs_inode->delayed_node = node; atomic_inc(&node->refs); /* can be accessed */ @@ -113,6 +112,23 @@ again: } spin_unlock(&root->inode_lock); + return NULL; +} + +static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( + struct inode *inode) +{ + struct btrfs_delayed_node *node; + struct btrfs_inode *btrfs_inode = BTRFS_I(inode); + struct btrfs_root *root = btrfs_inode->root; + u64 ino = btrfs_ino(inode); + int ret; + +again: + node = btrfs_get_delayed_node(inode); + if (node) + return node; + node = kmem_cache_alloc(delayed_node_cache, GFP_NOFS); if (!node) return ERR_PTR(-ENOMEM); @@ -548,19 +564,6 @@ struct btrfs_delayed_item *__btrfs_next_delayed_item( return next; } -static inline struct btrfs_delayed_node *btrfs_get_delayed_node( - struct inode *inode) -{ - struct btrfs_inode *btrfs_inode = BTRFS_I(inode); - struct btrfs_delayed_node *delayed_node; - - delayed_node = btrfs_inode->delayed_node; - if (delayed_node) - atomic_inc(&delayed_node->refs); - - return delayed_node; -} - static inline struct btrfs_root *btrfs_get_fs_root(struct btrfs_root *root, u64 root_id) { @@ -1404,8 +1407,7 @@ end: int btrfs_inode_delayed_dir_index_count(struct inode *inode) { - struct btrfs_delayed_node *delayed_node = BTRFS_I(inode)->delayed_node; - int ret = 0; + struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); if (!delayed_node) return -ENOENT; @@ -1415,11 +1417,14 @@ int btrfs_inode_delayed_dir_index_count(struct inode *inode) * a new directory index is added into the delayed node and index_cnt * is updated now. So we needn't lock the delayed node. */ - if (!delayed_node->index_cnt) + if (!delayed_node->index_cnt) { + btrfs_release_delayed_node(delayed_node); return -EINVAL; + } BTRFS_I(inode)->index_cnt = delayed_node->index_cnt; - return ret; + btrfs_release_delayed_node(delayed_node); + return 0; } void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list, @@ -1613,6 +1618,57 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans, inode->i_ctime.tv_nsec); } +int btrfs_fill_inode(struct inode *inode, u32 *rdev) +{ + struct btrfs_delayed_node *delayed_node; + struct btrfs_inode_item *inode_item; + struct btrfs_timespec *tspec; + + delayed_node = btrfs_get_delayed_node(inode); + if (!delayed_node) + return -ENOENT; + + mutex_lock(&delayed_node->mutex); + if (!delayed_node->inode_dirty) { + mutex_unlock(&delayed_node->mutex); + btrfs_release_delayed_node(delayed_node); + return -ENOENT; + } + + inode_item = &delayed_node->inode_item; + + inode->i_uid = btrfs_stack_inode_uid(inode_item); + inode->i_gid = btrfs_stack_inode_gid(inode_item); + btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item)); + inode->i_mode = btrfs_stack_inode_mode(inode_item); + inode->i_nlink = btrfs_stack_inode_nlink(inode_item); + inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item)); + BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item); + BTRFS_I(inode)->sequence = btrfs_stack_inode_sequence(inode_item); + inode->i_rdev = 0; + *rdev = btrfs_stack_inode_rdev(inode_item); + BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item); + + tspec = btrfs_inode_atime(inode_item); + inode->i_atime.tv_sec = btrfs_stack_timespec_sec(tspec); + inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(tspec); + + tspec = btrfs_inode_mtime(inode_item); + inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(tspec); + inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(tspec); + + tspec = btrfs_inode_ctime(inode_item); + inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(tspec); + inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(tspec); + + inode->i_generation = BTRFS_I(inode)->generation; + BTRFS_I(inode)->index_cnt = (u64)-1; + + mutex_unlock(&delayed_node->mutex); + btrfs_release_delayed_node(delayed_node); + return 0; +} + int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode) { -- cgit v1.2.3 From bd681513fa6f2ff29aa391f01e413a2d1c59fd77 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Sat, 16 Jul 2011 15:23:14 -0400 Subject: Btrfs: switch the btrfs tree locks to reader/writer The btrfs metadata btree is the source of significant lock contention, especially in the root node. This commit changes our locking to use a reader/writer lock. The lock is built on top of rw spinlocks, and it extends the lock tracking to remember if we have a read lock or a write lock when we go to blocking. Atomics count the number of blocking readers or writers at any given time. It removes all of the adaptive spinning from the old code and uses only the spinning/blocking hints inside of btrfs to decide when it should continue spinning. In read heavy workloads this is dramatically faster. In write heavy workloads we're still faster because of less contention on the root node lock. We suffer slightly in dbench because we schedule more often during write locks, but all other benchmarks so far are improved. Signed-off-by: Chris Mason --- fs/btrfs/delayed-inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/delayed-inode.c') diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 98c68e658a9b..b52c672f4c18 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -735,7 +735,7 @@ static int btrfs_batch_insert_items(struct btrfs_trans_handle *trans, } /* reset all the locked nodes in the patch to spinning locks. */ - btrfs_clear_path_blocking(path, NULL); + btrfs_clear_path_blocking(path, NULL, 0); /* insert the keys of the items */ ret = setup_items_for_insert(trans, root, path, keys, data_size, -- cgit v1.2.3