From 16e7549f045d33b0c5b0ebf19d08439e9221d40c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 22 Oct 2013 12:18:51 -0400 Subject: Btrfs: incompatible format change to remove hole extents Btrfs has always had these filler extent data items for holes in inodes. This has made somethings very easy, like logging hole punches and sending hole punches. However for large holey files these extent data items are pure overhead. So add an incompatible feature to no longer add hole extents to reduce the amount of metadata used by these sort of files. This has a few changes for logging and send obviously since they will need to detect holes and log/send the holes if there are any. I've tested this thoroughly with xfstests and it doesn't cause any issues with and without the incompat format set. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 172 +++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 162 insertions(+), 10 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 9f7fc51ca334..e7d7a837512a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3194,7 +3194,7 @@ static int log_inode_item(struct btrfs_trans_handle *trans, static noinline int copy_items(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_path *dst_path, - struct extent_buffer *src, + struct btrfs_path *src_path, u64 *last_extent, int start_slot, int nr, int inode_only) { unsigned long src_offset; @@ -3202,6 +3202,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, struct btrfs_root *log = BTRFS_I(inode)->root->log_root; struct btrfs_file_extent_item *extent; struct btrfs_inode_item *inode_item; + struct extent_buffer *src = src_path->nodes[0]; + struct btrfs_key first_key, last_key, key; int ret; struct btrfs_key *ins_keys; u32 *ins_sizes; @@ -3209,6 +3211,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, int i; struct list_head ordered_sums; int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; + bool has_extents = false; + bool need_find_last_extent = (*last_extent == 0); + bool done = false; INIT_LIST_HEAD(&ordered_sums); @@ -3217,6 +3222,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, if (!ins_data) return -ENOMEM; + first_key.objectid = (u64)-1; + ins_sizes = (u32 *)ins_data; ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); @@ -3237,6 +3244,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, src_offset = btrfs_item_ptr_offset(src, start_slot + i); + if ((i == (nr - 1))) + last_key = ins_keys[i]; + if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_path->slots[0], @@ -3248,6 +3258,21 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, src_offset, ins_sizes[i]); } + /* + * We set need_find_last_extent here in case we know we were + * processing other items and then walk into the first extent in + * the inode. If we don't hit an extent then nothing changes, + * we'll do the last search the next time around. + */ + if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) { + has_extents = true; + if (need_find_last_extent && + first_key.objectid == (u64)-1) + first_key = ins_keys[i]; + } else { + need_find_last_extent = false; + } + /* take a reference on file data extents so that truncates * or deletes of this inode don't have to relog the inode * again @@ -3312,6 +3337,126 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, list_del(&sums->list); kfree(sums); } + + if (!has_extents) + return ret; + + /* + * Because we use btrfs_search_forward we could skip leaves that were + * not modified and then assume *last_extent is valid when it really + * isn't. So back up to the previous leaf and read the end of the last + * extent before we go and fill in holes. + */ + if (need_find_last_extent) { + u64 len; + + ret = btrfs_prev_leaf(BTRFS_I(inode)->root, src_path); + if (ret < 0) + return ret; + if (ret) + goto fill_holes; + if (src_path->slots[0]) + src_path->slots[0]--; + src = src_path->nodes[0]; + btrfs_item_key_to_cpu(src, &key, src_path->slots[0]); + if (key.objectid != btrfs_ino(inode) || + key.type != BTRFS_EXTENT_DATA_KEY) + goto fill_holes; + extent = btrfs_item_ptr(src, src_path->slots[0], + struct btrfs_file_extent_item); + if (btrfs_file_extent_type(src, extent) == + BTRFS_FILE_EXTENT_INLINE) { + len = btrfs_file_extent_inline_len(src, extent); + *last_extent = ALIGN(key.offset + len, + log->sectorsize); + } else { + len = btrfs_file_extent_num_bytes(src, extent); + *last_extent = key.offset + len; + } + } +fill_holes: + /* So we did prev_leaf, now we need to move to the next leaf, but a few + * things could have happened + * + * 1) A merge could have happened, so we could currently be on a leaf + * that holds what we were copying in the first place. + * 2) A split could have happened, and now not all of the items we want + * are on the same leaf. + * + * So we need to adjust how we search for holes, we need to drop the + * path and re-search for the first extent key we found, and then walk + * forward until we hit the last one we copied. + */ + if (need_find_last_extent) { + /* btrfs_prev_leaf could return 1 without releasing the path */ + btrfs_release_path(src_path); + ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &first_key, + src_path, 0, 0); + if (ret < 0) + return ret; + ASSERT(ret == 0); + src = src_path->nodes[0]; + i = src_path->slots[0]; + } else { + i = start_slot; + } + + /* + * Ok so here we need to go through and fill in any holes we may have + * to make sure that holes are punched for those areas in case they had + * extents previously. + */ + while (!done) { + u64 offset, len; + u64 extent_end; + + if (i >= btrfs_header_nritems(src_path->nodes[0])) { + ret = btrfs_next_leaf(BTRFS_I(inode)->root, src_path); + if (ret < 0) + return ret; + ASSERT(ret == 0); + src = src_path->nodes[0]; + i = 0; + } + + btrfs_item_key_to_cpu(src, &key, i); + if (!btrfs_comp_cpu_keys(&key, &last_key)) + done = true; + if (key.objectid != btrfs_ino(inode) || + key.type != BTRFS_EXTENT_DATA_KEY) { + i++; + continue; + } + extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item); + if (btrfs_file_extent_type(src, extent) == + BTRFS_FILE_EXTENT_INLINE) { + len = btrfs_file_extent_inline_len(src, extent); + extent_end = ALIGN(key.offset + len, log->sectorsize); + } else { + len = btrfs_file_extent_num_bytes(src, extent); + extent_end = key.offset + len; + } + i++; + + if (*last_extent == key.offset) { + *last_extent = extent_end; + continue; + } + offset = *last_extent; + len = key.offset - *last_extent; + ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode), + offset, 0, 0, len, 0, len, 0, + 0, 0); + if (ret) + break; + *last_extent = offset + len; + } + /* + * Need to let the callers know we dropped the path so they should + * re-search. + */ + if (!ret && need_find_last_extent) + ret = 1; return ret; } @@ -3630,6 +3775,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_key max_key; struct btrfs_root *log = root->log_root; struct extent_buffer *src = NULL; + u64 last_extent = 0; int err = 0; int ret; int nritems; @@ -3745,11 +3891,15 @@ again: goto next_slot; } - ret = copy_items(trans, inode, dst_path, src, ins_start_slot, - ins_nr, inode_only); - if (ret) { + ret = copy_items(trans, inode, dst_path, path, &last_extent, + ins_start_slot, ins_nr, inode_only); + if (ret < 0) { err = ret; goto out_unlock; + } if (ret) { + ins_nr = 0; + btrfs_release_path(path); + continue; } ins_nr = 1; ins_start_slot = path->slots[0]; @@ -3763,13 +3913,14 @@ next_slot: goto again; } if (ins_nr) { - ret = copy_items(trans, inode, dst_path, src, - ins_start_slot, + ret = copy_items(trans, inode, dst_path, path, + &last_extent, ins_start_slot, ins_nr, inode_only); - if (ret) { + if (ret < 0) { err = ret; goto out_unlock; } + ret = 0; ins_nr = 0; } btrfs_release_path(path); @@ -3784,12 +3935,13 @@ next_slot: } } if (ins_nr) { - ret = copy_items(trans, inode, dst_path, src, ins_start_slot, - ins_nr, inode_only); - if (ret) { + ret = copy_items(trans, inode, dst_path, path, &last_extent, + ins_start_slot, ins_nr, inode_only); + if (ret < 0) { err = ret; goto out_unlock; } + ret = 0; ins_nr = 0; } -- cgit v1.2.3 From 3f870c28990015a1fd6c67807efcdb02a75b35e1 Mon Sep 17 00:00:00 2001 From: Kelley Nielsen Date: Mon, 4 Nov 2013 19:37:39 -0800 Subject: btrfs: expand btrfs_find_item() to include find_orphan_item functionality This is the third step in bootstrapping the btrfs_find_item interface. The function find_orphan_item(), in orphan.c, is similar to the two functions already replaced by the new interface. It uses two parameters, which are already present in the interface, and is nearly identical to the function brought in in the previous patch. Replace the two calls to find_orphan_item() with calls to btrfs_find_item(), with the defined objectid and type that was used internally by find_orphan_item(), a null path, and a null key. Add a test for a null path to btrfs_find_item, and if it passes, allocate and free the path. Finally, remove find_orphan_item(). Signed-off-by: Kelley Nielsen Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index e7d7a837512a..ba2f15109dac 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1238,7 +1238,8 @@ static int insert_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset) { int ret; - ret = btrfs_find_orphan_item(root, offset); + ret = btrfs_find_item(root, NULL, BTRFS_ORPHAN_OBJECTID, + offset, BTRFS_ORPHAN_ITEM_KEY, NULL); if (ret > 0) ret = btrfs_insert_orphan_item(trans, root, offset); return ret; -- cgit v1.2.3 From 1acae57b161ef1282f565ef907f72aeed0eb71d9 Mon Sep 17 00:00:00 2001 From: Filipe David Borba Manana Date: Tue, 7 Jan 2014 11:42:27 +0000 Subject: Btrfs: faster file extent item replace operations When writing to a file we drop existing file extent items that cover the write range and then add a new file extent item that represents that write range. Before this change we were doing a tree lookup to remove the file extent items, and then after we did another tree lookup to insert the new file extent item. Most of the time all the file extent items we need to drop are located within a single leaf - this is the leaf where our new file extent item ends up at. Therefore, in this common case just combine these 2 operations into a single one. By avoiding the second btree navigation for insertion of the new file extent item, we reduce btree node/leaf lock acquisitions/releases, btree block/leaf COW operations, CPU time on btree node/leaf key binary searches, etc. Besides for file writes, this is an operation that happens for file fsync's as well. However log btrees are much less likely to big as big as regular fs btrees, therefore the impact of this change is smaller. The following benchmark was performed against an SSD drive and a HDD drive, both for random and sequential writes: sysbench --test=fileio --file-num=4096 --file-total-size=8G \ --file-test-mode=[rndwr|seqwr] --num-threads=512 \ --file-block-size=8192 \ --max-requests=1000000 \ --file-fsync-freq=0 --file-io-mode=sync [prepare|run] All results below are averages of 10 runs of the respective test. ** SSD sequential writes Before this change: 225.88 Mb/sec After this change: 277.26 Mb/sec ** SSD random writes Before this change: 49.91 Mb/sec After this change: 56.39 Mb/sec ** HDD sequential writes Before this change: 68.53 Mb/sec After this change: 69.87 Mb/sec ** HDD random writes Before this change: 13.04 Mb/sec After this change: 14.39 Mb/sec Signed-off-by: Filipe David Borba Manana Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index ba2f15109dac..b561e7a4007d 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3495,21 +3495,27 @@ static int log_one_extent(struct btrfs_trans_handle *trans, int ret; int index = log->log_transid % 2; bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; - - ret = __btrfs_drop_extents(trans, log, inode, path, em->start, - em->start + em->len, NULL, 0); - if (ret) - return ret; + int extent_inserted = 0; INIT_LIST_HEAD(&ordered_sums); btrfs_init_map_token(&token); - key.objectid = btrfs_ino(inode); - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = em->start; - ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*fi)); + ret = __btrfs_drop_extents(trans, log, inode, path, em->start, + em->start + em->len, NULL, 0, 1, + sizeof(*fi), &extent_inserted); if (ret) return ret; + + if (!extent_inserted) { + key.objectid = btrfs_ino(inode); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = em->start; + + ret = btrfs_insert_empty_item(trans, log, path, &key, + sizeof(*fi)); + if (ret) + return ret; + } leaf = path->nodes[0]; fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); -- cgit v1.2.3 From 23c671a58831a5aaca3b56b915c8394a274a96df Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 14 Jan 2014 20:31:52 +0800 Subject: Btrfs: flush the dirty pages of the ordered extent aggressively during logging csum The performance of fsync dropped down suddenly sometimes, the main reason of this problem was that we might only flush part dirty pages in a ordered extent, then got that ordered extent, wait for the csum calcucation. But if no task flushed the left part, we would wait until the flusher flushed them, sometimes we need wait for several seconds, it made the performance drop down suddenly. (On my box, it drop down from 56MB/s to 4-10MB/s) This patch improves the above problem by flushing left dirty pages aggressively. Test Environment: CPU: 2CPU * 2Cores Memory: 4GB Partition: 20GB(HDD) Test Command: # sysbench --num-threads=8 --test=fileio --file-num=1 \ > --file-total-size=8G --file-block-size=32768 \ > --file-io-mode=sync --file-fsync-freq=100 \ > --file-fsync-end=no --max-requests=10000 \ > --file-test-mode=rndwr run Signed-off-by: Miao Xie Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index b561e7a4007d..b142b6dc96c3 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3637,7 +3637,11 @@ again: * start over after this. */ - wait_event(ordered->wait, ordered->csum_bytes_left == 0); + if (ordered->csum_bytes_left) { + btrfs_start_ordered_extent(inode, ordered, 0); + wait_event(ordered->wait, + ordered->csum_bytes_left == 0); + } list_for_each_entry(sum, &ordered->list, list) { ret = btrfs_csum_file_blocks(trans, log, sum); -- cgit v1.2.3 From 514ac8ad8793a097c0c9d89202c642479d6dfa34 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 3 Jan 2014 21:07:00 -0800 Subject: Btrfs: don't use ram_bytes for uncompressed inline items If we truncate an uncompressed inline item, ram_bytes isn't updated to reflect the new size. The fixe uses the size directly from the item header when reading uncompressed inlines, and also fixes truncate to update the size as it goes. Reported-by: Jens Axboe Signed-off-by: Chris Mason CC: stable@vger.kernel.org --- fs/btrfs/tree-log.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index b142b6dc96c3..39d83da03e03 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -570,7 +570,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, if (btrfs_file_extent_disk_bytenr(eb, item) == 0) nbytes = 0; } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { - size = btrfs_file_extent_inline_len(eb, item); + size = btrfs_file_extent_inline_len(eb, slot, item); nbytes = btrfs_file_extent_ram_bytes(eb, item); extent_end = ALIGN(start + size, root->sectorsize); } else { @@ -3367,7 +3367,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, struct btrfs_file_extent_item); if (btrfs_file_extent_type(src, extent) == BTRFS_FILE_EXTENT_INLINE) { - len = btrfs_file_extent_inline_len(src, extent); + len = btrfs_file_extent_inline_len(src, + src_path->slots[0], + extent); *last_extent = ALIGN(key.offset + len, log->sectorsize); } else { @@ -3431,7 +3433,7 @@ fill_holes: extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item); if (btrfs_file_extent_type(src, extent) == BTRFS_FILE_EXTENT_INLINE) { - len = btrfs_file_extent_inline_len(src, extent); + len = btrfs_file_extent_inline_len(src, i, extent); extent_end = ALIGN(key.offset + len, log->sectorsize); } else { len = btrfs_file_extent_num_bytes(src, extent); -- cgit v1.2.3