From d3922a777f9b4c4df898d326fa940f239af4f9b6 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Mon, 1 Jul 2013 08:12:37 -0400 Subject: ext4: improve extent cache shrink mechanism to avoid to burn CPU time Now we maintain an proper in-order LRU list in ext4 to reclaim entries from extent status tree when we are under heavy memory pressure. For keeping this order, a spin lock is used to protect this list. But this lock burns a lot of CPU time. We can use the following steps to trigger it. % cd /dev/shm % dd if=/dev/zero of=ext4-img bs=1M count=2k % mkfs.ext4 ext4-img % mount -t ext4 -o loop ext4-img /mnt % cd /mnt % for ((i=0;i<160;i++)); do truncate -s 64g $i; done % for ((i=0;i<160;i++)); do cp $i /dev/null &; done % perf record -a -g % perf report This commit tries to fix this problem. Now a new member called i_touch_when is added into ext4_inode_info to record the last access time for an inode. Meanwhile we never need to keep a proper in-order LRU list. So this can avoid to burns some CPU time. When we try to reclaim some entries from extent status tree, we use list_sort() to get a proper in-order list. Then we traverse this list to discard some entries. In ext4_sb_info, we use s_es_last_sorted to record the last time of sorting this list. When we traverse the list, we skip the inode that is newer than this time, and move this inode to the tail of LRU list. When the head of the list is newer than s_es_last_sorted, we will sort the LRU list again. In this commit, we break the loop if s_extent_cache_cnt == 0 because that means that all extents in extent status tree have been reclaimed. Meanwhile in this commit, ext4_es_{un}register_shrinker()'s prototype is changed to save a local variable in these functions. Reported-by: Dave Hansen Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents_status.c | 75 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 55 insertions(+), 20 deletions(-) (limited to 'fs/ext4/extents_status.c') diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index e6941e622d31..ee018d5f397e 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -10,6 +10,7 @@ * Ext4 extents status tree core functions. */ #include +#include #include "ext4.h" #include "extents_status.h" #include "ext4_extents.h" @@ -291,7 +292,6 @@ out: read_unlock(&EXT4_I(inode)->i_es_lock); - ext4_es_lru_add(inode); trace_ext4_es_find_delayed_extent_range_exit(inode, es); } @@ -672,7 +672,6 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, error: write_unlock(&EXT4_I(inode)->i_es_lock); - ext4_es_lru_add(inode); ext4_es_print_tree(inode); return err; @@ -734,7 +733,6 @@ out: read_unlock(&EXT4_I(inode)->i_es_lock); - ext4_es_lru_add(inode); trace_ext4_es_lookup_extent_exit(inode, es, found); return found; } @@ -878,12 +876,28 @@ int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex) EXTENT_STATUS_WRITTEN); } +static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a, + struct list_head *b) +{ + struct ext4_inode_info *eia, *eib; + eia = list_entry(a, struct ext4_inode_info, i_es_lru); + eib = list_entry(b, struct ext4_inode_info, i_es_lru); + + if (eia->i_touch_when == eib->i_touch_when) + return 0; + if (time_after(eia->i_touch_when, eib->i_touch_when)) + return 1; + else + return -1; +} + static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) { struct ext4_sb_info *sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker); struct ext4_inode_info *ei; - struct list_head *cur, *tmp, scanned; + struct list_head *cur, *tmp; + LIST_HEAD(skiped); int nr_to_scan = sc->nr_to_scan; int ret, nr_shrunk = 0; @@ -893,23 +907,41 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) if (!nr_to_scan) return ret; - INIT_LIST_HEAD(&scanned); - spin_lock(&sbi->s_es_lru_lock); + + /* + * If the inode that is at the head of LRU list is newer than + * last_sorted time, that means that we need to sort this list. + */ + ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, i_es_lru); + if (sbi->s_es_last_sorted < ei->i_touch_when) { + list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp); + sbi->s_es_last_sorted = jiffies; + } + list_for_each_safe(cur, tmp, &sbi->s_es_lru) { - list_move_tail(cur, &scanned); + /* + * If we have already reclaimed all extents from extent + * status tree, just stop the loop immediately. + */ + if (percpu_counter_read_positive(&sbi->s_extent_cache_cnt) == 0) + break; ei = list_entry(cur, struct ext4_inode_info, i_es_lru); - read_lock(&ei->i_es_lock); - if (ei->i_es_lru_nr == 0) { - read_unlock(&ei->i_es_lock); + /* Skip the inode that is newer than the last_sorted time */ + if (sbi->s_es_last_sorted < ei->i_touch_when) { + list_move_tail(cur, &skiped); continue; } - read_unlock(&ei->i_es_lock); + + if (ei->i_es_lru_nr == 0) + continue; write_lock(&ei->i_es_lock); ret = __es_try_to_reclaim_extents(ei, nr_to_scan); + if (ei->i_es_lru_nr == 0) + list_del_init(&ei->i_es_lru); write_unlock(&ei->i_es_lock); nr_shrunk += ret; @@ -917,7 +949,9 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) if (nr_to_scan == 0) break; } - list_splice_tail(&scanned, &sbi->s_es_lru); + + /* Move the newer inodes into the tail of the LRU list. */ + list_splice_tail(&skiped, &sbi->s_es_lru); spin_unlock(&sbi->s_es_lru_lock); ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); @@ -925,21 +959,19 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) return ret; } -void ext4_es_register_shrinker(struct super_block *sb) +void ext4_es_register_shrinker(struct ext4_sb_info *sbi) { - struct ext4_sb_info *sbi; - - sbi = EXT4_SB(sb); INIT_LIST_HEAD(&sbi->s_es_lru); spin_lock_init(&sbi->s_es_lru_lock); + sbi->s_es_last_sorted = 0; sbi->s_es_shrinker.shrink = ext4_es_shrink; sbi->s_es_shrinker.seeks = DEFAULT_SEEKS; register_shrinker(&sbi->s_es_shrinker); } -void ext4_es_unregister_shrinker(struct super_block *sb) +void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi) { - unregister_shrinker(&EXT4_SB(sb)->s_es_shrinker); + unregister_shrinker(&sbi->s_es_shrinker); } void ext4_es_lru_add(struct inode *inode) @@ -947,11 +979,14 @@ void ext4_es_lru_add(struct inode *inode) struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + ei->i_touch_when = jiffies; + + if (!list_empty(&ei->i_es_lru)) + return; + spin_lock(&sbi->s_es_lru_lock); if (list_empty(&ei->i_es_lru)) list_add_tail(&ei->i_es_lru, &sbi->s_es_lru); - else - list_move_tail(&ei->i_es_lru, &sbi->s_es_lru); spin_unlock(&sbi->s_es_lru_lock); } -- cgit v1.2.3 From bdafe42aaf72859166f784f0fad3e6b4a815fa6d Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 13 Jul 2013 00:40:31 -0400 Subject: ext4: fix spelling errors and a comment in extent_status tree Replace "assertation" with "assertion" in lots and lots of debugging messages. Correct the comment stating when ext4_es_insert_extent() is used. It was no doubt tree at one point, but it is no longer true... Signed-off-by: "Theodore Ts'o" Cc: Zheng Liu --- fs/ext4/extents_status.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) (limited to 'fs/ext4/extents_status.c') diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index ee018d5f397e..4b8df7fbb10a 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -439,7 +439,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode, */ if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) { if (in_range(es->es_lblk, ee_block, ee_len)) { - pr_warn("ES insert assertation failed for " + pr_warn("ES insert assertion failed for " "inode: %lu we can find an extent " "at block [%d/%d/%llu/%c], but we " "want to add an delayed/hole extent " @@ -458,7 +458,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode, */ if (es->es_lblk < ee_block || ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) { - pr_warn("ES insert assertation failed for inode: %lu " + pr_warn("ES insert assertion failed for inode: %lu " "ex_status [%d/%d/%llu/%c] != " "es_status [%d/%d/%llu/%c]\n", inode->i_ino, ee_block, ee_len, ee_start, @@ -468,7 +468,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode, } if (ee_status ^ es_status) { - pr_warn("ES insert assertation failed for inode: %lu " + pr_warn("ES insert assertion failed for inode: %lu " "ex_status [%d/%d/%llu/%c] != " "es_status [%d/%d/%llu/%c]\n", inode->i_ino, ee_block, ee_len, ee_start, @@ -481,7 +481,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode, * that we don't want to add an written/unwritten extent. */ if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) { - pr_warn("ES insert assertation failed for inode: %lu " + pr_warn("ES insert assertion failed for inode: %lu " "can't find an extent at block %d but we want " "to add an written/unwritten extent " "[%d/%d/%llu/%llx]\n", inode->i_ino, @@ -519,7 +519,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode, * We want to add a delayed/hole extent but this * block has been allocated. */ - pr_warn("ES insert assertation failed for inode: %lu " + pr_warn("ES insert assertion failed for inode: %lu " "We can find blocks but we want to add a " "delayed/hole extent [%d/%d/%llu/%llx]\n", inode->i_ino, es->es_lblk, es->es_len, @@ -527,13 +527,13 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode, return; } else if (ext4_es_is_written(es)) { if (retval != es->es_len) { - pr_warn("ES insert assertation failed for " + pr_warn("ES insert assertion failed for " "inode: %lu retval %d != es_len %d\n", inode->i_ino, retval, es->es_len); return; } if (map.m_pblk != ext4_es_pblock(es)) { - pr_warn("ES insert assertation failed for " + pr_warn("ES insert assertion failed for " "inode: %lu m_pblk %llu != " "es_pblk %llu\n", inode->i_ino, map.m_pblk, @@ -549,7 +549,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode, } } else if (retval == 0) { if (ext4_es_is_written(es)) { - pr_warn("ES insert assertation failed for inode: %lu " + pr_warn("ES insert assertion failed for inode: %lu " "We can't find the block but we want to add " "an written extent [%d/%d/%llu/%llx]\n", inode->i_ino, es->es_lblk, es->es_len, @@ -632,10 +632,8 @@ out: } /* - * ext4_es_insert_extent() adds a space to a extent status tree. - * - * ext4_es_insert_extent is called by ext4_da_write_begin and - * ext4_es_remove_extent. + * ext4_es_insert_extent() adds information to an inode's extent + * status tree. * * Return 0 on success, error code on failure. */ -- cgit v1.2.3 From e15f742ce816076497549b955fbec3254820db85 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 15 Jul 2013 00:12:14 -0400 Subject: ext4: make the extent_status code more robust against ENOMEM failures Some callers of ext4_es_remove_extent() and ext4_es_insert_extent() may not be completely robust against ENOMEM failures (or the consequences of reflecting ENOMEM back up to userspace may lead to xfstest or user application failure). To mitigate against this, when trying to insert an entry in the extent status tree, try to shrink the inode's extent status tree before returning ENOMEM. If there are entries which don't record information about extents under delayed allocations, freeing one of them is preferable to returning ENOMEM. Signed-off-by: "Theodore Ts'o" Reviewed-by: Zheng Liu --- fs/ext4/extents_status.c | 51 ++++++++++++++++++++++++++++++++++++------------ 1 file changed, 39 insertions(+), 12 deletions(-) (limited to 'fs/ext4/extents_status.c') diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 4b8df7fbb10a..91cb110da1b4 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -148,6 +148,8 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t end); static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, int nr_to_scan); +static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, + struct ext4_inode_info *locked_ei); int __init ext4_init_es(void) { @@ -665,7 +667,13 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, err = __es_remove_extent(inode, lblk, end); if (err != 0) goto error; +retry: err = __es_insert_extent(inode, &newes); + if (err == -ENOMEM && __ext4_es_shrink(EXT4_SB(inode->i_sb), 1, + EXT4_I(inode))) + goto retry; + if (err == -ENOMEM && !ext4_es_is_delayed(&newes)) + err = 0; error: write_unlock(&EXT4_I(inode)->i_es_lock); @@ -744,8 +752,10 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, struct extent_status orig_es; ext4_lblk_t len1, len2; ext4_fsblk_t block; - int err = 0; + int err; +retry: + err = 0; es = __es_tree_search(&tree->root, lblk); if (!es) goto out; @@ -780,6 +790,10 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, if (err) { es->es_lblk = orig_es.es_lblk; es->es_len = orig_es.es_len; + if ((err == -ENOMEM) && + __ext4_es_shrink(EXT4_SB(inode->i_sb), 1, + EXT4_I(inode))) + goto retry; goto out; } } else { @@ -889,22 +903,14 @@ static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a, return -1; } -static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) +static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, + struct ext4_inode_info *locked_ei) { - struct ext4_sb_info *sbi = container_of(shrink, - struct ext4_sb_info, s_es_shrinker); struct ext4_inode_info *ei; struct list_head *cur, *tmp; LIST_HEAD(skiped); - int nr_to_scan = sc->nr_to_scan; int ret, nr_shrunk = 0; - ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); - trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret); - - if (!nr_to_scan) - return ret; - spin_lock(&sbi->s_es_lru_lock); /* @@ -933,7 +939,7 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) continue; } - if (ei->i_es_lru_nr == 0) + if (ei->i_es_lru_nr == 0 || ei == locked_ei) continue; write_lock(&ei->i_es_lock); @@ -952,6 +958,27 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) list_splice_tail(&skiped, &sbi->s_es_lru); spin_unlock(&sbi->s_es_lru_lock); + if (locked_ei && nr_shrunk == 0) + nr_shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan); + + return nr_shrunk; +} + +static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) +{ + struct ext4_sb_info *sbi = container_of(shrink, + struct ext4_sb_info, s_es_shrinker); + int nr_to_scan = sc->nr_to_scan; + int ret, nr_shrunk; + + ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); + trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret); + + if (!nr_to_scan) + return ret; + + nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL); + ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret); return ret; -- cgit v1.2.3