From 6282adbf932c226f76e1b83e074448c79976fe75 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sat, 25 Jul 2015 00:29:17 -0700 Subject: f2fs: call set_page_dirty to attach i_wb for cgroup The cgroup attaches inode->i_wb via mark_inode_dirty and when set_page_writeback is called, __inc_wb_stat() updates i_wb's stat. So, we need to explicitly call set_page_dirty->__mark_inode_dirty in prior to any writebacking pages. This patch should resolve the following kernel panic reported by Andreas Reis. https://bugzilla.kernel.org/show_bug.cgi?id=101801 --- Comment #2 from Andreas Reis --- BUG: unable to handle kernel NULL pointer dereference at 00000000000000a8 IP: [] __percpu_counter_add+0x1a/0x90 PGD 2951ff067 PUD 2df43f067 PMD 0 Oops: 0000 [#1] PREEMPT SMP Modules linked in: CPU: 7 PID: 10356 Comm: gcc Tainted: G W 4.2.0-1-cu #1 Hardware name: Gigabyte Technology Co., Ltd. G1.Sniper M5/G1.Sniper M5, BIOS T01 02/03/2015 task: ffff880295044f80 ti: ffff880295140000 task.ti: ffff880295140000 RIP: 0010:[] [] __percpu_counter_add+0x1a/0x90 RSP: 0018:ffff880295143ac8 EFLAGS: 00010082 RAX: 0000000000000003 RBX: ffffea000a526d40 RCX: 0000000000000001 RDX: 0000000000000020 RSI: 0000000000000001 RDI: 0000000000000088 RBP: ffff880295143ae8 R08: 0000000000000000 R09: ffff88008f69bb30 R10: 00000000fffffffa R11: 0000000000000000 R12: 0000000000000088 R13: 0000000000000001 R14: ffff88041d099000 R15: ffff880084a205d0 FS: 00007f8549374700(0000) GS:ffff88042f3c0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000000000a8 CR3: 000000033e1d5000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Stack: 0000000000000000 ffffea000a526d40 ffff880084a20738 ffff880084a20750 ffff880295143b48 ffffffff811cc91e ffff880000000000 0000000000000296 0000000000000000 ffff880417090198 0000000000000000 ffffea000a526d40 Call Trace: [] __test_set_page_writeback+0xde/0x1d0 [] do_write_data_page+0xe7/0x3a0 [] gc_data_segment+0x5aa/0x640 [] do_garbage_collect+0x138/0x150 [] f2fs_gc+0x1be/0x3e0 [] f2fs_balance_fs+0x81/0x90 [] f2fs_unlink+0x47/0x1d0 [] vfs_unlink+0x109/0x1b0 [] do_unlinkat+0x287/0x2c0 [] SyS_unlink+0x16/0x20 [] entry_SYSCALL_64_fastpath+0x12/0x71 Code: 41 5e 5d c3 0f 1f 00 66 2e 0f 1f 84 00 00 00 00 00 55 48 89 e5 41 55 49 89 f5 41 54 49 89 fc 53 48 83 ec 08 65 ff 05 e6 d9 b6 7e <48> 8b 47 20 48 63 ca 65 8b 18 48 63 db 48 01 f3 48 39 cb 7d 0a RIP [] __percpu_counter_add+0x1a/0x90 RSP CR2: 00000000000000a8 ---[ end trace 5132449a58ed93a3 ]--- note: gcc[10356] exited with preempt_count 2 Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/f2fs/segment.c') diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 1eb343768781..61b97f9cb9f6 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -257,6 +257,7 @@ void commit_inmem_pages(struct inode *inode, bool abort) if (!abort) { lock_page(cur->page); if (cur->page->mapping == inode->i_mapping) { + set_page_dirty(cur->page); f2fs_wait_on_page_writeback(cur->page, DATA); if (clear_page_dirty_for_io(cur->page)) inode_dec_dirty_pages(inode); -- cgit v1.2.3 From 1b38dc8e74a366b92986755c304591e330f3c3e0 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 19 Jun 2015 15:36:07 -0700 Subject: f2fs: shrink nat_cache entries This patch registers shrinking nat_cache entries. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs/f2fs/segment.c') diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 61b97f9cb9f6..d5ee99258cbc 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -306,8 +306,12 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) /* try to shrink extent cache when there is no enough memory */ f2fs_shrink_extent_tree(sbi, EXTENT_CACHE_SHRINK_NUMBER); - /* check the # of cached NAT entries and prefree segments */ - if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK) || + /* check the # of cached NAT entries */ + if (!available_free_memory(sbi, NAT_ENTRIES)) + try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK); + + /* checkpoint is the only way to shrink partial cached entries */ + if (!available_free_memory(sbi, NAT_ENTRIES) || excess_prefree_segs(sbi) || !available_free_memory(sbi, INO_ENTRIES)) f2fs_sync_fs(sbi->sb, true); -- cgit v1.2.3 From 554df79e523d14dab475eb6650cb96617256ceea Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 19 Jun 2015 13:41:23 -0700 Subject: f2fs: shrink extent_cache entries This patch registers shrinking extent_caches. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/f2fs/segment.c') diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index d5ee99258cbc..f7bfc3b7d934 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -304,7 +304,8 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi) void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) { /* try to shrink extent cache when there is no enough memory */ - f2fs_shrink_extent_tree(sbi, EXTENT_CACHE_SHRINK_NUMBER); + if (!available_free_memory(sbi, EXTENT_CACHE)) + f2fs_shrink_extent_tree(sbi, EXTENT_CACHE_SHRINK_NUMBER); /* check the # of cached NAT entries */ if (!available_free_memory(sbi, NAT_ENTRIES)) -- cgit v1.2.3 From edb27deea7cabfff8feb8c62aae647b7673be734 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sat, 25 Jul 2015 00:52:52 -0700 Subject: f2fs: handle error cases in commit_inmem_pages This patch adds to handle error cases in commit_inmem_pages. If an error occurs, it stops to write the pages and return the error right away. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs/f2fs/segment.c') diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f7bfc3b7d934..509a2c4bb7d3 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -227,7 +227,7 @@ retry: trace_f2fs_register_inmem_page(page, INMEM); } -void commit_inmem_pages(struct inode *inode, bool abort) +int commit_inmem_pages(struct inode *inode, bool abort) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); @@ -239,6 +239,7 @@ void commit_inmem_pages(struct inode *inode, bool abort) .rw = WRITE_SYNC | REQ_PRIO, .encrypted_page = NULL, }; + int err = 0; /* * The abort is true only when f2fs_evict_inode is called. @@ -263,8 +264,12 @@ void commit_inmem_pages(struct inode *inode, bool abort) inode_dec_dirty_pages(inode); trace_f2fs_commit_inmem_page(cur->page, INMEM); fio.page = cur->page; - do_write_data_page(&fio); + err = do_write_data_page(&fio); submit_bio = true; + if (err) { + unlock_page(cur->page); + break; + } } f2fs_put_page(cur->page, 1); } else { @@ -283,6 +288,7 @@ void commit_inmem_pages(struct inode *inode, bool abort) if (submit_bio) f2fs_submit_merged_bio(sbi, DATA, WRITE); } + return err; } /* -- cgit v1.2.3 From e90c2d2850d9d034e814a328725a4b15878f0357 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 28 Jul 2015 18:36:47 +0800 Subject: f2fs: invalidate temporary meta page To avoid meeting garbage data in next free node block at the end of warm node chain when doing recovery, we will try to zero out that invalid block. If the device is not support discard, our way for zeroing out block is: grabbing a temporary zeroed page in meta inode, then, issue write request with this page. But, we forget to release that temporary page, so our memory usage will increase without gaining any hit ratio benefit, so it's better to free it for saving memory. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'fs/f2fs/segment.c') diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 509a2c4bb7d3..1f1200487c44 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -514,7 +514,7 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi, return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0); } -void discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr) +bool discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr) { int err = -ENOTSUPP; @@ -524,13 +524,16 @@ void discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr) unsigned int offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); if (f2fs_test_bit(offset, se->discard_map)) - return; + return false; err = f2fs_issue_discard(sbi, blkaddr, 1); } - if (err) + if (err) { update_meta_page(sbi, NULL, blkaddr); + return true; + } + return false; } static void __add_discard_entry(struct f2fs_sb_info *sbi, -- cgit v1.2.3 From decd36b6c43a1051bab97571cf4c0ec8450268b0 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 7 Aug 2015 18:42:09 +0800 Subject: f2fs: remove inmem radix tree Previously, we use radix tree to index all registered page entries for atomic file, but now we only use radix tree to see whether current page is indexed or not, since the other user of radix tree is gone in commit 042b7816aaeb ("f2fs: remove unnecessary call to invalidate inmemory pages"). So in this patch, we try to use one more efficient way: Introducing a macro ATOMIC_WRITTEN_PAGE, and setting it as page private value to indicate page indexing status. By using this way, we can save memory and lookup time. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) (limited to 'fs/f2fs/segment.c') diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 1f1200487c44..7d53cb44c617 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -197,28 +197,20 @@ void register_inmem_page(struct inode *inode, struct page *page) { struct f2fs_inode_info *fi = F2FS_I(inode); struct inmem_pages *new; - int err; - SetPagePrivate(page); f2fs_trace_pid(page); + set_page_private(page, (unsigned long)ATOMIC_WRITTEN_PAGE); + SetPagePrivate(page); + new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS); /* add atomic page indices to the list */ new->page = page; INIT_LIST_HEAD(&new->list); -retry: + /* increase reference count with clean state */ mutex_lock(&fi->inmem_lock); - err = radix_tree_insert(&fi->inmem_root, page->index, new); - if (err == -EEXIST) { - mutex_unlock(&fi->inmem_lock); - kmem_cache_free(inmem_entry_slab, new); - return; - } else if (err) { - mutex_unlock(&fi->inmem_lock); - goto retry; - } get_page(page); list_add_tail(&new->list, &fi->inmem_pages); inc_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); @@ -255,8 +247,8 @@ int commit_inmem_pages(struct inode *inode, bool abort) mutex_lock(&fi->inmem_lock); list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) { + lock_page(cur->page); if (!abort) { - lock_page(cur->page); if (cur->page->mapping == inode->i_mapping) { set_page_dirty(cur->page); f2fs_wait_on_page_writeback(cur->page, DATA); @@ -271,12 +263,13 @@ int commit_inmem_pages(struct inode *inode, bool abort) break; } } - f2fs_put_page(cur->page, 1); } else { trace_f2fs_commit_inmem_page(cur->page, INMEM_DROP); - put_page(cur->page); } - radix_tree_delete(&fi->inmem_root, cur->page->index); + set_page_private(cur->page, 0); + ClearPagePrivate(cur->page); + f2fs_put_page(cur->page, 1); + list_del(&cur->list); kmem_cache_free(inmem_entry_slab, cur); dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); -- cgit v1.2.3 From 47e70ca46f9074efe6573263c0de5bef0af829de Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 11 Aug 2015 10:17:27 -0700 Subject: f2fs: do not assign a new segment for dio under space shortage If there is not enough free segment, we should not assign a new segment explicitly. Otherwise, we can run out of free segment. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/f2fs/segment.c') diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 7d53cb44c617..bf1605dbce93 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1225,7 +1225,8 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, mutex_lock(&sit_i->sentry_lock); /* direct_io'ed data is aligned to the segment for better performance */ - if (direct_io && curseg->next_blkoff) + if (direct_io && curseg->next_blkoff && + !has_not_enough_free_secs(sbi, 0)) __allocate_new_segments(sbi, type); *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); -- cgit v1.2.3 From 31696580bf4c042a0f7b06d855e04441488d18b1 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 28 Jul 2015 18:33:46 +0800 Subject: f2fs: shrink free_nids entries This patch introduces __count_free_nids/try_to_free_nids and registers them in slab shrinker for shrinking under memory pressure. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/f2fs/segment.c') diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index bf1605dbce93..1b4265639f07 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -310,6 +310,9 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) if (!available_free_memory(sbi, NAT_ENTRIES)) try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK); + if (!available_free_memory(sbi, FREE_NIDS)) + try_to_free_nids(sbi, NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES); + /* checkpoint is the only way to shrink partial cached entries */ if (!available_free_memory(sbi, NAT_ENTRIES) || excess_prefree_segs(sbi) || -- cgit v1.2.3 From 740432f835608d11b5386321ab5aa8f61e07fb27 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 14 Aug 2015 11:43:56 -0700 Subject: f2fs: handle failed bio allocation As the below comment of bio_alloc_bioset, f2fs can allocate multiple bios at the same time. So, we can't guarantee that bio is allocated all the time. " * When @bs is not NULL, if %__GFP_WAIT is set then bio_alloc will always be * able to allocate a bio. This is due to the mempool guarantees. To make this * work, callers must never allocate more than 1 bio at a time from this pool. * Callers that need to allocate more than 1 bio must always submit the * previously allocated bio for IO before attempting to allocate a new one. * Failure to do so can cause deadlocks under memory pressure. " Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'fs/f2fs/segment.c') diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 1b4265639f07..6273e2cde93e 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -330,10 +330,12 @@ repeat: return 0; if (!llist_empty(&fcc->issue_list)) { - struct bio *bio = bio_alloc(GFP_NOIO, 0); + struct bio *bio; struct flush_cmd *cmd, *next; int ret; + bio = f2fs_bio_alloc(0); + fcc->dispatch_list = llist_del_all(&fcc->issue_list); fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list); @@ -365,8 +367,15 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) if (test_opt(sbi, NOBARRIER)) return 0; - if (!test_opt(sbi, FLUSH_MERGE)) - return blkdev_issue_flush(sbi->sb->s_bdev, GFP_KERNEL, NULL); + if (!test_opt(sbi, FLUSH_MERGE)) { + struct bio *bio = f2fs_bio_alloc(0); + int ret; + + bio->bi_bdev = sbi->sb->s_bdev; + ret = submit_bio_wait(WRITE_FLUSH, bio); + bio_put(bio); + return ret; + } init_completion(&cmd.wait); -- cgit v1.2.3 From 80c545055dc7c1f7f487176fe0aac17896a4b7af Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 20 Aug 2015 08:51:56 -0700 Subject: f2fs: use __GFP_NOFAIL to avoid infinite loop __GFP_NOFAIL can avoid retrying the whole path of kmem_cache_alloc and bio_alloc. And, it also fixes the use cases of GFP_ATOMIC correctly. Suggested-by: Chao Yu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/f2fs/segment.c') diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 6273e2cde93e..78e6d0696847 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1753,7 +1753,7 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi, static struct sit_entry_set *grab_sit_entry_set(void) { struct sit_entry_set *ses = - f2fs_kmem_cache_alloc(sit_entry_set_slab, GFP_ATOMIC); + f2fs_kmem_cache_alloc(sit_entry_set_slab, GFP_NOFS); ses->entry_cnt = 0; INIT_LIST_HEAD(&ses->set_list); -- cgit v1.2.3