From badcf2b7b816130a60152d9f5a06705176596925 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 16 Jul 2013 18:15:46 +0400 Subject: constify touch_atime() Signed-off-by: Al Viro --- fs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/inode.c') diff --git a/fs/inode.c b/fs/inode.c index d6dfb09c8280..93a0625b46e4 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1525,7 +1525,7 @@ static int update_time(struct inode *inode, struct timespec *time, int flags) * This function automatically handles read only file systems and media, * as well as the "noatime" flag and inode specific "noatime" markers. */ -void touch_atime(struct path *path) +void touch_atime(const struct path *path) { struct vfsmount *mnt = path->mnt; struct inode *inode = path->dentry->d_inode; -- cgit v1.2.3 From 3942c07ccf98e66b8893f396dca98f5b076f905f Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Wed, 28 Aug 2013 10:17:53 +1000 Subject: fs: bump inode and dentry counters to long MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This series reworks our current object cache shrinking infrastructure in two main ways: * Noticing that a lot of users copy and paste their own version of LRU lists for objects, we put some effort in providing a generic version. It is modeled after the filesystem users: dentries, inodes, and xfs (for various tasks), but we expect that other users could benefit in the near future with little or no modification. Let us know if you have any issues. * The underlying list_lru being proposed automatically and transparently keeps the elements in per-node lists, and is able to manipulate the node lists individually. Given this infrastructure, we are able to modify the up-to-now hammer called shrink_slab to proceed with node-reclaim instead of always searching memory from all over like it has been doing. Per-node lru lists are also expected to lead to less contention in the lru locks on multi-node scans, since we are now no longer fighting for a global lock. The locks usually disappear from the profilers with this change. Although we have no official benchmarks for this version - be our guest to independently evaluate this - earlier versions of this series were performance tested (details at http://permalink.gmane.org/gmane.linux.kernel.mm/100537) yielding no visible performance regressions while yielding a better qualitative behavior in NUMA machines. With this infrastructure in place, we can use the list_lru entry point to provide memcg isolation and per-memcg targeted reclaim. Historically, those two pieces of work have been posted together. This version presents only the infrastructure work, deferring the memcg work for a later time, so we can focus on getting this part tested. You can see more about the history of such work at http://lwn.net/Articles/552769/ Dave Chinner (18): dcache: convert dentry_stat.nr_unused to per-cpu counters dentry: move to per-sb LRU locks dcache: remove dentries from LRU before putting on dispose list mm: new shrinker API shrinker: convert superblock shrinkers to new API list: add a new LRU list type inode: convert inode lru list to generic lru list code. dcache: convert to use new lru list infrastructure list_lru: per-node list infrastructure shrinker: add node awareness fs: convert inode and dentry shrinking to be node aware xfs: convert buftarg LRU to generic code xfs: rework buffer dispose list tracking xfs: convert dquot cache lru to list_lru fs: convert fs shrinkers to new scan/count API drivers: convert shrinkers to new count/scan API shrinker: convert remaining shrinkers to count/scan API shrinker: Kill old ->shrink API. Glauber Costa (7): fs: bump inode and dentry counters to long super: fix calculation of shrinkable objects for small numbers list_lru: per-node API vmscan: per-node deferred work i915: bail out earlier when shrinker cannot acquire mutex hugepage: convert huge zero page shrinker to new shrinker API list_lru: dynamically adjust node arrays This patch: There are situations in very large machines in which we can have a large quantity of dirty inodes, unused dentries, etc. This is particularly true when umounting a filesystem, where eventually since every live object will eventually be discarded. Dave Chinner reported a problem with this while experimenting with the shrinker revamp patchset. So we believe it is time for a change. This patch just moves int to longs. Machines where it matters should have a big long anyway. Signed-off-by: Glauber Costa Cc: Dave Chinner Cc: "Theodore Ts'o" Cc: Adrian Hunter Cc: Al Viro Cc: Artem Bityutskiy Cc: Arve Hjønnevåg Cc: Carlos Maiolino Cc: Christoph Hellwig Cc: Chuck Lever Cc: Daniel Vetter Cc: Dave Chinner Cc: David Rientjes Cc: Gleb Natapov Cc: Greg Thelen Cc: J. Bruce Fields Cc: Jan Kara Cc: Jerome Glisse Cc: John Stultz Cc: KAMEZAWA Hiroyuki Cc: Kent Overstreet Cc: Kirill A. Shutemov Cc: Marcelo Tosatti Cc: Mel Gorman Cc: Steven Whitehouse Cc: Thomas Hellstrom Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/inode.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'fs/inode.c') diff --git a/fs/inode.c b/fs/inode.c index 93a0625b46e4..2a3c37ea823d 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -70,33 +70,33 @@ EXPORT_SYMBOL(empty_aops); */ struct inodes_stat_t inodes_stat; -static DEFINE_PER_CPU(unsigned int, nr_inodes); -static DEFINE_PER_CPU(unsigned int, nr_unused); +static DEFINE_PER_CPU(unsigned long, nr_inodes); +static DEFINE_PER_CPU(unsigned long, nr_unused); static struct kmem_cache *inode_cachep __read_mostly; -static int get_nr_inodes(void) +static long get_nr_inodes(void) { int i; - int sum = 0; + long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_inodes, i); return sum < 0 ? 0 : sum; } -static inline int get_nr_inodes_unused(void) +static inline long get_nr_inodes_unused(void) { int i; - int sum = 0; + long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_unused, i); return sum < 0 ? 0 : sum; } -int get_nr_dirty_inodes(void) +long get_nr_dirty_inodes(void) { /* not actually dirty inodes, but a wild approximation */ - int nr_dirty = get_nr_inodes() - get_nr_inodes_unused(); + long nr_dirty = get_nr_inodes() - get_nr_inodes_unused(); return nr_dirty > 0 ? nr_dirty : 0; } @@ -109,7 +109,7 @@ int proc_nr_inodes(ctl_table *table, int write, { inodes_stat.nr_inodes = get_nr_inodes(); inodes_stat.nr_unused = get_nr_inodes_unused(); - return proc_dointvec(table, write, buffer, lenp, ppos); + return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } #endif -- cgit v1.2.3 From 0a234c6dcb79a270803f5c9773ed650b78730962 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 28 Aug 2013 10:17:57 +1000 Subject: shrinker: convert superblock shrinkers to new API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert superblock shrinker to use the new count/scan API, and propagate the API changes through to the filesystem callouts. The filesystem callouts already use a count/scan API, so it's just changing counters to longs to match the VM API. This requires the dentry and inode shrinker callouts to be converted to the count/scan API. This is mainly a mechanical change. [glommer@openvz.org: use mult_frac for fractional proportions, build fixes] Signed-off-by: Dave Chinner Signed-off-by: Glauber Costa Acked-by: Mel Gorman Cc: "Theodore Ts'o" Cc: Adrian Hunter Cc: Al Viro Cc: Artem Bityutskiy Cc: Arve Hjønnevåg Cc: Carlos Maiolino Cc: Christoph Hellwig Cc: Chuck Lever Cc: Daniel Vetter Cc: David Rientjes Cc: Gleb Natapov Cc: Greg Thelen Cc: J. Bruce Fields Cc: Jan Kara Cc: Jerome Glisse Cc: John Stultz Cc: KAMEZAWA Hiroyuki Cc: Kent Overstreet Cc: Kirill A. Shutemov Cc: Marcelo Tosatti Cc: Mel Gorman Cc: Steven Whitehouse Cc: Thomas Hellstrom Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/inode.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs/inode.c') diff --git a/fs/inode.c b/fs/inode.c index 2a3c37ea823d..021d64768a55 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -706,10 +706,11 @@ static int can_unuse(struct inode *inode) * LRU does not have strict ordering. Hence we don't want to reclaim inodes * with this flag set because they are the inodes that are out of order. */ -void prune_icache_sb(struct super_block *sb, int nr_to_scan) +long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan) { LIST_HEAD(freeable); - int nr_scanned; + long nr_scanned; + long freed = 0; unsigned long reap = 0; spin_lock(&sb->s_inode_lru_lock); @@ -779,6 +780,7 @@ void prune_icache_sb(struct super_block *sb, int nr_to_scan) list_move(&inode->i_lru, &freeable); sb->s_nr_inodes_unused--; this_cpu_dec(nr_unused); + freed++; } if (current_is_kswapd()) __count_vm_events(KSWAPD_INODESTEAL, reap); @@ -789,6 +791,7 @@ void prune_icache_sb(struct super_block *sb, int nr_to_scan) current->reclaim_state->reclaimed_slab += reap; dispose_list(&freeable); + return freed; } static void __wait_on_freeing_inode(struct inode *inode); -- cgit v1.2.3 From bc3b14cb2d505dda969dbe3a31038dbb24aca945 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 28 Aug 2013 10:17:58 +1000 Subject: inode: convert inode lru list to generic lru list code. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [glommer@openvz.org: adapted for new LRU return codes] Signed-off-by: Dave Chinner Signed-off-by: Glauber Costa Cc: "Theodore Ts'o" Cc: Adrian Hunter Cc: Al Viro Cc: Artem Bityutskiy Cc: Arve Hjønnevåg Cc: Carlos Maiolino Cc: Christoph Hellwig Cc: Chuck Lever Cc: Daniel Vetter Cc: David Rientjes Cc: Gleb Natapov Cc: Greg Thelen Cc: J. Bruce Fields Cc: Jan Kara Cc: Jerome Glisse Cc: John Stultz Cc: KAMEZAWA Hiroyuki Cc: Kent Overstreet Cc: Kirill A. Shutemov Cc: Marcelo Tosatti Cc: Mel Gorman Cc: Steven Whitehouse Cc: Thomas Hellstrom Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/inode.c | 175 +++++++++++++++++++++++++------------------------------------ 1 file changed, 70 insertions(+), 105 deletions(-) (limited to 'fs/inode.c') diff --git a/fs/inode.c b/fs/inode.c index 021d64768a55..a973d268c157 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -17,6 +17,7 @@ #include #include /* for inode_has_buffers */ #include +#include #include "internal.h" /* @@ -24,7 +25,7 @@ * * inode->i_lock protects: * inode->i_state, inode->i_hash, __iget() - * inode->i_sb->s_inode_lru_lock protects: + * Inode LRU list locks protect: * inode->i_sb->s_inode_lru, inode->i_lru * inode_sb_list_lock protects: * sb->s_inodes, inode->i_sb_list @@ -37,7 +38,7 @@ * * inode_sb_list_lock * inode->i_lock - * inode->i_sb->s_inode_lru_lock + * Inode LRU list locks * * bdi->wb.list_lock * inode->i_lock @@ -401,13 +402,8 @@ EXPORT_SYMBOL(ihold); static void inode_lru_list_add(struct inode *inode) { - spin_lock(&inode->i_sb->s_inode_lru_lock); - if (list_empty(&inode->i_lru)) { - list_add(&inode->i_lru, &inode->i_sb->s_inode_lru); - inode->i_sb->s_nr_inodes_unused++; + if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru)) this_cpu_inc(nr_unused); - } - spin_unlock(&inode->i_sb->s_inode_lru_lock); } /* @@ -425,13 +421,9 @@ void inode_add_lru(struct inode *inode) static void inode_lru_list_del(struct inode *inode) { - spin_lock(&inode->i_sb->s_inode_lru_lock); - if (!list_empty(&inode->i_lru)) { - list_del_init(&inode->i_lru); - inode->i_sb->s_nr_inodes_unused--; + + if (list_lru_del(&inode->i_sb->s_inode_lru, &inode->i_lru)) this_cpu_dec(nr_unused); - } - spin_unlock(&inode->i_sb->s_inode_lru_lock); } /** @@ -675,24 +667,8 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) return busy; } -static int can_unuse(struct inode *inode) -{ - if (inode->i_state & ~I_REFERENCED) - return 0; - if (inode_has_buffers(inode)) - return 0; - if (atomic_read(&inode->i_count)) - return 0; - if (inode->i_data.nrpages) - return 0; - return 1; -} - /* - * Walk the superblock inode LRU for freeable inodes and attempt to free them. - * This is called from the superblock shrinker function with a number of inodes - * to trim from the LRU. Inodes to be freed are moved to a temporary list and - * then are freed outside inode_lock by dispose_list(). + * Isolate the inode from the LRU in preparation for freeing it. * * Any inodes which are pinned purely because of attached pagecache have their * pagecache removed. If the inode has metadata buffers attached to @@ -706,90 +682,79 @@ static int can_unuse(struct inode *inode) * LRU does not have strict ordering. Hence we don't want to reclaim inodes * with this flag set because they are the inodes that are out of order. */ -long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan) +static enum lru_status +inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) { - LIST_HEAD(freeable); - long nr_scanned; - long freed = 0; - unsigned long reap = 0; + struct list_head *freeable = arg; + struct inode *inode = container_of(item, struct inode, i_lru); - spin_lock(&sb->s_inode_lru_lock); - for (nr_scanned = nr_to_scan; nr_scanned >= 0; nr_scanned--) { - struct inode *inode; + /* + * we are inverting the lru lock/inode->i_lock here, so use a trylock. + * If we fail to get the lock, just skip it. + */ + if (!spin_trylock(&inode->i_lock)) + return LRU_SKIP; - if (list_empty(&sb->s_inode_lru)) - break; + /* + * Referenced or dirty inodes are still in use. Give them another pass + * through the LRU as we canot reclaim them now. + */ + if (atomic_read(&inode->i_count) || + (inode->i_state & ~I_REFERENCED)) { + list_del_init(&inode->i_lru); + spin_unlock(&inode->i_lock); + this_cpu_dec(nr_unused); + return LRU_REMOVED; + } - inode = list_entry(sb->s_inode_lru.prev, struct inode, i_lru); + /* recently referenced inodes get one more pass */ + if (inode->i_state & I_REFERENCED) { + inode->i_state &= ~I_REFERENCED; + spin_unlock(&inode->i_lock); + return LRU_ROTATE; + } - /* - * we are inverting the sb->s_inode_lru_lock/inode->i_lock here, - * so use a trylock. If we fail to get the lock, just move the - * inode to the back of the list so we don't spin on it. - */ - if (!spin_trylock(&inode->i_lock)) { - list_move(&inode->i_lru, &sb->s_inode_lru); - continue; + if (inode_has_buffers(inode) || inode->i_data.nrpages) { + __iget(inode); + spin_unlock(&inode->i_lock); + spin_unlock(lru_lock); + if (remove_inode_buffers(inode)) { + unsigned long reap; + reap = invalidate_mapping_pages(&inode->i_data, 0, -1); + if (current_is_kswapd()) + __count_vm_events(KSWAPD_INODESTEAL, reap); + else + __count_vm_events(PGINODESTEAL, reap); + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += reap; } + iput(inode); + spin_lock(lru_lock); + return LRU_RETRY; + } - /* - * Referenced or dirty inodes are still in use. Give them - * another pass through the LRU as we canot reclaim them now. - */ - if (atomic_read(&inode->i_count) || - (inode->i_state & ~I_REFERENCED)) { - list_del_init(&inode->i_lru); - spin_unlock(&inode->i_lock); - sb->s_nr_inodes_unused--; - this_cpu_dec(nr_unused); - continue; - } + WARN_ON(inode->i_state & I_NEW); + inode->i_state |= I_FREEING; + spin_unlock(&inode->i_lock); - /* recently referenced inodes get one more pass */ - if (inode->i_state & I_REFERENCED) { - inode->i_state &= ~I_REFERENCED; - list_move(&inode->i_lru, &sb->s_inode_lru); - spin_unlock(&inode->i_lock); - continue; - } - if (inode_has_buffers(inode) || inode->i_data.nrpages) { - __iget(inode); - spin_unlock(&inode->i_lock); - spin_unlock(&sb->s_inode_lru_lock); - if (remove_inode_buffers(inode)) - reap += invalidate_mapping_pages(&inode->i_data, - 0, -1); - iput(inode); - spin_lock(&sb->s_inode_lru_lock); - - if (inode != list_entry(sb->s_inode_lru.next, - struct inode, i_lru)) - continue; /* wrong inode or list_empty */ - /* avoid lock inversions with trylock */ - if (!spin_trylock(&inode->i_lock)) - continue; - if (!can_unuse(inode)) { - spin_unlock(&inode->i_lock); - continue; - } - } - WARN_ON(inode->i_state & I_NEW); - inode->i_state |= I_FREEING; - spin_unlock(&inode->i_lock); + list_move(&inode->i_lru, freeable); + this_cpu_dec(nr_unused); + return LRU_REMOVED; +} - list_move(&inode->i_lru, &freeable); - sb->s_nr_inodes_unused--; - this_cpu_dec(nr_unused); - freed++; - } - if (current_is_kswapd()) - __count_vm_events(KSWAPD_INODESTEAL, reap); - else - __count_vm_events(PGINODESTEAL, reap); - spin_unlock(&sb->s_inode_lru_lock); - if (current->reclaim_state) - current->reclaim_state->reclaimed_slab += reap; +/* + * Walk the superblock inode LRU for freeable inodes and attempt to free them. + * This is called from the superblock shrinker function with a number of inodes + * to trim from the LRU. Inodes to be freed are moved to a temporary list and + * then are freed outside inode_lock by dispose_list(). + */ +long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan) +{ + LIST_HEAD(freeable); + long freed; + freed = list_lru_walk(&sb->s_inode_lru, inode_lru_isolate, + &freeable, nr_to_scan); dispose_list(&freeable); return freed; } -- cgit v1.2.3 From d38fa6986e9124f827aa6ea4a9dde01e67a37be7 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Wed, 28 Aug 2013 10:17:59 +1000 Subject: inode: move inode to a different list inside lock When removing an element from the lru, this will be done today after the lock is released. This is a clear mistake, although we are not sure if the bugs we are seeing are related to this. All list manipulations are done inside the lock, and so should this one. Signed-off-by: Glauber Costa Tested-by: Michal Hocko Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/inode.c') diff --git a/fs/inode.c b/fs/inode.c index a973d268c157..5b1ec47c5d39 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -735,9 +735,9 @@ inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; + list_move(&inode->i_lru, freeable); spin_unlock(&inode->i_lock); - list_move(&inode->i_lru, freeable); this_cpu_dec(nr_unused); return LRU_REMOVED; } -- cgit v1.2.3 From 9b17c62382dd2e7507984b9890bf44e070cdd8bb Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 28 Aug 2013 10:18:05 +1000 Subject: fs: convert inode and dentry shrinking to be node aware MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that the shrinker is passing a node in the scan control structure, we can pass this to the the generic LRU list code to isolate reclaim to the lists on matching nodes. Signed-off-by: Dave Chinner Signed-off-by: Glauber Costa Acked-by: Mel Gorman Cc: "Theodore Ts'o" Cc: Adrian Hunter Cc: Al Viro Cc: Artem Bityutskiy Cc: Arve Hjønnevåg Cc: Carlos Maiolino Cc: Christoph Hellwig Cc: Chuck Lever Cc: Daniel Vetter Cc: David Rientjes Cc: Gleb Natapov Cc: Greg Thelen Cc: J. Bruce Fields Cc: Jan Kara Cc: Jerome Glisse Cc: John Stultz Cc: KAMEZAWA Hiroyuki Cc: Kent Overstreet Cc: Kirill A. Shutemov Cc: Marcelo Tosatti Cc: Mel Gorman Cc: Steven Whitehouse Cc: Thomas Hellstrom Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/inode.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs/inode.c') diff --git a/fs/inode.c b/fs/inode.c index 5b1ec47c5d39..b33ba8e021cc 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -748,13 +748,14 @@ inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) * to trim from the LRU. Inodes to be freed are moved to a temporary list and * then are freed outside inode_lock by dispose_list(). */ -long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan) +long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan, + int nid) { LIST_HEAD(freeable); long freed; - freed = list_lru_walk(&sb->s_inode_lru, inode_lru_isolate, - &freeable, nr_to_scan); + freed = list_lru_walk_node(&sb->s_inode_lru, nid, inode_lru_isolate, + &freeable, &nr_to_scan); dispose_list(&freeable); return freed; } -- cgit v1.2.3