From 15570086b590a69d59183b08a7770e316cca20a7 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 2 Sep 2013 11:38:06 -0700 Subject: vfs: reimplement d_rcu_to_refcount() using lockref_get_or_lock() This moves __d_rcu_to_refcount() from into fs/namei.c and re-implements it using the lockref infrastructure instead. It also adds a lot of comments about what is actually going on, because turning a dentry that was looked up using RCU into a long-lived reference counted entry is one of the more subtle parts of the rcu walk. We also used to be _particularly_ subtle in unlazy_walk() where we re-validate both the dentry and its parent using the same sequence count. We used to do it by nesting the locks and then verifying the sequence count just once. That was silly, because nested locking is expensive, but the sequence count check is not. So this just re-validates the dentry and the parent separately, avoiding the nested locking, and making the lockref lookup possible. Acked-by: Waiman Long Signed-off-by: Linus Torvalds --- include/linux/dcache.h | 22 ---------------------- 1 file changed, 22 deletions(-) (limited to 'include/linux/dcache.h') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index efdc94434c30..9169b91ea2d2 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -304,28 +304,6 @@ extern struct dentry *__d_lookup(const struct dentry *, const struct qstr *); extern struct dentry *__d_lookup_rcu(const struct dentry *parent, const struct qstr *name, unsigned *seq); -/** - * __d_rcu_to_refcount - take a refcount on dentry if sequence check is ok - * @dentry: dentry to take a ref on - * @seq: seqcount to verify against - * Returns: 0 on failure, else 1. - * - * __d_rcu_to_refcount operates on a dentry,seq pair that was returned - * by __d_lookup_rcu, to get a reference on an rcu-walk dentry. - */ -static inline int __d_rcu_to_refcount(struct dentry *dentry, unsigned seq) -{ - int ret = 0; - - assert_spin_locked(&dentry->d_lock); - if (!read_seqcount_retry(&dentry->d_seq, seq)) { - ret = 1; - dentry->d_lockref.count++; - } - - return ret; -} - static inline unsigned d_count(const struct dentry *dentry) { return dentry->d_lockref.count; -- cgit v1.2.3 From 848ac114e847af3f1f9141c90a39ebe79bdb13b3 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 5 Sep 2013 11:44:36 +0200 Subject: vfs: check submounts and drop atomically We check submounts before doing d_drop() on a non-empty directory dentry in NFS (have_submounts()), but we do not exclude a racing mount. Process A: have_submounts() -> returns false Process B: mount() -> success Process A: d_drop() This patch prepares the ground for the fix by doing the following operations all under the same rename lock: have_submounts() shrink_dcache_parent() d_drop() This is actually an optimization since have_submounts() and shrink_dcache_parent() both traverse the same dentry tree separately. Signed-off-by: Miklos Szeredi CC: David Howells CC: Steven Whitehouse CC: Trond Myklebust CC: Greg Kroah-Hartman Signed-off-by: Al Viro --- include/linux/dcache.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/dcache.h') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 9169b91ea2d2..c6d3fe50ff96 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -253,6 +253,7 @@ extern void d_prune_aliases(struct inode *); /* test whether we have any submounts in a subdir tree */ extern int have_submounts(struct dentry *); +extern int check_submounts_and_drop(struct dentry *); /* * This adds the entry to the hash queues. -- cgit v1.2.3 From f0d3b3ded999daae1cf451d051018038c0a05bae Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 5 Sep 2013 12:11:29 -0400 Subject: constify dcache.c inlined helpers where possible Signed-off-by: Al Viro --- include/linux/dcache.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux/dcache.h') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index c6d3fe50ff96..fe50f3db3af9 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -212,7 +212,7 @@ struct dentry_operations { extern seqlock_t rename_lock; -static inline int dname_external(struct dentry *dentry) +static inline int dname_external(const struct dentry *dentry) { return dentry->d_name.name != dentry->d_iname; } @@ -358,17 +358,17 @@ extern struct dentry *dget_parent(struct dentry *dentry); * Returns true if the dentry passed is not currently hashed. */ -static inline int d_unhashed(struct dentry *dentry) +static inline int d_unhashed(const struct dentry *dentry) { return hlist_bl_unhashed(&dentry->d_hash); } -static inline int d_unlinked(struct dentry *dentry) +static inline int d_unlinked(const struct dentry *dentry) { return d_unhashed(dentry) && !IS_ROOT(dentry); } -static inline int cant_mount(struct dentry *dentry) +static inline int cant_mount(const struct dentry *dentry) { return (dentry->d_flags & DCACHE_CANT_MOUNT); } @@ -382,12 +382,12 @@ static inline void dont_mount(struct dentry *dentry) extern void dput(struct dentry *); -static inline bool d_managed(struct dentry *dentry) +static inline bool d_managed(const struct dentry *dentry) { return dentry->d_flags & DCACHE_MANAGED_DENTRY; } -static inline bool d_mountpoint(struct dentry *dentry) +static inline bool d_mountpoint(const struct dentry *dentry) { return dentry->d_flags & DCACHE_MOUNTED; } -- cgit v1.2.3 From 8aab6a27332bbf2abfcb35224738394e784d940b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 8 Sep 2013 13:26:18 -0700 Subject: vfs: reorganize dput() memory accesses This is me being a bit OCD after all the dentry optimization work this merge window: profiles end up showing 'dput()' as a rather expensive operation, and there were two unrelated bad reasons for that. The first reason was reading d_lockref.count for debugging purposes, which touches the lockref cacheline (for reads) before really need to. More importantly, the debugging test in question is _wrong_, and has hidden bugs. It's true that we can only sleep when the count goes down to zero, but the test as-is hides the much more subtle bug that happens if we race with somebody else deleting the file. Anyway we _will_ touch that cacheline, but let's do it for a write and in the right routine (ie in "lockref_put_or_lock()") which annotates the costs better. So remove the misleading debug code. The other was an unnecessary access to the cacheline that contains the d_lru list, just to check whether we already were on the LRU list or not. This is exactly what we have d_flags for, so that we can avoid touching extra cache lines for the common case. So just add another bit for "is this dentry on the LRU". Finally, mark the tests properly likely/unlikely, so that the common fast-paths are dense in the instruction stream. This makes the profiles look much saner. Signed-off-by: Linus Torvalds --- include/linux/dcache.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/dcache.h') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index fe50f3db3af9..feaa8d88eef7 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -208,6 +208,7 @@ struct dentry_operations { #define DCACHE_MANAGED_DENTRY \ (DCACHE_MOUNTED|DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT) +#define DCACHE_LRU_LIST 0x80000 #define DCACHE_DENTRY_KILLED 0x100000 extern seqlock_t rename_lock; -- cgit v1.2.3 From 3942c07ccf98e66b8893f396dca98f5b076f905f Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Wed, 28 Aug 2013 10:17:53 +1000 Subject: fs: bump inode and dentry counters to long MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This series reworks our current object cache shrinking infrastructure in two main ways: * Noticing that a lot of users copy and paste their own version of LRU lists for objects, we put some effort in providing a generic version. It is modeled after the filesystem users: dentries, inodes, and xfs (for various tasks), but we expect that other users could benefit in the near future with little or no modification. Let us know if you have any issues. * The underlying list_lru being proposed automatically and transparently keeps the elements in per-node lists, and is able to manipulate the node lists individually. Given this infrastructure, we are able to modify the up-to-now hammer called shrink_slab to proceed with node-reclaim instead of always searching memory from all over like it has been doing. Per-node lru lists are also expected to lead to less contention in the lru locks on multi-node scans, since we are now no longer fighting for a global lock. The locks usually disappear from the profilers with this change. Although we have no official benchmarks for this version - be our guest to independently evaluate this - earlier versions of this series were performance tested (details at http://permalink.gmane.org/gmane.linux.kernel.mm/100537) yielding no visible performance regressions while yielding a better qualitative behavior in NUMA machines. With this infrastructure in place, we can use the list_lru entry point to provide memcg isolation and per-memcg targeted reclaim. Historically, those two pieces of work have been posted together. This version presents only the infrastructure work, deferring the memcg work for a later time, so we can focus on getting this part tested. You can see more about the history of such work at http://lwn.net/Articles/552769/ Dave Chinner (18): dcache: convert dentry_stat.nr_unused to per-cpu counters dentry: move to per-sb LRU locks dcache: remove dentries from LRU before putting on dispose list mm: new shrinker API shrinker: convert superblock shrinkers to new API list: add a new LRU list type inode: convert inode lru list to generic lru list code. dcache: convert to use new lru list infrastructure list_lru: per-node list infrastructure shrinker: add node awareness fs: convert inode and dentry shrinking to be node aware xfs: convert buftarg LRU to generic code xfs: rework buffer dispose list tracking xfs: convert dquot cache lru to list_lru fs: convert fs shrinkers to new scan/count API drivers: convert shrinkers to new count/scan API shrinker: convert remaining shrinkers to count/scan API shrinker: Kill old ->shrink API. Glauber Costa (7): fs: bump inode and dentry counters to long super: fix calculation of shrinkable objects for small numbers list_lru: per-node API vmscan: per-node deferred work i915: bail out earlier when shrinker cannot acquire mutex hugepage: convert huge zero page shrinker to new shrinker API list_lru: dynamically adjust node arrays This patch: There are situations in very large machines in which we can have a large quantity of dirty inodes, unused dentries, etc. This is particularly true when umounting a filesystem, where eventually since every live object will eventually be discarded. Dave Chinner reported a problem with this while experimenting with the shrinker revamp patchset. So we believe it is time for a change. This patch just moves int to longs. Machines where it matters should have a big long anyway. Signed-off-by: Glauber Costa Cc: Dave Chinner Cc: "Theodore Ts'o" Cc: Adrian Hunter Cc: Al Viro Cc: Artem Bityutskiy Cc: Arve Hjønnevåg Cc: Carlos Maiolino Cc: Christoph Hellwig Cc: Chuck Lever Cc: Daniel Vetter Cc: Dave Chinner Cc: David Rientjes Cc: Gleb Natapov Cc: Greg Thelen Cc: J. Bruce Fields Cc: Jan Kara Cc: Jerome Glisse Cc: John Stultz Cc: KAMEZAWA Hiroyuki Cc: Kent Overstreet Cc: Kirill A. Shutemov Cc: Marcelo Tosatti Cc: Mel Gorman Cc: Steven Whitehouse Cc: Thomas Hellstrom Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- include/linux/dcache.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux/dcache.h') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index feaa8d88eef7..844a1ef387e4 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -55,11 +55,11 @@ struct qstr { #define hashlen_len(hashlen) ((u32)((hashlen) >> 32)) struct dentry_stat_t { - int nr_dentry; - int nr_unused; - int age_limit; /* age in seconds */ - int want_pages; /* pages requested by system */ - int dummy[2]; + long nr_dentry; + long nr_unused; + long age_limit; /* age in seconds */ + long want_pages; /* pages requested by system */ + long dummy[2]; }; extern struct dentry_stat_t dentry_stat; -- cgit v1.2.3 From 55f841ce9395a72c6285fbcc4c403c0c786e1c74 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Wed, 28 Aug 2013 10:17:53 +1000 Subject: super: fix calculation of shrinkable objects for small numbers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The sysctl knob sysctl_vfs_cache_pressure is used to determine which percentage of the shrinkable objects in our cache we should actively try to shrink. It works great in situations in which we have many objects (at least more than 100), because the aproximation errors will be negligible. But if this is not the case, specially when total_objects < 100, we may end up concluding that we have no objects at all (total / 100 = 0, if total < 100). This is certainly not the biggest killer in the world, but may matter in very low kernel memory situations. Signed-off-by: Glauber Costa Reviewed-by: Carlos Maiolino Acked-by: KAMEZAWA Hiroyuki Acked-by: Mel Gorman Cc: Dave Chinner Cc: Al Viro Cc: "Theodore Ts'o" Cc: Adrian Hunter Cc: Al Viro Cc: Artem Bityutskiy Cc: Arve Hjønnevåg Cc: Carlos Maiolino Cc: Christoph Hellwig Cc: Chuck Lever Cc: Daniel Vetter Cc: David Rientjes Cc: Gleb Natapov Cc: Greg Thelen Cc: J. Bruce Fields Cc: Jan Kara Cc: Jerome Glisse Cc: John Stultz Cc: KAMEZAWA Hiroyuki Cc: Kent Overstreet Cc: Kirill A. Shutemov Cc: Marcelo Tosatti Cc: Mel Gorman Cc: Steven Whitehouse Cc: Thomas Hellstrom Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- include/linux/dcache.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux/dcache.h') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 844a1ef387e4..59066e0b4ff1 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -395,4 +395,8 @@ static inline bool d_mountpoint(const struct dentry *dentry) extern int sysctl_vfs_cache_pressure; +static inline unsigned long vfs_pressure_ratio(unsigned long val) +{ + return mult_frac(val, sysctl_vfs_cache_pressure, 100); +} #endif /* __LINUX_DCACHE_H */ -- cgit v1.2.3