From 15570086b590a69d59183b08a7770e316cca20a7 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 2 Sep 2013 11:38:06 -0700
Subject: vfs: reimplement d_rcu_to_refcount() using lockref_get_or_lock()

This moves __d_rcu_to_refcount() from <linux/dcache.h> into fs/namei.c
and re-implements it using the lockref infrastructure instead.  It also
adds a lot of comments about what is actually going on, because turning
a dentry that was looked up using RCU into a long-lived reference
counted entry is one of the more subtle parts of the rcu walk.

We also used to be _particularly_ subtle in unlazy_walk() where we
re-validate both the dentry and its parent using the same sequence
count.  We used to do it by nesting the locks and then verifying the
sequence count just once.

That was silly, because nested locking is expensive, but the sequence
count check is not.  So this just re-validates the dentry and the parent
separately, avoiding the nested locking, and making the lockref lookup
possible.

Acked-by: Waiman Long <waiman.long@hp.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/dcache.h | 22 ----------------------
 1 file changed, 22 deletions(-)

(limited to 'include/linux/dcache.h')

diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index efdc94434c30..9169b91ea2d2 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -304,28 +304,6 @@ extern struct dentry *__d_lookup(const struct dentry *, const struct qstr *);
 extern struct dentry *__d_lookup_rcu(const struct dentry *parent,
 				const struct qstr *name, unsigned *seq);
 
-/**
- * __d_rcu_to_refcount - take a refcount on dentry if sequence check is ok
- * @dentry: dentry to take a ref on
- * @seq: seqcount to verify against
- * Returns: 0 on failure, else 1.
- *
- * __d_rcu_to_refcount operates on a dentry,seq pair that was returned
- * by __d_lookup_rcu, to get a reference on an rcu-walk dentry.
- */
-static inline int __d_rcu_to_refcount(struct dentry *dentry, unsigned seq)
-{
-	int ret = 0;
-
-	assert_spin_locked(&dentry->d_lock);
-	if (!read_seqcount_retry(&dentry->d_seq, seq)) {
-		ret = 1;
-		dentry->d_lockref.count++;
-	}
-
-	return ret;
-}
-
 static inline unsigned d_count(const struct dentry *dentry)
 {
 	return dentry->d_lockref.count;
-- 
cgit v1.2.3


From 848ac114e847af3f1f9141c90a39ebe79bdb13b3 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Thu, 5 Sep 2013 11:44:36 +0200
Subject: vfs: check submounts and drop atomically

We check submounts before doing d_drop() on a non-empty directory dentry in
NFS (have_submounts()), but we do not exclude a racing mount.

 Process A: have_submounts() -> returns false
 Process B: mount() -> success
 Process A: d_drop()

This patch prepares the ground for the fix by doing the following
operations all under the same rename lock:

  have_submounts()
  shrink_dcache_parent()
  d_drop()

This is actually an optimization since have_submounts() and
shrink_dcache_parent() both traverse the same dentry tree separately.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
CC: David Howells <dhowells@redhat.com>
CC: Steven Whitehouse <swhiteho@redhat.com>
CC: Trond Myklebust <Trond.Myklebust@netapp.com>
CC: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/dcache.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux/dcache.h')

diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 9169b91ea2d2..c6d3fe50ff96 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -253,6 +253,7 @@ extern void d_prune_aliases(struct inode *);
 
 /* test whether we have any submounts in a subdir tree */
 extern int have_submounts(struct dentry *);
+extern int check_submounts_and_drop(struct dentry *);
 
 /*
  * This adds the entry to the hash queues.
-- 
cgit v1.2.3


From f0d3b3ded999daae1cf451d051018038c0a05bae Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 5 Sep 2013 12:11:29 -0400
Subject: constify dcache.c inlined helpers where possible

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/dcache.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux/dcache.h')

diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index c6d3fe50ff96..fe50f3db3af9 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -212,7 +212,7 @@ struct dentry_operations {
 
 extern seqlock_t rename_lock;
 
-static inline int dname_external(struct dentry *dentry)
+static inline int dname_external(const struct dentry *dentry)
 {
 	return dentry->d_name.name != dentry->d_iname;
 }
@@ -358,17 +358,17 @@ extern struct dentry *dget_parent(struct dentry *dentry);
  *	Returns true if the dentry passed is not currently hashed.
  */
  
-static inline int d_unhashed(struct dentry *dentry)
+static inline int d_unhashed(const struct dentry *dentry)
 {
 	return hlist_bl_unhashed(&dentry->d_hash);
 }
 
-static inline int d_unlinked(struct dentry *dentry)
+static inline int d_unlinked(const struct dentry *dentry)
 {
 	return d_unhashed(dentry) && !IS_ROOT(dentry);
 }
 
-static inline int cant_mount(struct dentry *dentry)
+static inline int cant_mount(const struct dentry *dentry)
 {
 	return (dentry->d_flags & DCACHE_CANT_MOUNT);
 }
@@ -382,12 +382,12 @@ static inline void dont_mount(struct dentry *dentry)
 
 extern void dput(struct dentry *);
 
-static inline bool d_managed(struct dentry *dentry)
+static inline bool d_managed(const struct dentry *dentry)
 {
 	return dentry->d_flags & DCACHE_MANAGED_DENTRY;
 }
 
-static inline bool d_mountpoint(struct dentry *dentry)
+static inline bool d_mountpoint(const struct dentry *dentry)
 {
 	return dentry->d_flags & DCACHE_MOUNTED;
 }
-- 
cgit v1.2.3


From 8aab6a27332bbf2abfcb35224738394e784d940b Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 8 Sep 2013 13:26:18 -0700
Subject: vfs: reorganize dput() memory accesses

This is me being a bit OCD after all the dentry optimization work this
merge window: profiles end up showing 'dput()' as a rather expensive
operation, and there were two unrelated bad reasons for that.

The first reason was reading d_lockref.count for debugging purposes,
which touches the lockref cacheline (for reads) before really need to.
More importantly, the debugging test in question is _wrong_, and has
hidden bugs.  It's true that we can only sleep when the count goes down
to zero, but the test as-is hides the much more subtle bug that happens
if we race with somebody else deleting the file.

Anyway we _will_ touch that cacheline, but let's do it for a write and
in the right routine (ie in "lockref_put_or_lock()") which annotates the
costs better.  So remove the misleading debug code.

The other was an unnecessary access to the cacheline that contains the
d_lru list, just to check whether we already were on the LRU list or
not.  This is exactly what we have d_flags for, so that we can avoid
touching extra cache lines for the common case.  So just add another bit
for "is this dentry on the LRU".

Finally, mark the tests properly likely/unlikely, so that the common
fast-paths are dense in the instruction stream.

This makes the profiles look much saner.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/dcache.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux/dcache.h')

diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index fe50f3db3af9..feaa8d88eef7 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -208,6 +208,7 @@ struct dentry_operations {
 #define DCACHE_MANAGED_DENTRY \
 	(DCACHE_MOUNTED|DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT)
 
+#define DCACHE_LRU_LIST		0x80000
 #define DCACHE_DENTRY_KILLED	0x100000
 
 extern seqlock_t rename_lock;
-- 
cgit v1.2.3


From 3942c07ccf98e66b8893f396dca98f5b076f905f Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@openvz.org>
Date: Wed, 28 Aug 2013 10:17:53 +1000
Subject: fs: bump inode and dentry counters to long
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This series reworks our current object cache shrinking infrastructure in
two main ways:

 * Noticing that a lot of users copy and paste their own version of LRU
   lists for objects, we put some effort in providing a generic version.
   It is modeled after the filesystem users: dentries, inodes, and xfs
   (for various tasks), but we expect that other users could benefit in
   the near future with little or no modification.  Let us know if you
   have any issues.

 * The underlying list_lru being proposed automatically and
   transparently keeps the elements in per-node lists, and is able to
   manipulate the node lists individually.  Given this infrastructure, we
   are able to modify the up-to-now hammer called shrink_slab to proceed
   with node-reclaim instead of always searching memory from all over like
   it has been doing.

Per-node lru lists are also expected to lead to less contention in the lru
locks on multi-node scans, since we are now no longer fighting for a
global lock.  The locks usually disappear from the profilers with this
change.

Although we have no official benchmarks for this version - be our guest to
independently evaluate this - earlier versions of this series were
performance tested (details at
http://permalink.gmane.org/gmane.linux.kernel.mm/100537) yielding no
visible performance regressions while yielding a better qualitative
behavior in NUMA machines.

With this infrastructure in place, we can use the list_lru entry point to
provide memcg isolation and per-memcg targeted reclaim.  Historically,
those two pieces of work have been posted together.  This version presents
only the infrastructure work, deferring the memcg work for a later time,
so we can focus on getting this part tested.  You can see more about the
history of such work at http://lwn.net/Articles/552769/

Dave Chinner (18):
  dcache: convert dentry_stat.nr_unused to per-cpu counters
  dentry: move to per-sb LRU locks
  dcache: remove dentries from LRU before putting on dispose list
  mm: new shrinker API
  shrinker: convert superblock shrinkers to new API
  list: add a new LRU list type
  inode: convert inode lru list to generic lru list code.
  dcache: convert to use new lru list infrastructure
  list_lru: per-node list infrastructure
  shrinker: add node awareness
  fs: convert inode and dentry shrinking to be node aware
  xfs: convert buftarg LRU to generic code
  xfs: rework buffer dispose list tracking
  xfs: convert dquot cache lru to list_lru
  fs: convert fs shrinkers to new scan/count API
  drivers: convert shrinkers to new count/scan API
  shrinker: convert remaining shrinkers to count/scan API
  shrinker: Kill old ->shrink API.

Glauber Costa (7):
  fs: bump inode and dentry counters to long
  super: fix calculation of shrinkable objects for small numbers
  list_lru: per-node API
  vmscan: per-node deferred work
  i915: bail out earlier when shrinker cannot acquire mutex
  hugepage: convert huge zero page shrinker to new shrinker API
  list_lru: dynamically adjust node arrays

This patch:

There are situations in very large machines in which we can have a large
quantity of dirty inodes, unused dentries, etc.  This is particularly true
when umounting a filesystem, where eventually since every live object will
eventually be discarded.

Dave Chinner reported a problem with this while experimenting with the
shrinker revamp patchset.  So we believe it is time for a change.  This
patch just moves int to longs.  Machines where it matters should have a
big long anyway.

Signed-off-by: Glauber Costa <glommer@openvz.org>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Cc: Arve Hjønnevåg <arve@android.com>
Cc: Carlos Maiolino <cmaiolino@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Chuck Lever <chuck.lever@oracle.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Gleb Natapov <gleb@redhat.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: J. Bruce Fields <bfields@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Kent Overstreet <koverstreet@google.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Cc: Thomas Hellstrom <thellstrom@vmware.com>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/dcache.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux/dcache.h')

diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index feaa8d88eef7..844a1ef387e4 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -55,11 +55,11 @@ struct qstr {
 #define hashlen_len(hashlen)  ((u32)((hashlen) >> 32))
 
 struct dentry_stat_t {
-	int nr_dentry;
-	int nr_unused;
-	int age_limit;          /* age in seconds */
-	int want_pages;         /* pages requested by system */
-	int dummy[2];
+	long nr_dentry;
+	long nr_unused;
+	long age_limit;          /* age in seconds */
+	long want_pages;         /* pages requested by system */
+	long dummy[2];
 };
 extern struct dentry_stat_t dentry_stat;
 
-- 
cgit v1.2.3


From 55f841ce9395a72c6285fbcc4c403c0c786e1c74 Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@openvz.org>
Date: Wed, 28 Aug 2013 10:17:53 +1000
Subject: super: fix calculation of shrinkable objects for small numbers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The sysctl knob sysctl_vfs_cache_pressure is used to determine which
percentage of the shrinkable objects in our cache we should actively try
to shrink.

It works great in situations in which we have many objects (at least more
than 100), because the aproximation errors will be negligible.  But if
this is not the case, specially when total_objects < 100, we may end up
concluding that we have no objects at all (total / 100 = 0, if total <
100).

This is certainly not the biggest killer in the world, but may matter in
very low kernel memory situations.

Signed-off-by: Glauber Costa <glommer@openvz.org>
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Cc: Arve Hjønnevåg <arve@android.com>
Cc: Carlos Maiolino <cmaiolino@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Chuck Lever <chuck.lever@oracle.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: David Rientjes <rientjes@google.com>
Cc: Gleb Natapov <gleb@redhat.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: J. Bruce Fields <bfields@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Kent Overstreet <koverstreet@google.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Cc: Thomas Hellstrom <thellstrom@vmware.com>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/dcache.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux/dcache.h')

diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 844a1ef387e4..59066e0b4ff1 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -395,4 +395,8 @@ static inline bool d_mountpoint(const struct dentry *dentry)
 
 extern int sysctl_vfs_cache_pressure;
 
+static inline unsigned long vfs_pressure_ratio(unsigned long val)
+{
+	return mult_frac(val, sysctl_vfs_cache_pressure, 100);
+}
 #endif	/* __LINUX_DCACHE_H */
-- 
cgit v1.2.3