From df3d0bbcdb2cafa23a70223d806655bd37e64a9b Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 2 Sep 2013 11:29:22 -0700 Subject: vfs: use lockref_get_not_zero() for optimistic lockless dget_parent() A valid parent pointer is always going to have a non-zero reference count, but if we look up the parent optimistically without locking, we have to protect against the (very unlikely) race against renaming changing the parent from under us. We do that by using lockref_get_not_zero(), and then re-checking the parent pointer after getting a valid reference. [ This is a re-implementation of a chunk from the original patch by Waiman Long: "dcache: Enable lockless update of dentry's refcount". I've completely rewritten the patch-series and split it up, but I'm attributing this part to Waiman as it's close enough to his earlier patch - Linus ] Signed-off-by: Waiman Long Signed-off-by: Linus Torvalds --- fs/dcache.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index b949af850cd6..2d244227999d 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -611,8 +611,23 @@ static inline void __dget(struct dentry *dentry) struct dentry *dget_parent(struct dentry *dentry) { + int gotref; struct dentry *ret; + /* + * Do optimistic parent lookup without any + * locking. + */ + rcu_read_lock(); + ret = ACCESS_ONCE(dentry->d_parent); + gotref = lockref_get_not_zero(&ret->d_lockref); + rcu_read_unlock(); + if (likely(gotref)) { + if (likely(ret == ACCESS_ONCE(dentry->d_parent))) + return ret; + dput(ret); + } + repeat: /* * Don't need rcu_dereference because we re-check it was correct under -- cgit v1.2.3 From 15570086b590a69d59183b08a7770e316cca20a7 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 2 Sep 2013 11:38:06 -0700 Subject: vfs: reimplement d_rcu_to_refcount() using lockref_get_or_lock() This moves __d_rcu_to_refcount() from into fs/namei.c and re-implements it using the lockref infrastructure instead. It also adds a lot of comments about what is actually going on, because turning a dentry that was looked up using RCU into a long-lived reference counted entry is one of the more subtle parts of the rcu walk. We also used to be _particularly_ subtle in unlazy_walk() where we re-validate both the dentry and its parent using the same sequence count. We used to do it by nesting the locks and then verifying the sequence count just once. That was silly, because nested locking is expensive, but the sequence count check is not. So this just re-validates the dentry and the parent separately, avoiding the nested locking, and making the lockref lookup possible. Acked-by: Waiman Long Signed-off-by: Linus Torvalds --- fs/dcache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index 2d244227999d..96655f4f4574 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1786,7 +1786,7 @@ static noinline enum slow_d_compare slow_dentry_cmp( * without taking d_lock and checking d_seq sequence count against @seq * returned here. * - * A refcount may be taken on the found dentry with the __d_rcu_to_refcount + * A refcount may be taken on the found dentry with the d_rcu_to_refcount * function. * * Alternatively, __d_lookup_rcu may be called again to look up the child of -- cgit v1.2.3 From 590fb51f1cf99c4a48a3b1bd65885192e877b561 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 13 Aug 2013 15:42:02 +0800 Subject: vfs: call d_op->d_prune() before unhashing dentry The d_prune dentry operation is used to notify filesystem when VFS about to prune a hashed dentry from the dcache. There are three code paths that prune dentries: shrink_dcache_for_umount_subtree(), prune_dcache_sb() and d_prune_aliases(). For the d_prune_aliases() case, VFS unhashes the dentry first, then call the d_prune dentry operation. This confuses ceph_d_prune() (ceph uses the d_prune dentry operation to maintain a flag indicating whether the complete contents of a directory are in the dcache, pruning unhashed dentry does not affect dir's completeness) This patch fixes the issue by calling the d_prune dentry operation in d_prune_aliases(), before unhashing the dentry. Also make VFS only call the d_prune dentry operation for hashed dentry, to avoid calling the d_prune dentry operation twice when dentry is pruned by d_prune_aliases(). Signed-off-by: Yan, Zheng Signed-off-by: Al Viro --- fs/dcache.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index 83cfb834db03..2e5f9ca83285 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -472,7 +472,7 @@ relock: * inform the fs via d_prune that this dentry is about to be * unhashed and destroyed. */ - if (dentry->d_flags & DCACHE_OP_PRUNE) + if ((dentry->d_flags & DCACHE_OP_PRUNE) && !d_unhashed(dentry)) dentry->d_op->d_prune(dentry); dentry_lru_del(dentry); @@ -719,6 +719,14 @@ restart: hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) { spin_lock(&dentry->d_lock); if (!dentry->d_count) { + /* + * inform the fs via d_prune that this dentry + * is about to be unhashed and destroyed. + */ + if ((dentry->d_flags & DCACHE_OP_PRUNE) && + !d_unhashed(dentry)) + dentry->d_op->d_prune(dentry); + __dget_dlock(dentry); __d_drop(dentry); spin_unlock(&dentry->d_lock); @@ -907,7 +915,8 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) * inform the fs that this dentry is about to be * unhashed and destroyed. */ - if (dentry->d_flags & DCACHE_OP_PRUNE) + if ((dentry->d_flags & DCACHE_OP_PRUNE) && + !d_unhashed(dentry)) dentry->d_op->d_prune(dentry); dentry_lru_del(dentry); -- cgit v1.2.3 From 01ddc4ede5f09cdaed015d49ab66eec179085a2e Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 5 Sep 2013 11:44:34 +0200 Subject: vfs: restructure d_genocide() It shouldn't matter when we decrement the refcount during the walk as long as we do it exactly once. Restructure d_genocide() to do the killing on entering the dentry instead of when leaving it. This helps creating a common helper for tree walking. Signed-off-by: Miklos Szeredi Signed-off-by: Al Viro --- fs/dcache.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index 5aa53bc056ba..f792e9f20eca 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2952,6 +2952,10 @@ resume: spin_unlock(&dentry->d_lock); continue; } + if (!(dentry->d_flags & DCACHE_GENOCIDE)) { + dentry->d_flags |= DCACHE_GENOCIDE; + dentry->d_lockref.count--; + } if (!list_empty(&dentry->d_subdirs)) { spin_unlock(&this_parent->d_lock); spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_); @@ -2959,18 +2963,10 @@ resume: spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_); goto repeat; } - if (!(dentry->d_flags & DCACHE_GENOCIDE)) { - dentry->d_flags |= DCACHE_GENOCIDE; - dentry->d_lockref.count--; - } spin_unlock(&dentry->d_lock); } if (this_parent != root) { struct dentry *child = this_parent; - if (!(this_parent->d_flags & DCACHE_GENOCIDE)) { - this_parent->d_flags |= DCACHE_GENOCIDE; - this_parent->d_lockref.count--; - } this_parent = try_to_ascend(this_parent, locked, seq); if (!this_parent) goto rename_retry; -- cgit v1.2.3 From db14fc3abcd5dcc9b32ad5b9dd5b8f9e16295a39 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 5 Sep 2013 11:44:35 +0200 Subject: vfs: add d_walk() This one replaces three instances open coded tree walking (have_submounts, select_parent, d_genocide) with a common helper. In addition to slightly reducing the kernel size, this simplifies the callers and makes them less bug prone. Signed-off-by: Miklos Szeredi Signed-off-by: Al Viro --- fs/dcache.c | 309 +++++++++++++++++++++++++++++------------------------------- 1 file changed, 148 insertions(+), 161 deletions(-) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index f792e9f20eca..043c5b478a77 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1031,34 +1031,56 @@ static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq return new; } +/** + * enum d_walk_ret - action to talke during tree walk + * @D_WALK_CONTINUE: contrinue walk + * @D_WALK_QUIT: quit walk + * @D_WALK_NORETRY: quit when retry is needed + * @D_WALK_SKIP: skip this dentry and its children + */ +enum d_walk_ret { + D_WALK_CONTINUE, + D_WALK_QUIT, + D_WALK_NORETRY, + D_WALK_SKIP, +}; -/* - * Search for at least 1 mount point in the dentry's subdirs. - * We descend to the next level whenever the d_subdirs - * list is non-empty and continue searching. - */ - /** - * have_submounts - check for mounts over a dentry - * @parent: dentry to check. + * d_walk - walk the dentry tree + * @parent: start of walk + * @data: data passed to @enter() and @finish() + * @enter: callback when first entering the dentry + * @finish: callback when successfully finished the walk * - * Return true if the parent or its subdirectories contain - * a mount point + * The @enter() and @finish() callbacks are called with d_lock held. */ -int have_submounts(struct dentry *parent) +static void d_walk(struct dentry *parent, void *data, + enum d_walk_ret (*enter)(void *, struct dentry *), + void (*finish)(void *)) { struct dentry *this_parent; struct list_head *next; unsigned seq; int locked = 0; + enum d_walk_ret ret; + bool retry = true; seq = read_seqbegin(&rename_lock); again: this_parent = parent; - - if (d_mountpoint(parent)) - goto positive; spin_lock(&this_parent->d_lock); + + ret = enter(data, this_parent); + switch (ret) { + case D_WALK_CONTINUE: + break; + case D_WALK_QUIT: + case D_WALK_SKIP: + goto out_unlock; + case D_WALK_NORETRY: + retry = false; + break; + } repeat: next = this_parent->d_subdirs.next; resume: @@ -1068,12 +1090,22 @@ resume: next = tmp->next; spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - /* Have we found a mount point ? */ - if (d_mountpoint(dentry)) { + + ret = enter(data, dentry); + switch (ret) { + case D_WALK_CONTINUE: + break; + case D_WALK_QUIT: spin_unlock(&dentry->d_lock); - spin_unlock(&this_parent->d_lock); - goto positive; + goto out_unlock; + case D_WALK_NORETRY: + retry = false; + break; + case D_WALK_SKIP: + spin_unlock(&dentry->d_lock); + continue; } + if (!list_empty(&dentry->d_subdirs)) { spin_unlock(&this_parent->d_lock); spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_); @@ -1094,26 +1126,61 @@ resume: next = child->d_u.d_child.next; goto resume; } - spin_unlock(&this_parent->d_lock); - if (!locked && read_seqretry(&rename_lock, seq)) - goto rename_retry; - if (locked) - write_sequnlock(&rename_lock); - return 0; /* No mount points found in tree */ -positive: - if (!locked && read_seqretry(&rename_lock, seq)) + if (!locked && read_seqretry(&rename_lock, seq)) { + spin_unlock(&this_parent->d_lock); goto rename_retry; + } + if (finish) + finish(data); + +out_unlock: + spin_unlock(&this_parent->d_lock); if (locked) write_sequnlock(&rename_lock); - return 1; + return; rename_retry: + if (!retry) + return; if (locked) goto again; locked = 1; write_seqlock(&rename_lock); goto again; } + +/* + * Search for at least 1 mount point in the dentry's subdirs. + * We descend to the next level whenever the d_subdirs + * list is non-empty and continue searching. + */ + +/** + * have_submounts - check for mounts over a dentry + * @parent: dentry to check. + * + * Return true if the parent or its subdirectories contain + * a mount point + */ + +static enum d_walk_ret check_mount(void *data, struct dentry *dentry) +{ + int *ret = data; + if (d_mountpoint(dentry)) { + *ret = 1; + return D_WALK_QUIT; + } + return D_WALK_CONTINUE; +} + +int have_submounts(struct dentry *parent) +{ + int ret = 0; + + d_walk(parent, &ret, check_mount, NULL); + + return ret; +} EXPORT_SYMBOL(have_submounts); /* @@ -1130,93 +1197,46 @@ EXPORT_SYMBOL(have_submounts); * drop the lock and return early due to latency * constraints. */ -static int select_parent(struct dentry *parent, struct list_head *dispose) -{ - struct dentry *this_parent; - struct list_head *next; - unsigned seq; - int found = 0; - int locked = 0; - seq = read_seqbegin(&rename_lock); -again: - this_parent = parent; - spin_lock(&this_parent->d_lock); -repeat: - next = this_parent->d_subdirs.next; -resume: - while (next != &this_parent->d_subdirs) { - struct list_head *tmp = next; - struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); - next = tmp->next; - - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); +struct select_data { + struct dentry *start; + struct list_head dispose; + int found; +}; - /* - * move only zero ref count dentries to the dispose list. - * - * Those which are presently on the shrink list, being processed - * by shrink_dentry_list(), shouldn't be moved. Otherwise the - * loop in shrink_dcache_parent() might not make any progress - * and loop forever. - */ - if (dentry->d_lockref.count) { - dentry_lru_del(dentry); - } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { - dentry_lru_move_list(dentry, dispose); - dentry->d_flags |= DCACHE_SHRINK_LIST; - found++; - } - /* - * We can return to the caller if we have found some (this - * ensures forward progress). We'll be coming back to find - * the rest. - */ - if (found && need_resched()) { - spin_unlock(&dentry->d_lock); - goto out; - } +static enum d_walk_ret select_collect(void *_data, struct dentry *dentry) +{ + struct select_data *data = _data; + enum d_walk_ret ret = D_WALK_CONTINUE; - /* - * Descend a level if the d_subdirs list is non-empty. - */ - if (!list_empty(&dentry->d_subdirs)) { - spin_unlock(&this_parent->d_lock); - spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_); - this_parent = dentry; - spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_); - goto repeat; - } + if (data->start == dentry) + goto out; - spin_unlock(&dentry->d_lock); - } /* - * All done at this level ... ascend and resume the search. + * move only zero ref count dentries to the dispose list. + * + * Those which are presently on the shrink list, being processed + * by shrink_dentry_list(), shouldn't be moved. Otherwise the + * loop in shrink_dcache_parent() might not make any progress + * and loop forever. */ - if (this_parent != parent) { - struct dentry *child = this_parent; - this_parent = try_to_ascend(this_parent, locked, seq); - if (!this_parent) - goto rename_retry; - next = child->d_u.d_child.next; - goto resume; + if (dentry->d_lockref.count) { + dentry_lru_del(dentry); + } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { + dentry_lru_move_list(dentry, &data->dispose); + dentry->d_flags |= DCACHE_SHRINK_LIST; + data->found++; + ret = D_WALK_NORETRY; } + /* + * We can return to the caller if we have found some (this + * ensures forward progress). We'll be coming back to find + * the rest. + */ + if (data->found && need_resched()) + ret = D_WALK_QUIT; out: - spin_unlock(&this_parent->d_lock); - if (!locked && read_seqretry(&rename_lock, seq)) - goto rename_retry; - if (locked) - write_sequnlock(&rename_lock); - return found; - -rename_retry: - if (found) - return found; - if (locked) - goto again; - locked = 1; - write_seqlock(&rename_lock); - goto again; + return ret; } /** @@ -1225,13 +1245,20 @@ rename_retry: * * Prune the dcache to remove unused children of the parent dentry. */ -void shrink_dcache_parent(struct dentry * parent) +void shrink_dcache_parent(struct dentry *parent) { - LIST_HEAD(dispose); - int found; + for (;;) { + struct select_data data; - while ((found = select_parent(parent, &dispose)) != 0) { - shrink_dentry_list(&dispose); + INIT_LIST_HEAD(&data.dispose); + data.start = parent; + data.found = 0; + + d_walk(parent, &data, select_collect, NULL); + if (!data.found) + break; + + shrink_dentry_list(&data.dispose); cond_resched(); } } @@ -2928,64 +2955,24 @@ int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry) return result; } -void d_genocide(struct dentry *root) +static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry) { - struct dentry *this_parent; - struct list_head *next; - unsigned seq; - int locked = 0; + struct dentry *root = data; + if (dentry != root) { + if (d_unhashed(dentry) || !dentry->d_inode) + return D_WALK_SKIP; - seq = read_seqbegin(&rename_lock); -again: - this_parent = root; - spin_lock(&this_parent->d_lock); -repeat: - next = this_parent->d_subdirs.next; -resume: - while (next != &this_parent->d_subdirs) { - struct list_head *tmp = next; - struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); - next = tmp->next; - - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - if (d_unhashed(dentry) || !dentry->d_inode) { - spin_unlock(&dentry->d_lock); - continue; - } if (!(dentry->d_flags & DCACHE_GENOCIDE)) { dentry->d_flags |= DCACHE_GENOCIDE; dentry->d_lockref.count--; } - if (!list_empty(&dentry->d_subdirs)) { - spin_unlock(&this_parent->d_lock); - spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_); - this_parent = dentry; - spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_); - goto repeat; - } - spin_unlock(&dentry->d_lock); } - if (this_parent != root) { - struct dentry *child = this_parent; - this_parent = try_to_ascend(this_parent, locked, seq); - if (!this_parent) - goto rename_retry; - next = child->d_u.d_child.next; - goto resume; - } - spin_unlock(&this_parent->d_lock); - if (!locked && read_seqretry(&rename_lock, seq)) - goto rename_retry; - if (locked) - write_sequnlock(&rename_lock); - return; + return D_WALK_CONTINUE; +} -rename_retry: - if (locked) - goto again; - locked = 1; - write_seqlock(&rename_lock); - goto again; +void d_genocide(struct dentry *parent) +{ + d_walk(parent, parent, d_genocide_kill, NULL); } void d_tmpfile(struct dentry *dentry, struct inode *inode) -- cgit v1.2.3 From 848ac114e847af3f1f9141c90a39ebe79bdb13b3 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 5 Sep 2013 11:44:36 +0200 Subject: vfs: check submounts and drop atomically We check submounts before doing d_drop() on a non-empty directory dentry in NFS (have_submounts()), but we do not exclude a racing mount. Process A: have_submounts() -> returns false Process B: mount() -> success Process A: d_drop() This patch prepares the ground for the fix by doing the following operations all under the same rename lock: have_submounts() shrink_dcache_parent() d_drop() This is actually an optimization since have_submounts() and shrink_dcache_parent() both traverse the same dentry tree separately. Signed-off-by: Miklos Szeredi CC: David Howells CC: Steven Whitehouse CC: Trond Myklebust CC: Greg Kroah-Hartman Signed-off-by: Al Viro --- fs/dcache.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index 043c5b478a77..ce5a7e6d84cd 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1264,6 +1264,71 @@ void shrink_dcache_parent(struct dentry *parent) } EXPORT_SYMBOL(shrink_dcache_parent); +static enum d_walk_ret check_and_collect(void *_data, struct dentry *dentry) +{ + struct select_data *data = _data; + + if (d_mountpoint(dentry)) { + data->found = -EBUSY; + return D_WALK_QUIT; + } + + return select_collect(_data, dentry); +} + +static void check_and_drop(void *_data) +{ + struct select_data *data = _data; + + if (d_mountpoint(data->start)) + data->found = -EBUSY; + if (!data->found) + __d_drop(data->start); +} + +/** + * check_submounts_and_drop - prune dcache, check for submounts and drop + * + * All done as a single atomic operation relative to has_unlinked_ancestor(). + * Returns 0 if successfully unhashed @parent. If there were submounts then + * return -EBUSY. + * + * @dentry: dentry to prune and drop + */ +int check_submounts_and_drop(struct dentry *dentry) +{ + int ret = 0; + + /* Negative dentries can be dropped without further checks */ + if (!dentry->d_inode) { + d_drop(dentry); + goto out; + } + + for (;;) { + struct select_data data; + + INIT_LIST_HEAD(&data.dispose); + data.start = dentry; + data.found = 0; + + d_walk(dentry, &data, check_and_collect, check_and_drop); + ret = data.found; + + if (!list_empty(&data.dispose)) + shrink_dentry_list(&data.dispose); + + if (ret <= 0) + break; + + cond_resched(); + } + +out: + return ret; +} +EXPORT_SYMBOL(check_submounts_and_drop); + /** * __d_alloc - allocate a dcache entry * @sb: filesystem it will belong to -- cgit v1.2.3 From eed810076685c77dc9a8c5c3593e641c93caed1c Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 5 Sep 2013 14:39:11 +0200 Subject: vfs: check unlinked ancestors before mount We check submounts before doing d_drop() on a non-empty directory dentry in NFS (have_submounts()), but we do not exclude a racing mount. Nor do we prevent mounts to be added to the disconnected subtree using relative paths after the d_drop(). This patch fixes these issues by checking for unlinked (unhashed, non-root) ancestors before proceeding with the mount. This is done with rename seqlock taken for write and with ->d_lock grabbed on each ancestor in turn, including our dentry itself. This ensures that the only one of check_submounts_and_drop() or has_unlinked_ancestor() can succeed. Signed-off-by: Miklos Szeredi Signed-off-by: Al Viro --- fs/dcache.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index ce5a7e6d84cd..761e31bacbc2 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1183,6 +1183,39 @@ int have_submounts(struct dentry *parent) } EXPORT_SYMBOL(have_submounts); +/* + * Called by mount code to set a mountpoint and check if the mountpoint is + * reachable (e.g. NFS can unhash a directory dentry and then the complete + * subtree can become unreachable). + * + * Only one of check_submounts_and_drop() and d_set_mounted() must succeed. For + * this reason take rename_lock and d_lock on dentry and ancestors. + */ +int d_set_mounted(struct dentry *dentry) +{ + struct dentry *p; + int ret = -ENOENT; + write_seqlock(&rename_lock); + for (p = dentry->d_parent; !IS_ROOT(p); p = p->d_parent) { + /* Need exclusion wrt. check_submounts_and_drop() */ + spin_lock(&p->d_lock); + if (unlikely(d_unhashed(p))) { + spin_unlock(&p->d_lock); + goto out; + } + spin_unlock(&p->d_lock); + } + spin_lock(&dentry->d_lock); + if (!d_unlinked(dentry)) { + dentry->d_flags |= DCACHE_MOUNTED; + ret = 0; + } + spin_unlock(&dentry->d_lock); +out: + write_sequnlock(&rename_lock); + return ret; +} + /* * Search the dentry child list of the specified parent, * and move any unused dentries to the end of the unused -- cgit v1.2.3 From 8aab6a27332bbf2abfcb35224738394e784d940b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 8 Sep 2013 13:26:18 -0700 Subject: vfs: reorganize dput() memory accesses This is me being a bit OCD after all the dentry optimization work this merge window: profiles end up showing 'dput()' as a rather expensive operation, and there were two unrelated bad reasons for that. The first reason was reading d_lockref.count for debugging purposes, which touches the lockref cacheline (for reads) before really need to. More importantly, the debugging test in question is _wrong_, and has hidden bugs. It's true that we can only sleep when the count goes down to zero, but the test as-is hides the much more subtle bug that happens if we race with somebody else deleting the file. Anyway we _will_ touch that cacheline, but let's do it for a write and in the right routine (ie in "lockref_put_or_lock()") which annotates the costs better. So remove the misleading debug code. The other was an unnecessary access to the cacheline that contains the d_lru list, just to check whether we already were on the LRU list or not. This is exactly what we have d_flags for, so that we can avoid touching extra cache lines for the common case. So just add another bit for "is this dentry on the LRU". Finally, mark the tests properly likely/unlikely, so that the common fast-paths are dense in the instruction stream. This makes the profiles look much saner. Signed-off-by: Linus Torvalds --- fs/dcache.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index 761e31bacbc2..bf3c4f9569eb 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -308,8 +308,9 @@ static void dentry_unlink_inode(struct dentry * dentry) */ static void dentry_lru_add(struct dentry *dentry) { - if (list_empty(&dentry->d_lru)) { + if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST))) { spin_lock(&dcache_lru_lock); + dentry->d_flags |= DCACHE_LRU_LIST; list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); dentry->d_sb->s_nr_dentry_unused++; dentry_stat.nr_unused++; @@ -320,7 +321,7 @@ static void dentry_lru_add(struct dentry *dentry) static void __dentry_lru_del(struct dentry *dentry) { list_del_init(&dentry->d_lru); - dentry->d_flags &= ~DCACHE_SHRINK_LIST; + dentry->d_flags &= ~(DCACHE_SHRINK_LIST | DCACHE_LRU_LIST); dentry->d_sb->s_nr_dentry_unused--; dentry_stat.nr_unused--; } @@ -341,6 +342,7 @@ static void dentry_lru_move_list(struct dentry *dentry, struct list_head *list) { spin_lock(&dcache_lru_lock); if (list_empty(&dentry->d_lru)) { + dentry->d_flags |= DCACHE_LRU_LIST; list_add_tail(&dentry->d_lru, list); dentry->d_sb->s_nr_dentry_unused++; dentry_stat.nr_unused++; @@ -509,24 +511,22 @@ relock: */ void dput(struct dentry *dentry) { - if (!dentry) + if (unlikely(!dentry)) return; repeat: - if (dentry->d_lockref.count == 1) - might_sleep(); if (lockref_put_or_lock(&dentry->d_lockref)) return; - if (dentry->d_flags & DCACHE_OP_DELETE) { + /* Unreachable? Get rid of it */ + if (unlikely(d_unhashed(dentry))) + goto kill_it; + + if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) { if (dentry->d_op->d_delete(dentry)) goto kill_it; } - /* Unreachable? Get rid of it */ - if (d_unhashed(dentry)) - goto kill_it; - dentry->d_flags |= DCACHE_REFERENCED; dentry_lru_add(dentry); -- cgit v1.2.3 From 0d98439ea3c6ffb2af931f6de4480e744634e2c5 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 8 Sep 2013 13:46:52 -0700 Subject: vfs: use lockred "dead" flag to mark unrecoverably dead dentries This simplifies the RCU to refcounting code in particular. I was originally intending to leave this for later, but walking through all the dput() logic (see previous commit), I realized that the dput() "might_sleep()" check was misleadingly weak. And I removed it as misleading, both for performance profiling and for debugging. However, the might_sleep() debugging case is actually true: the final dput() can indeed sleep, if the inode of the dentry that you are releasing ends up sleeping at iput time (see dentry_iput()). So the problem with the might_sleep() in dput() wasn't that it wasn't true, it was that it wasn't actually testing and triggering on the interesting case. In particular, just about *any* dput() can indeed sleep, if you happen to race with another thread deleting the file in question, and you then lose the race to the be the last dput() for that file. But because it's a very rare race, the debugging code would never trigger it in practice. Why is this problematic? The new d_rcu_to_refcount() (see commit 15570086b590: "vfs: reimplement d_rcu_to_refcount() using lockref_get_or_lock()") does a dput() for the failure case, and it does it under the RCU lock. So potentially sleeping really is a bug. But there's no way I'm going to fix this with the previous complicated "lockref_get_or_lock()" interface. And rather than revert to the old and crufty nested dentry locking code (which did get this right by delaying the reference count updates until they were verified to be safe), let's make forward progress. Cc: Al Viro Signed-off-by: Linus Torvalds --- fs/dcache.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index bf3c4f9569eb..ca8e9cd60f87 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -229,7 +229,7 @@ static void __d_free(struct rcu_head *head) */ static void d_free(struct dentry *dentry) { - BUG_ON(dentry->d_lockref.count); + BUG_ON((int)dentry->d_lockref.count > 0); this_cpu_dec(nr_dentry); if (dentry->d_op && dentry->d_op->d_release) dentry->d_op->d_release(dentry); @@ -445,7 +445,7 @@ EXPORT_SYMBOL(d_drop); * If ref is non-zero, then decrement the refcount too. * Returns dentry requiring refcount drop, or NULL if we're done. */ -static inline struct dentry *dentry_kill(struct dentry *dentry, int ref) +static inline struct dentry *dentry_kill(struct dentry *dentry) __releases(dentry->d_lock) { struct inode *inode; @@ -468,8 +468,11 @@ relock: goto relock; } - if (ref) - dentry->d_lockref.count--; + /* + * The dentry is now unrecoverably dead to the world. + */ + lockref_mark_dead(&dentry->d_lockref); + /* * inform the fs via d_prune that this dentry is about to be * unhashed and destroyed. @@ -535,7 +538,7 @@ repeat: return; kill_it: - dentry = dentry_kill(dentry, 1); + dentry = dentry_kill(dentry); if (dentry) goto repeat; } @@ -760,7 +763,7 @@ static void try_prune_one_dentry(struct dentry *dentry) { struct dentry *parent; - parent = dentry_kill(dentry, 0); + parent = dentry_kill(dentry); /* * If dentry_kill returns NULL, we have nothing more to do. * if it returns the same dentry, trylocks failed. In either @@ -781,7 +784,7 @@ static void try_prune_one_dentry(struct dentry *dentry) while (dentry) { if (lockref_put_or_lock(&dentry->d_lockref)) return; - dentry = dentry_kill(dentry, 1); + dentry = dentry_kill(dentry); } } -- cgit v1.2.3 From 232d2d60aa5469bb097f55728f65146bd49c1d25 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 9 Sep 2013 12:18:13 -0400 Subject: dcache: Translating dentry into pathname without taking rename_lock When running the AIM7's short workload, Linus' lockref patch eliminated most of the spinlock contention. However, there were still some left: 8.46% reaim [kernel.kallsyms] [k] _raw_spin_lock |--42.21%-- d_path | proc_pid_readlink | SyS_readlinkat | SyS_readlink | system_call | __GI___readlink | |--40.97%-- sys_getcwd | system_call | __getcwd The big one here is the rename_lock (seqlock) contention in d_path() and the getcwd system call. This patch will eliminate the need to take the rename_lock while translating dentries into the full pathnames. The need to take the rename_lock is to make sure that no rename operation can be ongoing while the translation is in progress. However, only one thread can take the rename_lock thus blocking all the other threads that need it even though the translation process won't make any change to the dentries. This patch will replace the writer's write_seqlock/write_sequnlock sequence of the rename_lock of the callers of the prepend_path() and __dentry_path() functions with the reader's read_seqbegin/read_seqretry sequence within these 2 functions. As a result, the code will have to retry if one or more rename operations had been performed. In addition, RCU read lock will be taken during the translation process to make sure that no dentries will go away. To prevent live-lock from happening, the code will switch back to take the rename_lock if read_seqretry() fails for three times. To further reduce spinlock contention, this patch does not take the dentry's d_lock when copying the filename from the dentries. Instead, it treats the name pointer and length as unreliable and just copy the string byte-by-byte over until it hits a null byte or the end of string as specified by the length. This should avoid stepping into invalid memory address. The error cases are left to be handled by the sequence number check. The following code re-factoring are also made: 1. Move prepend('/') into prepend_name() to remove one conditional check. 2. Move the global root check in prepend_path() back to the top of the while loop. With this patch, the _raw_spin_lock will now account for only 1.2% of the total CPU cycles for the short workload. This patch also has the effect of reducing the effect of running perf on its profile since the perf command itself can be a heavy user of the d_path() function depending on the complexity of the workload. When taking the perf profile of the high-systime workload, the amount of spinlock contention contributed by running perf without this patch was about 16%. With this patch, the spinlock contention caused by the running of perf will go away and we will have a more accurate perf profile. Signed-off-by: Waiman Long Signed-off-by: Al Viro --- fs/dcache.c | 196 +++++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 133 insertions(+), 63 deletions(-) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index 761e31bacbc2..38b1b0989a16 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -88,6 +88,44 @@ EXPORT_SYMBOL(rename_lock); static struct kmem_cache *dentry_cache __read_mostly; +/** + * read_seqbegin_or_lock - begin a sequence number check or locking block + * lock: sequence lock + * seq : sequence number to be checked + * + * First try it once optimistically without taking the lock. If that fails, + * take the lock. The sequence number is also used as a marker for deciding + * whether to be a reader (even) or writer (odd). + * N.B. seq must be initialized to an even number to begin with. + */ +static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq) +{ + if (!(*seq & 1)) { /* Even */ + *seq = read_seqbegin(lock); + rcu_read_lock(); + } else /* Odd */ + write_seqlock(lock); +} + +/** + * read_seqretry_or_unlock - end a seqretry or lock block & return retry status + * lock : sequence lock + * seq : sequence number + * Return: 1 to retry operation again, 0 to continue + */ +static inline int read_seqretry_or_unlock(seqlock_t *lock, int *seq) +{ + if (!(*seq & 1)) { /* Even */ + rcu_read_unlock(); + if (read_seqretry(lock, *seq)) { + (*seq)++; /* Take writer lock */ + return 1; + } + } else /* Odd */ + write_sequnlock(lock); + return 0; +} + /* * This is the single most critical data structure when it comes * to the dcache: the hashtable for lookups. Somebody should try @@ -2644,9 +2682,39 @@ static int prepend(char **buffer, int *buflen, const char *str, int namelen) return 0; } +/** + * prepend_name - prepend a pathname in front of current buffer pointer + * buffer: buffer pointer + * buflen: allocated length of the buffer + * name: name string and length qstr structure + * + * With RCU path tracing, it may race with d_move(). Use ACCESS_ONCE() to + * make sure that either the old or the new name pointer and length are + * fetched. However, there may be mismatch between length and pointer. + * The length cannot be trusted, we need to copy it byte-by-byte until + * the length is reached or a null byte is found. It also prepends "/" at + * the beginning of the name. The sequence number check at the caller will + * retry it again when a d_move() does happen. So any garbage in the buffer + * due to mismatched pointer and length will be discarded. + */ static int prepend_name(char **buffer, int *buflen, struct qstr *name) { - return prepend(buffer, buflen, name->name, name->len); + const char *dname = ACCESS_ONCE(name->name); + u32 dlen = ACCESS_ONCE(name->len); + char *p; + + if (*buflen < dlen + 1) + return -ENAMETOOLONG; + *buflen -= dlen + 1; + p = *buffer -= dlen + 1; + *p++ = '/'; + while (dlen--) { + char c = *dname++; + if (!c) + break; + *p++ = c; + } + return 0; } /** @@ -2656,7 +2724,14 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name) * @buffer: pointer to the end of the buffer * @buflen: pointer to buffer length * - * Caller holds the rename_lock. + * The function tries to write out the pathname without taking any lock other + * than the RCU read lock to make sure that dentries won't go away. It only + * checks the sequence number of the global rename_lock as any change in the + * dentry's d_seq will be preceded by changes in the rename_lock sequence + * number. If the sequence number had been change, it will restart the whole + * pathname back-tracing sequence again. It performs a total of 3 trials of + * lockless back-tracing sequences before falling back to take the + * rename_lock. */ static int prepend_path(const struct path *path, const struct path *root, @@ -2665,54 +2740,60 @@ static int prepend_path(const struct path *path, struct dentry *dentry = path->dentry; struct vfsmount *vfsmnt = path->mnt; struct mount *mnt = real_mount(vfsmnt); - bool slash = false; int error = 0; + unsigned seq = 0; + char *bptr; + int blen; +restart: + bptr = *buffer; + blen = *buflen; + read_seqbegin_or_lock(&rename_lock, &seq); while (dentry != root->dentry || vfsmnt != root->mnt) { struct dentry * parent; if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) { /* Global root? */ - if (!mnt_has_parent(mnt)) - goto global_root; - dentry = mnt->mnt_mountpoint; - mnt = mnt->mnt_parent; - vfsmnt = &mnt->mnt; - continue; + if (mnt_has_parent(mnt)) { + dentry = mnt->mnt_mountpoint; + mnt = mnt->mnt_parent; + vfsmnt = &mnt->mnt; + continue; + } + /* + * Filesystems needing to implement special "root names" + * should do so with ->d_dname() + */ + if (IS_ROOT(dentry) && + (dentry->d_name.len != 1 || + dentry->d_name.name[0] != '/')) { + WARN(1, "Root dentry has weird name <%.*s>\n", + (int) dentry->d_name.len, + dentry->d_name.name); + } + if (!error) + error = is_mounted(vfsmnt) ? 1 : 2; + break; } parent = dentry->d_parent; prefetch(parent); - spin_lock(&dentry->d_lock); - error = prepend_name(buffer, buflen, &dentry->d_name); - spin_unlock(&dentry->d_lock); - if (!error) - error = prepend(buffer, buflen, "/", 1); + error = prepend_name(&bptr, &blen, &dentry->d_name); if (error) break; - slash = true; dentry = parent; } + if (read_seqretry_or_unlock(&rename_lock, &seq)) + goto restart; - if (!error && !slash) - error = prepend(buffer, buflen, "/", 1); - - return error; - -global_root: - /* - * Filesystems needing to implement special "root names" - * should do so with ->d_dname() - */ - if (IS_ROOT(dentry) && - (dentry->d_name.len != 1 || dentry->d_name.name[0] != '/')) { - WARN(1, "Root dentry has weird name <%.*s>\n", - (int) dentry->d_name.len, dentry->d_name.name); - } - if (!slash) - error = prepend(buffer, buflen, "/", 1); - if (!error) - error = is_mounted(vfsmnt) ? 1 : 2; + if (error >= 0 && bptr == *buffer) { + if (--blen < 0) + error = -ENAMETOOLONG; + else + *--bptr = '/'; + } + *buffer = bptr; + *buflen = blen; return error; } @@ -2741,9 +2822,7 @@ char *__d_path(const struct path *path, prepend(&res, &buflen, "\0", 1); br_read_lock(&vfsmount_lock); - write_seqlock(&rename_lock); error = prepend_path(path, root, &res, &buflen); - write_sequnlock(&rename_lock); br_read_unlock(&vfsmount_lock); if (error < 0) @@ -2762,9 +2841,7 @@ char *d_absolute_path(const struct path *path, prepend(&res, &buflen, "\0", 1); br_read_lock(&vfsmount_lock); - write_seqlock(&rename_lock); error = prepend_path(path, &root, &res, &buflen); - write_sequnlock(&rename_lock); br_read_unlock(&vfsmount_lock); if (error > 1) @@ -2830,9 +2907,7 @@ char *d_path(const struct path *path, char *buf, int buflen) get_fs_root(current->fs, &root); br_read_lock(&vfsmount_lock); - write_seqlock(&rename_lock); error = path_with_deleted(path, &root, &res, &buflen); - write_sequnlock(&rename_lock); br_read_unlock(&vfsmount_lock); if (error < 0) res = ERR_PTR(error); @@ -2867,10 +2942,10 @@ char *simple_dname(struct dentry *dentry, char *buffer, int buflen) char *end = buffer + buflen; /* these dentries are never renamed, so d_lock is not needed */ if (prepend(&end, &buflen, " (deleted)", 11) || - prepend_name(&end, &buflen, &dentry->d_name) || + prepend(&end, &buflen, dentry->d_name.name, dentry->d_name.len) || prepend(&end, &buflen, "/", 1)) end = ERR_PTR(-ENAMETOOLONG); - return end; + return end; } /* @@ -2878,30 +2953,36 @@ char *simple_dname(struct dentry *dentry, char *buffer, int buflen) */ static char *__dentry_path(struct dentry *dentry, char *buf, int buflen) { - char *end = buf + buflen; - char *retval; + char *end, *retval; + int len, seq = 0; + int error = 0; - prepend(&end, &buflen, "\0", 1); +restart: + end = buf + buflen; + len = buflen; + prepend(&end, &len, "\0", 1); if (buflen < 1) goto Elong; /* Get '/' right */ retval = end-1; *retval = '/'; - + read_seqbegin_or_lock(&rename_lock, &seq); while (!IS_ROOT(dentry)) { struct dentry *parent = dentry->d_parent; int error; prefetch(parent); - spin_lock(&dentry->d_lock); - error = prepend_name(&end, &buflen, &dentry->d_name); - spin_unlock(&dentry->d_lock); - if (error != 0 || prepend(&end, &buflen, "/", 1) != 0) - goto Elong; + error = prepend_name(&end, &len, &dentry->d_name); + if (error) + break; retval = end; dentry = parent; } + if (read_seqretry_or_unlock(&rename_lock, &seq)) + goto restart; + if (error) + goto Elong; return retval; Elong: return ERR_PTR(-ENAMETOOLONG); @@ -2909,13 +2990,7 @@ Elong: char *dentry_path_raw(struct dentry *dentry, char *buf, int buflen) { - char *retval; - - write_seqlock(&rename_lock); - retval = __dentry_path(dentry, buf, buflen); - write_sequnlock(&rename_lock); - - return retval; + return __dentry_path(dentry, buf, buflen); } EXPORT_SYMBOL(dentry_path_raw); @@ -2924,7 +2999,6 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen) char *p = NULL; char *retval; - write_seqlock(&rename_lock); if (d_unlinked(dentry)) { p = buf + buflen; if (prepend(&p, &buflen, "//deleted", 10) != 0) @@ -2932,7 +3006,6 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen) buflen++; } retval = __dentry_path(dentry, buf, buflen); - write_sequnlock(&rename_lock); if (!IS_ERR(retval) && p) *p = '/'; /* restore '/' overriden with '\0' */ return retval; @@ -2971,7 +3044,6 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) error = -ENOENT; br_read_lock(&vfsmount_lock); - write_seqlock(&rename_lock); if (!d_unlinked(pwd.dentry)) { unsigned long len; char *cwd = page + PAGE_SIZE; @@ -2979,7 +3051,6 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) prepend(&cwd, &buflen, "\0", 1); error = prepend_path(&pwd, &root, &cwd, &buflen); - write_sequnlock(&rename_lock); br_read_unlock(&vfsmount_lock); if (error < 0) @@ -3000,7 +3071,6 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) error = -EFAULT; } } else { - write_sequnlock(&rename_lock); br_read_unlock(&vfsmount_lock); } -- cgit v1.2.3 From 48f5ec21d9c67e881ff35343988e290ef5cf933f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 9 Sep 2013 15:22:25 -0400 Subject: split read_seqretry_or_unlock(), convert d_walk() to resulting primitives Separate "check if we need to retry" from "unlock if we are done and had seq_writelock"; that allows to use these guys in d_walk(), where we need to recheck every time we ascend back to parent, but do *not* want to unlock until the very end. Lift rcu_read_lock/rcu_read_unlock out into callers. Signed-off-by: Al Viro --- fs/dcache.c | 64 ++++++++++++++++++++++++++++++------------------------------- 1 file changed, 31 insertions(+), 33 deletions(-) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index 38b1b0989a16..b9caf47d5389 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -100,30 +100,21 @@ static struct kmem_cache *dentry_cache __read_mostly; */ static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq) { - if (!(*seq & 1)) { /* Even */ + if (!(*seq & 1)) /* Even */ *seq = read_seqbegin(lock); - rcu_read_lock(); - } else /* Odd */ + else /* Odd */ write_seqlock(lock); } -/** - * read_seqretry_or_unlock - end a seqretry or lock block & return retry status - * lock : sequence lock - * seq : sequence number - * Return: 1 to retry operation again, 0 to continue - */ -static inline int read_seqretry_or_unlock(seqlock_t *lock, int *seq) +static inline int need_seqretry(seqlock_t *lock, int seq) { - if (!(*seq & 1)) { /* Even */ - rcu_read_unlock(); - if (read_seqretry(lock, *seq)) { - (*seq)++; /* Take writer lock */ - return 1; - } - } else /* Odd */ + return !(seq & 1) && read_seqretry(lock, seq); +} + +static inline void done_seqretry(seqlock_t *lock, int seq) +{ + if (seq & 1) write_sequnlock(lock); - return 0; } /* @@ -1047,7 +1038,7 @@ void shrink_dcache_for_umount(struct super_block *sb) * the parenthood after dropping the lock and check * that the sequence number still matches. */ -static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq) +static struct dentry *try_to_ascend(struct dentry *old, unsigned seq) { struct dentry *new = old->d_parent; @@ -1061,7 +1052,7 @@ static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq */ if (new != old->d_parent || (old->d_flags & DCACHE_DENTRY_KILLED) || - (!locked && read_seqretry(&rename_lock, seq))) { + need_seqretry(&rename_lock, seq)) { spin_unlock(&new->d_lock); new = NULL; } @@ -1098,13 +1089,12 @@ static void d_walk(struct dentry *parent, void *data, { struct dentry *this_parent; struct list_head *next; - unsigned seq; - int locked = 0; + unsigned seq = 0; enum d_walk_ret ret; bool retry = true; - seq = read_seqbegin(&rename_lock); again: + read_seqbegin_or_lock(&rename_lock, &seq); this_parent = parent; spin_lock(&this_parent->d_lock); @@ -1158,13 +1148,13 @@ resume: */ if (this_parent != parent) { struct dentry *child = this_parent; - this_parent = try_to_ascend(this_parent, locked, seq); + this_parent = try_to_ascend(this_parent, seq); if (!this_parent) goto rename_retry; next = child->d_u.d_child.next; goto resume; } - if (!locked && read_seqretry(&rename_lock, seq)) { + if (need_seqretry(&rename_lock, seq)) { spin_unlock(&this_parent->d_lock); goto rename_retry; } @@ -1173,17 +1163,13 @@ resume: out_unlock: spin_unlock(&this_parent->d_lock); - if (locked) - write_sequnlock(&rename_lock); + done_seqretry(&rename_lock, seq); return; rename_retry: if (!retry) return; - if (locked) - goto again; - locked = 1; - write_seqlock(&rename_lock); + seq = 1; goto again; } @@ -2745,6 +2731,7 @@ static int prepend_path(const struct path *path, char *bptr; int blen; + rcu_read_lock(); restart: bptr = *buffer; blen = *buflen; @@ -2783,8 +2770,13 @@ restart: dentry = parent; } - if (read_seqretry_or_unlock(&rename_lock, &seq)) + if (!(seq & 1)) + rcu_read_unlock(); + if (need_seqretry(&rename_lock, seq)) { + seq = 1; goto restart; + } + done_seqretry(&rename_lock, seq); if (error >= 0 && bptr == *buffer) { if (--blen < 0) @@ -2957,6 +2949,7 @@ static char *__dentry_path(struct dentry *dentry, char *buf, int buflen) int len, seq = 0; int error = 0; + rcu_read_lock(); restart: end = buf + buflen; len = buflen; @@ -2979,8 +2972,13 @@ restart: retval = end; dentry = parent; } - if (read_seqretry_or_unlock(&rename_lock, &seq)) + if (!(seq & 1)) + rcu_read_unlock(); + if (need_seqretry(&rename_lock, seq)) { + seq = 1; goto restart; + } + done_seqretry(&rename_lock, seq); if (error) goto Elong; return retval; -- cgit v1.2.3 From 3942c07ccf98e66b8893f396dca98f5b076f905f Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Wed, 28 Aug 2013 10:17:53 +1000 Subject: fs: bump inode and dentry counters to long MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This series reworks our current object cache shrinking infrastructure in two main ways: * Noticing that a lot of users copy and paste their own version of LRU lists for objects, we put some effort in providing a generic version. It is modeled after the filesystem users: dentries, inodes, and xfs (for various tasks), but we expect that other users could benefit in the near future with little or no modification. Let us know if you have any issues. * The underlying list_lru being proposed automatically and transparently keeps the elements in per-node lists, and is able to manipulate the node lists individually. Given this infrastructure, we are able to modify the up-to-now hammer called shrink_slab to proceed with node-reclaim instead of always searching memory from all over like it has been doing. Per-node lru lists are also expected to lead to less contention in the lru locks on multi-node scans, since we are now no longer fighting for a global lock. The locks usually disappear from the profilers with this change. Although we have no official benchmarks for this version - be our guest to independently evaluate this - earlier versions of this series were performance tested (details at http://permalink.gmane.org/gmane.linux.kernel.mm/100537) yielding no visible performance regressions while yielding a better qualitative behavior in NUMA machines. With this infrastructure in place, we can use the list_lru entry point to provide memcg isolation and per-memcg targeted reclaim. Historically, those two pieces of work have been posted together. This version presents only the infrastructure work, deferring the memcg work for a later time, so we can focus on getting this part tested. You can see more about the history of such work at http://lwn.net/Articles/552769/ Dave Chinner (18): dcache: convert dentry_stat.nr_unused to per-cpu counters dentry: move to per-sb LRU locks dcache: remove dentries from LRU before putting on dispose list mm: new shrinker API shrinker: convert superblock shrinkers to new API list: add a new LRU list type inode: convert inode lru list to generic lru list code. dcache: convert to use new lru list infrastructure list_lru: per-node list infrastructure shrinker: add node awareness fs: convert inode and dentry shrinking to be node aware xfs: convert buftarg LRU to generic code xfs: rework buffer dispose list tracking xfs: convert dquot cache lru to list_lru fs: convert fs shrinkers to new scan/count API drivers: convert shrinkers to new count/scan API shrinker: convert remaining shrinkers to count/scan API shrinker: Kill old ->shrink API. Glauber Costa (7): fs: bump inode and dentry counters to long super: fix calculation of shrinkable objects for small numbers list_lru: per-node API vmscan: per-node deferred work i915: bail out earlier when shrinker cannot acquire mutex hugepage: convert huge zero page shrinker to new shrinker API list_lru: dynamically adjust node arrays This patch: There are situations in very large machines in which we can have a large quantity of dirty inodes, unused dentries, etc. This is particularly true when umounting a filesystem, where eventually since every live object will eventually be discarded. Dave Chinner reported a problem with this while experimenting with the shrinker revamp patchset. So we believe it is time for a change. This patch just moves int to longs. Machines where it matters should have a big long anyway. Signed-off-by: Glauber Costa Cc: Dave Chinner Cc: "Theodore Ts'o" Cc: Adrian Hunter Cc: Al Viro Cc: Artem Bityutskiy Cc: Arve Hjønnevåg Cc: Carlos Maiolino Cc: Christoph Hellwig Cc: Chuck Lever Cc: Daniel Vetter Cc: Dave Chinner Cc: David Rientjes Cc: Gleb Natapov Cc: Greg Thelen Cc: J. Bruce Fields Cc: Jan Kara Cc: Jerome Glisse Cc: John Stultz Cc: KAMEZAWA Hiroyuki Cc: Kent Overstreet Cc: Kirill A. Shutemov Cc: Marcelo Tosatti Cc: Mel Gorman Cc: Steven Whitehouse Cc: Thomas Hellstrom Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/dcache.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index 4d9df3c940e6..6ef1c2e1bbc4 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -146,13 +146,13 @@ struct dentry_stat_t dentry_stat = { .age_limit = 45, }; -static DEFINE_PER_CPU(unsigned int, nr_dentry); +static DEFINE_PER_CPU(long, nr_dentry); #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) -static int get_nr_dentry(void) +static long get_nr_dentry(void) { int i; - int sum = 0; + long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_dentry, i); return sum < 0 ? 0 : sum; @@ -162,7 +162,7 @@ int proc_nr_dentry(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { dentry_stat.nr_dentry = get_nr_dentry(); - return proc_dointvec(table, write, buffer, lenp, ppos); + return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } #endif -- cgit v1.2.3 From 62d36c77035219ac776d1882ed3a662f2b75f258 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 28 Aug 2013 10:17:54 +1000 Subject: dcache: convert dentry_stat.nr_unused to per-cpu counters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before we split up the dcache_lru_lock, the unused dentry counter needs to be made independent of the global dcache_lru_lock. Convert it to per-cpu counters to do this. Signed-off-by: Dave Chinner Signed-off-by: Glauber Costa Reviewed-by: Christoph Hellwig Acked-by: Mel Gorman Cc: "Theodore Ts'o" Cc: Adrian Hunter Cc: Al Viro Cc: Artem Bityutskiy Cc: Arve Hjønnevåg Cc: Carlos Maiolino Cc: Christoph Hellwig Cc: Chuck Lever Cc: Daniel Vetter Cc: David Rientjes Cc: Gleb Natapov Cc: Greg Thelen Cc: J. Bruce Fields Cc: Jan Kara Cc: Jerome Glisse Cc: John Stultz Cc: KAMEZAWA Hiroyuki Cc: Kent Overstreet Cc: Kirill A. Shutemov Cc: Marcelo Tosatti Cc: Mel Gorman Cc: Steven Whitehouse Cc: Thomas Hellstrom Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/dcache.c | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index 6ef1c2e1bbc4..03161240e744 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -147,8 +147,22 @@ struct dentry_stat_t dentry_stat = { }; static DEFINE_PER_CPU(long, nr_dentry); +static DEFINE_PER_CPU(long, nr_dentry_unused); #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) + +/* + * Here we resort to our own counters instead of using generic per-cpu counters + * for consistency with what the vfs inode code does. We are expected to harvest + * better code and performance by having our own specialized counters. + * + * Please note that the loop is done over all possible CPUs, not over all online + * CPUs. The reason for this is that we don't want to play games with CPUs going + * on and off. If one of them goes off, we will just keep their counters. + * + * glommer: See cffbc8a for details, and if you ever intend to change this, + * please update all vfs counters to match. + */ static long get_nr_dentry(void) { int i; @@ -158,10 +172,20 @@ static long get_nr_dentry(void) return sum < 0 ? 0 : sum; } +static long get_nr_dentry_unused(void) +{ + int i; + long sum = 0; + for_each_possible_cpu(i) + sum += per_cpu(nr_dentry_unused, i); + return sum < 0 ? 0 : sum; +} + int proc_nr_dentry(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { dentry_stat.nr_dentry = get_nr_dentry(); + dentry_stat.nr_unused = get_nr_dentry_unused(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } #endif @@ -342,7 +366,7 @@ static void dentry_lru_add(struct dentry *dentry) dentry->d_flags |= DCACHE_LRU_LIST; list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); dentry->d_sb->s_nr_dentry_unused++; - dentry_stat.nr_unused++; + this_cpu_inc(nr_dentry_unused); spin_unlock(&dcache_lru_lock); } } @@ -352,7 +376,7 @@ static void __dentry_lru_del(struct dentry *dentry) list_del_init(&dentry->d_lru); dentry->d_flags &= ~(DCACHE_SHRINK_LIST | DCACHE_LRU_LIST); dentry->d_sb->s_nr_dentry_unused--; - dentry_stat.nr_unused--; + this_cpu_dec(nr_dentry_unused); } /* @@ -374,7 +398,7 @@ static void dentry_lru_move_list(struct dentry *dentry, struct list_head *list) dentry->d_flags |= DCACHE_LRU_LIST; list_add_tail(&dentry->d_lru, list); dentry->d_sb->s_nr_dentry_unused++; - dentry_stat.nr_unused++; + this_cpu_inc(nr_dentry_unused); } else { list_move_tail(&dentry->d_lru, list); } -- cgit v1.2.3 From 19156840e33a23eeb1a749c0f991dab6588b077d Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 28 Aug 2013 10:17:55 +1000 Subject: dentry: move to per-sb LRU locks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With the dentry LRUs being per-sb structures, there is no real need for a global dentry_lru_lock. The locking can be made more fine-grained by moving to a per-sb LRU lock, isolating the LRU operations of different filesytsems completely from each other. The need for this is independent of any performance consideration that may arise: in the interest of abstracting the lru operations away, it is mandatory that each lru works around its own lock instead of a global lock for all of them. [glommer@openvz.org: updated changelog ] Signed-off-by: Dave Chinner Signed-off-by: Glauber Costa Reviewed-by: Christoph Hellwig Acked-by: Mel Gorman Cc: "Theodore Ts'o" Cc: Adrian Hunter Cc: Al Viro Cc: Artem Bityutskiy Cc: Arve Hjønnevåg Cc: Carlos Maiolino Cc: Christoph Hellwig Cc: Chuck Lever Cc: Daniel Vetter Cc: David Rientjes Cc: Gleb Natapov Cc: Greg Thelen Cc: J. Bruce Fields Cc: Jan Kara Cc: Jerome Glisse Cc: John Stultz Cc: KAMEZAWA Hiroyuki Cc: Kent Overstreet Cc: Kirill A. Shutemov Cc: Marcelo Tosatti Cc: Mel Gorman Cc: Steven Whitehouse Cc: Thomas Hellstrom Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/dcache.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index 03161240e744..e989ecb44a65 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -48,7 +48,7 @@ * - the dcache hash table * s_anon bl list spinlock protects: * - the s_anon list (see __d_drop) - * dcache_lru_lock protects: + * dentry->d_sb->s_dentry_lru_lock protects: * - the dcache lru lists and counters * d_lock protects: * - d_flags @@ -63,7 +63,7 @@ * Ordering: * dentry->d_inode->i_lock * dentry->d_lock - * dcache_lru_lock + * dentry->d_sb->s_dentry_lru_lock * dcache_hash_bucket lock * s_anon lock * @@ -81,7 +81,6 @@ int sysctl_vfs_cache_pressure __read_mostly = 100; EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); -static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lru_lock); __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); EXPORT_SYMBOL(rename_lock); @@ -362,12 +361,12 @@ static void dentry_unlink_inode(struct dentry * dentry) static void dentry_lru_add(struct dentry *dentry) { if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST))) { - spin_lock(&dcache_lru_lock); + spin_lock(&dentry->d_sb->s_dentry_lru_lock); dentry->d_flags |= DCACHE_LRU_LIST; list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); dentry->d_sb->s_nr_dentry_unused++; this_cpu_inc(nr_dentry_unused); - spin_unlock(&dcache_lru_lock); + spin_unlock(&dentry->d_sb->s_dentry_lru_lock); } } @@ -385,15 +384,15 @@ static void __dentry_lru_del(struct dentry *dentry) static void dentry_lru_del(struct dentry *dentry) { if (!list_empty(&dentry->d_lru)) { - spin_lock(&dcache_lru_lock); + spin_lock(&dentry->d_sb->s_dentry_lru_lock); __dentry_lru_del(dentry); - spin_unlock(&dcache_lru_lock); + spin_unlock(&dentry->d_sb->s_dentry_lru_lock); } } static void dentry_lru_move_list(struct dentry *dentry, struct list_head *list) { - spin_lock(&dcache_lru_lock); + spin_lock(&dentry->d_sb->s_dentry_lru_lock); if (list_empty(&dentry->d_lru)) { dentry->d_flags |= DCACHE_LRU_LIST; list_add_tail(&dentry->d_lru, list); @@ -402,7 +401,7 @@ static void dentry_lru_move_list(struct dentry *dentry, struct list_head *list) } else { list_move_tail(&dentry->d_lru, list); } - spin_unlock(&dcache_lru_lock); + spin_unlock(&dentry->d_sb->s_dentry_lru_lock); } /** @@ -895,14 +894,14 @@ void prune_dcache_sb(struct super_block *sb, int count) LIST_HEAD(tmp); relock: - spin_lock(&dcache_lru_lock); + spin_lock(&sb->s_dentry_lru_lock); while (!list_empty(&sb->s_dentry_lru)) { dentry = list_entry(sb->s_dentry_lru.prev, struct dentry, d_lru); BUG_ON(dentry->d_sb != sb); if (!spin_trylock(&dentry->d_lock)) { - spin_unlock(&dcache_lru_lock); + spin_unlock(&sb->s_dentry_lru_lock); cpu_relax(); goto relock; } @@ -918,11 +917,11 @@ relock: if (!--count) break; } - cond_resched_lock(&dcache_lru_lock); + cond_resched_lock(&sb->s_dentry_lru_lock); } if (!list_empty(&referenced)) list_splice(&referenced, &sb->s_dentry_lru); - spin_unlock(&dcache_lru_lock); + spin_unlock(&sb->s_dentry_lru_lock); shrink_dentry_list(&tmp); } @@ -938,14 +937,14 @@ void shrink_dcache_sb(struct super_block *sb) { LIST_HEAD(tmp); - spin_lock(&dcache_lru_lock); + spin_lock(&sb->s_dentry_lru_lock); while (!list_empty(&sb->s_dentry_lru)) { list_splice_init(&sb->s_dentry_lru, &tmp); - spin_unlock(&dcache_lru_lock); + spin_unlock(&sb->s_dentry_lru_lock); shrink_dentry_list(&tmp); - spin_lock(&dcache_lru_lock); + spin_lock(&sb->s_dentry_lru_lock); } - spin_unlock(&dcache_lru_lock); + spin_unlock(&sb->s_dentry_lru_lock); } EXPORT_SYMBOL(shrink_dcache_sb); -- cgit v1.2.3 From dd1f6b2e43a53ee58eb87d5e623cf44e277d005d Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 28 Aug 2013 10:17:55 +1000 Subject: dcache: remove dentries from LRU before putting on dispose list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit One of the big problems with modifying the way the dcache shrinker and LRU implementation works is that the LRU is abused in several ways. One of these is shrink_dentry_list(). Basically, we can move a dentry off the LRU onto a different list without doing any accounting changes, and then use dentry_lru_prune() to remove it from what-ever list it is now on to do the LRU accounting at that point. This makes it -really hard- to change the LRU implementation. The use of the per-sb LRU lock serialises movement of the dentries between the different lists and the removal of them, and this is the only reason that it works. If we want to break up the dentry LRU lock and lists into, say, per-node lists, we remove the only serialisation that allows this lru list/dispose list abuse to work. To make this work effectively, the dispose list has to be isolated from the LRU list - dentries have to be removed from the LRU *before* being placed on the dispose list. This means that the LRU accounting and isolation is completed before disposal is started, and that means we can change the LRU implementation freely in future. This means that dentries *must* be marked with DCACHE_SHRINK_LIST when they are placed on the dispose list so that we don't think that parent dentries found in try_prune_one_dentry() are on the LRU when the are actually on the dispose list. This would result in accounting the dentry to the LRU a second time. Hence dentry_lru_del() has to handle the DCACHE_SHRINK_LIST case Signed-off-by: Dave Chinner Signed-off-by: Glauber Costa Cc: "Theodore Ts'o" Cc: Adrian Hunter Cc: Al Viro Cc: Artem Bityutskiy Cc: Arve Hjønnevåg Cc: Carlos Maiolino Cc: Christoph Hellwig Cc: Chuck Lever Cc: Daniel Vetter Cc: David Rientjes Cc: Gleb Natapov Cc: Greg Thelen Cc: J. Bruce Fields Cc: Jan Kara Cc: Jerome Glisse Cc: John Stultz Cc: KAMEZAWA Hiroyuki Cc: Kent Overstreet Cc: Kirill A. Shutemov Cc: Marcelo Tosatti Cc: Mel Gorman Cc: Steven Whitehouse Cc: Thomas Hellstrom Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/dcache.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 78 insertions(+), 21 deletions(-) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index e989ecb44a65..509b49410943 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -356,7 +356,7 @@ static void dentry_unlink_inode(struct dentry * dentry) } /* - * dentry_lru_(add|del|prune|move_tail) must be called with d_lock held. + * dentry_lru_(add|del|move_list) must be called with d_lock held. */ static void dentry_lru_add(struct dentry *dentry) { @@ -373,16 +373,26 @@ static void dentry_lru_add(struct dentry *dentry) static void __dentry_lru_del(struct dentry *dentry) { list_del_init(&dentry->d_lru); - dentry->d_flags &= ~(DCACHE_SHRINK_LIST | DCACHE_LRU_LIST); + dentry->d_flags &= ~DCACHE_LRU_LIST; dentry->d_sb->s_nr_dentry_unused--; this_cpu_dec(nr_dentry_unused); } /* * Remove a dentry with references from the LRU. + * + * If we are on the shrink list, then we can get to try_prune_one_dentry() and + * lose our last reference through the parent walk. In this case, we need to + * remove ourselves from the shrink list, not the LRU. */ static void dentry_lru_del(struct dentry *dentry) { + if (dentry->d_flags & DCACHE_SHRINK_LIST) { + list_del_init(&dentry->d_lru); + dentry->d_flags &= ~DCACHE_SHRINK_LIST; + return; + } + if (!list_empty(&dentry->d_lru)) { spin_lock(&dentry->d_sb->s_dentry_lru_lock); __dentry_lru_del(dentry); @@ -392,14 +402,16 @@ static void dentry_lru_del(struct dentry *dentry) static void dentry_lru_move_list(struct dentry *dentry, struct list_head *list) { + BUG_ON(dentry->d_flags & DCACHE_SHRINK_LIST); + spin_lock(&dentry->d_sb->s_dentry_lru_lock); if (list_empty(&dentry->d_lru)) { dentry->d_flags |= DCACHE_LRU_LIST; list_add_tail(&dentry->d_lru, list); - dentry->d_sb->s_nr_dentry_unused++; - this_cpu_inc(nr_dentry_unused); } else { list_move_tail(&dentry->d_lru, list); + dentry->d_sb->s_nr_dentry_unused--; + this_cpu_dec(nr_dentry_unused); } spin_unlock(&dentry->d_sb->s_dentry_lru_lock); } @@ -497,7 +509,8 @@ EXPORT_SYMBOL(d_drop); * If ref is non-zero, then decrement the refcount too. * Returns dentry requiring refcount drop, or NULL if we're done. */ -static inline struct dentry *dentry_kill(struct dentry *dentry) +static inline struct dentry * +dentry_kill(struct dentry *dentry, int unlock_on_failure) __releases(dentry->d_lock) { struct inode *inode; @@ -506,8 +519,10 @@ static inline struct dentry *dentry_kill(struct dentry *dentry) inode = dentry->d_inode; if (inode && !spin_trylock(&inode->i_lock)) { relock: - spin_unlock(&dentry->d_lock); - cpu_relax(); + if (unlock_on_failure) { + spin_unlock(&dentry->d_lock); + cpu_relax(); + } return dentry; /* try again with same dentry */ } if (IS_ROOT(dentry)) @@ -590,7 +605,7 @@ repeat: return; kill_it: - dentry = dentry_kill(dentry); + dentry = dentry_kill(dentry, 1); if (dentry) goto repeat; } @@ -810,12 +825,12 @@ EXPORT_SYMBOL(d_prune_aliases); * * This may fail if locks cannot be acquired no problem, just try again. */ -static void try_prune_one_dentry(struct dentry *dentry) +static struct dentry * try_prune_one_dentry(struct dentry *dentry) __releases(dentry->d_lock) { struct dentry *parent; - parent = dentry_kill(dentry); + parent = dentry_kill(dentry, 0); /* * If dentry_kill returns NULL, we have nothing more to do. * if it returns the same dentry, trylocks failed. In either @@ -827,17 +842,18 @@ static void try_prune_one_dentry(struct dentry *dentry) * fragmentation. */ if (!parent) - return; + return NULL; if (parent == dentry) - return; + return dentry; /* Prune ancestors. */ dentry = parent; while (dentry) { if (lockref_put_or_lock(&dentry->d_lockref)) - return; - dentry = dentry_kill(dentry); + return NULL; + dentry = dentry_kill(dentry, 1); } + return NULL; } static void shrink_dentry_list(struct list_head *list) @@ -855,22 +871,32 @@ static void shrink_dentry_list(struct list_head *list) continue; } + /* + * The dispose list is isolated and dentries are not accounted + * to the LRU here, so we can simply remove it from the list + * here regardless of whether it is referenced or not. + */ + list_del_init(&dentry->d_lru); + dentry->d_flags &= ~DCACHE_SHRINK_LIST; + /* * We found an inuse dentry which was not removed from - * the LRU because of laziness during lookup. Do not free - * it - just keep it off the LRU list. + * the LRU because of laziness during lookup. Do not free it. */ if (dentry->d_lockref.count) { - dentry_lru_del(dentry); spin_unlock(&dentry->d_lock); continue; } - rcu_read_unlock(); - try_prune_one_dentry(dentry); + dentry = try_prune_one_dentry(dentry); rcu_read_lock(); + if (dentry) { + dentry->d_flags |= DCACHE_SHRINK_LIST; + list_add(&dentry->d_lru, list); + spin_unlock(&dentry->d_lock); + } } rcu_read_unlock(); } @@ -911,8 +937,10 @@ relock: list_move(&dentry->d_lru, &referenced); spin_unlock(&dentry->d_lock); } else { - list_move_tail(&dentry->d_lru, &tmp); + list_move(&dentry->d_lru, &tmp); dentry->d_flags |= DCACHE_SHRINK_LIST; + this_cpu_dec(nr_dentry_unused); + sb->s_nr_dentry_unused--; spin_unlock(&dentry->d_lock); if (!--count) break; @@ -926,6 +954,27 @@ relock: shrink_dentry_list(&tmp); } +/* + * Mark all the dentries as on being the dispose list so we don't think they are + * still on the LRU if we try to kill them from ascending the parent chain in + * try_prune_one_dentry() rather than directly from the dispose list. + */ +static void +shrink_dcache_list( + struct list_head *dispose) +{ + struct dentry *dentry; + + rcu_read_lock(); + list_for_each_entry_rcu(dentry, dispose, d_lru) { + spin_lock(&dentry->d_lock); + dentry->d_flags |= DCACHE_SHRINK_LIST; + spin_unlock(&dentry->d_lock); + } + rcu_read_unlock(); + shrink_dentry_list(dispose); +} + /** * shrink_dcache_sb - shrink dcache for a superblock * @sb: superblock @@ -939,9 +988,17 @@ void shrink_dcache_sb(struct super_block *sb) spin_lock(&sb->s_dentry_lru_lock); while (!list_empty(&sb->s_dentry_lru)) { + /* + * account for removal here so we don't need to handle it later + * even though the dentry is no longer on the lru list. + */ list_splice_init(&sb->s_dentry_lru, &tmp); + this_cpu_sub(nr_dentry_unused, sb->s_nr_dentry_unused); + sb->s_nr_dentry_unused = 0; spin_unlock(&sb->s_dentry_lru_lock); - shrink_dentry_list(&tmp); + + shrink_dcache_list(&tmp); + spin_lock(&sb->s_dentry_lru_lock); } spin_unlock(&sb->s_dentry_lru_lock); -- cgit v1.2.3 From 0a234c6dcb79a270803f5c9773ed650b78730962 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 28 Aug 2013 10:17:57 +1000 Subject: shrinker: convert superblock shrinkers to new API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert superblock shrinker to use the new count/scan API, and propagate the API changes through to the filesystem callouts. The filesystem callouts already use a count/scan API, so it's just changing counters to longs to match the VM API. This requires the dentry and inode shrinker callouts to be converted to the count/scan API. This is mainly a mechanical change. [glommer@openvz.org: use mult_frac for fractional proportions, build fixes] Signed-off-by: Dave Chinner Signed-off-by: Glauber Costa Acked-by: Mel Gorman Cc: "Theodore Ts'o" Cc: Adrian Hunter Cc: Al Viro Cc: Artem Bityutskiy Cc: Arve Hjønnevåg Cc: Carlos Maiolino Cc: Christoph Hellwig Cc: Chuck Lever Cc: Daniel Vetter Cc: David Rientjes Cc: Gleb Natapov Cc: Greg Thelen Cc: J. Bruce Fields Cc: Jan Kara Cc: Jerome Glisse Cc: John Stultz Cc: KAMEZAWA Hiroyuki Cc: Kent Overstreet Cc: Kirill A. Shutemov Cc: Marcelo Tosatti Cc: Mel Gorman Cc: Steven Whitehouse Cc: Thomas Hellstrom Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/dcache.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index 509b49410943..77d466b13fef 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -913,11 +913,12 @@ static void shrink_dentry_list(struct list_head *list) * This function may fail to free any resources if all the dentries are in * use. */ -void prune_dcache_sb(struct super_block *sb, int count) +long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan) { struct dentry *dentry; LIST_HEAD(referenced); LIST_HEAD(tmp); + long freed = 0; relock: spin_lock(&sb->s_dentry_lru_lock); @@ -942,7 +943,8 @@ relock: this_cpu_dec(nr_dentry_unused); sb->s_nr_dentry_unused--; spin_unlock(&dentry->d_lock); - if (!--count) + freed++; + if (!--nr_to_scan) break; } cond_resched_lock(&sb->s_dentry_lru_lock); @@ -952,6 +954,7 @@ relock: spin_unlock(&sb->s_dentry_lru_lock); shrink_dentry_list(&tmp); + return freed; } /* -- cgit v1.2.3 From f604156751db77e08afe47ce29fe8f3d51ad9b04 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 28 Aug 2013 10:18:00 +1000 Subject: dcache: convert to use new lru list infrastructure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [glommer@openvz.org: don't reintroduce double decrement of nr_unused_dentries, adapted for new LRU return codes] Signed-off-by: Dave Chinner Signed-off-by: Glauber Costa Cc: "Theodore Ts'o" Cc: Adrian Hunter Cc: Al Viro Cc: Artem Bityutskiy Cc: Arve Hjønnevåg Cc: Carlos Maiolino Cc: Christoph Hellwig Cc: Chuck Lever Cc: Daniel Vetter Cc: David Rientjes Cc: Gleb Natapov Cc: Greg Thelen Cc: J. Bruce Fields Cc: Jan Kara Cc: Jerome Glisse Cc: John Stultz Cc: KAMEZAWA Hiroyuki Cc: Kent Overstreet Cc: Kirill A. Shutemov Cc: Marcelo Tosatti Cc: Mel Gorman Cc: Steven Whitehouse Cc: Thomas Hellstrom Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/dcache.c | 170 ++++++++++++++++++++++++++++-------------------------------- 1 file changed, 78 insertions(+), 92 deletions(-) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index 77d466b13fef..38a4a03499a2 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -37,6 +37,7 @@ #include #include #include +#include #include "internal.h" #include "mount.h" @@ -356,28 +357,17 @@ static void dentry_unlink_inode(struct dentry * dentry) } /* - * dentry_lru_(add|del|move_list) must be called with d_lock held. + * dentry_lru_(add|del)_list) must be called with d_lock held. */ static void dentry_lru_add(struct dentry *dentry) { if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST))) { - spin_lock(&dentry->d_sb->s_dentry_lru_lock); + if (list_lru_add(&dentry->d_sb->s_dentry_lru, &dentry->d_lru)) + this_cpu_inc(nr_dentry_unused); dentry->d_flags |= DCACHE_LRU_LIST; - list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); - dentry->d_sb->s_nr_dentry_unused++; - this_cpu_inc(nr_dentry_unused); - spin_unlock(&dentry->d_sb->s_dentry_lru_lock); } } -static void __dentry_lru_del(struct dentry *dentry) -{ - list_del_init(&dentry->d_lru); - dentry->d_flags &= ~DCACHE_LRU_LIST; - dentry->d_sb->s_nr_dentry_unused--; - this_cpu_dec(nr_dentry_unused); -} - /* * Remove a dentry with references from the LRU. * @@ -393,27 +383,9 @@ static void dentry_lru_del(struct dentry *dentry) return; } - if (!list_empty(&dentry->d_lru)) { - spin_lock(&dentry->d_sb->s_dentry_lru_lock); - __dentry_lru_del(dentry); - spin_unlock(&dentry->d_sb->s_dentry_lru_lock); - } -} - -static void dentry_lru_move_list(struct dentry *dentry, struct list_head *list) -{ - BUG_ON(dentry->d_flags & DCACHE_SHRINK_LIST); - - spin_lock(&dentry->d_sb->s_dentry_lru_lock); - if (list_empty(&dentry->d_lru)) { - dentry->d_flags |= DCACHE_LRU_LIST; - list_add_tail(&dentry->d_lru, list); - } else { - list_move_tail(&dentry->d_lru, list); - dentry->d_sb->s_nr_dentry_unused--; + if (list_lru_del(&dentry->d_sb->s_dentry_lru, &dentry->d_lru)) this_cpu_dec(nr_dentry_unused); - } - spin_unlock(&dentry->d_sb->s_dentry_lru_lock); + dentry->d_flags &= ~DCACHE_LRU_LIST; } /** @@ -901,12 +873,72 @@ static void shrink_dentry_list(struct list_head *list) rcu_read_unlock(); } +static enum lru_status +dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) +{ + struct list_head *freeable = arg; + struct dentry *dentry = container_of(item, struct dentry, d_lru); + + + /* + * we are inverting the lru lock/dentry->d_lock here, + * so use a trylock. If we fail to get the lock, just skip + * it + */ + if (!spin_trylock(&dentry->d_lock)) + return LRU_SKIP; + + /* + * Referenced dentries are still in use. If they have active + * counts, just remove them from the LRU. Otherwise give them + * another pass through the LRU. + */ + if (dentry->d_lockref.count) { + list_del_init(&dentry->d_lru); + spin_unlock(&dentry->d_lock); + return LRU_REMOVED; + } + + if (dentry->d_flags & DCACHE_REFERENCED) { + dentry->d_flags &= ~DCACHE_REFERENCED; + spin_unlock(&dentry->d_lock); + + /* + * The list move itself will be made by the common LRU code. At + * this point, we've dropped the dentry->d_lock but keep the + * lru lock. This is safe to do, since every list movement is + * protected by the lru lock even if both locks are held. + * + * This is guaranteed by the fact that all LRU management + * functions are intermediated by the LRU API calls like + * list_lru_add and list_lru_del. List movement in this file + * only ever occur through this functions or through callbacks + * like this one, that are called from the LRU API. + * + * The only exceptions to this are functions like + * shrink_dentry_list, and code that first checks for the + * DCACHE_SHRINK_LIST flag. Those are guaranteed to be + * operating only with stack provided lists after they are + * properly isolated from the main list. It is thus, always a + * local access. + */ + return LRU_ROTATE; + } + + dentry->d_flags |= DCACHE_SHRINK_LIST; + list_move_tail(&dentry->d_lru, freeable); + this_cpu_dec(nr_dentry_unused); + spin_unlock(&dentry->d_lock); + + return LRU_REMOVED; +} + /** * prune_dcache_sb - shrink the dcache * @sb: superblock - * @count: number of entries to try to free + * @nr_to_scan : number of entries to try to free * - * Attempt to shrink the superblock dcache LRU by @count entries. This is + * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is * done when we need more memory an called from the superblock shrinker * function. * @@ -915,45 +947,12 @@ static void shrink_dentry_list(struct list_head *list) */ long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan) { - struct dentry *dentry; - LIST_HEAD(referenced); - LIST_HEAD(tmp); - long freed = 0; + LIST_HEAD(dispose); + long freed; -relock: - spin_lock(&sb->s_dentry_lru_lock); - while (!list_empty(&sb->s_dentry_lru)) { - dentry = list_entry(sb->s_dentry_lru.prev, - struct dentry, d_lru); - BUG_ON(dentry->d_sb != sb); - - if (!spin_trylock(&dentry->d_lock)) { - spin_unlock(&sb->s_dentry_lru_lock); - cpu_relax(); - goto relock; - } - - if (dentry->d_flags & DCACHE_REFERENCED) { - dentry->d_flags &= ~DCACHE_REFERENCED; - list_move(&dentry->d_lru, &referenced); - spin_unlock(&dentry->d_lock); - } else { - list_move(&dentry->d_lru, &tmp); - dentry->d_flags |= DCACHE_SHRINK_LIST; - this_cpu_dec(nr_dentry_unused); - sb->s_nr_dentry_unused--; - spin_unlock(&dentry->d_lock); - freed++; - if (!--nr_to_scan) - break; - } - cond_resched_lock(&sb->s_dentry_lru_lock); - } - if (!list_empty(&referenced)) - list_splice(&referenced, &sb->s_dentry_lru); - spin_unlock(&sb->s_dentry_lru_lock); - - shrink_dentry_list(&tmp); + freed = list_lru_walk(&sb->s_dentry_lru, dentry_lru_isolate, + &dispose, nr_to_scan); + shrink_dentry_list(&dispose); return freed; } @@ -987,24 +986,10 @@ shrink_dcache_list( */ void shrink_dcache_sb(struct super_block *sb) { - LIST_HEAD(tmp); - - spin_lock(&sb->s_dentry_lru_lock); - while (!list_empty(&sb->s_dentry_lru)) { - /* - * account for removal here so we don't need to handle it later - * even though the dentry is no longer on the lru list. - */ - list_splice_init(&sb->s_dentry_lru, &tmp); - this_cpu_sub(nr_dentry_unused, sb->s_nr_dentry_unused); - sb->s_nr_dentry_unused = 0; - spin_unlock(&sb->s_dentry_lru_lock); + long disposed; - shrink_dcache_list(&tmp); - - spin_lock(&sb->s_dentry_lru_lock); - } - spin_unlock(&sb->s_dentry_lru_lock); + disposed = list_lru_dispose_all(&sb->s_dentry_lru, shrink_dcache_list); + this_cpu_sub(nr_dentry_unused, disposed); } EXPORT_SYMBOL(shrink_dcache_sb); @@ -1366,7 +1351,8 @@ static enum d_walk_ret select_collect(void *_data, struct dentry *dentry) if (dentry->d_lockref.count) { dentry_lru_del(dentry); } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { - dentry_lru_move_list(dentry, &data->dispose); + dentry_lru_del(dentry); + list_add_tail(&dentry->d_lru, &data->dispose); dentry->d_flags |= DCACHE_SHRINK_LIST; data->found++; ret = D_WALK_NORETRY; -- cgit v1.2.3 From 4e717f5c1083995c334ced639cc77a75e9972567 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Wed, 28 Aug 2013 10:18:03 +1000 Subject: list_lru: remove special case function list_lru_dispose_all. The list_lru implementation has one function, list_lru_dispose_all, with only one user (the dentry code). At first, such function appears to make sense because we are really not interested in the result of isolating each dentry separately - all of them are going away anyway. However, it's implementation is buggy in the following way: When we call list_lru_dispose_all in fs/dcache.c, we scan all dentries marking them with DCACHE_SHRINK_LIST. However, this is done without the nlru->lock taken. The imediate result of that is that someone else may add or remove the dentry from the LRU at the same time. When list_lru_del happens in that scenario we will see an element that is not yet marked with DCACHE_SHRINK_LIST (even though it will be in the future) and obviously remove it from an lru where the element no longer is. Since list_lru_dispose_all will in effect count down nlru's nr_items and list_lru_del will do the same, this will lead to an imbalance. The solution for this would not be so simple: we can obviously just keep the lru_lock taken, but then we have no guarantees that we will be able to acquire the dentry lock (dentry->d_lock). To properly solve this, we need a communication mechanism between the lru and dentry code, so they can coordinate this with each other. Such mechanism already exists in the form of the list_lru_walk_cb callback. So it is possible to construct a dcache-side prune function that does the right thing only by calling list_lru_walk in a loop until no more dentries are available. With only one user, plus the fact that a sane solution for the problem would involve boucing between dcache and list_lru anyway, I see little justification to keep the special case list_lru_dispose_all in tree. Signed-off-by: Glauber Costa Cc: Michal Hocko Acked-by: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/dcache.c | 49 +++++++++++++++++++++++++++++-------------------- 1 file changed, 29 insertions(+), 20 deletions(-) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index 38a4a03499a2..d74b5bdff7f9 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -956,27 +956,29 @@ long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan) return freed; } -/* - * Mark all the dentries as on being the dispose list so we don't think they are - * still on the LRU if we try to kill them from ascending the parent chain in - * try_prune_one_dentry() rather than directly from the dispose list. - */ -static void -shrink_dcache_list( - struct list_head *dispose) +static enum lru_status dentry_lru_isolate_shrink(struct list_head *item, + spinlock_t *lru_lock, void *arg) { - struct dentry *dentry; + struct list_head *freeable = arg; + struct dentry *dentry = container_of(item, struct dentry, d_lru); - rcu_read_lock(); - list_for_each_entry_rcu(dentry, dispose, d_lru) { - spin_lock(&dentry->d_lock); - dentry->d_flags |= DCACHE_SHRINK_LIST; - spin_unlock(&dentry->d_lock); - } - rcu_read_unlock(); - shrink_dentry_list(dispose); + /* + * we are inverting the lru lock/dentry->d_lock here, + * so use a trylock. If we fail to get the lock, just skip + * it + */ + if (!spin_trylock(&dentry->d_lock)) + return LRU_SKIP; + + dentry->d_flags |= DCACHE_SHRINK_LIST; + list_move_tail(&dentry->d_lru, freeable); + this_cpu_dec(nr_dentry_unused); + spin_unlock(&dentry->d_lock); + + return LRU_REMOVED; } + /** * shrink_dcache_sb - shrink dcache for a superblock * @sb: superblock @@ -986,10 +988,17 @@ shrink_dcache_list( */ void shrink_dcache_sb(struct super_block *sb) { - long disposed; + long freed; + + do { + LIST_HEAD(dispose); + + freed = list_lru_walk(&sb->s_dentry_lru, + dentry_lru_isolate_shrink, &dispose, UINT_MAX); - disposed = list_lru_dispose_all(&sb->s_dentry_lru, shrink_dcache_list); - this_cpu_sub(nr_dentry_unused, disposed); + this_cpu_sub(nr_dentry_unused, freed); + shrink_dentry_list(&dispose); + } while (freed > 0); } EXPORT_SYMBOL(shrink_dcache_sb); -- cgit v1.2.3 From 9b17c62382dd2e7507984b9890bf44e070cdd8bb Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 28 Aug 2013 10:18:05 +1000 Subject: fs: convert inode and dentry shrinking to be node aware MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that the shrinker is passing a node in the scan control structure, we can pass this to the the generic LRU list code to isolate reclaim to the lists on matching nodes. Signed-off-by: Dave Chinner Signed-off-by: Glauber Costa Acked-by: Mel Gorman Cc: "Theodore Ts'o" Cc: Adrian Hunter Cc: Al Viro Cc: Artem Bityutskiy Cc: Arve Hjønnevåg Cc: Carlos Maiolino Cc: Christoph Hellwig Cc: Chuck Lever Cc: Daniel Vetter Cc: David Rientjes Cc: Gleb Natapov Cc: Greg Thelen Cc: J. Bruce Fields Cc: Jan Kara Cc: Jerome Glisse Cc: John Stultz Cc: KAMEZAWA Hiroyuki Cc: Kent Overstreet Cc: Kirill A. Shutemov Cc: Marcelo Tosatti Cc: Mel Gorman Cc: Steven Whitehouse Cc: Thomas Hellstrom Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/dcache.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index d74b5bdff7f9..c932ed32c77b 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -937,6 +937,7 @@ dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) * prune_dcache_sb - shrink the dcache * @sb: superblock * @nr_to_scan : number of entries to try to free + * @nid: which node to scan for freeable entities * * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is * done when we need more memory an called from the superblock shrinker @@ -945,13 +946,14 @@ dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) * This function may fail to free any resources if all the dentries are in * use. */ -long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan) +long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan, + int nid) { LIST_HEAD(dispose); long freed; - freed = list_lru_walk(&sb->s_dentry_lru, dentry_lru_isolate, - &dispose, nr_to_scan); + freed = list_lru_walk_node(&sb->s_dentry_lru, nid, dentry_lru_isolate, + &dispose, &nr_to_scan); shrink_dentry_list(&dispose); return freed; } -- cgit v1.2.3 From 1812997720ab90d029548778c55d7315555e1fef Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 12 Sep 2013 10:55:35 -0400 Subject: dcache: get/release read lock in read_seqbegin_or_lock() & friend This patch modifies read_seqbegin_or_lock() and need_seqretry() to use newly introduced read_seqlock_excl() and read_sequnlock_excl() primitives so that they won't change the sequence number even if they fall back to take the lock. This is OK as no change to the protected data structure is being made. It will prevent one fallback to lock taking from cascading into a series of lock taking reducing performance because of the sequence number change. It will also allow other sequence readers to go forward while an exclusive reader lock is taken. This patch also updates some of the inaccurate comments in the code. Signed-off-by: Waiman Long To: Alexander Viro Signed-off-by: Linus Torvalds --- fs/dcache.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index 4d9df3c940e6..f3dcc6351a16 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -90,8 +90,8 @@ static struct kmem_cache *dentry_cache __read_mostly; /** * read_seqbegin_or_lock - begin a sequence number check or locking block - * lock: sequence lock - * seq : sequence number to be checked + * @lock: sequence lock + * @seq : sequence number to be checked * * First try it once optimistically without taking the lock. If that fails, * take the lock. The sequence number is also used as a marker for deciding @@ -103,7 +103,7 @@ static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq) if (!(*seq & 1)) /* Even */ *seq = read_seqbegin(lock); else /* Odd */ - write_seqlock(lock); + read_seqlock_excl(lock); } static inline int need_seqretry(seqlock_t *lock, int seq) @@ -114,7 +114,7 @@ static inline int need_seqretry(seqlock_t *lock, int seq) static inline void done_seqretry(seqlock_t *lock, int seq) { if (seq & 1) - write_sequnlock(lock); + read_sequnlock_excl(lock); } /* @@ -2673,9 +2673,9 @@ static int prepend(char **buffer, int *buflen, const char *str, int namelen) /** * prepend_name - prepend a pathname in front of current buffer pointer - * buffer: buffer pointer - * buflen: allocated length of the buffer - * name: name string and length qstr structure + * @buffer: buffer pointer + * @buflen: allocated length of the buffer + * @name: name string and length qstr structure * * With RCU path tracing, it may race with d_move(). Use ACCESS_ONCE() to * make sure that either the old or the new name pointer and length are @@ -2713,14 +2713,15 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name) * @buffer: pointer to the end of the buffer * @buflen: pointer to buffer length * - * The function tries to write out the pathname without taking any lock other - * than the RCU read lock to make sure that dentries won't go away. It only - * checks the sequence number of the global rename_lock as any change in the - * dentry's d_seq will be preceded by changes in the rename_lock sequence - * number. If the sequence number had been change, it will restart the whole - * pathname back-tracing sequence again. It performs a total of 3 trials of - * lockless back-tracing sequences before falling back to take the - * rename_lock. + * The function will first try to write out the pathname without taking any + * lock other than the RCU read lock to make sure that dentries won't go away. + * It only checks the sequence number of the global rename_lock as any change + * in the dentry's d_seq will be preceded by changes in the rename_lock + * sequence number. If the sequence number had been changed, it will restart + * the whole pathname back-tracing sequence again by taking the rename_lock. + * In this case, there is no need to take the RCU read lock as the recursive + * parent pointer references will keep the dentry chain alive as long as no + * rename operation is performed. */ static int prepend_path(const struct path *path, const struct path *root, -- cgit v1.2.3 From 5762482f5496cb1dd86acd2aace3ea25d1404e1f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 12 Sep 2013 10:12:47 -0700 Subject: vfs: move get_fs_root_and_pwd() to single caller Let's not pollute the include files with inline functions that are only used in a single place. Especially not if we decide we might want to change the semantics of said function to make it more efficient.. Signed-off-by: Linus Torvalds --- fs/dcache.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index f3dcc6351a16..4df68e27cbc7 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -3015,6 +3015,17 @@ Elong: return ERR_PTR(-ENAMETOOLONG); } +static inline void get_fs_root_and_pwd(struct fs_struct *fs, struct path *root, + struct path *pwd) +{ + spin_lock(&fs->lock); + *root = fs->root; + path_get(root); + *pwd = fs->pwd; + path_get(pwd); + spin_unlock(&fs->lock); +} + /* * NOTE! The user-level library version returns a * character pointer. The kernel system call just -- cgit v1.2.3 From 8b19e34188a32d63a7da94ea8a3f5e39b0c66050 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 12 Sep 2013 10:35:47 -0700 Subject: vfs: make getcwd() get the root and pwd path under rcu This allows us to skip all the crazy spinlocks and reference count updates, and instead use the fs sequence read-lock to get an atomic snapshot of the root and cwd information. We might want to make the rule that "prepend_path()" is always called with the RCU lock held, but the RCU lock nests fine and this is the minimal fix. Signed-off-by: Linus Torvalds --- fs/dcache.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index 4df68e27cbc7..99d4d7226203 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -3015,15 +3015,16 @@ Elong: return ERR_PTR(-ENAMETOOLONG); } -static inline void get_fs_root_and_pwd(struct fs_struct *fs, struct path *root, - struct path *pwd) +static void get_fs_root_and_pwd_rcu(struct fs_struct *fs, struct path *root, + struct path *pwd) { - spin_lock(&fs->lock); - *root = fs->root; - path_get(root); - *pwd = fs->pwd; - path_get(pwd); - spin_unlock(&fs->lock); + unsigned seq; + + do { + seq = read_seqcount_begin(&fs->seq); + *root = fs->root; + *pwd = fs->pwd; + } while (read_seqcount_retry(&fs->seq, seq)); } /* @@ -3053,7 +3054,8 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) if (!page) return -ENOMEM; - get_fs_root_and_pwd(current->fs, &root, &pwd); + rcu_read_lock(); + get_fs_root_and_pwd_rcu(current->fs, &root, &pwd); error = -ENOENT; br_read_lock(&vfsmount_lock); @@ -3088,8 +3090,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) } out: - path_put(&pwd); - path_put(&root); + rcu_read_unlock(); free_page((unsigned long) page); return error; } -- cgit v1.2.3 From ff812d724254b95df76b7775d1359d856927a840 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 12 Sep 2013 11:57:01 -0700 Subject: vfs: don't copy things to user space holding the rcu readlock Oops. That wasn't very smart. We don't actually need the RCU lock any more by the time we copy the cwd string to user space, but I had stupidly surrounded the whole thing with it. Introduced by commit 8b19e34188a3 ("vfs: make getcwd() get the root and pwd path under rcu") Is-a-big-hairy-idiot: Linus Torvalds --- fs/dcache.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index 99d4d7226203..29d58212aaf0 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -3067,6 +3067,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) prepend(&cwd, &buflen, "\0", 1); error = prepend_path(&pwd, &root, &cwd, &buflen); br_read_unlock(&vfsmount_lock); + rcu_read_unlock(); if (error < 0) goto out; @@ -3087,10 +3088,10 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) } } else { br_read_unlock(&vfsmount_lock); + rcu_read_unlock(); } out: - rcu_read_unlock(); free_page((unsigned long) page); return error; } -- cgit v1.2.3 From 3272c544da48f8915a0e34189182aed029bd0f2b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 12 Sep 2013 12:40:15 -0700 Subject: vfs: use __getname/__putname for getcwd() system call It's a pathname. It should use the pathname allocators and deallocators, and PATH_MAX instead of PAGE_SIZE. Never mind that the two are commonly the same. With this, the allocations scale up nicely too, and I can do getcwd() system calls at a rate of about 300M/s, with no lock contention anywhere. Of course, nobody sane does that, especially since getcwd() is traditionally a very slow operation in Unix. But this was also the simplest way to benchmark the prepend_path() improvements by Waiman, and once I saw the profiles I couldn't leave it well enough alone. But apart from being an performance improvement (from using per-cpu slab allocators instead of the raw page allocator), it's actually a valid and real cleanup. Signed-off-by: Linus "OCD" Torvalds --- fs/dcache.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index 29d58212aaf0..91e551b5af59 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -3049,7 +3049,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) { int error; struct path pwd, root; - char *page = (char *) __get_free_page(GFP_USER); + char *page = __getname(); if (!page) return -ENOMEM; @@ -3061,8 +3061,8 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) br_read_lock(&vfsmount_lock); if (!d_unlinked(pwd.dentry)) { unsigned long len; - char *cwd = page + PAGE_SIZE; - int buflen = PAGE_SIZE; + char *cwd = page + PATH_MAX; + int buflen = PATH_MAX; prepend(&cwd, &buflen, "\0", 1); error = prepend_path(&pwd, &root, &cwd, &buflen); @@ -3080,7 +3080,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) } error = -ERANGE; - len = PAGE_SIZE + page - cwd; + len = PATH_MAX + page - cwd; if (len <= size) { error = len; if (copy_to_user(buf, cwd, len)) @@ -3092,7 +3092,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) } out: - free_page((unsigned long) page); + __putname(page); return error; } -- cgit v1.2.3 From 68f0d9d92e5430e250f2bd2a1e7a350e880d776a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 12 Sep 2013 13:24:55 -0700 Subject: vfs: make d_path() get the root path under RCU This avoids the spinlocks and refcounts in the d_path() sequence too (used by /proc and various other entities). See commit 8b19e34188a3 for the equivalent getcwd() system call path. And unlike getcwd(), d_path() doesn't copy the result to user space, so I don't need to fear _that_ particular bug happening again. Signed-off-by: Linus Torvalds --- fs/dcache.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index 91e551b5af59..dddc67fed732 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2869,6 +2869,16 @@ static int prepend_unreachable(char **buffer, int *buflen) return prepend(buffer, buflen, "(unreachable)", 13); } +static void get_fs_root_rcu(struct fs_struct *fs, struct path *root) +{ + unsigned seq; + + do { + seq = read_seqcount_begin(&fs->seq); + *root = fs->root; + } while (read_seqcount_retry(&fs->seq, seq)); +} + /** * d_path - return the path of a dentry * @path: path to report @@ -2901,13 +2911,15 @@ char *d_path(const struct path *path, char *buf, int buflen) if (path->dentry->d_op && path->dentry->d_op->d_dname) return path->dentry->d_op->d_dname(path->dentry, buf, buflen); - get_fs_root(current->fs, &root); + rcu_read_lock(); + get_fs_root_rcu(current->fs, &root); br_read_lock(&vfsmount_lock); error = path_with_deleted(path, &root, &res, &buflen); br_read_unlock(&vfsmount_lock); + rcu_read_unlock(); + if (error < 0) res = ERR_PTR(error); - path_put(&root); return res; } EXPORT_SYMBOL(d_path); -- cgit v1.2.3 From 89dc77bcdabf42ec99553f5837aa4bb8255a088c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 13 Sep 2013 22:55:10 -0400 Subject: vfs: fix dentry LRU list handling and nr_dentry_unused accounting The LRU list changes interacted badly with our nr_dentry_unused accounting, and even worse with the new DCACHE_LRU_LIST bit logic. This introduces helper functions to make sure everything follows the proper dcache d_lru list rules: the dentry cache is complicated by the fact that some of the hotpaths don't even want to look at the LRU list at all, and the fact that we use the same list entry in the dentry for both the LRU list and for our temporary shrinking lists when removing things from the LRU. The helper functions temporarily have some extra sanity checking for the flag bits that have to match the current LRU state of the dentry. We'll remove that before the final 3.12 release, but considering how easy it is to get wrong, this first cleanup version has some very particular sanity checking. Acked-by: Al Viro Signed-off-by: Linus Torvalds --- fs/dcache.c | 128 +++++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 101 insertions(+), 27 deletions(-) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index 1bd4614ce93b..62538e705c9b 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -356,16 +356,81 @@ static void dentry_unlink_inode(struct dentry * dentry) iput(inode); } +/* + * The DCACHE_LRU_LIST bit is set whenever the 'd_lru' entry + * is in use - which includes both the "real" per-superblock + * LRU list _and_ the DCACHE_SHRINK_LIST use. + * + * The DCACHE_SHRINK_LIST bit is set whenever the dentry is + * on the shrink list (ie not on the superblock LRU list). + * + * The per-cpu "nr_dentry_unused" counters are updated with + * the DCACHE_LRU_LIST bit. + * + * These helper functions make sure we always follow the + * rules. d_lock must be held by the caller. + */ +#define D_FLAG_VERIFY(dentry,x) WARN_ON_ONCE(((dentry)->d_flags & (DCACHE_LRU_LIST | DCACHE_SHRINK_LIST)) != (x)) +static void d_lru_add(struct dentry *dentry) +{ + D_FLAG_VERIFY(dentry, 0); + dentry->d_flags |= DCACHE_LRU_LIST; + this_cpu_inc(nr_dentry_unused); + WARN_ON_ONCE(!list_lru_add(&dentry->d_sb->s_dentry_lru, &dentry->d_lru)); +} + +static void d_lru_del(struct dentry *dentry) +{ + D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); + dentry->d_flags &= ~DCACHE_LRU_LIST; + this_cpu_dec(nr_dentry_unused); + WARN_ON_ONCE(!list_lru_del(&dentry->d_sb->s_dentry_lru, &dentry->d_lru)); +} + +static void d_shrink_del(struct dentry *dentry) +{ + D_FLAG_VERIFY(dentry, DCACHE_SHRINK_LIST | DCACHE_LRU_LIST); + list_del_init(&dentry->d_lru); + dentry->d_flags &= ~(DCACHE_SHRINK_LIST | DCACHE_LRU_LIST); + this_cpu_dec(nr_dentry_unused); +} + +static void d_shrink_add(struct dentry *dentry, struct list_head *list) +{ + D_FLAG_VERIFY(dentry, 0); + list_add(&dentry->d_lru, list); + dentry->d_flags |= DCACHE_SHRINK_LIST | DCACHE_LRU_LIST; + this_cpu_inc(nr_dentry_unused); +} + +/* + * These can only be called under the global LRU lock, ie during the + * callback for freeing the LRU list. "isolate" removes it from the + * LRU lists entirely, while shrink_move moves it to the indicated + * private list. + */ +static void d_lru_isolate(struct dentry *dentry) +{ + D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); + dentry->d_flags &= ~DCACHE_LRU_LIST; + this_cpu_dec(nr_dentry_unused); + list_del_init(&dentry->d_lru); +} + +static void d_lru_shrink_move(struct dentry *dentry, struct list_head *list) +{ + D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); + dentry->d_flags |= DCACHE_SHRINK_LIST; + list_move_tail(&dentry->d_lru, list); +} + /* * dentry_lru_(add|del)_list) must be called with d_lock held. */ static void dentry_lru_add(struct dentry *dentry) { - if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST))) { - if (list_lru_add(&dentry->d_sb->s_dentry_lru, &dentry->d_lru)) - this_cpu_inc(nr_dentry_unused); - dentry->d_flags |= DCACHE_LRU_LIST; - } + if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST))) + d_lru_add(dentry); } /* @@ -377,15 +442,11 @@ static void dentry_lru_add(struct dentry *dentry) */ static void dentry_lru_del(struct dentry *dentry) { - if (dentry->d_flags & DCACHE_SHRINK_LIST) { - list_del_init(&dentry->d_lru); - dentry->d_flags &= ~DCACHE_SHRINK_LIST; - return; + if (dentry->d_flags & DCACHE_LRU_LIST) { + if (dentry->d_flags & DCACHE_SHRINK_LIST) + return d_shrink_del(dentry); + d_lru_del(dentry); } - - if (list_lru_del(&dentry->d_sb->s_dentry_lru, &dentry->d_lru)) - this_cpu_dec(nr_dentry_unused); - dentry->d_flags &= ~DCACHE_LRU_LIST; } /** @@ -837,6 +898,12 @@ static void shrink_dentry_list(struct list_head *list) dentry = list_entry_rcu(list->prev, struct dentry, d_lru); if (&dentry->d_lru == list) break; /* empty */ + + /* + * Get the dentry lock, and re-verify that the dentry is + * this on the shrinking list. If it is, we know that + * DCACHE_SHRINK_LIST and DCACHE_LRU_LIST are set. + */ spin_lock(&dentry->d_lock); if (dentry != list_entry(list->prev, struct dentry, d_lru)) { spin_unlock(&dentry->d_lock); @@ -848,8 +915,7 @@ static void shrink_dentry_list(struct list_head *list) * to the LRU here, so we can simply remove it from the list * here regardless of whether it is referenced or not. */ - list_del_init(&dentry->d_lru); - dentry->d_flags &= ~DCACHE_SHRINK_LIST; + d_shrink_del(dentry); /* * We found an inuse dentry which was not removed from @@ -861,12 +927,20 @@ static void shrink_dentry_list(struct list_head *list) } rcu_read_unlock(); + /* + * If 'try_to_prune()' returns a dentry, it will + * be the same one we passed in, and d_lock will + * have been held the whole time, so it will not + * have been added to any other lists. We failed + * to get the inode lock. + * + * We just add it back to the shrink list. + */ dentry = try_prune_one_dentry(dentry); rcu_read_lock(); if (dentry) { - dentry->d_flags |= DCACHE_SHRINK_LIST; - list_add(&dentry->d_lru, list); + d_shrink_add(dentry, list); spin_unlock(&dentry->d_lock); } } @@ -894,7 +968,7 @@ dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) * another pass through the LRU. */ if (dentry->d_lockref.count) { - list_del_init(&dentry->d_lru); + d_lru_isolate(dentry); spin_unlock(&dentry->d_lock); return LRU_REMOVED; } @@ -925,9 +999,7 @@ dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) return LRU_ROTATE; } - dentry->d_flags |= DCACHE_SHRINK_LIST; - list_move_tail(&dentry->d_lru, freeable); - this_cpu_dec(nr_dentry_unused); + d_lru_shrink_move(dentry, freeable); spin_unlock(&dentry->d_lock); return LRU_REMOVED; @@ -972,9 +1044,7 @@ static enum lru_status dentry_lru_isolate_shrink(struct list_head *item, if (!spin_trylock(&dentry->d_lock)) return LRU_SKIP; - dentry->d_flags |= DCACHE_SHRINK_LIST; - list_move_tail(&dentry->d_lru, freeable); - this_cpu_dec(nr_dentry_unused); + d_lru_shrink_move(dentry, freeable); spin_unlock(&dentry->d_lock); return LRU_REMOVED; @@ -1362,9 +1432,13 @@ static enum d_walk_ret select_collect(void *_data, struct dentry *dentry) if (dentry->d_lockref.count) { dentry_lru_del(dentry); } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { - dentry_lru_del(dentry); - list_add_tail(&dentry->d_lru, &data->dispose); - dentry->d_flags |= DCACHE_SHRINK_LIST; + /* + * We can't use d_lru_shrink_move() because we + * need to get the global LRU lock and do the + * RLU accounting. + */ + d_lru_del(dentry); + d_shrink_add(dentry, &data->dispose); data->found++; ret = D_WALK_NORETRY; } -- cgit v1.2.3 From 05a8252bdefe9f2a8931c720afe6200671d631a6 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 15 Sep 2013 07:11:01 -0400 Subject: vfs: fix typo in comment in recent dentry work Sedat points out that I transposed some letters in "LRU" and wrote "RLU" instead in one of the new comments explaining the flow. Let's just fix it. Reported-by: Sedat Dilek Signed-off-by: Linus Torvalds --- fs/dcache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index 62538e705c9b..41000305d716 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1435,7 +1435,7 @@ static enum d_walk_ret select_collect(void *_data, struct dentry *dentry) /* * We can't use d_lru_shrink_move() because we * need to get the global LRU lock and do the - * RLU accounting. + * LRU accounting. */ d_lru_del(dentry); d_shrink_add(dentry, &data->dispose); -- cgit v1.2.3