From 7eaceaccab5f40bbfda044629a6298616aeaed50 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Thu, 10 Mar 2011 08:52:07 +0100
Subject: block: remove per-queue plugging

Code has been converted over to the new explicit on-stack plugging,
and delay users have been converted to use the new API for that.
So lets kill off the old plugging along with aops->sync_page().

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 mm/filemap.c | 67 ++++++------------------------------------------------------
 1 file changed, 6 insertions(+), 61 deletions(-)

(limited to 'mm/filemap.c')

diff --git a/mm/filemap.c b/mm/filemap.c
index 83a45d35468b..380776c2a9ac 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -155,45 +155,15 @@ void remove_from_page_cache(struct page *page)
 }
 EXPORT_SYMBOL(remove_from_page_cache);
 
-static int sync_page(void *word)
+static int sleep_on_page(void *word)
 {
-	struct address_space *mapping;
-	struct page *page;
-
-	page = container_of((unsigned long *)word, struct page, flags);
-
-	/*
-	 * page_mapping() is being called without PG_locked held.
-	 * Some knowledge of the state and use of the page is used to
-	 * reduce the requirements down to a memory barrier.
-	 * The danger here is of a stale page_mapping() return value
-	 * indicating a struct address_space different from the one it's
-	 * associated with when it is associated with one.
-	 * After smp_mb(), it's either the correct page_mapping() for
-	 * the page, or an old page_mapping() and the page's own
-	 * page_mapping() has gone NULL.
-	 * The ->sync_page() address_space operation must tolerate
-	 * page_mapping() going NULL. By an amazing coincidence,
-	 * this comes about because none of the users of the page
-	 * in the ->sync_page() methods make essential use of the
-	 * page_mapping(), merely passing the page down to the backing
-	 * device's unplug functions when it's non-NULL, which in turn
-	 * ignore it for all cases but swap, where only page_private(page) is
-	 * of interest. When page_mapping() does go NULL, the entire
-	 * call stack gracefully ignores the page and returns.
-	 * -- wli
-	 */
-	smp_mb();
-	mapping = page_mapping(page);
-	if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
-		mapping->a_ops->sync_page(page);
 	io_schedule();
 	return 0;
 }
 
-static int sync_page_killable(void *word)
+static int sleep_on_page_killable(void *word)
 {
-	sync_page(word);
+	sleep_on_page(word);
 	return fatal_signal_pending(current) ? -EINTR : 0;
 }
 
@@ -479,12 +449,6 @@ struct page *__page_cache_alloc(gfp_t gfp)
 EXPORT_SYMBOL(__page_cache_alloc);
 #endif
 
-static int __sleep_on_page_lock(void *word)
-{
-	io_schedule();
-	return 0;
-}
-
 /*
  * In order to wait for pages to become available there must be
  * waitqueues associated with pages. By using a hash table of
@@ -512,7 +476,7 @@ void wait_on_page_bit(struct page *page, int bit_nr)
 	DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
 
 	if (test_bit(bit_nr, &page->flags))
-		__wait_on_bit(page_waitqueue(page), &wait, sync_page,
+		__wait_on_bit(page_waitqueue(page), &wait, sleep_on_page,
 							TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(wait_on_page_bit);
@@ -576,17 +540,12 @@ EXPORT_SYMBOL(end_page_writeback);
 /**
  * __lock_page - get a lock on the page, assuming we need to sleep to get it
  * @page: the page to lock
- *
- * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary.  If some
- * random driver's requestfn sets TASK_RUNNING, we could busywait.  However
- * chances are that on the second loop, the block layer's plug list is empty,
- * so sync_page() will then return in state TASK_UNINTERRUPTIBLE.
  */
 void __lock_page(struct page *page)
 {
 	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
 
-	__wait_on_bit_lock(page_waitqueue(page), &wait, sync_page,
+	__wait_on_bit_lock(page_waitqueue(page), &wait, sleep_on_page,
 							TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(__lock_page);
@@ -596,24 +555,10 @@ int __lock_page_killable(struct page *page)
 	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
 
 	return __wait_on_bit_lock(page_waitqueue(page), &wait,
-					sync_page_killable, TASK_KILLABLE);
+					sleep_on_page_killable, TASK_KILLABLE);
 }
 EXPORT_SYMBOL_GPL(__lock_page_killable);
 
-/**
- * __lock_page_nosync - get a lock on the page, without calling sync_page()
- * @page: the page to lock
- *
- * Variant of lock_page that does not require the caller to hold a reference
- * on the page's mapping.
- */
-void __lock_page_nosync(struct page *page)
-{
-	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
-	__wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock,
-							TASK_UNINTERRUPTIBLE);
-}
-
 int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
 			 unsigned int flags)
 {
-- 
cgit v1.2.3


From 55602dd66f5353981b53f734e26c307f310ced37 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Thu, 24 Jun 2010 15:05:37 +0200
Subject: fs: make generic file read/write functions plug

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 mm/filemap.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'mm/filemap.c')

diff --git a/mm/filemap.c b/mm/filemap.c
index 380776c2a9ac..f9a29c87a2cf 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1243,12 +1243,15 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 	unsigned long seg = 0;
 	size_t count;
 	loff_t *ppos = &iocb->ki_pos;
+	struct blk_plug plug;
 
 	count = 0;
 	retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
 	if (retval)
 		return retval;
 
+	blk_start_plug(&plug);
+
 	/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
 	if (filp->f_flags & O_DIRECT) {
 		loff_t size;
@@ -1321,6 +1324,7 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 			break;
 	}
 out:
+	blk_finish_plug(&plug);
 	return retval;
 }
 EXPORT_SYMBOL(generic_file_aio_read);
@@ -2432,11 +2436,13 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
+	struct blk_plug plug;
 	ssize_t ret;
 
 	BUG_ON(iocb->ki_pos != pos);
 
 	mutex_lock(&inode->i_mutex);
+	blk_start_plug(&plug);
 	ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
 	mutex_unlock(&inode->i_mutex);
 
@@ -2447,6 +2453,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 		if (err < 0 && ret > 0)
 			ret = err;
 	}
+	blk_finish_plug(&plug);
 	return ret;
 }
 EXPORT_SYMBOL(generic_file_aio_write);
-- 
cgit v1.2.3


From 318b275fbca1ab9ec0862de71420e0e92c3d1aa7 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Tue, 22 Mar 2011 16:30:51 -0700
Subject: mm: allow GUP to fail instead of waiting on a page

GUP user may want to try to acquire a reference to a page if it is already
in memory, but not if IO, to bring it in, is needed.  For example KVM may
tell vcpu to schedule another guest process if current one is trying to
access swapped out page.  Meanwhile, the page will be swapped in and the
guest process, that depends on it, will be able to run again.

This patch adds FAULT_FLAG_RETRY_NOWAIT (suggested by Linus) and
FOLL_NOWAIT follow_page flags.  FAULT_FLAG_RETRY_NOWAIT, when used in
conjunction with VM_FAULT_ALLOW_RETRY, indicates to handle_mm_fault that
it shouldn't drop mmap_sem and wait on a page, but return VM_FAULT_RETRY
instead.

[akpm@linux-foundation.org: improve FOLL_NOWAIT comment]
Signed-off-by: Gleb Natapov <gleb@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Hugh Dickins <hughd@google.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/filemap.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'mm/filemap.c')

diff --git a/mm/filemap.c b/mm/filemap.c
index 83a45d35468b..312b6eb78430 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -621,8 +621,10 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
 		__lock_page(page);
 		return 1;
 	} else {
-		up_read(&mm->mmap_sem);
-		wait_on_page_locked(page);
+		if (!(flags & FAULT_FLAG_RETRY_NOWAIT)) {
+			up_read(&mm->mmap_sem);
+			wait_on_page_locked(page);
+		}
 		return 0;
 	}
 }
-- 
cgit v1.2.3


From ef6a3c63112e865d632ff7c478ba7c7160cad0d1 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Tue, 22 Mar 2011 16:30:52 -0700
Subject: mm: add replace_page_cache_page() function

This function basically does:

     remove_from_page_cache(old);
     page_cache_release(old);
     add_to_page_cache_locked(new);

Except it does this atomically, so there's no possibility for the "add" to
fail because of a race.

If memory cgroups are enabled, then the memory cgroup charge is also moved
from the old page to the new.

This function is currently used by fuse to move pages into the page cache
on read, instead of copying the page contents.

[minchan.kim@gmail.com: add freepage() hook to replace_page_cache_page()]
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/filemap.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)

(limited to 'mm/filemap.c')

diff --git a/mm/filemap.c b/mm/filemap.c
index 312b6eb78430..c1459f2cdb5e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -386,6 +386,76 @@ int filemap_write_and_wait_range(struct address_space *mapping,
 }
 EXPORT_SYMBOL(filemap_write_and_wait_range);
 
+/**
+ * replace_page_cache_page - replace a pagecache page with a new one
+ * @old:	page to be replaced
+ * @new:	page to replace with
+ * @gfp_mask:	allocation mode
+ *
+ * This function replaces a page in the pagecache with a new one.  On
+ * success it acquires the pagecache reference for the new page and
+ * drops it for the old page.  Both the old and new pages must be
+ * locked.  This function does not add the new page to the LRU, the
+ * caller must do that.
+ *
+ * The remove + add is atomic.  The only way this function can fail is
+ * memory allocation failure.
+ */
+int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
+{
+	int error;
+	struct mem_cgroup *memcg = NULL;
+
+	VM_BUG_ON(!PageLocked(old));
+	VM_BUG_ON(!PageLocked(new));
+	VM_BUG_ON(new->mapping);
+
+	/*
+	 * This is not page migration, but prepare_migration and
+	 * end_migration does enough work for charge replacement.
+	 *
+	 * In the longer term we probably want a specialized function
+	 * for moving the charge from old to new in a more efficient
+	 * manner.
+	 */
+	error = mem_cgroup_prepare_migration(old, new, &memcg, gfp_mask);
+	if (error)
+		return error;
+
+	error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
+	if (!error) {
+		struct address_space *mapping = old->mapping;
+		void (*freepage)(struct page *);
+
+		pgoff_t offset = old->index;
+		freepage = mapping->a_ops->freepage;
+
+		page_cache_get(new);
+		new->mapping = mapping;
+		new->index = offset;
+
+		spin_lock_irq(&mapping->tree_lock);
+		__remove_from_page_cache(old);
+		error = radix_tree_insert(&mapping->page_tree, offset, new);
+		BUG_ON(error);
+		mapping->nrpages++;
+		__inc_zone_page_state(new, NR_FILE_PAGES);
+		if (PageSwapBacked(new))
+			__inc_zone_page_state(new, NR_SHMEM);
+		spin_unlock_irq(&mapping->tree_lock);
+		radix_tree_preload_end();
+		if (freepage)
+			freepage(old);
+		page_cache_release(old);
+		mem_cgroup_end_migration(memcg, old, new, true);
+	} else {
+		mem_cgroup_end_migration(memcg, old, new, false);
+	}
+
+	return error;
+}
+EXPORT_SYMBOL_GPL(replace_page_cache_page);
+
 /**
  * add_to_page_cache_locked - add a locked page to the pagecache
  * @page:	page to add
-- 
cgit v1.2.3


From 97cecb5a254fec22d28ef32235d888bfbfd7c783 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan.kim@gmail.com>
Date: Tue, 22 Mar 2011 16:30:53 -0700
Subject: mm: introduce delete_from_page_cache()

Presently we increase the page refcount in add_to_page_cache() but don't
decrease it in remove_from_page_cache().  Such asymmetry adds confusion,
requiring that callers notice it and a comment explaining why they release
a page reference.  It's not a good API.

A long time ago, Hugh tried it (http://lkml.org/lkml/2004/10/24/140) but
gave up because reiser4's drop_page() had to unlock the page between
removing it from page cache and doing the page_cache_release().  But now
the situation is changed.  I think at least things in current mainline
don't have any obstacles.  The problem is for out-of-mainline filesystems
- if they have done such things as reiser4, this patch could be a problem
but they will discover this at compile time since we remove
remove_from_page_cache().

This patch:

This function works as just wrapper remove_from_page_cache().  The
difference is that it decreases page references in itself.  So caller have
to make sure it has a page reference before calling.

This patch is ready for removing remove_from_page_cache().

Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Cc: Christoph Hellwig <hch@infradead.org>
Acked-by: Hugh Dickins <hughd@google.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Edward Shishkin <edward.shishkin@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/filemap.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'mm/filemap.c')

diff --git a/mm/filemap.c b/mm/filemap.c
index c1459f2cdb5e..e7b59785ceb9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -155,6 +155,22 @@ void remove_from_page_cache(struct page *page)
 }
 EXPORT_SYMBOL(remove_from_page_cache);
 
+/**
+ * delete_from_page_cache - delete page from page cache
+ * @page: the page which the kernel is trying to remove from page cache
+ *
+ * This must be called only on pages that have
+ * been verified to be in the page cache and locked.
+ * It will never put the page into the free list,
+ * the caller has a reference on the page.
+ */
+void delete_from_page_cache(struct page *page)
+{
+	remove_from_page_cache(page);
+	page_cache_release(page);
+}
+EXPORT_SYMBOL(delete_from_page_cache);
+
 static int sync_page(void *word)
 {
 	struct address_space *mapping;
-- 
cgit v1.2.3


From 702cfbf93aaf3a091b0c64c8766c1ade0a820c38 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan.kim@gmail.com>
Date: Tue, 22 Mar 2011 16:32:43 -0700
Subject: mm: goodbye remove_from_page_cache()

Now delete_from_page_cache() replaces remove_from_page_cache().  So we
remove remove_from_page_cache so fs or something out of mainline will
notice it when compile time and can fix it.

Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Cc: Christoph Hellwig <hch@infradead.org>
Acked-by: Hugh Dickins <hughd@google.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/filemap.c | 25 +++++++++----------------
 1 file changed, 9 insertions(+), 16 deletions(-)

(limited to 'mm/filemap.c')

diff --git a/mm/filemap.c b/mm/filemap.c
index e7b59785ceb9..1cfb8fd84b27 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -137,7 +137,15 @@ void __remove_from_page_cache(struct page *page)
 	}
 }
 
-void remove_from_page_cache(struct page *page)
+/**
+ * delete_from_page_cache - delete page from page cache
+ * @page: the page which the kernel is trying to remove from page cache
+ *
+ * This must be called only on pages that have been verified to be in the page
+ * cache and locked.  It will never put the page into the free list, the caller
+ * has a reference on the page.
+ */
+void delete_from_page_cache(struct page *page)
 {
 	struct address_space *mapping = page->mapping;
 	void (*freepage)(struct page *);
@@ -152,21 +160,6 @@ void remove_from_page_cache(struct page *page)
 
 	if (freepage)
 		freepage(page);
-}
-EXPORT_SYMBOL(remove_from_page_cache);
-
-/**
- * delete_from_page_cache - delete page from page cache
- * @page: the page which the kernel is trying to remove from page cache
- *
- * This must be called only on pages that have
- * been verified to be in the page cache and locked.
- * It will never put the page into the free list,
- * the caller has a reference on the page.
- */
-void delete_from_page_cache(struct page *page)
-{
-	remove_from_page_cache(page);
 	page_cache_release(page);
 }
 EXPORT_SYMBOL(delete_from_page_cache);
-- 
cgit v1.2.3


From e64a782fec684c29a8204c51b3cb554dce588592 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan.kim@gmail.com>
Date: Tue, 22 Mar 2011 16:32:44 -0700
Subject: mm: change __remove_from_page_cache()

Now we renamed remove_from_page_cache with delete_from_page_cache.  As
consistency of __remove_from_swap_cache and remove_from_swap_cache, we
change internal page cache handling function name, too.

Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Cc: Christoph Hellwig <hch@infradead.org>
Acked-by: Hugh Dickins <hughd@google.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/filemap.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'mm/filemap.c')

diff --git a/mm/filemap.c b/mm/filemap.c
index 1cfb8fd84b27..cb476e70cf19 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -108,11 +108,11 @@
  */
 
 /*
- * Remove a page from the page cache and free it. Caller has to make
+ * Delete a page from the page cache and free it. Caller has to make
  * sure the page is locked and that nobody else uses it - or that usage
  * is safe.  The caller must hold the mapping's tree_lock.
  */
-void __remove_from_page_cache(struct page *page)
+void __delete_from_page_cache(struct page *page)
 {
 	struct address_space *mapping = page->mapping;
 
@@ -154,7 +154,7 @@ void delete_from_page_cache(struct page *page)
 
 	freepage = mapping->a_ops->freepage;
 	spin_lock_irq(&mapping->tree_lock);
-	__remove_from_page_cache(page);
+	__delete_from_page_cache(page);
 	spin_unlock_irq(&mapping->tree_lock);
 	mem_cgroup_uncharge_cache_page(page);
 
@@ -444,7 +444,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
 		new->index = offset;
 
 		spin_lock_irq(&mapping->tree_lock);
-		__remove_from_page_cache(old);
+		__delete_from_page_cache(old);
 		error = radix_tree_insert(&mapping->page_tree, offset, new);
 		BUG_ON(error);
 		mapping->nrpages++;
-- 
cgit v1.2.3


From 9d8aa4ea855e0d64bb6926acb5618e6d1e2ed344 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Tue, 22 Mar 2011 16:33:06 -0700
Subject: mm: remove worrying dead code from find_get_pages()

The radix_tree_deref_retry() case in find_get_pages() has a strange little
excrescence, not seen in the other gang lookups: it looks like the start
of an abandoned attempt to guarantee forward progress in a case that
cannot arise.

ret should always be 0 here: if it isn't, then going back to restart will
leak references to pages already gotten.  There used to be a comment
saying nr_found is necessarily 1 here: that's not quite true, but the
radix_tree_deref_retry() case is peculiar to the entry at index 0, when we
race with it being moved out of the radix_tree root or back.

Remove the worrisome two lines, add a brief comment here and in
find_get_pages_contig() and find_get_pages_tag(), and a WARN_ON in
find_get_pages() should it ever be seen elsewhere than at 0.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Nick Piggin <npiggin@kernel.dk>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Salman Qazi <sqazi@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/filemap.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

(limited to 'mm/filemap.c')

diff --git a/mm/filemap.c b/mm/filemap.c
index cb476e70cf19..a29318147365 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -863,9 +863,13 @@ repeat:
 		page = radix_tree_deref_slot((void **)pages[i]);
 		if (unlikely(!page))
 			continue;
+
+		/*
+		 * This can only trigger when the entry at index 0 moves out
+		 * of or back to the root: none yet gotten, safe to restart.
+		 */
 		if (radix_tree_deref_retry(page)) {
-			if (ret)
-				start = pages[ret-1]->index;
+			WARN_ON(start | i);
 			goto restart;
 		}
 
@@ -915,6 +919,11 @@ repeat:
 		page = radix_tree_deref_slot((void **)pages[i]);
 		if (unlikely(!page))
 			continue;
+
+		/*
+		 * This can only trigger when the entry at index 0 moves out
+		 * of or back to the root: none yet gotten, safe to restart.
+		 */
 		if (radix_tree_deref_retry(page))
 			goto restart;
 
@@ -975,6 +984,11 @@ repeat:
 		page = radix_tree_deref_slot((void **)pages[i]);
 		if (unlikely(!page))
 			continue;
+
+		/*
+		 * This can only trigger when the entry at index 0 moves out
+		 * of or back to the root: none yet gotten, safe to restart.
+		 */
 		if (radix_tree_deref_retry(page))
 			goto restart;
 
-- 
cgit v1.2.3


From 5b280c0cc70062967bb9d630b216375b18db3a0b Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Tue, 22 Mar 2011 16:33:07 -0700
Subject: mm: don't return 0 too early from find_get_pages()

Callers of find_get_pages(), or its wrapper pagevec_lookup() - notably
truncate_inode_pages_range() - stop looking further when it returns 0.

But if an interrupt comes just after its radix_tree_gang_lookup_slot(),
especially if we have preemptible RCU enabled, isn't it conceivable that
all 14 pages returned could be removed from the page cache by
shrink_page_list(), before find_get_pages() gets to process them?  So
causing it to return 0 although there may be plenty more pages beyond.

Make find_get_pages() and find_get_pages_tag() check for this unlikely
case, and restart should it occur; but callers of find_get_pages_contig()
have no such expectation, it's okay for that to return 0 early.

I have not seen this in practice, just worried by the possibility.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Nick Piggin <npiggin@kernel.dk>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Salman Qazi <sqazi@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/filemap.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'mm/filemap.c')

diff --git a/mm/filemap.c b/mm/filemap.c
index a29318147365..f807afda86f2 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -885,6 +885,13 @@ repeat:
 		pages[ret] = page;
 		ret++;
 	}
+
+	/*
+	 * If all entries were removed before we could secure them,
+	 * try again, because callers stop trying once 0 is returned.
+	 */
+	if (unlikely(!ret && nr_found))
+		goto restart;
 	rcu_read_unlock();
 	return ret;
 }
@@ -1004,6 +1011,13 @@ repeat:
 		pages[ret] = page;
 		ret++;
 	}
+
+	/*
+	 * If all entries were removed before we could secure them,
+	 * try again, because callers stop trying once 0 is returned.
+	 */
+	if (unlikely(!ret && nr_found))
+		goto restart;
 	rcu_read_unlock();
 
 	if (ret)
-- 
cgit v1.2.3


From 250df6ed274d767da844a5d9f05720b804240197 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 22 Mar 2011 22:23:36 +1100
Subject: fs: protect inode->i_state with inode->i_lock

Protect inode state transitions and validity checks with the
inode->i_lock. This enables us to make inode state transitions
independently of the inode_lock and is the first step to peeling
away the inode_lock from the code.

This requires that __iget() is done atomically with i_state checks
during list traversals so that we don't race with another thread
marking the inode I_FREEING between the state check and grabbing the
reference.

Also remove the unlock_new_inode() memory barrier optimisation
required to avoid taking the inode_lock when clearing I_NEW.
Simplify the code by simply taking the inode->i_lock around the
state change and wakeup. Because the wakeup is no longer tricky,
remove the wake_up_inode() function and open code the wakeup where
necessary.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 mm/filemap.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'mm/filemap.c')

diff --git a/mm/filemap.c b/mm/filemap.c
index f807afda86f2..499e9aa91450 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -99,7 +99,9 @@
  *    ->private_lock		(page_remove_rmap->set_page_dirty)
  *    ->tree_lock		(page_remove_rmap->set_page_dirty)
  *    ->inode_lock		(page_remove_rmap->set_page_dirty)
+ *    ->inode->i_lock		(page_remove_rmap->set_page_dirty)
  *    ->inode_lock		(zap_pte_range->set_page_dirty)
+ *    ->inode->i_lock		(zap_pte_range->set_page_dirty)
  *    ->private_lock		(zap_pte_range->__set_page_dirty_buffers)
  *
  *  (code doesn't rely on that order, so you could switch it around)
-- 
cgit v1.2.3


From a66979abad090b2765a6c6790c9fdeab996833f2 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 22 Mar 2011 22:23:41 +1100
Subject: fs: move i_wb_list out from under inode_lock

Protect the inode writeback list with a new global lock
inode_wb_list_lock and use it to protect the list manipulations and
traversals. This lock replaces the inode_lock as the inodes on the
list can be validity checked while holding the inode->i_lock and
hence the inode_lock is no longer needed to protect the list.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 mm/filemap.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'mm/filemap.c')

diff --git a/mm/filemap.c b/mm/filemap.c
index 499e9aa91450..d8b34d1a1071 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -80,8 +80,8 @@
  *  ->i_mutex
  *    ->i_alloc_sem             (various)
  *
- *  ->inode_lock
- *    ->sb_lock			(fs/fs-writeback.c)
+ *  inode_wb_list_lock
+ *    sb_lock			(fs/fs-writeback.c)
  *    ->mapping->tree_lock	(__sync_single_inode)
  *
  *  ->i_mmap_lock
@@ -98,9 +98,9 @@
  *    ->zone.lru_lock		(check_pte_range->isolate_lru_page)
  *    ->private_lock		(page_remove_rmap->set_page_dirty)
  *    ->tree_lock		(page_remove_rmap->set_page_dirty)
- *    ->inode_lock		(page_remove_rmap->set_page_dirty)
+ *    inode_wb_list_lock	(page_remove_rmap->set_page_dirty)
  *    ->inode->i_lock		(page_remove_rmap->set_page_dirty)
- *    ->inode_lock		(zap_pte_range->set_page_dirty)
+ *    inode_wb_list_lock	(zap_pte_range->set_page_dirty)
  *    ->inode->i_lock		(zap_pte_range->set_page_dirty)
  *    ->private_lock		(zap_pte_range->__set_page_dirty_buffers)
  *
-- 
cgit v1.2.3