From a482289d46587ffcda4c85aab109fb74910d7a48 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 22 Mar 2006 00:08:08 -0800
Subject: [PATCH] hugepage allocator cleanup

Insert "fresh" huge pages into the hugepage allocator by the same means as
they are freed back into it.  This reduces code size and allows
enqueue_huge_page to be inlined into the hugepage free fastpath.

Eliminate occurances of hugepages on the free list with non-zero refcount.
This can allow stricter refcount checks in future.  Also required for
lockless pagecache.

Signed-off-by: Nick Piggin <npiggin@suse.de>

"This patch also eliminates a leak "cleaned up" by re-clobbering the
refcount on every allocation from the hugepage freelists.  With respect to
the lockless pagecache, the crucial aspect is to eliminate unconditional
set_page_count() to 0 on pages with potentially nonzero refcounts, though
closer inspection suggests the assignments removed are entirely spurious."

Acked-by: William Irwin <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/hugetlb.c | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

(limited to 'mm/hugetlb.c')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 508707704d2c..39d49ecea8e8 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -64,7 +64,7 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
 	return page;
 }
 
-static struct page *alloc_fresh_huge_page(void)
+static int alloc_fresh_huge_page(void)
 {
 	static int nid = 0;
 	struct page *page;
@@ -72,12 +72,15 @@ static struct page *alloc_fresh_huge_page(void)
 					HUGETLB_PAGE_ORDER);
 	nid = (nid + 1) % num_online_nodes();
 	if (page) {
+		page[1].lru.next = (void *)free_huge_page;	/* dtor */
 		spin_lock(&hugetlb_lock);
 		nr_huge_pages++;
 		nr_huge_pages_node[page_to_nid(page)]++;
 		spin_unlock(&hugetlb_lock);
+		put_page(page); /* free it into the hugepage allocator */
+		return 1;
 	}
-	return page;
+	return 0;
 }
 
 void free_huge_page(struct page *page)
@@ -85,7 +88,6 @@ void free_huge_page(struct page *page)
 	BUG_ON(page_count(page));
 
 	INIT_LIST_HEAD(&page->lru);
-	page[1].lru.next = NULL;			/* reset dtor */
 
 	spin_lock(&hugetlb_lock);
 	enqueue_huge_page(page);
@@ -105,7 +107,6 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
 	}
 	spin_unlock(&hugetlb_lock);
 	set_page_count(page, 1);
-	page[1].lru.next = (void *)free_huge_page;	/* set dtor */
 	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
 		clear_user_highpage(&page[i], addr);
 	return page;
@@ -114,7 +115,6 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
 static int __init hugetlb_init(void)
 {
 	unsigned long i;
-	struct page *page;
 
 	if (HPAGE_SHIFT == 0)
 		return 0;
@@ -123,12 +123,8 @@ static int __init hugetlb_init(void)
 		INIT_LIST_HEAD(&hugepage_freelists[i]);
 
 	for (i = 0; i < max_huge_pages; ++i) {
-		page = alloc_fresh_huge_page();
-		if (!page)
+		if (!alloc_fresh_huge_page())
 			break;
-		spin_lock(&hugetlb_lock);
-		enqueue_huge_page(page);
-		spin_unlock(&hugetlb_lock);
 	}
 	max_huge_pages = free_huge_pages = nr_huge_pages = i;
 	printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
@@ -154,8 +150,8 @@ static void update_and_free_page(struct page *page)
 		page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
 				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
 				1 << PG_private | 1<< PG_writeback);
-		set_page_count(&page[i], 0);
 	}
+	page[1].lru.next = NULL;
 	set_page_count(page, 1);
 	__free_pages(page, HUGETLB_PAGE_ORDER);
 }
@@ -188,12 +184,8 @@ static inline void try_to_free_low(unsigned long count)
 static unsigned long set_max_huge_pages(unsigned long count)
 {
 	while (count > nr_huge_pages) {
-		struct page *page = alloc_fresh_huge_page();
-		if (!page)
+		if (!alloc_fresh_huge_page())
 			return nr_huge_pages;
-		spin_lock(&hugetlb_lock);
-		enqueue_huge_page(page);
-		spin_unlock(&hugetlb_lock);
 	}
 	if (count >= nr_huge_pages)
 		return nr_huge_pages;
-- 
cgit v1.2.3


From 7835e98b2e3c66dba79cb0ff8ebb90a2fe030c29 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 22 Mar 2006 00:08:40 -0800
Subject: [PATCH] remove set_page_count() outside mm/

set_page_count usage outside mm/ is limited to setting the refcount to 1.
Remove set_page_count from outside mm/, and replace those users with
init_page_count() and set_page_refcounted().

This allows more debug checking, and tighter control on how code is allowed
to play around with page->_count.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/hugetlb.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'mm/hugetlb.c')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 39d49ecea8e8..20117a4b8ab6 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -18,6 +18,7 @@
 #include <asm/pgtable.h>
 
 #include <linux/hugetlb.h>
+#include "internal.h"
 
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
 static unsigned long nr_huge_pages, free_huge_pages;
@@ -106,7 +107,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
 		return NULL;
 	}
 	spin_unlock(&hugetlb_lock);
-	set_page_count(page, 1);
+	set_page_refcounted(page);
 	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
 		clear_user_highpage(&page[i], addr);
 	return page;
@@ -152,7 +153,7 @@ static void update_and_free_page(struct page *page)
 				1 << PG_private | 1<< PG_writeback);
 	}
 	page[1].lru.next = NULL;
-	set_page_count(page, 1);
+	set_page_refcounted(page);
 	__free_pages(page, HUGETLB_PAGE_ORDER);
 }
 
-- 
cgit v1.2.3


From 8f860591ffb29738cf5539b6fbf27f50dcdeb380 Mon Sep 17 00:00:00 2001
From: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Date: Wed, 22 Mar 2006 00:08:50 -0800
Subject: [PATCH] Enable mprotect on huge pages

2.6.16-rc3 uses hugetlb on-demand paging, but it doesn_t support hugetlb
mprotect.

From: David Gibson <david@gibson.dropbear.id.au>

  Remove a test from the mprotect() path which checks that the mprotect()ed
  range on a hugepage VMA is hugepage aligned (yes, really, the sense of
  is_aligned_hugepage_range() is the opposite of what you'd guess :-/).

  In fact, we don't need this test.  If the given addresses match the
  beginning/end of a hugepage VMA they must already be suitably aligned.  If
  they don't, then mprotect_fixup() will attempt to split the VMA.  The very
  first test in split_vma() will check for a badly aligned address on a
  hugepage VMA and return -EINVAL if necessary.

From: "Chen, Kenneth W" <kenneth.w.chen@intel.com>

  On i386 and x86-64, pte flag _PAGE_PSE collides with _PAGE_PROTNONE.  The
  identify of hugetlb pte is lost when changing page protection via mprotect.
  A page fault occurs later will trigger a bug check in huge_pte_alloc().

  The fix is to always make new pte a hugetlb pte and also to clean up
  legacy code where _PAGE_PRESENT is forced on in the pre-faulting day.

Signed-off-by: Zhang Yanmin <yanmin.zhang@intel.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/hugetlb.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'mm/hugetlb.c')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 20117a4b8ab6..783098f6cf8e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -565,3 +565,32 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	return i;
 }
+
+void hugetlb_change_protection(struct vm_area_struct *vma,
+		unsigned long address, unsigned long end, pgprot_t newprot)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long start = address;
+	pte_t *ptep;
+	pte_t pte;
+
+	BUG_ON(address >= end);
+	flush_cache_range(vma, address, end);
+
+	spin_lock(&mm->page_table_lock);
+	for (; address < end; address += HPAGE_SIZE) {
+		ptep = huge_pte_offset(mm, address);
+		if (!ptep)
+			continue;
+		if (!pte_none(*ptep)) {
+			pte = huge_ptep_get_and_clear(mm, address, ptep);
+			pte = pte_mkhuge(pte_modify(pte, newprot));
+			set_huge_pte_at(mm, address, ptep, pte);
+			lazy_mmu_prot_update(pte);
+		}
+	}
+	spin_unlock(&mm->page_table_lock);
+
+	flush_tlb_range(vma, start, end);
+}
+
-- 
cgit v1.2.3


From 79ac6ba40eb8d70f0d204e98ae9b63280ad1018c Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 22 Mar 2006 00:08:51 -0800
Subject: [PATCH] hugepage: Small fixes to hugepage clear/copy path

Move the loops used in mm/hugetlb.c to clear and copy hugepages to their
own functions for clarity.  As we do so, we add some checks of need_resched
- we are, after all copying megabytes of memory here.  We also add
might_sleep() accordingly.  We generally dropped locks around the clear and
copy, already but not everyone has PREEMPT enabled, so we should still be
checking explicitly.

For this to work, we need to remove the clear_huge_page() from
alloc_huge_page(), which is called with the page_table_lock held in the COW
path.  We move the clear_huge_page() to just after the alloc_huge_page() in
the hugepage no-page path.  In the COW path, the new page is about to be
copied over, so clearing it was just a waste of time anyway.  So as a side
effect we also fix the fact that we held the page_table_lock for far too
long in this path by calling alloc_huge_page() under it.

It causes no regressions on the libhugetlbfs testsuite (ppc64, POWER5).

Signed-off-by: David Gibson <dwg@au1.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/hugetlb.c | 33 ++++++++++++++++++++++++++-------
 1 file changed, 26 insertions(+), 7 deletions(-)

(limited to 'mm/hugetlb.c')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 783098f6cf8e..41b1038f76da 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -27,6 +27,29 @@ static struct list_head hugepage_freelists[MAX_NUMNODES];
 static unsigned int nr_huge_pages_node[MAX_NUMNODES];
 static unsigned int free_huge_pages_node[MAX_NUMNODES];
 
+static void clear_huge_page(struct page *page, unsigned long addr)
+{
+	int i;
+
+	might_sleep();
+	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) {
+		cond_resched();
+		clear_user_highpage(page + i, addr);
+	}
+}
+
+static void copy_huge_page(struct page *dst, struct page *src,
+			   unsigned long addr)
+{
+	int i;
+
+	might_sleep();
+	for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
+		cond_resched();
+		copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE);
+	}
+}
+
 /*
  * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
  */
@@ -98,7 +121,6 @@ void free_huge_page(struct page *page)
 struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
 {
 	struct page *page;
-	int i;
 
 	spin_lock(&hugetlb_lock);
 	page = dequeue_huge_page(vma, addr);
@@ -108,8 +130,6 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
 	}
 	spin_unlock(&hugetlb_lock);
 	set_page_refcounted(page);
-	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
-		clear_user_highpage(&page[i], addr);
 	return page;
 }
 
@@ -367,7 +387,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, pte_t *ptep, pte_t pte)
 {
 	struct page *old_page, *new_page;
-	int i, avoidcopy;
+	int avoidcopy;
 
 	old_page = pte_page(pte);
 
@@ -388,9 +408,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 	}
 
 	spin_unlock(&mm->page_table_lock);
-	for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++)
-		copy_user_highpage(new_page + i, old_page + i,
-				   address + i*PAGE_SIZE);
+	copy_huge_page(new_page, old_page, address);
 	spin_lock(&mm->page_table_lock);
 
 	ptep = huge_pte_offset(mm, address & HPAGE_MASK);
@@ -435,6 +453,7 @@ retry:
 			ret = VM_FAULT_OOM;
 			goto out;
 		}
+		clear_huge_page(page, address);
 
 		if (vma->vm_flags & VM_SHARED) {
 			int err;
-- 
cgit v1.2.3


From 3935baa9bcda3ccaee4f7849f5157d316e34412e Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 22 Mar 2006 00:08:53 -0800
Subject: [PATCH] hugepage: serialize hugepage allocation and instantiation

Currently, no lock or mutex is held between allocating a hugepage and
inserting it into the pagetables / page cache.  When we do go to insert the
page into pagetables or page cache, we recheck and may free the newly
allocated hugepage.  However, since the number of hugepages in the system
is strictly limited, and it's usualy to want to use all of them, this can
still lead to spurious allocation failures.

For example, suppose two processes are both mapping (MAP_SHARED) the same
hugepage file, large enough to consume the entire available hugepage pool.
If they race instantiating the last page in the mapping, they will both
attempt to allocate the last available hugepage.  One will fail, of course,
returning OOM from the fault and thus causing the process to be killed,
despite the fact that the entire mapping can, in fact, be instantiated.

The patch fixes this race by the simple method of adding a (sleeping) mutex
to serialize the hugepage fault path between allocation and insertion into
pagetables and/or page cache.  It would be possible to avoid the
serialization by catching the allocation failures, waiting on some
condition, then rechecking to see if someone else has instantiated the page
for us.  Given the likely frequency of hugepage instantiations, it seems
very doubtful it's worth the extra complexity.

This patch causes no regression on the libhugetlbfs testsuite, and one
test, which can trigger this race now passes where it previously failed.

Actually, the test still sometimes fails, though less often and only as a
shmat() failure, rather processes getting OOM killed by the VM.  The dodgy
heuristic tests in fs/hugetlbfs/inode.c for whether there's enough hugepage
space aren't protected by the new mutex, and would be ugly to do so, so
there's still a race there.  Another patch to replace those tests with
something saner for this reason as well as others coming...

Signed-off-by: David Gibson <dwg@au1.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/hugetlb.c | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

(limited to 'mm/hugetlb.c')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 41b1038f76da..d5987a87bbe5 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -13,6 +13,7 @@
 #include <linux/pagemap.h>
 #include <linux/mempolicy.h>
 #include <linux/cpuset.h>
+#include <linux/mutex.h>
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -26,6 +27,10 @@ unsigned long max_huge_pages;
 static struct list_head hugepage_freelists[MAX_NUMNODES];
 static unsigned int nr_huge_pages_node[MAX_NUMNODES];
 static unsigned int free_huge_pages_node[MAX_NUMNODES];
+/*
+ * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
+ */
+static DEFINE_SPINLOCK(hugetlb_lock);
 
 static void clear_huge_page(struct page *page, unsigned long addr)
 {
@@ -50,11 +55,6 @@ static void copy_huge_page(struct page *dst, struct page *src,
 	}
 }
 
-/*
- * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
- */
-static DEFINE_SPINLOCK(hugetlb_lock);
-
 static void enqueue_huge_page(struct page *page)
 {
 	int nid = page_to_nid(page);
@@ -508,14 +508,24 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	pte_t *ptep;
 	pte_t entry;
 	int ret;
+	static DEFINE_MUTEX(hugetlb_instantiation_mutex);
 
 	ptep = huge_pte_alloc(mm, address);
 	if (!ptep)
 		return VM_FAULT_OOM;
 
+	/*
+	 * Serialize hugepage allocation and instantiation, so that we don't
+	 * get spurious allocation failures if two CPUs race to instantiate
+	 * the same page in the page cache.
+	 */
+	mutex_lock(&hugetlb_instantiation_mutex);
 	entry = *ptep;
-	if (pte_none(entry))
-		return hugetlb_no_page(mm, vma, address, ptep, write_access);
+	if (pte_none(entry)) {
+		ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
+		mutex_unlock(&hugetlb_instantiation_mutex);
+		return ret;
+	}
 
 	ret = VM_FAULT_MINOR;
 
@@ -525,6 +535,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		if (write_access && !pte_write(entry))
 			ret = hugetlb_cow(mm, vma, address, ptep, entry);
 	spin_unlock(&mm->page_table_lock);
+	mutex_unlock(&hugetlb_instantiation_mutex);
 
 	return ret;
 }
-- 
cgit v1.2.3


From b45b5bd65f668a665db40d093e4e1fe563533608 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 22 Mar 2006 00:08:55 -0800
Subject: [PATCH] hugepage: Strict page reservation for hugepage inodes

These days, hugepages are demand-allocated at first fault time.  There's a
somewhat dubious (and racy) heuristic when making a new mmap() to check if
there are enough available hugepages to fully satisfy that mapping.

A particularly obvious case where the heuristic breaks down is where a
process maps its hugepages not as a single chunk, but as a bunch of
individually mmap()ed (or shmat()ed) blocks without touching and
instantiating the pages in between allocations.  In this case the size of
each block is compared against the total number of available hugepages.
It's thus easy for the process to become overcommitted, because each block
mapping will succeed, although the total number of hugepages required by
all blocks exceeds the number available.  In particular, this defeats such
a program which will detect a mapping failure and adjust its hugepage usage
downward accordingly.

The patch below addresses this problem, by strictly reserving a number of
physical hugepages for hugepage inodes which have been mapped, but not
instatiated.  MAP_SHARED mappings are thus "safe" - they will fail on
mmap(), not later with an OOM SIGKILL.  MAP_PRIVATE mappings can still
trigger an OOM.  (Actually SHARED mappings can technically still OOM, but
only if the sysadmin explicitly reduces the hugepage pool between mapping
and instantiation)

This patch appears to address the problem at hand - it allows DB2 to start
correctly, for instance, which previously suffered the failure described
above.

This patch causes no regressions on the libhugetblfs testsuite, and makes a
test (designed to catch this problem) pass which previously failed (ppc64,
POWER5).

Signed-off-by: David Gibson <dwg@au1.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/hugetlb.c | 136 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 126 insertions(+), 10 deletions(-)

(limited to 'mm/hugetlb.c')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d5987a87bbe5..27fad5d9bcf6 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -22,7 +22,7 @@
 #include "internal.h"
 
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
-static unsigned long nr_huge_pages, free_huge_pages;
+static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages;
 unsigned long max_huge_pages;
 static struct list_head hugepage_freelists[MAX_NUMNODES];
 static unsigned int nr_huge_pages_node[MAX_NUMNODES];
@@ -120,17 +120,136 @@ void free_huge_page(struct page *page)
 
 struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
 {
+	struct inode *inode = vma->vm_file->f_dentry->d_inode;
 	struct page *page;
+	int use_reserve = 0;
+	unsigned long idx;
 
 	spin_lock(&hugetlb_lock);
-	page = dequeue_huge_page(vma, addr);
-	if (!page) {
-		spin_unlock(&hugetlb_lock);
-		return NULL;
+
+	if (vma->vm_flags & VM_MAYSHARE) {
+
+		/* idx = radix tree index, i.e. offset into file in
+		 * HPAGE_SIZE units */
+		idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
+			+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+
+		/* The hugetlbfs specific inode info stores the number
+		 * of "guaranteed available" (huge) pages.  That is,
+		 * the first 'prereserved_hpages' pages of the inode
+		 * are either already instantiated, or have been
+		 * pre-reserved (by hugetlb_reserve_for_inode()). Here
+		 * we're in the process of instantiating the page, so
+		 * we use this to determine whether to draw from the
+		 * pre-reserved pool or the truly free pool. */
+		if (idx < HUGETLBFS_I(inode)->prereserved_hpages)
+			use_reserve = 1;
+	}
+
+	if (!use_reserve) {
+		if (free_huge_pages <= reserved_huge_pages)
+			goto fail;
+	} else {
+		BUG_ON(reserved_huge_pages == 0);
+		reserved_huge_pages--;
 	}
+
+	page = dequeue_huge_page(vma, addr);
+	if (!page)
+		goto fail;
+
 	spin_unlock(&hugetlb_lock);
 	set_page_refcounted(page);
 	return page;
+
+ fail:
+	WARN_ON(use_reserve); /* reserved allocations shouldn't fail */
+	spin_unlock(&hugetlb_lock);
+	return NULL;
+}
+
+/* hugetlb_extend_reservation()
+ *
+ * Ensure that at least 'atleast' hugepages are, and will remain,
+ * available to instantiate the first 'atleast' pages of the given
+ * inode.  If the inode doesn't already have this many pages reserved
+ * or instantiated, set aside some hugepages in the reserved pool to
+ * satisfy later faults (or fail now if there aren't enough, rather
+ * than getting the SIGBUS later).
+ */
+int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info,
+			       unsigned long atleast)
+{
+	struct inode *inode = &info->vfs_inode;
+	unsigned long change_in_reserve = 0;
+	int ret = 0;
+
+	spin_lock(&hugetlb_lock);
+	read_lock_irq(&inode->i_mapping->tree_lock);
+
+	if (info->prereserved_hpages >= atleast)
+		goto out;
+
+	/* Because we always call this on shared mappings, none of the
+	 * pages beyond info->prereserved_hpages can have been
+	 * instantiated, so we need to reserve all of them now. */
+	change_in_reserve = atleast - info->prereserved_hpages;
+
+	if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	reserved_huge_pages += change_in_reserve;
+	info->prereserved_hpages = atleast;
+
+ out:
+	read_unlock_irq(&inode->i_mapping->tree_lock);
+	spin_unlock(&hugetlb_lock);
+
+	return ret;
+}
+
+/* hugetlb_truncate_reservation()
+ *
+ * This returns pages reserved for the given inode to the general free
+ * hugepage pool.  If the inode has any pages prereserved, but not
+ * instantiated, beyond offset (atmost << HPAGE_SIZE), then release
+ * them.
+ */
+void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info,
+				  unsigned long atmost)
+{
+	struct inode *inode = &info->vfs_inode;
+	struct address_space *mapping = inode->i_mapping;
+	unsigned long idx;
+	unsigned long change_in_reserve = 0;
+	struct page *page;
+
+	spin_lock(&hugetlb_lock);
+	read_lock_irq(&inode->i_mapping->tree_lock);
+
+	if (info->prereserved_hpages <= atmost)
+		goto out;
+
+	/* Count pages which were reserved, but not instantiated, and
+	 * which we can now release. */
+	for (idx = atmost; idx < info->prereserved_hpages; idx++) {
+		page = radix_tree_lookup(&mapping->page_tree, idx);
+		if (!page)
+			/* Pages which are already instantiated can't
+			 * be unreserved (and in fact have already
+			 * been removed from the reserved pool) */
+			change_in_reserve++;
+	}
+
+	BUG_ON(reserved_huge_pages < change_in_reserve);
+	reserved_huge_pages -= change_in_reserve;
+	info->prereserved_hpages = atmost;
+
+ out:
+	read_unlock_irq(&inode->i_mapping->tree_lock);
+	spin_unlock(&hugetlb_lock);
 }
 
 static int __init hugetlb_init(void)
@@ -238,9 +357,11 @@ int hugetlb_report_meminfo(char *buf)
 	return sprintf(buf,
 			"HugePages_Total: %5lu\n"
 			"HugePages_Free:  %5lu\n"
+		        "HugePages_Rsvd:  %5lu\n"
 			"Hugepagesize:    %5lu kB\n",
 			nr_huge_pages,
 			free_huge_pages,
+		        reserved_huge_pages,
 			HPAGE_SIZE/1024);
 }
 
@@ -253,11 +374,6 @@ int hugetlb_report_node_meminfo(int nid, char *buf)
 		nid, free_huge_pages_node[nid]);
 }
 
-int is_hugepage_mem_enough(size_t size)
-{
-	return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages;
-}
-
 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
 unsigned long hugetlb_total_pages(void)
 {
-- 
cgit v1.2.3


From 27a85ef1b81300cfff06b4c8037e9914dfb09acc Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 22 Mar 2006 00:08:56 -0800
Subject: [PATCH] hugepage: Make {alloc,free}_huge_page() local

Originally, mm/hugetlb.c just handled the hugepage physical allocation path
and its {alloc,free}_huge_page() functions were used from the arch specific
hugepage code.  These days those functions are only used with mm/hugetlb.c
itself.  Therefore, this patch makes them static and removes their
prototypes from hugetlb.h.  This requires a small rearrangement of code in
mm/hugetlb.c to avoid a forward declaration.

This patch causes no regressions on the libhugetlbfs testsuite (ppc64,
POWER5).

Signed-off-by: David Gibson <dwg@au1.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/hugetlb.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

(limited to 'mm/hugetlb.c')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 27fad5d9bcf6..075877b1cbc0 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -88,6 +88,17 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
 	return page;
 }
 
+static void free_huge_page(struct page *page)
+{
+	BUG_ON(page_count(page));
+
+	INIT_LIST_HEAD(&page->lru);
+
+	spin_lock(&hugetlb_lock);
+	enqueue_huge_page(page);
+	spin_unlock(&hugetlb_lock);
+}
+
 static int alloc_fresh_huge_page(void)
 {
 	static int nid = 0;
@@ -107,18 +118,8 @@ static int alloc_fresh_huge_page(void)
 	return 0;
 }
 
-void free_huge_page(struct page *page)
-{
-	BUG_ON(page_count(page));
-
-	INIT_LIST_HEAD(&page->lru);
-
-	spin_lock(&hugetlb_lock);
-	enqueue_huge_page(page);
-	spin_unlock(&hugetlb_lock);
-}
-
-struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
+static struct page *alloc_huge_page(struct vm_area_struct *vma,
+				    unsigned long addr)
 {
 	struct inode *inode = vma->vm_file->f_dentry->d_inode;
 	struct page *page;
-- 
cgit v1.2.3


From d5d4b0aa4e1430d73050babba999365593bdb9d2 Mon Sep 17 00:00:00 2001
From: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Date: Wed, 22 Mar 2006 00:09:03 -0800
Subject: [PATCH] optimize follow_hugetlb_page

follow_hugetlb_page() walks a range of user virtual address and then fills
in list of struct page * into an array that is passed from the argument
list.  It also gets a reference count via get_page().  For compound page,
get_page() actually traverse back to head page via page_private() macro and
then adds a reference count to the head page.  Since we are doing a virt to
pte look up, kernel already has a struct page pointer into the head page.
So instead of traverse into the small unit page struct and then follow a
link back to the head page, optimize that with incrementing the reference
count directly on the head page.

The benefit is that we don't take a cache miss on accessing page struct for
the corresponding user address and more importantly, not to pollute the
cache with a "not very useful" round trip of pointer chasing.  This adds a
moderate performance gain on an I/O intensive database transaction
workload.

Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/hugetlb.c | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

(limited to 'mm/hugetlb.c')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 075877b1cbc0..06699d871a8e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -661,10 +661,10 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			struct page **pages, struct vm_area_struct **vmas,
 			unsigned long *position, int *length, int i)
 {
-	unsigned long vpfn, vaddr = *position;
+	unsigned long pfn_offset;
+	unsigned long vaddr = *position;
 	int remainder = *length;
 
-	vpfn = vaddr/PAGE_SIZE;
 	spin_lock(&mm->page_table_lock);
 	while (vaddr < vma->vm_end && remainder) {
 		pte_t *pte;
@@ -692,19 +692,28 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			break;
 		}
 
-		if (pages) {
-			page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
-			get_page(page);
-			pages[i] = page;
-		}
+		pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT;
+		page = pte_page(*pte);
+same_page:
+		get_page(page);
+		if (pages)
+			pages[i] = page + pfn_offset;
 
 		if (vmas)
 			vmas[i] = vma;
 
 		vaddr += PAGE_SIZE;
-		++vpfn;
+		++pfn_offset;
 		--remainder;
 		++i;
+		if (vaddr < vma->vm_end && remainder &&
+				pfn_offset < HPAGE_SIZE/PAGE_SIZE) {
+			/*
+			 * We use pfn_offset to avoid touching the pageframes
+			 * of this compound page.
+			 */
+			goto same_page;
+		}
 	}
 	spin_unlock(&mm->page_table_lock);
 	*length = remainder;
-- 
cgit v1.2.3


From fdb7cc59084ba7eef935e4e40aaaf538ee34c625 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Wed, 22 Mar 2006 00:09:10 -0800
Subject: [PATCH] mm: hugetlb alloc_fresh_huge_page bogus node loop fix

Fix bogus node loop in hugetlb.c alloc_fresh_huge_page(), which was
assuming that nodes are numbered contiguously from 0 to num_online_nodes().
Once the hotplug folks get this far, that will be false.

Signed-off-by: Paul Jackson <pj@sgi.com>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/hugetlb.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'mm/hugetlb.c')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 06699d871a8e..ebad6bbb3501 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -105,7 +105,9 @@ static int alloc_fresh_huge_page(void)
 	struct page *page;
 	page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN,
 					HUGETLB_PAGE_ORDER);
-	nid = (nid + 1) % num_online_nodes();
+	nid = next_node(nid, node_online_map);
+	if (nid == MAX_NUMNODES)
+		nid = first_node(node_online_map);
 	if (page) {
 		page[1].lru.next = (void *)free_huge_page;	/* dtor */
 		spin_lock(&hugetlb_lock);
-- 
cgit v1.2.3