15 files changed, 364 insertions, 289 deletions
diff --git a/arch/powerpc/mm/4xx_mmu.c b/arch/powerpc/mm/4xx_mmu.c
index b7bcbc232f39..4d006aa1a0d1 100644
--- a/arch/powerpc/mm/4xx_mmu.c
+++ b/arch/powerpc/mm/4xx_mmu.c
@@ -110,13 +110,11 @@ unsigned long __init mmu_mapin_ram(void)
 		pmd_t *pmdp;
 		unsigned long val = p | _PMD_SIZE_16M | _PAGE_HWEXEC | _PAGE_HWWRITE;
 
-		spin_lock(&init_mm.page_table_lock);
 		pmdp = pmd_offset(pgd_offset_k(v), v);
 		pmd_val(*pmdp++) = val;
 		pmd_val(*pmdp++) = val;
 		pmd_val(*pmdp++) = val;
 		pmd_val(*pmdp++) = val;
-		spin_unlock(&init_mm.page_table_lock);
 
 		v += LARGE_PAGE_SIZE_16M;
 		p += LARGE_PAGE_SIZE_16M;
@@ -127,10 +125,8 @@ unsigned long __init mmu_mapin_ram(void)
 		pmd_t *pmdp;
 		unsigned long val = p | _PMD_SIZE_4M | _PAGE_HWEXEC | _PAGE_HWWRITE;
 
-		spin_lock(&init_mm.page_table_lock);
 		pmdp = pmd_offset(pgd_offset_k(v), v);
 		pmd_val(*pmdp) = val;
-		spin_unlock(&init_mm.page_table_lock);
 
 		v += LARGE_PAGE_SIZE_4M;
 		p += LARGE_PAGE_SIZE_4M;
diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c
index af9ca0eb6d55..5d581bb3aa12 100644
--- a/arch/powerpc/mm/fsl_booke_mmu.c
+++ b/arch/powerpc/mm/fsl_booke_mmu.c
@@ -1,5 +1,5 @@
 /*
- * Modifications by Kumar Gala (kumar.gala@freescale.com) to support
+ * Modifications by Kumar Gala (galak@kernel.crashing.org) to support
  * E500 Book E processors.
  *
  * Copyright 2004 Freescale Semiconductor, Inc
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 22e474876133..a606504678bd 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -84,10 +84,11 @@
 extern unsigned long dart_tablebase;
 #endif /* CONFIG_U3_DART */
 
+static unsigned long _SDR1;
+struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
+
 hpte_t *htab_address;
 unsigned long htab_hash_mask;
-unsigned long _SDR1;
-struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
 int mmu_linear_psize = MMU_PAGE_4K;
 int mmu_virtual_psize = MMU_PAGE_4K;
 #ifdef CONFIG_HUGETLB_PAGE
@@ -165,7 +166,7 @@ int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
 		 * normal insert callback here.
 		 */
 #ifdef CONFIG_PPC_ISERIES
-		if (systemcfg->platform == PLATFORM_ISERIES_LPAR)
+		if (_machine == PLATFORM_ISERIES_LPAR)
 			ret = iSeries_hpte_insert(hpteg, va,
 						  virt_to_abs(paddr),
 						  tmp_mode,
@@ -174,7 +175,7 @@ int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
 		else
 #endif
 #ifdef CONFIG_PPC_PSERIES
-		if (systemcfg->platform & PLATFORM_LPAR)
+		if (_machine & PLATFORM_LPAR)
 			ret = pSeries_lpar_hpte_insert(hpteg, va,
 						       virt_to_abs(paddr),
 						       tmp_mode,
@@ -293,7 +294,7 @@ static void __init htab_init_page_sizes(void)
 	 * Not in the device-tree, let's fallback on known size
 	 * list for 16M capable GP & GR
 	 */
-	if ((systemcfg->platform != PLATFORM_ISERIES_LPAR) &&
+	if ((_machine != PLATFORM_ISERIES_LPAR) &&
 	    cpu_has_feature(CPU_FTR_16M_PAGE))
 		memcpy(mmu_psize_defs, mmu_psize_defaults_gp,
 		       sizeof(mmu_psize_defaults_gp));
@@ -364,7 +365,7 @@ static int __init htab_dt_scan_pftsize(unsigned long node,
 
 static unsigned long __init htab_get_table_size(void)
 {
-	unsigned long rnd_mem_size, pteg_count;
+	unsigned long mem_size, rnd_mem_size, pteg_count;
 
 	/* If hash size isn't already provided by the platform, we try to
 	 * retreive it from the device-tree. If it's not there neither, we
@@ -376,8 +377,9 @@ static unsigned long __init htab_get_table_size(void)
 		return 1UL << ppc64_pft_size;
 
 	/* round mem_size up to next power of 2 */
-	rnd_mem_size = 1UL << __ilog2(systemcfg->physicalMemorySize);
-	if (rnd_mem_size < systemcfg->physicalMemorySize)
+	mem_size = lmb_phys_mem_size();
+	rnd_mem_size = 1UL << __ilog2(mem_size);
+	if (rnd_mem_size < mem_size)
 		rnd_mem_size <<= 1;
 
 	/* # pages / 2 */
@@ -386,6 +388,15 @@ static unsigned long __init htab_get_table_size(void)
 	return pteg_count << 7;
 }
 
+#ifdef CONFIG_MEMORY_HOTPLUG
+void create_section_mapping(unsigned long start, unsigned long end)
+{
+		BUG_ON(htab_bolt_mapping(start, end, start,
+			_PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_COHERENT | PP_RWXX,
+			mmu_linear_psize));
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
 void __init htab_initialize(void)
 {
 	unsigned long table, htab_size_bytes;
@@ -410,7 +421,7 @@ void __init htab_initialize(void)
 
 	htab_hash_mask = pteg_count - 1;
 
-	if (systemcfg->platform & PLATFORM_LPAR) {
+	if (platform_is_lpar()) {
 		/* Using a hypervisor which owns the htab */
 		htab_address = NULL;
 		_SDR1 = 0; 
@@ -431,6 +442,9 @@ void __init htab_initialize(void)
 
 		/* Initialize the HPT with no entries */
 		memset((void *)table, 0, htab_size_bytes);
+
+		/* Set SDR1 */
+		mtspr(SPRN_SDR1, _SDR1);
 	}
 
 	mode_rw = _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_COHERENT | PP_RWXX;
@@ -500,6 +514,12 @@ void __init htab_initialize(void)
 #undef KB
 #undef MB
 
+void htab_initialize_secondary(void)
+{
+	if (!platform_is_lpar())
+		mtspr(SPRN_SDR1, _SDR1);
+}
+
 /*
  * Called by asm hashtable.S for doing lazy icache flush
  */
@@ -581,7 +601,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
 	/* Handle hugepage regions */
 	if (unlikely(in_hugepage_area(mm->context, ea))) {
 		DBG_LOW(" -> huge page !\n");
-		return hash_huge_page(mm, access, ea, vsid, local);
+		return hash_huge_page(mm, access, ea, vsid, local, trap);
 	}
 
 	/* Get PTE and page size from page tables */
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 426c269e552e..54131b877da3 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -148,43 +148,63 @@ int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
 	return 0;
 }
 
+struct slb_flush_info {
+	struct mm_struct *mm;
+	u16 newareas;
+};
+
 static void flush_low_segments(void *parm)
 {
-	u16 areas = (unsigned long) parm;
+	struct slb_flush_info *fi = parm;
 	unsigned long i;
 
-	asm volatile("isync" : : : "memory");
+	BUILD_BUG_ON((sizeof(fi->newareas)*8) != NUM_LOW_AREAS);
+
+	if (current->active_mm != fi->mm)
+		return;
+
+	/* Only need to do anything if this CPU is working in the same
+	 * mm as the one which has changed */
 
-	BUILD_BUG_ON((sizeof(areas)*8) != NUM_LOW_AREAS);
+	/* update the paca copy of the context struct */
+	get_paca()->context = current->active_mm->context;
 
+	asm volatile("isync" : : : "memory");
 	for (i = 0; i < NUM_LOW_AREAS; i++) {
-		if (! (areas & (1U << i)))
+		if (! (fi->newareas & (1U << i)))
 			continue;
 		asm volatile("slbie %0"
 			     : : "r" ((i << SID_SHIFT) | SLBIE_C));
 	}
-
 	asm volatile("isync" : : : "memory");
 }
 
 static void flush_high_segments(void *parm)
 {
-	u16 areas = (unsigned long) parm;
+	struct slb_flush_info *fi = parm;
 	unsigned long i, j;
 
-	asm volatile("isync" : : : "memory");
 
-	BUILD_BUG_ON((sizeof(areas)*8) != NUM_HIGH_AREAS);
+	BUILD_BUG_ON((sizeof(fi->newareas)*8) != NUM_HIGH_AREAS);
 
+	if (current->active_mm != fi->mm)
+		return;
+
+	/* Only need to do anything if this CPU is working in the same
+	 * mm as the one which has changed */
+
+	/* update the paca copy of the context struct */
+	get_paca()->context = current->active_mm->context;
+
+	asm volatile("isync" : : : "memory");
 	for (i = 0; i < NUM_HIGH_AREAS; i++) {
-		if (! (areas & (1U << i)))
+		if (! (fi->newareas & (1U << i)))
 			continue;
 		for (j = 0; j < (1UL << (HTLB_AREA_SHIFT-SID_SHIFT)); j++)
 			asm volatile("slbie %0"
 				     :: "r" (((i << HTLB_AREA_SHIFT)
-					     + (j << SID_SHIFT)) | SLBIE_C));
+					      + (j << SID_SHIFT)) | SLBIE_C));
 	}
-
 	asm volatile("isync" : : : "memory");
 }
 
@@ -229,6 +249,7 @@ static int prepare_high_area_for_htlb(struct mm_struct *mm, unsigned long area)
 static int open_low_hpage_areas(struct mm_struct *mm, u16 newareas)
 {
 	unsigned long i;
+	struct slb_flush_info fi;
 
 	BUILD_BUG_ON((sizeof(newareas)*8) != NUM_LOW_AREAS);
 	BUILD_BUG_ON((sizeof(mm->context.low_htlb_areas)*8) != NUM_LOW_AREAS);
@@ -244,19 +265,20 @@ static int open_low_hpage_areas(struct mm_struct *mm, u16 newareas)
 
 	mm->context.low_htlb_areas |= newareas;
 
-	/* update the paca copy of the context struct */
-	get_paca()->context = mm->context;
-
 	/* the context change must make it to memory before the flush,
 	 * so that further SLB misses do the right thing. */
 	mb();
-	on_each_cpu(flush_low_segments, (void *)(unsigned long)newareas, 0, 1);
+
+	fi.mm = mm;
+	fi.newareas = newareas;
+	on_each_cpu(flush_low_segments, &fi, 0, 1);
 
 	return 0;
 }
 
 static int open_high_hpage_areas(struct mm_struct *mm, u16 newareas)
 {
+	struct slb_flush_info fi;
 	unsigned long i;
 
 	BUILD_BUG_ON((sizeof(newareas)*8) != NUM_HIGH_AREAS);
@@ -280,22 +302,25 @@ static int open_high_hpage_areas(struct mm_struct *mm, u16 newareas)
 	/* the context change must make it to memory before the flush,
 	 * so that further SLB misses do the right thing. */
 	mb();
-	on_each_cpu(flush_high_segments, (void *)(unsigned long)newareas, 0, 1);
+
+	fi.mm = mm;
+	fi.newareas = newareas;
+	on_each_cpu(flush_high_segments, &fi, 0, 1);
 
 	return 0;
 }
 
 int prepare_hugepage_range(unsigned long addr, unsigned long len)
 {
-	int err;
+	int err = 0;
 
 	if ( (addr+len) < addr )
 		return -EINVAL;
 
-	if ((addr + len) < 0x100000000UL)
+	if (addr < 0x100000000UL)
 		err = open_low_hpage_areas(current->mm,
 					  LOW_ESID_MASK(addr, len));
-	else
+	if ((addr + len) > 0x100000000UL)
 		err = open_high_hpage_areas(current->mm,
 					    HTLB_AREA_MASK(addr, len));
 	if (err) {
@@ -639,8 +664,36 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 	return -ENOMEM;
 }
 
+/*
+ * Called by asm hashtable.S for doing lazy icache flush
+ */
+static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
+						  pte_t pte, int trap)
+{
+	struct page *page;
+	int i;
+
+	if (!pfn_valid(pte_pfn(pte)))
+		return rflags;
+
+	page = pte_page(pte);
+
+	/* page is dirty */
+	if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
+		if (trap == 0x400) {
+			for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++)
+				__flush_dcache_icache(page_address(page+i));
+			set_bit(PG_arch_1, &page->flags);
+		} else {
+			rflags |= HPTE_R_N;
+		}
+	}
+	return rflags;
+}
+
 int hash_huge_page(struct mm_struct *mm, unsigned long access,
-		   unsigned long ea, unsigned long vsid, int local)
+		   unsigned long ea, unsigned long vsid, int local,
+		   unsigned long trap)
 {
 	pte_t *ptep;
 	unsigned long old_pte, new_pte;
@@ -691,6 +744,11 @@ int hash_huge_page(struct mm_struct *mm, unsigned long access,
 	rflags = 0x2 | (!(new_pte & _PAGE_RW));
  	/* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
 	rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
+	if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
+		/* No CPU has hugepages but lacks no execute, so we
+		 * don't need to worry about that case */
+		rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte),
+						       trap);
 
 	/* Check if pte already has an hpte (case 2) */
 	if (unlikely(old_pte & _PAGE_HASHPTE)) {
@@ -703,7 +761,8 @@ int hash_huge_page(struct mm_struct *mm, unsigned long access,
 		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
 		slot += (old_pte & _PAGE_F_GIX) >> 12;
 
-		if (ppc_md.hpte_updatepp(slot, rflags, va, 1, local) == -1)
+		if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_huge_psize,
+					 local) == -1)
 			old_pte &= ~_PAGE_HPTEFLAGS;
 	}
 
@@ -754,9 +813,7 @@ repeat:
 	}
 
 	/*
-	 * No need to use ldarx/stdcx here because all who
-	 * might be updating the pte will hold the
-	 * page_table_lock
+	 * No need to use ldarx/stdcx here
 	 */
 	*ptep = __pte(new_pte & ~_PAGE_BUSY);
 
diff --git a/arch/powerpc/mm/imalloc.c b/arch/powerpc/mm/imalloc.c
index f4ca29cf5364..f9587bcc6a48 100644
--- a/arch/powerpc/mm/imalloc.c
+++ b/arch/powerpc/mm/imalloc.c
@@ -14,9 +14,10 @@
 #include <asm/pgalloc.h>
 #include <asm/pgtable.h>
 #include <asm/semaphore.h>
-#include <asm/imalloc.h>
 #include <asm/cacheflush.h>
 
+#include "mmu_decl.h"
+
 static DECLARE_MUTEX(imlist_sem);
 struct vm_struct * imlist = NULL;
 
diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index 4612a79dfb6e..7d4b8b5f0606 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -84,9 +84,6 @@ void MMU_init(void);
 /* XXX should be in current.h  -- paulus */
 extern struct task_struct *current_set[NR_CPUS];
 
-char *klimit = _end;
-struct device_node *memory_node;
-
 extern int init_bootmem_done;
 
 /*
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index ce974c83d88a..81cfb0c2ec58 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -20,6 +20,8 @@
  *
  */
 
+#undef DEBUG
+
 #include <linux/config.h>
 #include <linux/signal.h>
 #include <linux/sched.h>
@@ -62,7 +64,14 @@
 #include <asm/iommu.h>
 #include <asm/abs_addr.h>
 #include <asm/vdso.h>
-#include <asm/imalloc.h>
+
+#include "mmu_decl.h"
+
+#ifdef DEBUG
+#define DBG(fmt...) printk(fmt)
+#else
+#define DBG(fmt...)
+#endif
 
 #if PGTABLE_RANGE > USER_VSID_RANGE
 #warning Limited user VSID range means pagetable space is wasted
@@ -72,8 +81,6 @@
 #warning TASK_SIZE is smaller than it needs to be.
 #endif
 
-unsigned long klimit = (unsigned long)_end;
-
 /* max amount of RAM to use */
 unsigned long __max_memory;
 
@@ -188,14 +195,14 @@ static void zero_ctor(void *addr, kmem_cache_t *cache, unsigned long flags)
 }
 
 #ifdef CONFIG_PPC_64K_PAGES
-static const int pgtable_cache_size[2] = {
-	PTE_TABLE_SIZE, PGD_TABLE_SIZE
+static const unsigned int pgtable_cache_size[3] = {
+	PTE_TABLE_SIZE, PMD_TABLE_SIZE, PGD_TABLE_SIZE
 };
 static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = {
-	"pte_pmd_cache", "pgd_cache",
+	"pte_pmd_cache", "pmd_cache", "pgd_cache",
 };
 #else
-static const int pgtable_cache_size[2] = {
+static const unsigned int pgtable_cache_size[2] = {
 	PTE_TABLE_SIZE, PMD_TABLE_SIZE
 };
 static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = {
@@ -213,6 +220,8 @@ void pgtable_cache_init(void)
 		int size = pgtable_cache_size[i];
 		const char *name = pgtable_cache_name[i];
 
+		DBG("Allocating page table cache %s (#%d) "
+		    "for size: %08x...\n", name, i, size);
 		pgtable_cache[i] = kmem_cache_create(name,
 						     size, size,
 						     SLAB_HWCACHE_ALIGN |
diff --git a/arch/powerpc/mm/lmb.c b/arch/powerpc/mm/lmb.c
index 9b5aa6808eb8..9584608fd768 100644
--- a/arch/powerpc/mm/lmb.c
+++ b/arch/powerpc/mm/lmb.c
@@ -22,35 +22,38 @@
 #include "mmu_decl.h"		/* for __max_low_memory */
 #endif
 
-struct lmb lmb;
-
 #undef DEBUG
 
+#ifdef DEBUG
+#include <asm/udbg.h>
+#define DBG(fmt...) udbg_printf(fmt)
+#else
+#define DBG(fmt...)
+#endif
+
+struct lmb lmb;
+
 void lmb_dump_all(void)
 {
 #ifdef DEBUG
 	unsigned long i;
 
-	udbg_printf("lmb_dump_all:\n");
-	udbg_printf("    memory.cnt		  = 0x%lx\n",
-		    lmb.memory.cnt);
-	udbg_printf("    memory.size		  = 0x%lx\n",
-		    lmb.memory.size);
+	DBG("lmb_dump_all:\n");
+	DBG("    memory.cnt		  = 0x%lx\n", lmb.memory.cnt);
+	DBG("    memory.size		  = 0x%lx\n", lmb.memory.size);
 	for (i=0; i < lmb.memory.cnt ;i++) {
-		udbg_printf("    memory.region[0x%x].base       = 0x%lx\n",
+		DBG("    memory.region[0x%x].base       = 0x%lx\n",
 			    i, lmb.memory.region[i].base);
-		udbg_printf("		      .size     = 0x%lx\n",
+		DBG("		      .size     = 0x%lx\n",
 			    lmb.memory.region[i].size);
 	}
 
-	udbg_printf("\n    reserved.cnt	  = 0x%lx\n",
-		    lmb.reserved.cnt);
-	udbg_printf("    reserved.size	  = 0x%lx\n",
-		    lmb.reserved.size);
+	DBG("\n    reserved.cnt	  = 0x%lx\n", lmb.reserved.cnt);
+	DBG("    reserved.size	  = 0x%lx\n", lmb.reserved.size);
 	for (i=0; i < lmb.reserved.cnt ;i++) {
-		udbg_printf("    reserved.region[0x%x].base       = 0x%lx\n",
+		DBG("    reserved.region[0x%x].base       = 0x%lx\n",
 			    i, lmb.reserved.region[i].base);
-		udbg_printf("		      .size     = 0x%lx\n",
+		DBG("		      .size     = 0x%lx\n",
 			    lmb.reserved.region[i].size);
 	}
 #endif /* DEBUG */
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 6f55efd9be95..ed6ed2e30dac 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -46,9 +46,7 @@
 #include <asm/prom.h>
 #include <asm/lmb.h>
 #include <asm/sections.h>
-#ifdef CONFIG_PPC64
 #include <asm/vdso.h>
-#endif
 
 #include "mmu_decl.h"
 
@@ -110,6 +108,7 @@ EXPORT_SYMBOL(phys_mem_access_prot);
 void online_page(struct page *page)
 {
 	ClearPageReserved(page);
+	set_page_count(page, 0);
 	free_cold_page(page);
 	totalram_pages++;
 	num_physpages++;
@@ -127,6 +126,9 @@ int __devinit add_memory(u64 start, u64 size)
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 
+	start += KERNELBASE;
+	create_section_mapping(start, start + size);
+
 	/* this should work for most non-highmem platforms */
 	zone = pgdata->node_zones;
 
@@ -198,6 +200,8 @@ void show_mem(void)
 		unsigned long flags;
 		pgdat_resize_lock(pgdat, &flags);
 		for (i = 0; i < pgdat->node_spanned_pages; i++) {
+			if (!pfn_valid(pgdat->node_start_pfn + i))
+				continue;
 			page = pgdat_page_nr(pgdat, i);
 			total++;
 			if (PageHighMem(page))
@@ -334,7 +338,7 @@ void __init mem_init(void)
 	struct page *page;
 	unsigned long reservedpages = 0, codesize, initsize, datasize, bsssize;
 
-	num_physpages = max_pfn;	/* RAM is assumed contiguous */
+	num_physpages = lmb.memory.size >> PAGE_SHIFT;
 	high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
 
 #ifdef CONFIG_NEED_MULTIPLE_NODES
@@ -346,11 +350,13 @@ void __init mem_init(void)
 		}
 	}
 #else
-	max_mapnr = num_physpages;
+	max_mapnr = max_pfn;
 	totalram_pages += free_all_bootmem();
 #endif
 	for_each_pgdat(pgdat) {
 		for (i = 0; i < pgdat->node_spanned_pages; i++) {
+			if (!pfn_valid(pgdat->node_start_pfn + i))
+				continue;
 			page = pgdat_page_nr(pgdat, i);
 			if (PageReserved(page))
 				reservedpages++;
@@ -393,10 +399,8 @@ void __init mem_init(void)
 
 	mem_init_done = 1;
 
-#ifdef CONFIG_PPC64
 	/* Initialize the vDSO */
 	vdso_init();
-#endif
 }
 
 /*
@@ -491,7 +495,7 @@ EXPORT_SYMBOL(flush_icache_user_range);
  * We use it to preload an HPTE into the hash table corresponding to
  * the updated linux PTE.
  * 
- * This must always be called with the mm->page_table_lock held
+ * This must always be called with the pte lock held.
  */
 void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
 		      pte_t pte)
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index a4d7a327c0e5..bea2d21ac6f7 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -33,7 +33,6 @@ extern void invalidate_tlbcam_entry(int index);
 
 extern int __map_without_bats;
 extern unsigned long ioremap_base;
-extern unsigned long ioremap_bot;
 extern unsigned int rtas_data, rtas_size;
 
 extern PTE *Hash, *Hash_end;
@@ -42,6 +41,7 @@ extern unsigned long Hash_size, Hash_mask;
 extern unsigned int num_tlbcam_entries;
 #endif
 
+extern unsigned long ioremap_bot;
 extern unsigned long __max_low_memory;
 extern unsigned long __initial_memory_limit;
 extern unsigned long total_memory;
@@ -84,4 +84,16 @@ static inline void flush_HPTE(unsigned context, unsigned long va,
 	else
 		_tlbie(va);
 }
+#else /* CONFIG_PPC64 */
+/* imalloc region types */
+#define IM_REGION_UNUSED	0x1
+#define IM_REGION_SUBSET	0x2
+#define IM_REGION_EXISTS	0x4
+#define IM_REGION_OVERLAP	0x8
+#define IM_REGION_SUPERSET	0x10
+
+extern struct vm_struct * im_get_free_area(unsigned long size);
+extern struct vm_struct * im_get_area(unsigned long v_addr, unsigned long size,
+				      int region_type);
+extern void im_free(void *addr);
 #endif
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index da09ba03c424..ba7a3055a9fc 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -17,9 +17,8 @@
 #include <linux/nodemask.h>
 #include <linux/cpu.h>
 #include <linux/notifier.h>
+#include <asm/sparsemem.h>
 #include <asm/lmb.h>
-#include <asm/machdep.h>
-#include <asm/abs_addr.h>
 #include <asm/system.h>
 #include <asm/smp.h>
 
@@ -28,45 +27,113 @@ static int numa_enabled = 1;
 static int numa_debug;
 #define dbg(args...) if (numa_debug) { printk(KERN_INFO args); }
 
-#ifdef DEBUG_NUMA
-#define ARRAY_INITIALISER -1
-#else
-#define ARRAY_INITIALISER 0
-#endif
-
-int numa_cpu_lookup_table[NR_CPUS] = { [ 0 ... (NR_CPUS - 1)] =
-	ARRAY_INITIALISER};
-char *numa_memory_lookup_table;
+int numa_cpu_lookup_table[NR_CPUS];
 cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES];
-int nr_cpus_in_node[MAX_NUMNODES] = { [0 ... (MAX_NUMNODES -1)] = 0};
-
 struct pglist_data *node_data[MAX_NUMNODES];
-bootmem_data_t __initdata plat_node_bdata[MAX_NUMNODES];
+
+EXPORT_SYMBOL(numa_cpu_lookup_table);
+EXPORT_SYMBOL(numa_cpumask_lookup_table);
+EXPORT_SYMBOL(node_data);
+
+static bootmem_data_t __initdata plat_node_bdata[MAX_NUMNODES];
 static int min_common_depth;
 
 /*
- * We need somewhere to store start/span for each node until we have
+ * We need somewhere to store start/end/node for each region until we have
  * allocated the real node_data structures.
  */
+#define MAX_REGIONS	(MAX_LMB_REGIONS*2)
 static struct {
-	unsigned long node_start_pfn;
-	unsigned long node_end_pfn;
-	unsigned long node_present_pages;
-} init_node_data[MAX_NUMNODES] __initdata;
+	unsigned long start_pfn;
+	unsigned long end_pfn;
+	int nid;
+} init_node_data[MAX_REGIONS] __initdata;
 
-EXPORT_SYMBOL(node_data);
-EXPORT_SYMBOL(numa_cpu_lookup_table);
-EXPORT_SYMBOL(numa_memory_lookup_table);
-EXPORT_SYMBOL(numa_cpumask_lookup_table);
-EXPORT_SYMBOL(nr_cpus_in_node);
+int __init early_pfn_to_nid(unsigned long pfn)
+{
+	unsigned int i;
+
+	for (i = 0; init_node_data[i].end_pfn; i++) {
+		unsigned long start_pfn = init_node_data[i].start_pfn;
+		unsigned long end_pfn = init_node_data[i].end_pfn;
+
+		if ((start_pfn <= pfn) && (pfn < end_pfn))
+			return init_node_data[i].nid;
+	}
+
+	return -1;
+}
+
+void __init add_region(unsigned int nid, unsigned long start_pfn,
+		       unsigned long pages)
+{
+	unsigned int i;
+
+	dbg("add_region nid %d start_pfn 0x%lx pages 0x%lx\n",
+		nid, start_pfn, pages);
+
+	for (i = 0; init_node_data[i].end_pfn; i++) {
+		if (init_node_data[i].nid != nid)
+			continue;
+		if (init_node_data[i].end_pfn == start_pfn) {
+			init_node_data[i].end_pfn += pages;
+			return;
+		}
+		if (init_node_data[i].start_pfn == (start_pfn + pages)) {
+			init_node_data[i].start_pfn -= pages;
+			return;
+		}
+	}
+
+	/*
+	 * Leave last entry NULL so we dont iterate off the end (we use
+	 * entry.end_pfn to terminate the walk).
+	 */
+	if (i >= (MAX_REGIONS - 1)) {
+		printk(KERN_ERR "WARNING: too many memory regions in "
+				"numa code, truncating\n");
+		return;
+	}
+
+	init_node_data[i].start_pfn = start_pfn;
+	init_node_data[i].end_pfn = start_pfn + pages;
+	init_node_data[i].nid = nid;
+}
+
+/* We assume init_node_data has no overlapping regions */
+void __init get_region(unsigned int nid, unsigned long *start_pfn,
+		       unsigned long *end_pfn, unsigned long *pages_present)
+{
+	unsigned int i;
+
+	*start_pfn = -1UL;
+	*end_pfn = *pages_present = 0;
+
+	for (i = 0; init_node_data[i].end_pfn; i++) {
+		if (init_node_data[i].nid != nid)
+			continue;
+
+		*pages_present += init_node_data[i].end_pfn -
+			init_node_data[i].start_pfn;
+
+		if (init_node_data[i].start_pfn < *start_pfn)
+			*start_pfn = init_node_data[i].start_pfn;
+
+		if (init_node_data[i].end_pfn > *end_pfn)
+			*end_pfn = init_node_data[i].end_pfn;
+	}
+
+	/* We didnt find a matching region, return start/end as 0 */
+	if (*start_pfn == -1UL)
+		*start_pfn = 0;
+}
 
 static inline void map_cpu_to_node(int cpu, int node)
 {
 	numa_cpu_lookup_table[cpu] = node;
-	if (!(cpu_isset(cpu, numa_cpumask_lookup_table[node]))) {
+
+	if (!(cpu_isset(cpu, numa_cpumask_lookup_table[node])))
 		cpu_set(cpu, numa_cpumask_lookup_table[node]);
-		nr_cpus_in_node[node]++;
-	}
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -78,7 +145,6 @@ static void unmap_cpu_from_node(unsigned long cpu)
 
 	if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) {
 		cpu_clear(cpu, numa_cpumask_lookup_table[node]);
-		nr_cpus_in_node[node]--;
 	} else {
 		printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n",
 		       cpu, node);
@@ -86,7 +152,7 @@ static void unmap_cpu_from_node(unsigned long cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-static struct device_node * __devinit find_cpu_node(unsigned int cpu)
+static struct device_node *find_cpu_node(unsigned int cpu)
 {
 	unsigned int hw_cpuid = get_hard_smp_processor_id(cpu);
 	struct device_node *cpu_node = NULL;
@@ -213,7 +279,7 @@ static int __init get_mem_size_cells(void)
 	return rc;
 }
 
-static unsigned long read_n_cells(int n, unsigned int **buf)
+static unsigned long __init read_n_cells(int n, unsigned int **buf)
 {
 	unsigned long result = 0;
 
@@ -295,7 +361,8 @@ static int cpu_numa_callback(struct notifier_block *nfb,
  * or zero. If the returned value of size is 0 the region should be
  * discarded as it lies wholy above the memory limit.
  */
-static unsigned long __init numa_enforce_memory_limit(unsigned long start, unsigned long size)
+static unsigned long __init numa_enforce_memory_limit(unsigned long start,
+						      unsigned long size)
 {
 	/*
 	 * We use lmb_end_of_DRAM() in here instead of memory_limit because
@@ -320,8 +387,7 @@ static int __init parse_numa_properties(void)
 	struct device_node *cpu = NULL;
 	struct device_node *memory = NULL;
 	int addr_cells, size_cells;
-	int max_domain = 0;
-	long entries = lmb_end_of_DRAM() >> MEMORY_INCREMENT_SHIFT;
+	int max_domain;
 	unsigned long i;
 
 	if (numa_enabled == 0) {
@@ -329,13 +395,6 @@ static int __init parse_numa_properties(void)
 		return -1;
 	}
 
-	numa_memory_lookup_table =
-		(char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1));
-	memset(numa_memory_lookup_table, 0, entries * sizeof(char));
-
-	for (i = 0; i < entries ; i++)
-		numa_memory_lookup_table[i] = ARRAY_INITIALISER;
-
 	min_common_depth = find_min_common_depth();
 
 	dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
@@ -387,9 +446,6 @@ new_range:
 		start = read_n_cells(addr_cells, &memcell_buf);
 		size = read_n_cells(size_cells, &memcell_buf);
 
-		start = _ALIGN_DOWN(start, MEMORY_INCREMENT);
-		size = _ALIGN_UP(size, MEMORY_INCREMENT);
-
 		numa_domain = of_node_numa_domain(memory);
 
 		if (numa_domain >= MAX_NUMNODES) {
@@ -403,44 +459,15 @@ new_range:
 		if (max_domain < numa_domain)
 			max_domain = numa_domain;
 
-		if (! (size = numa_enforce_memory_limit(start, size))) {
+		if (!(size = numa_enforce_memory_limit(start, size))) {
 			if (--ranges)
 				goto new_range;
 			else
 				continue;
 		}
 
-		/*
-		 * Initialize new node struct, or add to an existing one.
-		 */
-		if (init_node_data[numa_domain].node_end_pfn) {
-			if ((start / PAGE_SIZE) <
-			    init_node_data[numa_domain].node_start_pfn)
-				init_node_data[numa_domain].node_start_pfn =
-					start / PAGE_SIZE;
-			if (((start / PAGE_SIZE) + (size / PAGE_SIZE)) >
-			    init_node_data[numa_domain].node_end_pfn)
-				init_node_data[numa_domain].node_end_pfn =
-					(start / PAGE_SIZE) +
-					(size / PAGE_SIZE);
-
-			init_node_data[numa_domain].node_present_pages +=
-				size / PAGE_SIZE;
-		} else {
-			node_set_online(numa_domain);
-
-			init_node_data[numa_domain].node_start_pfn =
-				start / PAGE_SIZE;
-			init_node_data[numa_domain].node_end_pfn =
-				init_node_data[numa_domain].node_start_pfn +
-				size / PAGE_SIZE;
-			init_node_data[numa_domain].node_present_pages =
-				size / PAGE_SIZE;
-		}
-
-		for (i = start ; i < (start+size); i += MEMORY_INCREMENT)
-			numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] =
-				numa_domain;
+		add_region(numa_domain, start >> PAGE_SHIFT,
+			   size >> PAGE_SHIFT);
 
 		if (--ranges)
 			goto new_range;
@@ -456,32 +483,18 @@ static void __init setup_nonnuma(void)
 {
 	unsigned long top_of_ram = lmb_end_of_DRAM();
 	unsigned long total_ram = lmb_phys_mem_size();
-	unsigned long i;
+	unsigned int i;
 
 	printk(KERN_INFO "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
 	       top_of_ram, total_ram);
 	printk(KERN_INFO "Memory hole size: %ldMB\n",
 	       (top_of_ram - total_ram) >> 20);
 
-	if (!numa_memory_lookup_table) {
-		long entries = top_of_ram >> MEMORY_INCREMENT_SHIFT;
-		numa_memory_lookup_table =
-			(char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1));
-		memset(numa_memory_lookup_table, 0, entries * sizeof(char));
-		for (i = 0; i < entries ; i++)
-			numa_memory_lookup_table[i] = ARRAY_INITIALISER;
-	}
-
 	map_cpu_to_node(boot_cpuid, 0);
-
+	for (i = 0; i < lmb.memory.cnt; ++i)
+		add_region(0, lmb.memory.region[i].base >> PAGE_SHIFT,
+			   lmb_size_pages(&lmb.memory, i));
 	node_set_online(0);
-
-	init_node_data[0].node_start_pfn = 0;
-	init_node_data[0].node_end_pfn = lmb_end_of_DRAM() / PAGE_SIZE;
-	init_node_data[0].node_present_pages = total_ram / PAGE_SIZE;
-
-	for (i = 0 ; i < top_of_ram; i += MEMORY_INCREMENT)
-		numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] = 0;
 }
 
 static void __init dump_numa_topology(void)
@@ -499,8 +512,9 @@ static void __init dump_numa_topology(void)
 
 		count = 0;
 
-		for (i = 0; i < lmb_end_of_DRAM(); i += MEMORY_INCREMENT) {
-			if (numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] == node) {
+		for (i = 0; i < lmb_end_of_DRAM();
+		     i += (1 << SECTION_SIZE_BITS)) {
+			if (early_pfn_to_nid(i >> PAGE_SHIFT) == node) {
 				if (count == 0)
 					printk(" 0x%lx", i);
 				++count;
@@ -525,10 +539,12 @@ static void __init dump_numa_topology(void)
  *
  * Returns the physical address of the memory.
  */
-static unsigned long careful_allocation(int nid, unsigned long size,
-					unsigned long align, unsigned long end)
+static void __init *careful_allocation(int nid, unsigned long size,
+				       unsigned long align,
+				       unsigned long end_pfn)
 {
-	unsigned long ret = lmb_alloc_base(size, align, end);
+	int new_nid;
+	unsigned long ret = lmb_alloc_base(size, align, end_pfn << PAGE_SHIFT);
 
 	/* retry over all memory */
 	if (!ret)
@@ -542,28 +558,27 @@ static unsigned long careful_allocation(int nid, unsigned long size,
 	 * If the memory came from a previously allocated node, we must
 	 * retry with the bootmem allocator.
 	 */
-	if (pa_to_nid(ret) < nid) {
-		nid = pa_to_nid(ret);
-		ret = (unsigned long)__alloc_bootmem_node(NODE_DATA(nid),
+	new_nid = early_pfn_to_nid(ret >> PAGE_SHIFT);
+	if (new_nid < nid) {
+		ret = (unsigned long)__alloc_bootmem_node(NODE_DATA(new_nid),
 				size, align, 0);
 
 		if (!ret)
 			panic("numa.c: cannot allocate %lu bytes on node %d",
-			      size, nid);
+			      size, new_nid);
 
-		ret = virt_to_abs(ret);
+		ret = __pa(ret);
 
 		dbg("alloc_bootmem %lx %lx\n", ret, size);
 	}
 
-	return ret;
+	return (void *)ret;
 }
 
 void __init do_init_bootmem(void)
 {
 	int nid;
-	int addr_cells, size_cells;
-	struct device_node *memory = NULL;
+	unsigned int i;
 	static struct notifier_block ppc64_numa_nb = {
 		.notifier_call = cpu_numa_callback,
 		.priority = 1 /* Must run before sched domains notifier. */
@@ -581,99 +596,66 @@ void __init do_init_bootmem(void)
 	register_cpu_notifier(&ppc64_numa_nb);
 
 	for_each_online_node(nid) {
-		unsigned long start_paddr, end_paddr;
-		int i;
+		unsigned long start_pfn, end_pfn, pages_present;
 		unsigned long bootmem_paddr;
 		unsigned long bootmap_pages;
 
-		start_paddr = init_node_data[nid].node_start_pfn * PAGE_SIZE;
-		end_paddr = init_node_data[nid].node_end_pfn * PAGE_SIZE;
+		get_region(nid, &start_pfn, &end_pfn, &pages_present);
 
 		/* Allocate the node structure node local if possible */
-		NODE_DATA(nid) = (struct pglist_data *)careful_allocation(nid,
+		NODE_DATA(nid) = careful_allocation(nid,
 					sizeof(struct pglist_data),
-					SMP_CACHE_BYTES, end_paddr);
-		NODE_DATA(nid) = abs_to_virt(NODE_DATA(nid));
+					SMP_CACHE_BYTES, end_pfn);
+		NODE_DATA(nid) = __va(NODE_DATA(nid));
 		memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
 
   		dbg("node %d\n", nid);
 		dbg("NODE_DATA() = %p\n", NODE_DATA(nid));
 
 		NODE_DATA(nid)->bdata = &plat_node_bdata[nid];
-		NODE_DATA(nid)->node_start_pfn =
-			init_node_data[nid].node_start_pfn;
-		NODE_DATA(nid)->node_spanned_pages =
-			end_paddr - start_paddr;
+		NODE_DATA(nid)->node_start_pfn = start_pfn;
+		NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn;
 
 		if (NODE_DATA(nid)->node_spanned_pages == 0)
   			continue;
 
-  		dbg("start_paddr = %lx\n", start_paddr);
-  		dbg("end_paddr = %lx\n", end_paddr);
+  		dbg("start_paddr = %lx\n", start_pfn << PAGE_SHIFT);
+  		dbg("end_paddr = %lx\n", end_pfn << PAGE_SHIFT);
 
-		bootmap_pages = bootmem_bootmap_pages((end_paddr - start_paddr) >> PAGE_SHIFT);
+		bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
+		bootmem_paddr = (unsigned long)careful_allocation(nid,
+					bootmap_pages << PAGE_SHIFT,
+					PAGE_SIZE, end_pfn);
+		memset(__va(bootmem_paddr), 0, bootmap_pages << PAGE_SHIFT);
 
-		bootmem_paddr = careful_allocation(nid,
-				bootmap_pages << PAGE_SHIFT,
-				PAGE_SIZE, end_paddr);
-		memset(abs_to_virt(bootmem_paddr), 0,
-		       bootmap_pages << PAGE_SHIFT);
 		dbg("bootmap_paddr = %lx\n", bootmem_paddr);
 
 		init_bootmem_node(NODE_DATA(nid), bootmem_paddr >> PAGE_SHIFT,
-				  start_paddr >> PAGE_SHIFT,
-				  end_paddr >> PAGE_SHIFT);
+				  start_pfn, end_pfn);
 
-		/*
-		 * We need to do another scan of all memory sections to
-		 * associate memory with the correct node.
-		 */
-		addr_cells = get_mem_addr_cells();
-		size_cells = get_mem_size_cells();
-		memory = NULL;
-		while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
-			unsigned long mem_start, mem_size;
-			int numa_domain, ranges;
-			unsigned int *memcell_buf;
-			unsigned int len;
-
-			memcell_buf = (unsigned int *)get_property(memory, "reg", &len);
-			if (!memcell_buf || len <= 0)
-				continue;
+		/* Add free regions on this node */
+		for (i = 0; init_node_data[i].end_pfn; i++) {
+			unsigned long start, end;
 
-			ranges = memory->n_addrs;	/* ranges in cell */
-new_range:
-			mem_start = read_n_cells(addr_cells, &memcell_buf);
-			mem_size = read_n_cells(size_cells, &memcell_buf);
-			if (numa_enabled) {
-				numa_domain = of_node_numa_domain(memory);
-				if (numa_domain  >= MAX_NUMNODES)
-					numa_domain = 0;
-			} else
-				numa_domain =  0;
-
-			if (numa_domain != nid)
+			if (init_node_data[i].nid != nid)
 				continue;
 
-			mem_size = numa_enforce_memory_limit(mem_start, mem_size);
-  			if (mem_size) {
-  				dbg("free_bootmem %lx %lx\n", mem_start, mem_size);
-  				free_bootmem_node(NODE_DATA(nid), mem_start, mem_size);
-			}
+			start = init_node_data[i].start_pfn << PAGE_SHIFT;
+			end = init_node_data[i].end_pfn << PAGE_SHIFT;
 
-			if (--ranges)		/* process all ranges in cell */
-				goto new_range;
+			dbg("free_bootmem %lx %lx\n", start, end - start);
+  			free_bootmem_node(NODE_DATA(nid), start, end - start);
 		}
 
-		/*
-		 * Mark reserved regions on this node
-		 */
+		/* Mark reserved regions on this node */
 		for (i = 0; i < lmb.reserved.cnt; i++) {
 			unsigned long physbase = lmb.reserved.region[i].base;
 			unsigned long size = lmb.reserved.region[i].size;
+			unsigned long start_paddr = start_pfn << PAGE_SHIFT;
+			unsigned long end_paddr = end_pfn << PAGE_SHIFT;
 
-			if (pa_to_nid(physbase) != nid &&
-			    pa_to_nid(physbase+size-1) != nid)
+			if (early_pfn_to_nid(physbase >> PAGE_SHIFT) != nid &&
+			    early_pfn_to_nid((physbase+size-1) >> PAGE_SHIFT) != nid)
 				continue;
 
 			if (physbase < end_paddr &&
@@ -693,46 +675,19 @@ new_range:
 						     size);
 			}
 		}
-		/*
-		 * This loop may look famaliar, but we have to do it again
-		 * after marking our reserved memory to mark memory present
-		 * for sparsemem.
-		 */
-		addr_cells = get_mem_addr_cells();
-		size_cells = get_mem_size_cells();
-		memory = NULL;
-		while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
-			unsigned long mem_start, mem_size;
-			int numa_domain, ranges;
-			unsigned int *memcell_buf;
-			unsigned int len;
-
-			memcell_buf = (unsigned int *)get_property(memory, "reg", &len);
-			if (!memcell_buf || len <= 0)
-				continue;
 
-			ranges = memory->n_addrs;	/* ranges in cell */
-new_range2:
-			mem_start = read_n_cells(addr_cells, &memcell_buf);
-			mem_size = read_n_cells(size_cells, &memcell_buf);
-			if (numa_enabled) {
-				numa_domain = of_node_numa_domain(memory);
-				if (numa_domain  >= MAX_NUMNODES)
-					numa_domain = 0;
-			} else
-				numa_domain =  0;
-
-			if (numa_domain != nid)
+		/* Add regions into sparsemem */
+		for (i = 0; init_node_data[i].end_pfn; i++) {
+			unsigned long start, end;
+
+			if (init_node_data[i].nid != nid)
 				continue;
 
-			mem_size = numa_enforce_memory_limit(mem_start, mem_size);
-			memory_present(numa_domain, mem_start >> PAGE_SHIFT,
-				       (mem_start + mem_size) >> PAGE_SHIFT);
+			start = init_node_data[i].start_pfn;
+			end = init_node_data[i].end_pfn;
 
-			if (--ranges)		/* process all ranges in cell */
-				goto new_range2;
+			memory_present(nid, start, end);
 		}
-
 	}
 }
 
@@ -746,21 +701,18 @@ void __init paging_init(void)
 	memset(zholes_size, 0, sizeof(zholes_size));
 
 	for_each_online_node(nid) {
-		unsigned long start_pfn;
-		unsigned long end_pfn;
+		unsigned long start_pfn, end_pfn, pages_present;
 
-		start_pfn = init_node_data[nid].node_start_pfn;
-		end_pfn = init_node_data[nid].node_end_pfn;
+		get_region(nid, &start_pfn, &end_pfn, &pages_present);
 
 		zones_size[ZONE_DMA] = end_pfn - start_pfn;
-		zholes_size[ZONE_DMA] = zones_size[ZONE_DMA] -
-			init_node_data[nid].node_present_pages;
+		zholes_size[ZONE_DMA] = zones_size[ZONE_DMA] - pages_present;
 
 		dbg("free_area_init node %d %lx %lx (hole: %lx)\n", nid,
 		    zones_size[ZONE_DMA], start_pfn, zholes_size[ZONE_DMA]);
 
-		free_area_init_node(nid, NODE_DATA(nid), zones_size,
-							start_pfn, zholes_size);
+		free_area_init_node(nid, NODE_DATA(nid), zones_size, start_pfn,
+				    zholes_size);
 	}
 }
 
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 900842451bd3..2ffca63602c5 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -64,7 +64,8 @@
 #include <asm/iommu.h>
 #include <asm/abs_addr.h>
 #include <asm/vdso.h>
-#include <asm/imalloc.h>
+
+#include "mmu_decl.h"
 
 unsigned long ioremap_bot = IMALLOC_BASE;
 static unsigned long phbs_io_bot = PHBS_IO_BASE;
@@ -122,8 +123,11 @@ static int map_io_page(unsigned long ea, unsigned long pa, int flags)
 		 *
 		 */
 		if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, flags,
-				      mmu_virtual_psize))
-			panic("Can't map bolted IO mapping");
+				      mmu_virtual_psize)) {
+			printk(KERN_ERR "Failed to do bolted mapping IO "
+			       "memory at %016lx !\n", pa);
+			return -ENOMEM;
+		}
 	}
 	return 0;
 }
diff --git a/arch/powerpc/mm/stab.c b/arch/powerpc/mm/stab.c
index fa325dbf98fc..51e7951414e5 100644
--- a/arch/powerpc/mm/stab.c
+++ b/arch/powerpc/mm/stab.c
@@ -20,6 +20,7 @@
 #include <asm/cputable.h>
 #include <asm/lmb.h>
 #include <asm/abs_addr.h>
+#include <asm/firmware.h>
 
 struct stab_entry {
 	unsigned long esid_data;
@@ -256,7 +257,7 @@ void stabs_alloc(void)
 
 		paca[cpu].stab_addr = newstab;
 		paca[cpu].stab_real = virt_to_abs(newstab);
-		printk(KERN_DEBUG "Segment table for CPU %d at 0x%lx "
+		printk(KERN_INFO "Segment table for CPU %d at 0x%lx "
 		       "virtual, 0x%lx absolute\n",
 		       cpu, paca[cpu].stab_addr, paca[cpu].stab_real);
 	}
@@ -270,10 +271,23 @@ void stabs_alloc(void)
 void stab_initialize(unsigned long stab)
 {
 	unsigned long vsid = get_kernel_vsid(KERNELBASE);
+	unsigned long stabreal;
 
 	asm volatile("isync; slbia; isync":::"memory");
 	make_ste(stab, GET_ESID(KERNELBASE), vsid);
 
 	/* Order update */
 	asm volatile("sync":::"memory");
+
+	/* Set ASR */
+	stabreal = get_paca()->stab_real | 0x1ul;
+
+#ifdef CONFIG_PPC_ISERIES
+	if (firmware_has_feature(FW_FEATURE_ISERIES)) {
+		HvCall1(HvCallBaseSetASR, stabreal);
+		return;
+	}
+#endif /* CONFIG_PPC_ISERIES */
+
+	mtspr(SPRN_ASR, stabreal);
 }
diff --git a/arch/powerpc/mm/tlb_32.c b/arch/powerpc/mm/tlb_32.c
index 6c3dc3c44c86..ad580f3742e5 100644
--- a/arch/powerpc/mm/tlb_32.c
+++ b/arch/powerpc/mm/tlb_32.c
@@ -149,6 +149,12 @@ void flush_tlb_mm(struct mm_struct *mm)
 		return;
 	}
 
+	/*
+	 * It is safe to go down the mm's list of vmas when called
+	 * from dup_mmap, holding mmap_sem.  It would also be safe from
+	 * unmap_region or exit_mmap, but not from vmtruncate on SMP -
+	 * but it seems dup_mmap is the only SMP case which gets here.
+	 */
 	for (mp = mm->mmap; mp != NULL; mp = mp->vm_next)
 		flush_range(mp->vm_mm, mp->vm_start, mp->vm_end);
 	FINISH_FLUSH;
diff --git a/arch/powerpc/mm/tlb_64.c b/arch/powerpc/mm/tlb_64.c
index 53e31b834ace..859d29a0cac5 100644
--- a/arch/powerpc/mm/tlb_64.c
+++ b/arch/powerpc/mm/tlb_64.c
@@ -95,7 +95,7 @@ static void pte_free_submit(struct pte_freelist_batch *batch)
 
 void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf)
 {
-	/* This is safe as we are holding page_table_lock */
+	/* This is safe since tlb_gather_mmu has disabled preemption */
         cpumask_t local_cpumask = cpumask_of_cpu(smp_processor_id());
 	struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur);
 
@@ -206,7 +206,7 @@ void __flush_tlb_pending(struct ppc64_tlb_batch *batch)
 
 void pte_free_finish(void)
 {
-	/* This is safe as we are holding page_table_lock */
+	/* This is safe since tlb_gather_mmu has disabled preemption */
 	struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur);
 
 	if (*batchp == NULL)