summaryrefslogtreecommitdiff
path: root/mm/huge_memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r--mm/huge_memory.c566
1 files changed, 352 insertions, 214 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index a92012a71702..33a5dc492810 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -27,11 +27,12 @@
#include "internal.h"
/*
- * By default transparent hugepage support is enabled for all mappings
- * and khugepaged scans all mappings. Defrag is only invoked by
- * khugepaged hugepage allocations and by page faults inside
- * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived
- * allocations.
+ * By default transparent hugepage support is disabled in order that avoid
+ * to risk increase the memory footprint of applications without a guaranteed
+ * benefit. When transparent hugepage support is enabled, is for all mappings,
+ * and khugepaged scans all mappings.
+ * Defrag is invoked by khugepaged hugepage allocations and by page faults
+ * for all hugepage allocations.
*/
unsigned long transparent_hugepage_flags __read_mostly =
#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
@@ -211,24 +212,29 @@ static void put_huge_zero_page(void)
BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
}
-static int shrink_huge_zero_page(struct shrinker *shrink,
- struct shrink_control *sc)
+static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
+ struct shrink_control *sc)
{
- if (!sc->nr_to_scan)
- /* we can free zero page only if last reference remains */
- return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
+ /* we can free zero page only if last reference remains */
+ return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
+}
+static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
struct page *zero_page = xchg(&huge_zero_page, NULL);
BUG_ON(zero_page == NULL);
__free_page(zero_page);
+ return HPAGE_PMD_NR;
}
return 0;
}
static struct shrinker huge_zero_page_shrinker = {
- .shrink = shrink_huge_zero_page,
+ .count_objects = shrink_huge_zero_page_count,
+ .scan_objects = shrink_huge_zero_page_scan,
.seeks = DEFAULT_SEEKS,
};
@@ -417,7 +423,7 @@ static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
unsigned long msecs;
int err;
- err = strict_strtoul(buf, 10, &msecs);
+ err = kstrtoul(buf, 10, &msecs);
if (err || msecs > UINT_MAX)
return -EINVAL;
@@ -444,7 +450,7 @@ static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
unsigned long msecs;
int err;
- err = strict_strtoul(buf, 10, &msecs);
+ err = kstrtoul(buf, 10, &msecs);
if (err || msecs > UINT_MAX)
return -EINVAL;
@@ -470,7 +476,7 @@ static ssize_t pages_to_scan_store(struct kobject *kobj,
int err;
unsigned long pages;
- err = strict_strtoul(buf, 10, &pages);
+ err = kstrtoul(buf, 10, &pages);
if (err || !pages || pages > UINT_MAX)
return -EINVAL;
@@ -538,7 +544,7 @@ static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
int err;
unsigned long max_ptes_none;
- err = strict_strtoul(buf, 10, &max_ptes_none);
+ err = kstrtoul(buf, 10, &max_ptes_none);
if (err || max_ptes_none > HPAGE_PMD_NR-1)
return -EINVAL;
@@ -690,11 +696,10 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
return pmd;
}
-static inline pmd_t mk_huge_pmd(struct page *page, struct vm_area_struct *vma)
+static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
{
pmd_t entry;
- entry = mk_pmd(page, vma->vm_page_prot);
- entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ entry = mk_pmd(page, prot);
entry = pmd_mkhuge(entry);
return entry;
}
@@ -705,6 +710,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
struct page *page)
{
pgtable_t pgtable;
+ spinlock_t *ptl;
VM_BUG_ON(!PageCompound(page));
pgtable = pte_alloc_one(mm, haddr);
@@ -719,21 +725,22 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
*/
__SetPageUptodate(page);
- spin_lock(&mm->page_table_lock);
+ ptl = pmd_lock(mm, pmd);
if (unlikely(!pmd_none(*pmd))) {
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
mem_cgroup_uncharge_page(page);
put_page(page);
pte_free(mm, pgtable);
} else {
pmd_t entry;
- entry = mk_huge_pmd(page, vma);
+ entry = mk_huge_pmd(page, vma->vm_page_prot);
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
page_add_new_anon_rmap(page, vma, haddr);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
- mm->nr_ptes++;
- spin_unlock(&mm->page_table_lock);
+ atomic_long_inc(&mm->nr_ptes);
+ spin_unlock(ptl);
}
return 0;
@@ -753,14 +760,7 @@ static inline struct page *alloc_hugepage_vma(int defrag,
HPAGE_PMD_ORDER, vma, haddr, nd);
}
-#ifndef CONFIG_NUMA
-static inline struct page *alloc_hugepage(int defrag)
-{
- return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
- HPAGE_PMD_ORDER);
-}
-#endif
-
+/* Caller must hold page table lock. */
static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
struct page *zero_page)
@@ -773,7 +773,7 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
entry = pmd_mkhuge(entry);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
- mm->nr_ptes++;
+ atomic_long_inc(&mm->nr_ptes);
return true;
}
@@ -783,83 +783,65 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
{
struct page *page;
unsigned long haddr = address & HPAGE_PMD_MASK;
- pte_t *pte;
- if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) {
- if (unlikely(anon_vma_prepare(vma)))
- return VM_FAULT_OOM;
- if (unlikely(khugepaged_enter(vma)))
+ if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
+ return VM_FAULT_FALLBACK;
+ if (unlikely(anon_vma_prepare(vma)))
+ return VM_FAULT_OOM;
+ if (unlikely(khugepaged_enter(vma)))
+ return VM_FAULT_OOM;
+ if (!(flags & FAULT_FLAG_WRITE) &&
+ transparent_hugepage_use_zero_page()) {
+ spinlock_t *ptl;
+ pgtable_t pgtable;
+ struct page *zero_page;
+ bool set;
+ pgtable = pte_alloc_one(mm, haddr);
+ if (unlikely(!pgtable))
return VM_FAULT_OOM;
- if (!(flags & FAULT_FLAG_WRITE) &&
- transparent_hugepage_use_zero_page()) {
- pgtable_t pgtable;
- struct page *zero_page;
- bool set;
- pgtable = pte_alloc_one(mm, haddr);
- if (unlikely(!pgtable))
- return VM_FAULT_OOM;
- zero_page = get_huge_zero_page();
- if (unlikely(!zero_page)) {
- pte_free(mm, pgtable);
- count_vm_event(THP_FAULT_FALLBACK);
- goto out;
- }
- spin_lock(&mm->page_table_lock);
- set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
- zero_page);
- spin_unlock(&mm->page_table_lock);
- if (!set) {
- pte_free(mm, pgtable);
- put_huge_zero_page();
- }
- return 0;
- }
- page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
- vma, haddr, numa_node_id(), 0);
- if (unlikely(!page)) {
+ zero_page = get_huge_zero_page();
+ if (unlikely(!zero_page)) {
+ pte_free(mm, pgtable);
count_vm_event(THP_FAULT_FALLBACK);
- goto out;
- }
- count_vm_event(THP_FAULT_ALLOC);
- if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
- put_page(page);
- goto out;
+ return VM_FAULT_FALLBACK;
}
- if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd,
- page))) {
- mem_cgroup_uncharge_page(page);
- put_page(page);
- goto out;
+ ptl = pmd_lock(mm, pmd);
+ set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
+ zero_page);
+ spin_unlock(ptl);
+ if (!set) {
+ pte_free(mm, pgtable);
+ put_huge_zero_page();
}
-
return 0;
}
-out:
- /*
- * Use __pte_alloc instead of pte_alloc_map, because we can't
- * run pte_offset_map on the pmd, if an huge pmd could
- * materialize from under us from a different thread.
- */
- if (unlikely(pmd_none(*pmd)) &&
- unlikely(__pte_alloc(mm, vma, pmd, address)))
- return VM_FAULT_OOM;
- /* if an huge pmd materialized from under us just retry later */
- if (unlikely(pmd_trans_huge(*pmd)))
- return 0;
- /*
- * A regular pmd is established and it can't morph into a huge pmd
- * from under us anymore at this point because we hold the mmap_sem
- * read mode and khugepaged takes it in write mode. So now it's
- * safe to run pte_offset_map().
- */
- pte = pte_offset_map(pmd, address);
- return handle_pte_fault(mm, vma, address, pte, pmd, flags);
+ page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
+ vma, haddr, numa_node_id(), 0);
+ if (unlikely(!page)) {
+ count_vm_event(THP_FAULT_FALLBACK);
+ return VM_FAULT_FALLBACK;
+ }
+ if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
+ put_page(page);
+ count_vm_event(THP_FAULT_FALLBACK);
+ return VM_FAULT_FALLBACK;
+ }
+ if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) {
+ mem_cgroup_uncharge_page(page);
+ put_page(page);
+ count_vm_event(THP_FAULT_FALLBACK);
+ return VM_FAULT_FALLBACK;
+ }
+
+ count_vm_event(THP_FAULT_ALLOC);
+ return 0;
}
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
struct vm_area_struct *vma)
{
+ spinlock_t *dst_ptl, *src_ptl;
struct page *src_page;
pmd_t pmd;
pgtable_t pgtable;
@@ -870,8 +852,9 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
if (unlikely(!pgtable))
goto out;
- spin_lock(&dst_mm->page_table_lock);
- spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING);
+ dst_ptl = pmd_lock(dst_mm, dst_pmd);
+ src_ptl = pmd_lockptr(src_mm, src_pmd);
+ spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
ret = -EAGAIN;
pmd = *src_pmd;
@@ -880,7 +863,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
goto out_unlock;
}
/*
- * mm->page_table_lock is enough to be sure that huge zero pmd is not
+ * When page table lock is held, the huge zero pmd should not be
* under splitting since we don't split the page itself, only pmd to
* a page table.
*/
@@ -901,8 +884,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
}
if (unlikely(pmd_trans_splitting(pmd))) {
/* split huge page running from under us */
- spin_unlock(&src_mm->page_table_lock);
- spin_unlock(&dst_mm->page_table_lock);
+ spin_unlock(src_ptl);
+ spin_unlock(dst_ptl);
pte_free(dst_mm, pgtable);
wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */
@@ -918,12 +901,12 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd = pmd_mkold(pmd_wrprotect(pmd));
pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
set_pmd_at(dst_mm, addr, dst_pmd, pmd);
- dst_mm->nr_ptes++;
+ atomic_long_inc(&dst_mm->nr_ptes);
ret = 0;
out_unlock:
- spin_unlock(&src_mm->page_table_lock);
- spin_unlock(&dst_mm->page_table_lock);
+ spin_unlock(src_ptl);
+ spin_unlock(dst_ptl);
out:
return ret;
}
@@ -934,10 +917,11 @@ void huge_pmd_set_accessed(struct mm_struct *mm,
pmd_t *pmd, pmd_t orig_pmd,
int dirty)
{
+ spinlock_t *ptl;
pmd_t entry;
unsigned long haddr;
- spin_lock(&mm->page_table_lock);
+ ptl = pmd_lock(mm, pmd);
if (unlikely(!pmd_same(*pmd, orig_pmd)))
goto unlock;
@@ -947,13 +931,14 @@ void huge_pmd_set_accessed(struct mm_struct *mm,
update_mmu_cache_pmd(vma, address, pmd);
unlock:
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
}
static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr)
{
+ spinlock_t *ptl;
pgtable_t pgtable;
pmd_t _pmd;
struct page *page;
@@ -980,7 +965,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
mmun_end = haddr + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
- spin_lock(&mm->page_table_lock);
+ ptl = pmd_lock(mm, pmd);
if (unlikely(!pmd_same(*pmd, orig_pmd)))
goto out_free_page;
@@ -1007,7 +992,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
}
smp_wmb(); /* make pte visible before pmd */
pmd_populate(mm, pmd, pgtable);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
put_huge_zero_page();
inc_mm_counter(mm, MM_ANONPAGES);
@@ -1017,7 +1002,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
out:
return ret;
out_free_page:
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
mem_cgroup_uncharge_page(page);
put_page(page);
@@ -1031,6 +1016,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
struct page *page,
unsigned long haddr)
{
+ spinlock_t *ptl;
pgtable_t pgtable;
pmd_t _pmd;
int ret = 0, i;
@@ -1077,7 +1063,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
mmun_end = haddr + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
- spin_lock(&mm->page_table_lock);
+ ptl = pmd_lock(mm, pmd);
if (unlikely(!pmd_same(*pmd, orig_pmd)))
goto out_free_pages;
VM_BUG_ON(!PageHead(page));
@@ -1103,7 +1089,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
smp_wmb(); /* make pte visible before pmd */
pmd_populate(mm, pmd, pgtable);
page_remove_rmap(page);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
@@ -1114,7 +1100,7 @@ out:
return ret;
out_free_pages:
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
mem_cgroup_uncharge_start();
for (i = 0; i < HPAGE_PMD_NR; i++) {
@@ -1129,17 +1115,19 @@ out_free_pages:
int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
{
+ spinlock_t *ptl;
int ret = 0;
struct page *page = NULL, *new_page;
unsigned long haddr;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
+ ptl = pmd_lockptr(mm, pmd);
VM_BUG_ON(!vma->anon_vma);
haddr = address & HPAGE_PMD_MASK;
if (is_huge_zero_pmd(orig_pmd))
goto alloc;
- spin_lock(&mm->page_table_lock);
+ spin_lock(ptl);
if (unlikely(!pmd_same(*pmd, orig_pmd)))
goto out_unlock;
@@ -1155,7 +1143,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
goto out_unlock;
}
get_page(page);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
alloc:
if (transparent_hugepage_enabled(vma) &&
!transparent_hugepage_debug_cow())
@@ -1165,7 +1153,6 @@ alloc:
new_page = NULL;
if (unlikely(!new_page)) {
- count_vm_event(THP_FAULT_FALLBACK);
if (is_huge_zero_pmd(orig_pmd)) {
ret = do_huge_pmd_wp_zero_page_fallback(mm, vma,
address, pmd, orig_pmd, haddr);
@@ -1176,9 +1163,9 @@ alloc:
split_huge_page(page);
put_page(page);
}
+ count_vm_event(THP_FAULT_FALLBACK);
goto out;
}
- count_vm_event(THP_FAULT_ALLOC);
if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
put_page(new_page);
@@ -1186,10 +1173,13 @@ alloc:
split_huge_page(page);
put_page(page);
}
+ count_vm_event(THP_FAULT_FALLBACK);
ret |= VM_FAULT_OOM;
goto out;
}
+ count_vm_event(THP_FAULT_ALLOC);
+
if (is_huge_zero_pmd(orig_pmd))
clear_huge_page(new_page, haddr, HPAGE_PMD_NR);
else
@@ -1200,17 +1190,18 @@ alloc:
mmun_end = haddr + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
- spin_lock(&mm->page_table_lock);
+ spin_lock(ptl);
if (page)
put_page(page);
if (unlikely(!pmd_same(*pmd, orig_pmd))) {
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
mem_cgroup_uncharge_page(new_page);
put_page(new_page);
goto out_mn;
} else {
pmd_t entry;
- entry = mk_huge_pmd(new_page, vma);
+ entry = mk_huge_pmd(new_page, vma->vm_page_prot);
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
pmdp_clear_flush(vma, haddr, pmd);
page_add_new_anon_rmap(new_page, vma, haddr);
set_pmd_at(mm, haddr, pmd, entry);
@@ -1225,13 +1216,13 @@ alloc:
}
ret |= VM_FAULT_WRITE;
}
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
out_mn:
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
out:
return ret;
out_unlock:
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
return ret;
}
@@ -1243,7 +1234,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
struct mm_struct *mm = vma->vm_mm;
struct page *page = NULL;
- assert_spin_locked(&mm->page_table_lock);
+ assert_spin_locked(pmd_lockptr(mm, pmd));
if (flags & FOLL_WRITE && !pmd_write(*pmd))
goto out;
@@ -1290,73 +1281,116 @@ out:
int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pmd_t pmd, pmd_t *pmdp)
{
+ spinlock_t *ptl;
+ struct anon_vma *anon_vma = NULL;
struct page *page;
unsigned long haddr = addr & HPAGE_PMD_MASK;
- int target_nid;
- int current_nid = -1;
- bool migrated;
+ int page_nid = -1, this_nid = numa_node_id();
+ int target_nid, last_cpupid = -1;
+ bool page_locked;
+ bool migrated = false;
+ int flags = 0;
- spin_lock(&mm->page_table_lock);
+ ptl = pmd_lock(mm, pmdp);
if (unlikely(!pmd_same(pmd, *pmdp)))
goto out_unlock;
page = pmd_page(pmd);
- get_page(page);
- current_nid = page_to_nid(page);
+ BUG_ON(is_huge_zero_page(page));
+ page_nid = page_to_nid(page);
+ last_cpupid = page_cpupid_last(page);
count_vm_numa_event(NUMA_HINT_FAULTS);
- if (current_nid == numa_node_id())
+ if (page_nid == this_nid) {
count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+ flags |= TNF_FAULT_LOCAL;
+ }
+ /*
+ * Avoid grouping on DSO/COW pages in specific and RO pages
+ * in general, RO pages shouldn't hurt as much anyway since
+ * they can be in shared cache state.
+ */
+ if (!pmd_write(pmd))
+ flags |= TNF_NO_GROUP;
+
+ /*
+ * Acquire the page lock to serialise THP migrations but avoid dropping
+ * page_table_lock if at all possible
+ */
+ page_locked = trylock_page(page);
target_nid = mpol_misplaced(page, vma, haddr);
if (target_nid == -1) {
- put_page(page);
- goto clear_pmdnuma;
+ /* If the page was locked, there are no parallel migrations */
+ if (page_locked)
+ goto clear_pmdnuma;
+
+ /*
+ * Otherwise wait for potential migrations and retry. We do
+ * relock and check_same as the page may no longer be mapped.
+ * As the fault is being retried, do not account for it.
+ */
+ spin_unlock(ptl);
+ wait_on_page_locked(page);
+ page_nid = -1;
+ goto out;
}
- /* Acquire the page lock to serialise THP migrations */
- spin_unlock(&mm->page_table_lock);
- lock_page(page);
+ /* Page is misplaced, serialise migrations and parallel THP splits */
+ get_page(page);
+ spin_unlock(ptl);
+ if (!page_locked)
+ lock_page(page);
+ anon_vma = page_lock_anon_vma_read(page);
- /* Confirm the PTE did not while locked */
- spin_lock(&mm->page_table_lock);
+ /* Confirm the PMD did not change while page_table_lock was released */
+ spin_lock(ptl);
if (unlikely(!pmd_same(pmd, *pmdp))) {
unlock_page(page);
put_page(page);
+ page_nid = -1;
goto out_unlock;
}
- spin_unlock(&mm->page_table_lock);
- /* Migrate the THP to the requested node */
+ /*
+ * Migrate the THP to the requested node, returns with page unlocked
+ * and pmd_numa cleared.
+ */
+ spin_unlock(ptl);
migrated = migrate_misplaced_transhuge_page(mm, vma,
pmdp, pmd, addr, page, target_nid);
- if (!migrated)
- goto check_same;
-
- task_numa_fault(target_nid, HPAGE_PMD_NR, true);
- return 0;
+ if (migrated) {
+ flags |= TNF_MIGRATED;
+ page_nid = target_nid;
+ }
-check_same:
- spin_lock(&mm->page_table_lock);
- if (unlikely(!pmd_same(pmd, *pmdp)))
- goto out_unlock;
+ goto out;
clear_pmdnuma:
+ BUG_ON(!PageLocked(page));
pmd = pmd_mknonnuma(pmd);
set_pmd_at(mm, haddr, pmdp, pmd);
VM_BUG_ON(pmd_numa(*pmdp));
update_mmu_cache_pmd(vma, addr, pmdp);
+ unlock_page(page);
out_unlock:
- spin_unlock(&mm->page_table_lock);
- if (current_nid != -1)
- task_numa_fault(current_nid, HPAGE_PMD_NR, false);
+ spin_unlock(ptl);
+
+out:
+ if (anon_vma)
+ page_unlock_anon_vma_read(anon_vma);
+
+ if (page_nid != -1)
+ task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags);
+
return 0;
}
int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr)
{
+ spinlock_t *ptl;
int ret = 0;
- if (__pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
struct page *page;
pgtable_t pgtable;
pmd_t orig_pmd;
@@ -1370,8 +1404,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
if (is_huge_zero_pmd(orig_pmd)) {
- tlb->mm->nr_ptes--;
- spin_unlock(&tlb->mm->page_table_lock);
+ atomic_long_dec(&tlb->mm->nr_ptes);
+ spin_unlock(ptl);
put_huge_zero_page();
} else {
page = pmd_page(orig_pmd);
@@ -1379,8 +1413,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
VM_BUG_ON(page_mapcount(page) < 0);
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
VM_BUG_ON(!PageHead(page));
- tlb->mm->nr_ptes--;
- spin_unlock(&tlb->mm->page_table_lock);
+ atomic_long_dec(&tlb->mm->nr_ptes);
+ spin_unlock(ptl);
tlb_remove_page(tlb, page);
}
pte_free(tlb->mm, pgtable);
@@ -1393,14 +1427,15 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
unsigned char *vec)
{
+ spinlock_t *ptl;
int ret = 0;
- if (__pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
/*
* All logical pages in the range are present
* if backed by a huge page.
*/
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(ptl);
memset(vec, 1, (end - addr) >> PAGE_SHIFT);
ret = 1;
}
@@ -1413,6 +1448,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
unsigned long new_addr, unsigned long old_end,
pmd_t *old_pmd, pmd_t *new_pmd)
{
+ spinlock_t *old_ptl, *new_ptl;
int ret = 0;
pmd_t pmd;
@@ -1433,41 +1469,79 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
goto out;
}
- ret = __pmd_trans_huge_lock(old_pmd, vma);
+ /*
+ * We don't have to worry about the ordering of src and dst
+ * ptlocks because exclusive mmap_sem prevents deadlock.
+ */
+ ret = __pmd_trans_huge_lock(old_pmd, vma, &old_ptl);
if (ret == 1) {
+ new_ptl = pmd_lockptr(mm, new_pmd);
+ if (new_ptl != old_ptl)
+ spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
VM_BUG_ON(!pmd_none(*new_pmd));
set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
- spin_unlock(&mm->page_table_lock);
+ if (new_ptl != old_ptl) {
+ pgtable_t pgtable;
+
+ /*
+ * Move preallocated PTE page table if new_pmd is on
+ * different PMD page table.
+ */
+ pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
+ pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
+
+ spin_unlock(new_ptl);
+ }
+ spin_unlock(old_ptl);
}
out:
return ret;
}
+/*
+ * Returns
+ * - 0 if PMD could not be locked
+ * - 1 if PMD was locked but protections unchange and TLB flush unnecessary
+ * - HPAGE_PMD_NR is protections changed and TLB flush necessary
+ */
int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, pgprot_t newprot, int prot_numa)
{
struct mm_struct *mm = vma->vm_mm;
+ spinlock_t *ptl;
int ret = 0;
- if (__pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
pmd_t entry;
- entry = pmdp_get_and_clear(mm, addr, pmd);
+ ret = 1;
if (!prot_numa) {
+ entry = pmdp_get_and_clear(mm, addr, pmd);
entry = pmd_modify(entry, newprot);
+ ret = HPAGE_PMD_NR;
BUG_ON(pmd_write(entry));
} else {
struct page *page = pmd_page(*pmd);
- /* only check non-shared pages */
- if (page_mapcount(page) == 1 &&
+ /*
+ * Do not trap faults against the zero page. The
+ * read-only data is likely to be read-cached on the
+ * local CPU cache and it is less useful to know about
+ * local vs remote hits on the zero page.
+ */
+ if (!is_huge_zero_page(page) &&
!pmd_numa(*pmd)) {
+ entry = pmdp_get_and_clear(mm, addr, pmd);
entry = pmd_mknuma(entry);
+ ret = HPAGE_PMD_NR;
}
}
- set_pmd_at(mm, addr, pmd, entry);
- spin_unlock(&vma->vm_mm->page_table_lock);
- ret = 1;
+
+ /* Set PMD if cleared earlier */
+ if (ret == HPAGE_PMD_NR)
+ set_pmd_at(mm, addr, pmd, entry);
+
+ spin_unlock(ptl);
}
return ret;
@@ -1480,12 +1554,13 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
* Note that if it returns 1, this routine returns without unlocking page
* table locks. So callers must unlock them.
*/
-int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
+int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+ spinlock_t **ptl)
{
- spin_lock(&vma->vm_mm->page_table_lock);
+ *ptl = pmd_lock(vma->vm_mm, pmd);
if (likely(pmd_trans_huge(*pmd))) {
if (unlikely(pmd_trans_splitting(*pmd))) {
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(*ptl);
wait_split_huge_page(vma->anon_vma, pmd);
return -1;
} else {
@@ -1494,27 +1569,37 @@ int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
return 1;
}
}
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(*ptl);
return 0;
}
+/*
+ * This function returns whether a given @page is mapped onto the @address
+ * in the virtual space of @mm.
+ *
+ * When it's true, this function returns *pmd with holding the page table lock
+ * and passing it back to the caller via @ptl.
+ * If it's false, returns NULL without holding the page table lock.
+ */
pmd_t *page_check_address_pmd(struct page *page,
struct mm_struct *mm,
unsigned long address,
- enum page_check_address_pmd_flag flag)
+ enum page_check_address_pmd_flag flag,
+ spinlock_t **ptl)
{
- pmd_t *pmd, *ret = NULL;
+ pmd_t *pmd;
if (address & ~HPAGE_PMD_MASK)
- goto out;
+ return NULL;
pmd = mm_find_pmd(mm, address);
if (!pmd)
- goto out;
+ return NULL;
+ *ptl = pmd_lock(mm, pmd);
if (pmd_none(*pmd))
- goto out;
+ goto unlock;
if (pmd_page(*pmd) != page)
- goto out;
+ goto unlock;
/*
* split_vma() may create temporary aliased mappings. There is
* no risk as long as all huge pmd are found and have their
@@ -1524,14 +1609,15 @@ pmd_t *page_check_address_pmd(struct page *page,
*/
if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
pmd_trans_splitting(*pmd))
- goto out;
+ goto unlock;
if (pmd_trans_huge(*pmd)) {
VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
!pmd_trans_splitting(*pmd));
- ret = pmd;
+ return pmd;
}
-out:
- return ret;
+unlock:
+ spin_unlock(*ptl);
+ return NULL;
}
static int __split_huge_page_splitting(struct page *page,
@@ -1539,6 +1625,7 @@ static int __split_huge_page_splitting(struct page *page,
unsigned long address)
{
struct mm_struct *mm = vma->vm_mm;
+ spinlock_t *ptl;
pmd_t *pmd;
int ret = 0;
/* For mmu_notifiers */
@@ -1546,9 +1633,8 @@ static int __split_huge_page_splitting(struct page *page,
const unsigned long mmun_end = address + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
- spin_lock(&mm->page_table_lock);
pmd = page_check_address_pmd(page, mm, address,
- PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG);
+ PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG, &ptl);
if (pmd) {
/*
* We can't temporarily set the pmd to null in order
@@ -1559,8 +1645,8 @@ static int __split_huge_page_splitting(struct page *page,
*/
pmdp_splitting_flush(vma, address, pmd);
ret = 1;
+ spin_unlock(ptl);
}
- spin_unlock(&mm->page_table_lock);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
return ret;
@@ -1648,7 +1734,7 @@ static void __split_huge_page_refcount(struct page *page,
page_tail->mapping = page->mapping;
page_tail->index = page->index + i;
- page_nid_xchg_last(page_tail, page_nid_last(page));
+ page_cpupid_xchg_last(page_tail, page_cpupid_last(page));
BUG_ON(!PageAnon(page_tail));
BUG_ON(!PageUptodate(page_tail));
@@ -1661,7 +1747,6 @@ static void __split_huge_page_refcount(struct page *page,
BUG_ON(atomic_read(&page->_count) <= 0);
__mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1);
- __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
ClearPageCompound(page);
compound_unlock(page);
@@ -1692,14 +1777,14 @@ static int __split_huge_page_map(struct page *page,
unsigned long address)
{
struct mm_struct *mm = vma->vm_mm;
+ spinlock_t *ptl;
pmd_t *pmd, _pmd;
int ret = 0, i;
pgtable_t pgtable;
unsigned long haddr;
- spin_lock(&mm->page_table_lock);
pmd = page_check_address_pmd(page, mm, address,
- PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
+ PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG, &ptl);
if (pmd) {
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pmd_populate(mm, &_pmd, pgtable);
@@ -1754,8 +1839,8 @@ static int __split_huge_page_map(struct page *page,
pmdp_invalidate(vma, address, pmd);
pmd_populate(mm, pmd, pgtable);
ret = 1;
+ spin_unlock(ptl);
}
- spin_unlock(&mm->page_table_lock);
return ret;
}
@@ -2152,7 +2237,34 @@ static void khugepaged_alloc_sleep(void)
msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
}
+static int khugepaged_node_load[MAX_NUMNODES];
+
#ifdef CONFIG_NUMA
+static int khugepaged_find_target_node(void)
+{
+ static int last_khugepaged_target_node = NUMA_NO_NODE;
+ int nid, target_node = 0, max_value = 0;
+
+ /* find first node with max normal pages hit */
+ for (nid = 0; nid < MAX_NUMNODES; nid++)
+ if (khugepaged_node_load[nid] > max_value) {
+ max_value = khugepaged_node_load[nid];
+ target_node = nid;
+ }
+
+ /* do some balance if several nodes have the same hit record */
+ if (target_node <= last_khugepaged_target_node)
+ for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES;
+ nid++)
+ if (max_value == khugepaged_node_load[nid]) {
+ target_node = nid;
+ break;
+ }
+
+ last_khugepaged_target_node = target_node;
+ return target_node;
+}
+
static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
{
if (IS_ERR(*hpage)) {
@@ -2186,9 +2298,8 @@ static struct page
* mmap_sem in read mode is good idea also to allow greater
* scalability.
*/
- *hpage = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
- node, __GFP_OTHER_NODE);
-
+ *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask(
+ khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER);
/*
* After allocating the hugepage, release the mmap_sem read lock in
* preparation for taking it in write mode.
@@ -2204,6 +2315,17 @@ static struct page
return *hpage;
}
#else
+static int khugepaged_find_target_node(void)
+{
+ return 0;
+}
+
+static inline struct page *alloc_hugepage(int defrag)
+{
+ return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
+ HPAGE_PMD_ORDER);
+}
+
static struct page *khugepaged_alloc_hugepage(bool *wait)
{
struct page *hpage;
@@ -2270,7 +2392,7 @@ static void collapse_huge_page(struct mm_struct *mm,
pte_t *pte;
pgtable_t pgtable;
struct page *new_page;
- spinlock_t *ptl;
+ spinlock_t *pmd_ptl, *pte_ptl;
int isolated;
unsigned long hstart, hend;
unsigned long mmun_start; /* For mmu_notifiers */
@@ -2296,6 +2418,8 @@ static void collapse_huge_page(struct mm_struct *mm,
goto out;
vma = find_vma(mm, address);
+ if (!vma)
+ goto out;
hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
hend = vma->vm_end & HPAGE_PMD_MASK;
if (address < hstart || address + HPAGE_PMD_SIZE > hend)
@@ -2311,12 +2435,12 @@ static void collapse_huge_page(struct mm_struct *mm,
anon_vma_lock_write(vma->anon_vma);
pte = pte_offset_map(pmd, address);
- ptl = pte_lockptr(mm, pmd);
+ pte_ptl = pte_lockptr(mm, pmd);
mmun_start = address;
mmun_end = address + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
- spin_lock(&mm->page_table_lock); /* probably unnecessary */
+ pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
/*
* After this gup_fast can't run anymore. This also removes
* any huge TLB entry from the CPU so we won't allow
@@ -2324,16 +2448,16 @@ static void collapse_huge_page(struct mm_struct *mm,
* to avoid the risk of CPU bugs in that area.
*/
_pmd = pmdp_clear_flush(vma, address, pmd);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(pmd_ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
- spin_lock(ptl);
+ spin_lock(pte_ptl);
isolated = __collapse_huge_page_isolate(vma, address, pte);
- spin_unlock(ptl);
+ spin_unlock(pte_ptl);
if (unlikely(!isolated)) {
pte_unmap(pte);
- spin_lock(&mm->page_table_lock);
+ spin_lock(pmd_ptl);
BUG_ON(!pmd_none(*pmd));
/*
* We can only use set_pmd_at when establishing
@@ -2341,7 +2465,7 @@ static void collapse_huge_page(struct mm_struct *mm,
* points to regular pagetables. Use pmd_populate for that
*/
pmd_populate(mm, pmd, pmd_pgtable(_pmd));
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(pmd_ptl);
anon_vma_unlock_write(vma->anon_vma);
goto out;
}
@@ -2352,12 +2476,13 @@ static void collapse_huge_page(struct mm_struct *mm,
*/
anon_vma_unlock_write(vma->anon_vma);
- __collapse_huge_page_copy(pte, new_page, vma, address, ptl);
+ __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl);
pte_unmap(pte);
__SetPageUptodate(new_page);
pgtable = pmd_pgtable(_pmd);
- _pmd = mk_huge_pmd(new_page, vma);
+ _pmd = mk_huge_pmd(new_page, vma->vm_page_prot);
+ _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
/*
* spin_lock() below is not the equivalent of smp_wmb(), so
@@ -2366,13 +2491,13 @@ static void collapse_huge_page(struct mm_struct *mm,
*/
smp_wmb();
- spin_lock(&mm->page_table_lock);
+ spin_lock(pmd_ptl);
BUG_ON(!pmd_none(*pmd));
page_add_new_anon_rmap(new_page, vma, address);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, address, pmd, _pmd);
update_mmu_cache_pmd(vma, address, pmd);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(pmd_ptl);
*hpage = NULL;
@@ -2407,6 +2532,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
if (pmd_trans_huge(*pmd))
goto out;
+ memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
_pte++, _address += PAGE_SIZE) {
@@ -2423,12 +2549,13 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
if (unlikely(!page))
goto out_unmap;
/*
- * Chose the node of the first page. This could
- * be more sophisticated and look at more pages,
- * but isn't for now.
+ * Record which node the original page is from and save this
+ * information to khugepaged_node_load[].
+ * Khupaged will allocate hugepage from the node has the max
+ * hit record.
*/
- if (node == NUMA_NO_NODE)
- node = page_to_nid(page);
+ node = page_to_nid(page);
+ khugepaged_node_load[node]++;
VM_BUG_ON(PageCompound(page));
if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
goto out_unmap;
@@ -2443,9 +2570,11 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
ret = 1;
out_unmap:
pte_unmap_unlock(pte, ptl);
- if (ret)
+ if (ret) {
+ node = khugepaged_find_target_node();
/* collapse_huge_page will return with the mmap_sem released */
collapse_huge_page(mm, address, hpage, vma, node);
+ }
out:
return ret;
}
@@ -2697,6 +2826,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmd)
{
+ spinlock_t *ptl;
struct page *page;
struct mm_struct *mm = vma->vm_mm;
unsigned long haddr = address & HPAGE_PMD_MASK;
@@ -2707,29 +2837,37 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
mmun_start = haddr;
mmun_end = haddr + HPAGE_PMD_SIZE;
+again:
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
- spin_lock(&mm->page_table_lock);
+ ptl = pmd_lock(mm, pmd);
if (unlikely(!pmd_trans_huge(*pmd))) {
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
return;
}
if (is_huge_zero_pmd(*pmd)) {
__split_huge_zero_page_pmd(vma, haddr, pmd);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
return;
}
page = pmd_page(*pmd);
VM_BUG_ON(!page_count(page));
get_page(page);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
split_huge_page(page);
put_page(page);
- BUG_ON(pmd_trans_huge(*pmd));
+
+ /*
+ * We don't always have down_write of mmap_sem here: a racing
+ * do_huge_pmd_wp_page() might have copied-on-write to another
+ * huge page before our split_huge_page() got the anon_vma lock.
+ */
+ if (unlikely(pmd_trans_huge(*pmd)))
+ goto again;
}
void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,