diff options
Diffstat (limited to 'mm/mempolicy.c')
-rw-r--r-- | mm/mempolicy.c | 261 |
1 files changed, 186 insertions, 75 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4baf12e534d1..eca4a3129129 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -123,16 +123,19 @@ static struct mempolicy preferred_node_policy[MAX_NUMNODES]; static struct mempolicy *get_task_policy(struct task_struct *p) { struct mempolicy *pol = p->mempolicy; - int node; if (!pol) { - node = numa_node_id(); - if (node != NUMA_NO_NODE) - pol = &preferred_node_policy[node]; + int node = numa_node_id(); - /* preferred_node_policy is not initialised early in boot */ - if (!pol->mode) - pol = NULL; + if (node != NUMA_NO_NODE) { + pol = &preferred_node_policy[node]; + /* + * preferred_node_policy is not initialised early in + * boot + */ + if (!pol->mode) + pol = NULL; + } } return pol; @@ -473,8 +476,11 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { static void migrate_page_add(struct page *page, struct list_head *pagelist, unsigned long flags); -/* Scan through pages checking if pages follow certain conditions. */ -static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, +/* + * Scan through pages checking if pages follow certain conditions, + * and move them to the pagelist if they do. + */ +static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, const nodemask_t *nodes, unsigned long flags, void *private) @@ -512,7 +518,32 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, return addr != end; } -static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, +static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma, + pmd_t *pmd, const nodemask_t *nodes, unsigned long flags, + void *private) +{ +#ifdef CONFIG_HUGETLB_PAGE + int nid; + struct page *page; + spinlock_t *ptl; + + ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd); + page = pte_page(huge_ptep_get((pte_t *)pmd)); + nid = page_to_nid(page); + if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) + goto unlock; + /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ + if (flags & (MPOL_MF_MOVE_ALL) || + (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) + isolate_huge_page(page, private); +unlock: + spin_unlock(ptl); +#else + BUG(); +#endif +} + +static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, const nodemask_t *nodes, unsigned long flags, void *private) @@ -523,17 +554,24 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); + if (!pmd_present(*pmd)) + continue; + if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) { + queue_pages_hugetlb_pmd_range(vma, pmd, nodes, + flags, private); + continue; + } split_huge_page_pmd(vma, addr, pmd); if (pmd_none_or_trans_huge_or_clear_bad(pmd)) continue; - if (check_pte_range(vma, pmd, addr, next, nodes, + if (queue_pages_pte_range(vma, pmd, addr, next, nodes, flags, private)) return -EIO; } while (pmd++, addr = next, addr != end); return 0; } -static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, +static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, const nodemask_t *nodes, unsigned long flags, void *private) @@ -544,16 +582,18 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, pud = pud_offset(pgd, addr); do { next = pud_addr_end(addr, end); + if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) + continue; if (pud_none_or_clear_bad(pud)) continue; - if (check_pmd_range(vma, pud, addr, next, nodes, + if (queue_pages_pmd_range(vma, pud, addr, next, nodes, flags, private)) return -EIO; } while (pud++, addr = next, addr != end); return 0; } -static inline int check_pgd_range(struct vm_area_struct *vma, +static inline int queue_pages_pgd_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end, const nodemask_t *nodes, unsigned long flags, void *private) @@ -566,7 +606,7 @@ static inline int check_pgd_range(struct vm_area_struct *vma, next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - if (check_pud_range(vma, pgd, addr, next, nodes, + if (queue_pages_pud_range(vma, pgd, addr, next, nodes, flags, private)) return -EIO; } while (pgd++, addr = next, addr != end); @@ -604,12 +644,14 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma, #endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */ /* - * Check if all pages in a range are on a set of nodes. - * If pagelist != NULL then isolate pages from the LRU and - * put them on the pagelist. + * Walk through page tables and collect pages to be migrated. + * + * If pages found in a given range are on a set of nodes (determined by + * @nodes and @flags,) it's isolated and queued to the pagelist which is + * passed via @private.) */ static struct vm_area_struct * -check_range(struct mm_struct *mm, unsigned long start, unsigned long end, +queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, const nodemask_t *nodes, unsigned long flags, void *private) { int err; @@ -635,9 +677,6 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, return ERR_PTR(-EFAULT); } - if (is_vm_hugetlb_page(vma)) - goto next; - if (flags & MPOL_MF_LAZY) { change_prot_numa(vma, start, endvma); goto next; @@ -647,7 +686,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && vma_migratable(vma))) { - err = check_pgd_range(vma, start, endvma, nodes, + err = queue_pages_pgd_range(vma, start, endvma, nodes, flags, private); if (err) { first = ERR_PTR(err); @@ -990,7 +1029,11 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, static struct page *new_node_page(struct page *page, unsigned long node, int **x) { - return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0); + if (PageHuge(page)) + return alloc_huge_page_node(page_hstate(compound_head(page)), + node); + else + return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0); } /* @@ -1013,14 +1056,14 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, * space range and MPOL_MF_DISCONTIG_OK, this call can not fail. */ VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); - check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, + queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, flags | MPOL_MF_DISCONTIG_OK, &pagelist); if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, new_node_page, dest, MIGRATE_SYNC, MR_SYSCALL); if (err) - putback_lru_pages(&pagelist); + putback_movable_pages(&pagelist); } return err; @@ -1083,7 +1126,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, tmp = *from; while (!nodes_empty(tmp)) { int s,d; - int source = -1; + int source = NUMA_NO_NODE; int dest = 0; for_each_node_mask(s, tmp) { @@ -1118,7 +1161,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, if (!node_isset(dest, tmp)) break; } - if (source == -1) + if (source == NUMA_NO_NODE) break; node_clear(source, tmp); @@ -1154,10 +1197,14 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int * break; vma = vma->vm_next; } - /* - * if !vma, alloc_page_vma() will use task or system default policy + * queue_pages_range() confirms that @page belongs to some vma, + * so vma shouldn't be NULL. */ + BUG_ON(!vma); + + if (PageHuge(page)) + return alloc_huge_page_noerr(vma, address, 1); return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); } #else @@ -1249,7 +1296,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (err) goto mpol_out; - vma = check_range(mm, start, end, nmask, + vma = queue_pages_range(mm, start, end, nmask, flags | MPOL_MF_INVERT, &pagelist); err = PTR_ERR(vma); /* maybe ... */ @@ -1265,7 +1312,7 @@ static long do_mbind(unsigned long start, unsigned long len, (unsigned long)vma, MIGRATE_SYNC, MR_MEMPOLICY_MBIND); if (nr_failed) - putback_lru_pages(&pagelist); + putback_movable_pages(&pagelist); } if (nr_failed && (flags & MPOL_MF_STRICT)) @@ -1633,6 +1680,30 @@ struct mempolicy *get_vma_policy(struct task_struct *task, return pol; } +bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma) +{ + struct mempolicy *pol = get_task_policy(task); + if (vma) { + if (vma->vm_ops && vma->vm_ops->get_policy) { + bool ret = false; + + pol = vma->vm_ops->get_policy(vma, vma->vm_start); + if (pol && (pol->flags & MPOL_F_MOF)) + ret = true; + mpol_cond_put(pol); + + return ret; + } else if (vma->vm_policy) { + pol = vma->vm_policy; + } + } + + if (!pol) + return default_policy.flags & MPOL_F_MOF; + + return pol->flags & MPOL_F_MOF; +} + static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone) { enum zone_type dynamic_policy_zone = policy_zone; @@ -1765,7 +1836,7 @@ static unsigned offset_il_node(struct mempolicy *pol, unsigned nnodes = nodes_weight(pol->v.nodes); unsigned target; int c; - int nid = -1; + int nid = NUMA_NO_NODE; if (!nnodes) return numa_node_id(); @@ -1802,11 +1873,11 @@ static inline unsigned interleave_nid(struct mempolicy *pol, /* * Return the bit number of a random bit set in the nodemask. - * (returns -1 if nodemask is empty) + * (returns NUMA_NO_NODE if nodemask is empty) */ int node_random(const nodemask_t *maskp) { - int w, bit = -1; + int w, bit = NUMA_NO_NODE; w = nodes_weight(*maskp); if (w) @@ -2065,6 +2136,16 @@ retry_cpuset: } EXPORT_SYMBOL(alloc_pages_current); +int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) +{ + struct mempolicy *pol = mpol_dup(vma_policy(src)); + + if (IS_ERR(pol)) + return PTR_ERR(pol); + dst->vm_policy = pol; + return 0; +} + /* * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it * rebinds the mempolicy its copying by calling mpol_rebind_policy() @@ -2221,6 +2302,35 @@ static void sp_free(struct sp_node *n) kmem_cache_free(sn_cache, n); } +#ifdef CONFIG_NUMA_BALANCING +static bool numa_migrate_deferred(struct task_struct *p, int last_cpupid) +{ + /* Never defer a private fault */ + if (cpupid_match_pid(p, last_cpupid)) + return false; + + if (p->numa_migrate_deferred) { + p->numa_migrate_deferred--; + return true; + } + return false; +} + +static inline void defer_numa_migrate(struct task_struct *p) +{ + p->numa_migrate_deferred = sysctl_numa_balancing_migrate_deferred; +} +#else +static inline bool numa_migrate_deferred(struct task_struct *p, int last_cpupid) +{ + return false; +} + +static inline void defer_numa_migrate(struct task_struct *p) +{ +} +#endif /* CONFIG_NUMA_BALANCING */ + /** * mpol_misplaced - check whether current page node is valid in policy * @@ -2244,6 +2354,8 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long struct zone *zone; int curnid = page_to_nid(page); unsigned long pgoff; + int thiscpu = raw_smp_processor_id(); + int thisnid = cpu_to_node(thiscpu); int polnid = -1; int ret = -1; @@ -2292,9 +2404,11 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long /* Migrate the page towards the node whose CPU is referencing it */ if (pol->flags & MPOL_F_MORON) { - int last_nid; + int last_cpupid; + int this_cpupid; - polnid = numa_node_id(); + polnid = thisnid; + this_cpupid = cpu_pid_to_cpupid(thiscpu, current->pid); /* * Multi-stage node selection is used in conjunction @@ -2317,8 +2431,25 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long * it less likely we act on an unlikely task<->page * relation. */ - last_nid = page_nid_xchg_last(page, polnid); - if (last_nid != polnid) + last_cpupid = page_cpupid_xchg_last(page, this_cpupid); + if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) { + + /* See sysctl_numa_balancing_migrate_deferred comment */ + if (!cpupid_match_pid(current, last_cpupid)) + defer_numa_migrate(current); + + goto out; + } + + /* + * The quadratic filter above reduces extraneous migration + * of shared pages somewhat. This code reduces it even more, + * reducing the overhead of page migrations of shared pages. + * This makes workloads with shared pages rely more on + * "move task near its memory", and less on "move memory + * towards its task", which is exactly what we want. + */ + if (numa_migrate_deferred(current, last_cpupid)) goto out; } @@ -2784,62 +2915,45 @@ out: * @maxlen: length of @buffer * @pol: pointer to mempolicy to be formatted * - * Convert a mempolicy into a string. - * Returns the number of characters in buffer (if positive) - * or an error (negative) + * Convert @pol into a string. If @buffer is too short, truncate the string. + * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the + * longest flag, "relative", and to display at least a few node ids. */ -int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) +void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) { char *p = buffer; - int l; - nodemask_t nodes; - unsigned short mode; - unsigned short flags = pol ? pol->flags : 0; - - /* - * Sanity check: room for longest mode, flag and some nodes - */ - VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16); + nodemask_t nodes = NODE_MASK_NONE; + unsigned short mode = MPOL_DEFAULT; + unsigned short flags = 0; - if (!pol || pol == &default_policy) - mode = MPOL_DEFAULT; - else + if (pol && pol != &default_policy) { mode = pol->mode; + flags = pol->flags; + } switch (mode) { case MPOL_DEFAULT: - nodes_clear(nodes); break; - case MPOL_PREFERRED: - nodes_clear(nodes); if (flags & MPOL_F_LOCAL) mode = MPOL_LOCAL; else node_set(pol->v.preferred_node, nodes); break; - case MPOL_BIND: - /* Fall through */ case MPOL_INTERLEAVE: nodes = pol->v.nodes; break; - default: - return -EINVAL; + WARN_ON_ONCE(1); + snprintf(p, maxlen, "unknown"); + return; } - l = strlen(policy_modes[mode]); - if (buffer + maxlen < p + l + 1) - return -ENOSPC; - - strcpy(p, policy_modes[mode]); - p += l; + p += snprintf(p, maxlen, "%s", policy_modes[mode]); if (flags & MPOL_MODE_FLAGS) { - if (buffer + maxlen < p + 2) - return -ENOSPC; - *p++ = '='; + p += snprintf(p, buffer + maxlen - p, "="); /* * Currently, the only defined flags are mutually exclusive @@ -2851,10 +2965,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) } if (!nodes_empty(nodes)) { - if (buffer + maxlen < p + 2) - return -ENOSPC; - *p++ = ':'; + p += snprintf(p, buffer + maxlen - p, ":"); p += nodelist_scnprintf(p, buffer + maxlen - p, nodes); } - return p - buffer; } |