summaryrefslogtreecommitdiff
path: root/mm/mempolicy.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/mempolicy.c')
-rw-r--r--mm/mempolicy.c261
1 files changed, 186 insertions, 75 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4baf12e534d1..eca4a3129129 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -123,16 +123,19 @@ static struct mempolicy preferred_node_policy[MAX_NUMNODES];
static struct mempolicy *get_task_policy(struct task_struct *p)
{
struct mempolicy *pol = p->mempolicy;
- int node;
if (!pol) {
- node = numa_node_id();
- if (node != NUMA_NO_NODE)
- pol = &preferred_node_policy[node];
+ int node = numa_node_id();
- /* preferred_node_policy is not initialised early in boot */
- if (!pol->mode)
- pol = NULL;
+ if (node != NUMA_NO_NODE) {
+ pol = &preferred_node_policy[node];
+ /*
+ * preferred_node_policy is not initialised early in
+ * boot
+ */
+ if (!pol->mode)
+ pol = NULL;
+ }
}
return pol;
@@ -473,8 +476,11 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
static void migrate_page_add(struct page *page, struct list_head *pagelist,
unsigned long flags);
-/* Scan through pages checking if pages follow certain conditions. */
-static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
+/*
+ * Scan through pages checking if pages follow certain conditions,
+ * and move them to the pagelist if they do.
+ */
+static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
const nodemask_t *nodes, unsigned long flags,
void *private)
@@ -512,7 +518,32 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
return addr != end;
}
-static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,
+ pmd_t *pmd, const nodemask_t *nodes, unsigned long flags,
+ void *private)
+{
+#ifdef CONFIG_HUGETLB_PAGE
+ int nid;
+ struct page *page;
+ spinlock_t *ptl;
+
+ ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
+ page = pte_page(huge_ptep_get((pte_t *)pmd));
+ nid = page_to_nid(page);
+ if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
+ goto unlock;
+ /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
+ if (flags & (MPOL_MF_MOVE_ALL) ||
+ (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
+ isolate_huge_page(page, private);
+unlock:
+ spin_unlock(ptl);
+#else
+ BUG();
+#endif
+}
+
+static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud,
unsigned long addr, unsigned long end,
const nodemask_t *nodes, unsigned long flags,
void *private)
@@ -523,17 +554,24 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
+ if (!pmd_present(*pmd))
+ continue;
+ if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) {
+ queue_pages_hugetlb_pmd_range(vma, pmd, nodes,
+ flags, private);
+ continue;
+ }
split_huge_page_pmd(vma, addr, pmd);
if (pmd_none_or_trans_huge_or_clear_bad(pmd))
continue;
- if (check_pte_range(vma, pmd, addr, next, nodes,
+ if (queue_pages_pte_range(vma, pmd, addr, next, nodes,
flags, private))
return -EIO;
} while (pmd++, addr = next, addr != end);
return 0;
}
-static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
+static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
unsigned long addr, unsigned long end,
const nodemask_t *nodes, unsigned long flags,
void *private)
@@ -544,16 +582,18 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
pud = pud_offset(pgd, addr);
do {
next = pud_addr_end(addr, end);
+ if (pud_huge(*pud) && is_vm_hugetlb_page(vma))
+ continue;
if (pud_none_or_clear_bad(pud))
continue;
- if (check_pmd_range(vma, pud, addr, next, nodes,
+ if (queue_pages_pmd_range(vma, pud, addr, next, nodes,
flags, private))
return -EIO;
} while (pud++, addr = next, addr != end);
return 0;
}
-static inline int check_pgd_range(struct vm_area_struct *vma,
+static inline int queue_pages_pgd_range(struct vm_area_struct *vma,
unsigned long addr, unsigned long end,
const nodemask_t *nodes, unsigned long flags,
void *private)
@@ -566,7 +606,7 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
- if (check_pud_range(vma, pgd, addr, next, nodes,
+ if (queue_pages_pud_range(vma, pgd, addr, next, nodes,
flags, private))
return -EIO;
} while (pgd++, addr = next, addr != end);
@@ -604,12 +644,14 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma,
#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
/*
- * Check if all pages in a range are on a set of nodes.
- * If pagelist != NULL then isolate pages from the LRU and
- * put them on the pagelist.
+ * Walk through page tables and collect pages to be migrated.
+ *
+ * If pages found in a given range are on a set of nodes (determined by
+ * @nodes and @flags,) it's isolated and queued to the pagelist which is
+ * passed via @private.)
*/
static struct vm_area_struct *
-check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
+queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
const nodemask_t *nodes, unsigned long flags, void *private)
{
int err;
@@ -635,9 +677,6 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
return ERR_PTR(-EFAULT);
}
- if (is_vm_hugetlb_page(vma))
- goto next;
-
if (flags & MPOL_MF_LAZY) {
change_prot_numa(vma, start, endvma);
goto next;
@@ -647,7 +686,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
vma_migratable(vma))) {
- err = check_pgd_range(vma, start, endvma, nodes,
+ err = queue_pages_pgd_range(vma, start, endvma, nodes,
flags, private);
if (err) {
first = ERR_PTR(err);
@@ -990,7 +1029,11 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
static struct page *new_node_page(struct page *page, unsigned long node, int **x)
{
- return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
+ if (PageHuge(page))
+ return alloc_huge_page_node(page_hstate(compound_head(page)),
+ node);
+ else
+ return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
}
/*
@@ -1013,14 +1056,14 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
* space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
*/
VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
- check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
+ queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
flags | MPOL_MF_DISCONTIG_OK, &pagelist);
if (!list_empty(&pagelist)) {
err = migrate_pages(&pagelist, new_node_page, dest,
MIGRATE_SYNC, MR_SYSCALL);
if (err)
- putback_lru_pages(&pagelist);
+ putback_movable_pages(&pagelist);
}
return err;
@@ -1083,7 +1126,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
tmp = *from;
while (!nodes_empty(tmp)) {
int s,d;
- int source = -1;
+ int source = NUMA_NO_NODE;
int dest = 0;
for_each_node_mask(s, tmp) {
@@ -1118,7 +1161,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
if (!node_isset(dest, tmp))
break;
}
- if (source == -1)
+ if (source == NUMA_NO_NODE)
break;
node_clear(source, tmp);
@@ -1154,10 +1197,14 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int *
break;
vma = vma->vm_next;
}
-
/*
- * if !vma, alloc_page_vma() will use task or system default policy
+ * queue_pages_range() confirms that @page belongs to some vma,
+ * so vma shouldn't be NULL.
*/
+ BUG_ON(!vma);
+
+ if (PageHuge(page))
+ return alloc_huge_page_noerr(vma, address, 1);
return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
}
#else
@@ -1249,7 +1296,7 @@ static long do_mbind(unsigned long start, unsigned long len,
if (err)
goto mpol_out;
- vma = check_range(mm, start, end, nmask,
+ vma = queue_pages_range(mm, start, end, nmask,
flags | MPOL_MF_INVERT, &pagelist);
err = PTR_ERR(vma); /* maybe ... */
@@ -1265,7 +1312,7 @@ static long do_mbind(unsigned long start, unsigned long len,
(unsigned long)vma,
MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
if (nr_failed)
- putback_lru_pages(&pagelist);
+ putback_movable_pages(&pagelist);
}
if (nr_failed && (flags & MPOL_MF_STRICT))
@@ -1633,6 +1680,30 @@ struct mempolicy *get_vma_policy(struct task_struct *task,
return pol;
}
+bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma)
+{
+ struct mempolicy *pol = get_task_policy(task);
+ if (vma) {
+ if (vma->vm_ops && vma->vm_ops->get_policy) {
+ bool ret = false;
+
+ pol = vma->vm_ops->get_policy(vma, vma->vm_start);
+ if (pol && (pol->flags & MPOL_F_MOF))
+ ret = true;
+ mpol_cond_put(pol);
+
+ return ret;
+ } else if (vma->vm_policy) {
+ pol = vma->vm_policy;
+ }
+ }
+
+ if (!pol)
+ return default_policy.flags & MPOL_F_MOF;
+
+ return pol->flags & MPOL_F_MOF;
+}
+
static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
{
enum zone_type dynamic_policy_zone = policy_zone;
@@ -1765,7 +1836,7 @@ static unsigned offset_il_node(struct mempolicy *pol,
unsigned nnodes = nodes_weight(pol->v.nodes);
unsigned target;
int c;
- int nid = -1;
+ int nid = NUMA_NO_NODE;
if (!nnodes)
return numa_node_id();
@@ -1802,11 +1873,11 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
/*
* Return the bit number of a random bit set in the nodemask.
- * (returns -1 if nodemask is empty)
+ * (returns NUMA_NO_NODE if nodemask is empty)
*/
int node_random(const nodemask_t *maskp)
{
- int w, bit = -1;
+ int w, bit = NUMA_NO_NODE;
w = nodes_weight(*maskp);
if (w)
@@ -2065,6 +2136,16 @@ retry_cpuset:
}
EXPORT_SYMBOL(alloc_pages_current);
+int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
+{
+ struct mempolicy *pol = mpol_dup(vma_policy(src));
+
+ if (IS_ERR(pol))
+ return PTR_ERR(pol);
+ dst->vm_policy = pol;
+ return 0;
+}
+
/*
* If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
* rebinds the mempolicy its copying by calling mpol_rebind_policy()
@@ -2221,6 +2302,35 @@ static void sp_free(struct sp_node *n)
kmem_cache_free(sn_cache, n);
}
+#ifdef CONFIG_NUMA_BALANCING
+static bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
+{
+ /* Never defer a private fault */
+ if (cpupid_match_pid(p, last_cpupid))
+ return false;
+
+ if (p->numa_migrate_deferred) {
+ p->numa_migrate_deferred--;
+ return true;
+ }
+ return false;
+}
+
+static inline void defer_numa_migrate(struct task_struct *p)
+{
+ p->numa_migrate_deferred = sysctl_numa_balancing_migrate_deferred;
+}
+#else
+static inline bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
+{
+ return false;
+}
+
+static inline void defer_numa_migrate(struct task_struct *p)
+{
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
/**
* mpol_misplaced - check whether current page node is valid in policy
*
@@ -2244,6 +2354,8 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
struct zone *zone;
int curnid = page_to_nid(page);
unsigned long pgoff;
+ int thiscpu = raw_smp_processor_id();
+ int thisnid = cpu_to_node(thiscpu);
int polnid = -1;
int ret = -1;
@@ -2292,9 +2404,11 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
/* Migrate the page towards the node whose CPU is referencing it */
if (pol->flags & MPOL_F_MORON) {
- int last_nid;
+ int last_cpupid;
+ int this_cpupid;
- polnid = numa_node_id();
+ polnid = thisnid;
+ this_cpupid = cpu_pid_to_cpupid(thiscpu, current->pid);
/*
* Multi-stage node selection is used in conjunction
@@ -2317,8 +2431,25 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
* it less likely we act on an unlikely task<->page
* relation.
*/
- last_nid = page_nid_xchg_last(page, polnid);
- if (last_nid != polnid)
+ last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
+ if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) {
+
+ /* See sysctl_numa_balancing_migrate_deferred comment */
+ if (!cpupid_match_pid(current, last_cpupid))
+ defer_numa_migrate(current);
+
+ goto out;
+ }
+
+ /*
+ * The quadratic filter above reduces extraneous migration
+ * of shared pages somewhat. This code reduces it even more,
+ * reducing the overhead of page migrations of shared pages.
+ * This makes workloads with shared pages rely more on
+ * "move task near its memory", and less on "move memory
+ * towards its task", which is exactly what we want.
+ */
+ if (numa_migrate_deferred(current, last_cpupid))
goto out;
}
@@ -2784,62 +2915,45 @@ out:
* @maxlen: length of @buffer
* @pol: pointer to mempolicy to be formatted
*
- * Convert a mempolicy into a string.
- * Returns the number of characters in buffer (if positive)
- * or an error (negative)
+ * Convert @pol into a string. If @buffer is too short, truncate the string.
+ * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
+ * longest flag, "relative", and to display at least a few node ids.
*/
-int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
+void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
{
char *p = buffer;
- int l;
- nodemask_t nodes;
- unsigned short mode;
- unsigned short flags = pol ? pol->flags : 0;
-
- /*
- * Sanity check: room for longest mode, flag and some nodes
- */
- VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
+ nodemask_t nodes = NODE_MASK_NONE;
+ unsigned short mode = MPOL_DEFAULT;
+ unsigned short flags = 0;
- if (!pol || pol == &default_policy)
- mode = MPOL_DEFAULT;
- else
+ if (pol && pol != &default_policy) {
mode = pol->mode;
+ flags = pol->flags;
+ }
switch (mode) {
case MPOL_DEFAULT:
- nodes_clear(nodes);
break;
-
case MPOL_PREFERRED:
- nodes_clear(nodes);
if (flags & MPOL_F_LOCAL)
mode = MPOL_LOCAL;
else
node_set(pol->v.preferred_node, nodes);
break;
-
case MPOL_BIND:
- /* Fall through */
case MPOL_INTERLEAVE:
nodes = pol->v.nodes;
break;
-
default:
- return -EINVAL;
+ WARN_ON_ONCE(1);
+ snprintf(p, maxlen, "unknown");
+ return;
}
- l = strlen(policy_modes[mode]);
- if (buffer + maxlen < p + l + 1)
- return -ENOSPC;
-
- strcpy(p, policy_modes[mode]);
- p += l;
+ p += snprintf(p, maxlen, "%s", policy_modes[mode]);
if (flags & MPOL_MODE_FLAGS) {
- if (buffer + maxlen < p + 2)
- return -ENOSPC;
- *p++ = '=';
+ p += snprintf(p, buffer + maxlen - p, "=");
/*
* Currently, the only defined flags are mutually exclusive
@@ -2851,10 +2965,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
}
if (!nodes_empty(nodes)) {
- if (buffer + maxlen < p + 2)
- return -ENOSPC;
- *p++ = ':';
+ p += snprintf(p, buffer + maxlen - p, ":");
p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
}
- return p - buffer;
}