From a09a79f66874c905af35d5bb5e5f2fdc7b6b894d Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 9 May 2011 13:01:09 +0200 Subject: Don't lock guardpage if the stack is growing up Linux kernel excludes guard page when performing mlock on a VMA with down-growing stack. However, some architectures have up-growing stack and locking the guard page should be excluded in this case too. This patch fixes lvm2 on PA-RISC (and possibly other architectures with up-growing stack). lvm2 calculates number of used pages when locking and when unlocking and reports an internal error if the numbers mismatch. [ Patch changed fairly extensively to also fix /proc//maps for the grows-up case, and to move things around a bit to clean it all up and share the infrstructure with the /proc bits. Tested on ia64 that has both grow-up and grow-down segments - Linus ] Signed-off-by: Mikulas Patocka Tested-by: Tony Luck Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'fs/proc/task_mmu.c') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 2e7addfd9803..318d8654989b 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -214,7 +214,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) int flags = vma->vm_flags; unsigned long ino = 0; unsigned long long pgoff = 0; - unsigned long start; + unsigned long start, end; dev_t dev = 0; int len; @@ -227,13 +227,15 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) /* We don't show the stack guard page in /proc/maps */ start = vma->vm_start; - if (vma->vm_flags & VM_GROWSDOWN) - if (!vma_stack_continue(vma->vm_prev, vma->vm_start)) - start += PAGE_SIZE; + if (stack_guard_page_start(vma, start)) + start += PAGE_SIZE; + end = vma->vm_end; + if (stack_guard_page_end(vma, end)) + end -= PAGE_SIZE; seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", start, - vma->vm_end, + end, flags & VM_READ ? 'r' : '-', flags & VM_WRITE ? 'w' : '-', flags & VM_EXEC ? 'x' : '-', -- cgit v1.2.3 From f69ff943df6972aae96c10733b6847fa094d8a59 Mon Sep 17 00:00:00 2001 From: Stephen Wilson Date: Tue, 24 May 2011 17:12:47 -0700 Subject: mm: proc: move show_numa_map() to fs/proc/task_mmu.c Moving show_numa_map() from mempolicy.c to task_mmu.c solves several issues. - Having the show() operation "miles away" from the corresponding seq_file iteration operations is a maintenance burden. - The need to export ad hoc info like struct proc_maps_private is eliminated. - The implementation of show_numa_map() can be improved in a simple manner by cooperating with the other seq_file operations (start, stop, etc) -- something that would be messy to do without this change. Signed-off-by: Stephen Wilson Reviewed-by: KOSAKI Motohiro Cc: Hugh Dickins Cc: David Rientjes Cc: Lee Schermerhorn Cc: Alexey Dobriyan Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 184 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 182 insertions(+), 2 deletions(-) (limited to 'fs/proc/task_mmu.c') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 318d8654989b..2ed53d18b2ef 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -858,8 +858,188 @@ const struct file_operations proc_pagemap_operations = { #endif /* CONFIG_PROC_PAGE_MONITOR */ #ifdef CONFIG_NUMA -extern int show_numa_map(struct seq_file *m, void *v); +struct numa_maps { + struct vm_area_struct *vma; + unsigned long pages; + unsigned long anon; + unsigned long active; + unsigned long writeback; + unsigned long mapcount_max; + unsigned long dirty; + unsigned long swapcache; + unsigned long node[MAX_NUMNODES]; +}; + +static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty) +{ + int count = page_mapcount(page); + + md->pages++; + if (pte_dirty || PageDirty(page)) + md->dirty++; + + if (PageSwapCache(page)) + md->swapcache++; + + if (PageActive(page) || PageUnevictable(page)) + md->active++; + + if (PageWriteback(page)) + md->writeback++; + + if (PageAnon(page)) + md->anon++; + + if (count > md->mapcount_max) + md->mapcount_max = count; + + md->node[page_to_nid(page)]++; +} + +static int gather_pte_stats(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct numa_maps *md; + spinlock_t *ptl; + pte_t *orig_pte; + pte_t *pte; + + md = walk->private; + orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + do { + struct page *page; + int nid; + + if (!pte_present(*pte)) + continue; + + page = vm_normal_page(md->vma, addr, *pte); + if (!page) + continue; + + if (PageReserved(page)) + continue; + + nid = page_to_nid(page); + if (!node_isset(nid, node_states[N_HIGH_MEMORY])) + continue; + + gather_stats(page, md, pte_dirty(*pte)); + + } while (pte++, addr += PAGE_SIZE, addr != end); + pte_unmap_unlock(orig_pte, ptl); + return 0; +} +#ifdef CONFIG_HUGETLB_PAGE +static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long end, struct mm_walk *walk) +{ + struct numa_maps *md; + struct page *page; + + if (pte_none(*pte)) + return 0; + + page = pte_page(*pte); + if (!page) + return 0; + + md = walk->private; + gather_stats(page, md, pte_dirty(*pte)); + return 0; +} + +#else +static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long end, struct mm_walk *walk) +{ + return 0; +} +#endif + +/* + * Display pages allocated per node and memory policy via /proc. + */ +static int show_numa_map(struct seq_file *m, void *v) +{ + struct proc_maps_private *priv = m->private; + struct vm_area_struct *vma = v; + struct numa_maps *md; + struct file *file = vma->vm_file; + struct mm_struct *mm = vma->vm_mm; + struct mm_walk walk = {}; + struct mempolicy *pol; + int n; + char buffer[50]; + + if (!mm) + return 0; + + md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL); + if (!md) + return 0; + + md->vma = vma; + + walk.hugetlb_entry = gather_hugetbl_stats; + walk.pmd_entry = gather_pte_stats; + walk.private = md; + walk.mm = mm; + + pol = get_vma_policy(priv->task, vma, vma->vm_start); + mpol_to_str(buffer, sizeof(buffer), pol, 0); + mpol_cond_put(pol); + + seq_printf(m, "%08lx %s", vma->vm_start, buffer); + + if (file) { + seq_printf(m, " file="); + seq_path(m, &file->f_path, "\n\t= "); + } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { + seq_printf(m, " heap"); + } else if (vma->vm_start <= mm->start_stack && + vma->vm_end >= mm->start_stack) { + seq_printf(m, " stack"); + } + + walk_page_range(vma->vm_start, vma->vm_end, &walk); + + if (!md->pages) + goto out; + + if (md->anon) + seq_printf(m, " anon=%lu", md->anon); + + if (md->dirty) + seq_printf(m, " dirty=%lu", md->dirty); + + if (md->pages != md->anon && md->pages != md->dirty) + seq_printf(m, " mapped=%lu", md->pages); + + if (md->mapcount_max > 1) + seq_printf(m, " mapmax=%lu", md->mapcount_max); + + if (md->swapcache) + seq_printf(m, " swapcache=%lu", md->swapcache); + + if (md->active < md->pages && !is_vm_hugetlb_page(vma)) + seq_printf(m, " active=%lu", md->active); + + if (md->writeback) + seq_printf(m, " writeback=%lu", md->writeback); + + for_each_node_state(n, N_HIGH_MEMORY) + if (md->node[n]) + seq_printf(m, " N%d=%lu", n, md->node[n]); +out: + seq_putc(m, '\n'); + kfree(md); + + if (m->count < m->size) + m->version = (vma != priv->tail_vma) ? vma->vm_start : 0; + return 0; +} static const struct seq_operations proc_pid_numa_maps_op = { .start = m_start, .next = m_next, @@ -878,4 +1058,4 @@ const struct file_operations proc_numa_maps_operations = { .llseek = seq_lseek, .release = seq_release_private, }; -#endif +#endif /* CONFIG_NUMA */ -- cgit v1.2.3 From 5b52fc890bece77bffb9fade69239f71384ef02b Mon Sep 17 00:00:00 2001 From: Stephen Wilson Date: Tue, 24 May 2011 17:12:49 -0700 Subject: proc: allocate storage for numa_maps statistics once In show_numa_map() we collect statistics into a numa_maps structure. Since the number of NUMA nodes can be very large, this structure is not a candidate for stack allocation. Instead of going thru a kmalloc()+kfree() cycle each time show_numa_map() is invoked, perform the allocation just once when /proc/pid/numa_maps is opened. Performing the allocation when numa_maps is opened, and thus before a reference to the target tasks mm is taken, eliminates a potential stalemate condition in the oom-killer as originally described by Hugh Dickins: ... imagine what happens if the system is out of memory, and the mm we're looking at is selected for killing by the OOM killer: while we wait in __get_free_page for more memory, no memory is freed from the selected mm because it cannot reach exit_mmap while we hold that reference. Signed-off-by: Stephen Wilson Reviewed-by: KOSAKI Motohiro Cc: Hugh Dickins Cc: David Rientjes Cc: Lee Schermerhorn Cc: Alexey Dobriyan Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) (limited to 'fs/proc/task_mmu.c') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 2ed53d18b2ef..2c9db29ea358 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -871,6 +871,11 @@ struct numa_maps { unsigned long node[MAX_NUMNODES]; }; +struct numa_maps_private { + struct proc_maps_private proc_maps; + struct numa_maps md; +}; + static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty) { int count = page_mapcount(page); @@ -963,9 +968,10 @@ static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, */ static int show_numa_map(struct seq_file *m, void *v) { - struct proc_maps_private *priv = m->private; + struct numa_maps_private *numa_priv = m->private; + struct proc_maps_private *proc_priv = &numa_priv->proc_maps; struct vm_area_struct *vma = v; - struct numa_maps *md; + struct numa_maps *md = &numa_priv->md; struct file *file = vma->vm_file; struct mm_struct *mm = vma->vm_mm; struct mm_walk walk = {}; @@ -976,9 +982,8 @@ static int show_numa_map(struct seq_file *m, void *v) if (!mm) return 0; - md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL); - if (!md) - return 0; + /* Ensure we start with an empty set of numa_maps statistics. */ + memset(md, 0, sizeof(*md)); md->vma = vma; @@ -987,7 +992,7 @@ static int show_numa_map(struct seq_file *m, void *v) walk.private = md; walk.mm = mm; - pol = get_vma_policy(priv->task, vma, vma->vm_start); + pol = get_vma_policy(proc_priv->task, vma, vma->vm_start); mpol_to_str(buffer, sizeof(buffer), pol, 0); mpol_cond_put(pol); @@ -1034,12 +1039,12 @@ static int show_numa_map(struct seq_file *m, void *v) seq_printf(m, " N%d=%lu", n, md->node[n]); out: seq_putc(m, '\n'); - kfree(md); if (m->count < m->size) - m->version = (vma != priv->tail_vma) ? vma->vm_start : 0; + m->version = (vma != proc_priv->tail_vma) ? vma->vm_start : 0; return 0; } + static const struct seq_operations proc_pid_numa_maps_op = { .start = m_start, .next = m_next, @@ -1049,7 +1054,20 @@ static const struct seq_operations proc_pid_numa_maps_op = { static int numa_maps_open(struct inode *inode, struct file *file) { - return do_maps_open(inode, file, &proc_pid_numa_maps_op); + struct numa_maps_private *priv; + int ret = -ENOMEM; + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (priv) { + priv->proc_maps.pid = proc_pid(inode); + ret = seq_open(file, &proc_pid_numa_maps_op); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = priv; + } else { + kfree(priv); + } + } + return ret; } const struct file_operations proc_numa_maps_operations = { -- cgit v1.2.3 From ca16d140af91febe25daeb9e032bf8bd46b8c31f Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Thu, 26 May 2011 19:16:19 +0900 Subject: mm: don't access vm_flags as 'int' The type of vma->vm_flags is 'unsigned long'. Neither 'int' nor 'unsigned int'. This patch fixes such misuse. Signed-off-by: KOSAKI Motohiro [ Changed to use a typedef - we'll extend it to cover more cases later, since there has been discussion about making it a 64-bit type.. - Linus ] Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/proc/task_mmu.c') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 2c9db29ea358..db15935fa757 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -211,7 +211,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) { struct mm_struct *mm = vma->vm_mm; struct file *file = vma->vm_file; - int flags = vma->vm_flags; + vm_flags_t flags = vma->vm_flags; unsigned long ino = 0; unsigned long long pgoff = 0; unsigned long start, end; -- cgit v1.2.3 From 0a8cb8e34149251ad1f280fe099a4f971554639a Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 26 May 2011 16:25:50 -0700 Subject: fs/proc: convert to kstrtoX() Convert fs/proc/ from strict_strto*() to kstrto*() functions. Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs/proc/task_mmu.c') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index db15935fa757..30a6a72d05b2 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -536,15 +536,17 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, char buffer[PROC_NUMBUF]; struct mm_struct *mm; struct vm_area_struct *vma; - long type; + int type; + int rv; memset(buffer, 0, sizeof(buffer)); if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) return -EFAULT; - if (strict_strtol(strstrip(buffer), 10, &type)) - return -EINVAL; + rv = kstrtoint(strstrip(buffer), 10, &type); + if (rv < 0) + return rv; if (type < CLEAR_REFS_ALL || type > CLEAR_REFS_MAPPED) return -EINVAL; task = get_proc_task(file->f_path.dentry->d_inode); -- cgit v1.2.3 From 98bc93e505c03403479c6669c4ff97301cee6199 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Thu, 26 May 2011 16:25:53 -0700 Subject: proc: fix pagemap_read() error case Currently, pagemap_read() has three error and/or corner case handling mistake. (1) If ppos parameter is wrong, mm refcount will be leak. (2) If count parameter is 0, mm refcount will be leak too. (3) If the current task is sleeping in kmalloc() and the system is out of memory and oom-killer kill the proc associated task, mm_refcount prevent the task free its memory. then system may hang up. check_mem_permission gets a reference to the mm. If we __get_free_page after check_mem_permission, imagine what happens if the system is out of memory, and the mm we're looking at is selected for killing by the OOM killer: while we wait in __get_free_page for more memory, no memory is freed from the selected mm because it cannot reach exit_mmap while we hold that reference. This patch fixes the above three. Signed-off-by: KOSAKI Motohiro Cc: Hugh Dickins Cc: Jovi Zhang Acked-by: Hugh Dickins Cc: Stephen Wilson Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'fs/proc/task_mmu.c') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 30a6a72d05b2..25b6a887adb9 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -771,18 +771,12 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, if (!task) goto out; - mm = mm_for_maps(task); - ret = PTR_ERR(mm); - if (!mm || IS_ERR(mm)) - goto out_task; - ret = -EINVAL; /* file position must be aligned */ if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES)) goto out_task; ret = 0; - if (!count) goto out_task; @@ -790,7 +784,12 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, pm.buffer = kmalloc(pm.len, GFP_TEMPORARY); ret = -ENOMEM; if (!pm.buffer) - goto out_mm; + goto out_task; + + mm = mm_for_maps(task); + ret = PTR_ERR(mm); + if (!mm || IS_ERR(mm)) + goto out_free; pagemap_walk.pmd_entry = pagemap_pte_range; pagemap_walk.pte_hole = pagemap_pte_hole; @@ -833,7 +832,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, len = min(count, PM_ENTRY_BYTES * pm.pos); if (copy_to_user(buf, pm.buffer, len)) { ret = -EFAULT; - goto out_free; + goto out_mm; } copied += len; buf += len; @@ -843,10 +842,10 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, if (!ret || ret == PM_END_OF_BUFFER) ret = copied; -out_free: - kfree(pm.buffer); out_mm: mmput(mm); +out_free: + kfree(pm.buffer); out_task: put_task_struct(task); out: -- cgit v1.2.3