From 2ed7c03ec17779afb4fcfa3b8c61df61bd4879ba Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:13:54 +0100 Subject: [CVE-2009-0029] Convert all system calls to return a long Convert all system calls to return a long. This should be a NOP since all converted types should have the same size anyway. With the exception of sys_exit_group which returned void. But that doesn't matter since the system call doesn't return. Signed-off-by: Heiko Carstens --- mm/filemap.c | 2 +- mm/mmap.c | 2 +- mm/mremap.c | 2 +- mm/nommu.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index ceba0bd03662..538b75ed6236 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1374,7 +1374,7 @@ do_readahead(struct address_space *mapping, struct file *filp, return 0; } -asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count) +asmlinkage long sys_readahead(int fd, loff_t offset, size_t count) { ssize_t ret; struct file *file; diff --git a/mm/mmap.c b/mm/mmap.c index 749623196cb9..a970d890cb21 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -245,7 +245,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) return next; } -asmlinkage unsigned long sys_brk(unsigned long brk) +asmlinkage long sys_brk(unsigned long brk) { unsigned long rlim, retval; unsigned long newbrk, oldbrk; diff --git a/mm/mremap.c b/mm/mremap.c index 646de959aa58..5572e0825d80 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -420,7 +420,7 @@ out_nc: return ret; } -asmlinkage unsigned long sys_mremap(unsigned long addr, +asmlinkage long sys_mremap(unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr) { diff --git a/mm/nommu.c b/mm/nommu.c index 60ed8375c986..ee3e78927739 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -416,7 +416,7 @@ EXPORT_SYMBOL(vm_insert_page); * to a regular file. in this case, the unmapping will need * to invoke file system routines that need the global lock. */ -asmlinkage unsigned long sys_brk(unsigned long brk) +asmlinkage long sys_brk(unsigned long brk) { struct mm_struct *mm = current->mm; -- cgit v1.2.3 From 6673e0c3fbeaed2cd08e2fd4a4aa97382d6fedb0 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:02 +0100 Subject: [CVE-2009-0029] System call wrapper special cases System calls with an unsigned long long argument can't be converted with the standard wrappers since that would include a cast to long, which in turn means that we would lose the upper 32 bit on 32 bit architectures. Also semctl can't use the standard wrapper since it has a 'union' parameter. So we handle them as special case and add some extra wrappers instead. Signed-off-by: Heiko Carstens --- mm/fadvise.c | 18 ++++++++++++++++-- mm/filemap.c | 9 ++++++++- 2 files changed, 24 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/fadvise.c b/mm/fadvise.c index a1da969bd980..54a0f8040afa 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -24,7 +24,7 @@ * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could * deactivate the pages and clear PG_Referenced. */ -asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) +SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) { struct file *file = fget(fd); struct address_space *mapping; @@ -126,12 +126,26 @@ out: fput(file); return ret; } +#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS +asmlinkage long SyS_fadvise64_64(long fd, loff_t offset, loff_t len, long advice) +{ + return SYSC_fadvise64_64((int) fd, offset, len, (int) advice); +} +SYSCALL_ALIAS(sys_fadvise64_64, SyS_fadvise64_64); +#endif #ifdef __ARCH_WANT_SYS_FADVISE64 -asmlinkage long sys_fadvise64(int fd, loff_t offset, size_t len, int advice) +SYSCALL_DEFINE(fadvise64)(int fd, loff_t offset, size_t len, int advice) { return sys_fadvise64_64(fd, offset, len, advice); } +#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS +asmlinkage long SyS_fadvise64(long fd, loff_t offset, long len, long advice) +{ + return SYSC_fadvise64((int) fd, offset, (size_t)len, (int)advice); +} +SYSCALL_ALIAS(sys_fadvise64, SyS_fadvise64); +#endif #endif diff --git a/mm/filemap.c b/mm/filemap.c index 538b75ed6236..23acefe51808 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1374,7 +1374,7 @@ do_readahead(struct address_space *mapping, struct file *filp, return 0; } -asmlinkage long sys_readahead(int fd, loff_t offset, size_t count) +SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count) { ssize_t ret; struct file *file; @@ -1393,6 +1393,13 @@ asmlinkage long sys_readahead(int fd, loff_t offset, size_t count) } return ret; } +#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS +asmlinkage long SyS_readahead(long fd, loff_t offset, long count) +{ + return SYSC_readahead((int) fd, offset, (size_t) count); +} +SYSCALL_ALIAS(sys_readahead, SyS_readahead); +#endif #ifdef CONFIG_MMU /** -- cgit v1.2.3 From 6a6160a7b5c27b3c38651baef92a14fa7072b3c1 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:15 +0100 Subject: [CVE-2009-0029] System call wrappers part 13 Signed-off-by: Heiko Carstens --- mm/fremap.c | 4 ++-- mm/mlock.c | 4 ++-- mm/mmap.c | 4 ++-- mm/mprotect.c | 4 ++-- mm/mremap.c | 6 +++--- mm/msync.c | 2 +- mm/nommu.c | 11 +++++------ 7 files changed, 17 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/mm/fremap.c b/mm/fremap.c index 62d5bbda921a..736ba7f3306a 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -120,8 +120,8 @@ static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma, * and the vma's default protection is used. Arbitrary protections * might be implemented in the future. */ -asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, - unsigned long prot, unsigned long pgoff, unsigned long flags) +SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, + unsigned long, prot, unsigned long, pgoff, unsigned long, flags) { struct mm_struct *mm = current->mm; struct address_space *mapping; diff --git a/mm/mlock.c b/mm/mlock.c index e125156c664e..04d5e7429c55 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -530,7 +530,7 @@ static int do_mlock(unsigned long start, size_t len, int on) return error; } -asmlinkage long sys_mlock(unsigned long start, size_t len) +SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) { unsigned long locked; unsigned long lock_limit; @@ -558,7 +558,7 @@ asmlinkage long sys_mlock(unsigned long start, size_t len) return error; } -asmlinkage long sys_munlock(unsigned long start, size_t len) +SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) { int ret; diff --git a/mm/mmap.c b/mm/mmap.c index a970d890cb21..8d95902e9a38 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -245,7 +245,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) return next; } -asmlinkage long sys_brk(unsigned long brk) +SYSCALL_DEFINE1(brk, unsigned long, brk) { unsigned long rlim, retval; unsigned long newbrk, oldbrk; @@ -1948,7 +1948,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) EXPORT_SYMBOL(do_munmap); -asmlinkage long sys_munmap(unsigned long addr, size_t len) +SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) { int ret; struct mm_struct *mm = current->mm; diff --git a/mm/mprotect.c b/mm/mprotect.c index d0f6e7ce09f1..abe2694e13f4 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -217,8 +217,8 @@ fail: return error; } -asmlinkage long -sys_mprotect(unsigned long start, size_t len, unsigned long prot) +SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, + unsigned long, prot) { unsigned long vm_flags, nstart, end, tmp, reqprot; struct vm_area_struct *vma, *prev; diff --git a/mm/mremap.c b/mm/mremap.c index 5572e0825d80..a39b7b91be46 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -420,9 +420,9 @@ out_nc: return ret; } -asmlinkage long sys_mremap(unsigned long addr, - unsigned long old_len, unsigned long new_len, - unsigned long flags, unsigned long new_addr) +SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, + unsigned long, new_len, unsigned long, flags, + unsigned long, new_addr) { unsigned long ret; diff --git a/mm/msync.c b/mm/msync.c index 07dae08cf31c..4083209b7f02 100644 --- a/mm/msync.c +++ b/mm/msync.c @@ -28,7 +28,7 @@ * So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to * applications. */ -asmlinkage long sys_msync(unsigned long start, size_t len, int flags) +SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) { unsigned long end; struct mm_struct *mm = current->mm; diff --git a/mm/nommu.c b/mm/nommu.c index ee3e78927739..8cee8c8ff0f2 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -416,7 +416,7 @@ EXPORT_SYMBOL(vm_insert_page); * to a regular file. in this case, the unmapping will need * to invoke file system routines that need the global lock. */ -asmlinkage long sys_brk(unsigned long brk) +SYSCALL_DEFINE1(brk, unsigned long, brk) { struct mm_struct *mm = current->mm; @@ -1573,7 +1573,7 @@ erase_whole_vma: } EXPORT_SYMBOL(do_munmap); -asmlinkage long sys_munmap(unsigned long addr, size_t len) +SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) { int ret; struct mm_struct *mm = current->mm; @@ -1657,10 +1657,9 @@ unsigned long do_mremap(unsigned long addr, } EXPORT_SYMBOL(do_mremap); -asmlinkage -unsigned long sys_mremap(unsigned long addr, - unsigned long old_len, unsigned long new_len, - unsigned long flags, unsigned long new_addr) +SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, + unsigned long, new_len, unsigned long, flags, + unsigned long, new_addr) { unsigned long ret; -- cgit v1.2.3 From 3480b25743cb7404928d57efeaa3d085708b04c2 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:16 +0100 Subject: [CVE-2009-0029] System call wrappers part 14 Signed-off-by: Heiko Carstens --- mm/madvise.c | 2 +- mm/mincore.c | 4 ++-- mm/mlock.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/madvise.c b/mm/madvise.c index f9349c18a1b5..b9ce574827c8 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -281,7 +281,7 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, * -EBADF - map exists, but area maps something that isn't a file. * -EAGAIN - a kernel resource was temporarily unavailable. */ -asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior) +SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) { unsigned long end, tmp; struct vm_area_struct * vma, *prev; diff --git a/mm/mincore.c b/mm/mincore.c index 5178800bc129..8cb508f84ea4 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -177,8 +177,8 @@ none_mapped: * mapped * -EAGAIN - A kernel resource was temporarily unavailable. */ -asmlinkage long sys_mincore(unsigned long start, size_t len, - unsigned char __user * vec) +SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len, + unsigned char __user *, vec) { long retval; unsigned long pages; diff --git a/mm/mlock.c b/mm/mlock.c index 04d5e7429c55..2904a347e476 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -595,7 +595,7 @@ out: return 0; } -asmlinkage long sys_mlockall(int flags) +SYSCALL_DEFINE1(mlockall, int, flags) { unsigned long lock_limit; int ret = -EINVAL; @@ -623,7 +623,7 @@ out: return ret; } -asmlinkage long sys_munlockall(void) +SYSCALL_DEFINE0(munlockall) { int ret; -- cgit v1.2.3 From c4ea37c26a691ad0b7e86aa5884aab27830e95c9 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:28 +0100 Subject: [CVE-2009-0029] System call wrappers part 26 Signed-off-by: Heiko Carstens --- mm/swapfile.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index da422c47e2ee..f48b831e5e5c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1377,7 +1377,7 @@ out: return ret; } -asmlinkage long sys_swapoff(const char __user * specialfile) +SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) { struct swap_info_struct * p = NULL; unsigned short *swap_map; @@ -1633,7 +1633,7 @@ late_initcall(max_swapfiles_check); * * The swapon system call */ -asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) +SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) { struct swap_info_struct * p; char *name = NULL; -- cgit v1.2.3 From 938bb9f5e840eddbf54e4f62f6c5ba9b3ae12c9d Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:30 +0100 Subject: [CVE-2009-0029] System call wrappers part 28 Signed-off-by: Heiko Carstens --- mm/mempolicy.c | 24 +++++++++++------------- mm/migrate.c | 8 ++++---- 2 files changed, 15 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e412ffa8e52e..3eb4a6fdc043 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1068,10 +1068,9 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; } -asmlinkage long sys_mbind(unsigned long start, unsigned long len, - unsigned long mode, - unsigned long __user *nmask, unsigned long maxnode, - unsigned flags) +SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, + unsigned long, mode, unsigned long __user *, nmask, + unsigned long, maxnode, unsigned, flags) { nodemask_t nodes; int err; @@ -1091,8 +1090,8 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len, } /* Set the process memory policy */ -asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, - unsigned long maxnode) +SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask, + unsigned long, maxnode) { int err; nodemask_t nodes; @@ -1110,9 +1109,9 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, return do_set_mempolicy(mode, flags, &nodes); } -asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, - const unsigned long __user *old_nodes, - const unsigned long __user *new_nodes) +SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, + const unsigned long __user *, old_nodes, + const unsigned long __user *, new_nodes) { const struct cred *cred = current_cred(), *tcred; struct mm_struct *mm; @@ -1185,10 +1184,9 @@ out: /* Retrieve NUMA policy */ -asmlinkage long sys_get_mempolicy(int __user *policy, - unsigned long __user *nmask, - unsigned long maxnode, - unsigned long addr, unsigned long flags) +SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, + unsigned long __user *, nmask, unsigned long, maxnode, + unsigned long, addr, unsigned long, flags) { int err; int uninitialized_var(pval); diff --git a/mm/migrate.c b/mm/migrate.c index a30ea5fcf9f1..2bb4e1d63520 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1055,10 +1055,10 @@ out: * Move a list of pages in the address space of the currently executing * process. */ -asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, - const void __user * __user *pages, - const int __user *nodes, - int __user *status, int flags) +SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, + const void __user * __user *, pages, + const int __user *, nodes, + int __user *, status, int, flags) { const struct cred *cred = current_cred(), *tcred; struct task_struct *task; -- cgit v1.2.3 From 822c18f2e38cbc775792ab65ace4f9198678dec9 Mon Sep 17 00:00:00 2001 From: Ivan Kokshaysky Date: Thu, 15 Jan 2009 13:50:48 -0800 Subject: alpha: fix vmalloc breakage On alpha, we have to map some stuff in the VMALLOC space very early in the boot process (to make SRM console callbacks work and so on, see arch/alpha/mm/init.c). For old VM allocator, we just manually placed a vm_struct onto the global vmlist and this worked for ages. Unfortunately, the new allocator isn't aware of this, so it constantly tries to allocate the VM space which is already in use, making vmalloc on alpha defunct. This patch forces KVA to import vmlist entries on init. [akpm@linux-foundation.org: remove unneeded check (per Johannes)] Signed-off-by: Ivan Kokshaysky Cc: Nick Piggin Cc: Johannes Weiner Cc: Richard Henderson Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index c5db9a7264d9..7e00b280648a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -984,6 +985,8 @@ EXPORT_SYMBOL(vm_map_ram); void __init vmalloc_init(void) { + struct vmap_area *va; + struct vm_struct *tmp; int i; for_each_possible_cpu(i) { @@ -996,6 +999,14 @@ void __init vmalloc_init(void) vbq->nr_dirty = 0; } + /* Import existing vmlist entries. */ + for (tmp = vmlist; tmp; tmp = tmp->next) { + va = alloc_bootmem(sizeof(struct vmap_area)); + va->flags = tmp->flags | VM_VM_AREA; + va->va_start = (unsigned long)tmp->addr; + va->va_end = va->va_start + tmp->size; + __insert_vmap_area(va); + } vmap_initialized = true; } -- cgit v1.2.3 From bd112db872c2f69993c86f458467acb4a14da010 Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Thu, 15 Jan 2009 13:51:11 -0800 Subject: memcg: fix mem_cgroup_get_reclaim_stat_from_page In case of swapin, a new page is added to lru before it is charged, so page->pc->mem_cgroup points to NULL or last mem_cgroup the page was charged before. In the latter case, if the mem_cgroup has already freed by rmdir, the area pointed to by page->pc->mem_cgroup may have invalid data. Actually, I saw general protection fault. general protection fault: 0000 [#1] SMP last sysfs file: /sys/devices/system/cpu/cpu15/cache/index1/shared_cpu_map CPU 4 Modules linked in: ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp ipv6 autofs4 hidp rfcomm l2cap bluetooth sunrpc dm_mirror dm_region_hash dm_log dm_multipath dm_mod rfkill input_polldev sbs sbshc battery ac lp sg ide_cd_mod cdrom button serio_raw acpi_memhotplug parport_pc e1000 rtc_cmos parport rtc_core rtc_lib i2c_i801 i2c_core shpchp pcspkr ata_piix libata megaraid_mbox megaraid_mm sd_mod scsi_mod ext3 jbd ehci_hcd ohci_hcd uhci_hcd [last unloaded: microcode] Pid: 26038, comm: page01 Tainted: G W 2.6.28-rc9-mm1-mmotm-2008-12-22-16-14-f2ab3dea #1 RIP: 0010:[] [] update_page_reclaim_stat+0x2f/0x42 RSP: 0000:ffff8801ee457da8 EFLAGS: 00010002 RAX: 32353438312021c8 RBX: 0000000000000000 RCX: 32353438312021c8 RDX: 0000000000000000 RSI: ffff8800cb0b1000 RDI: ffff8801164d1d28 RBP: ffff880110002cb8 R08: ffff88010f2eae23 R09: 0000000000000001 R10: ffff8800bc514b00 R11: ffff880110002c00 R12: 0000000000000000 R13: ffff88000f484100 R14: 0000000000000003 R15: 00000000001200d2 FS: 00007f8a261726f0(0000) GS:ffff88010f2eaa80(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00007f8a25d22000 CR3: 00000001ef18c000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process page01 (pid: 26038, threadinfo ffff8801ee456000, task ffff8800b585b960) Stack: ffffe200071ee568 ffff880110001f00 0000000000000000 ffffffff8028ea17 ffff88000f484100 0000000000000000 0000000000000020 00007f8a25d22000 ffff8800bc514b00 ffffffff8028ec34 0000000000000000 0000000000016fd8 Call Trace: [] ? ____pagevec_lru_add+0xc1/0x13c [] ? drain_cpu_pagevecs+0x36/0x89 [] ? swapin_readahead+0x78/0x98 [] ? handle_mm_fault+0x3d9/0x741 [] ? do_page_fault+0x3ce/0x78c [] ? trace_hardirqs_off_thunk+0x3a/0x3c [] ? page_fault+0x1f/0x30 Code: cc 55 48 8d af b8 0d 00 00 48 89 f7 53 89 d3 e8 39 85 02 00 48 63 d3 48 ff 44 d5 10 45 85 e4 74 05 48 ff 44 d5 00 48 85 c0 74 0e <48> ff 44 d0 10 45 85 e4 74 04 48 ff 04 d0 5b 5d 41 5c c3 41 54 RIP [] update_page_reclaim_stat+0x2f/0x42 RSP Signed-off-by: Daisuke Nishimura Acked-by: KAMEZAWA Hiroyuki Cc: Balbir Singh Cc: Pavel Emelyanov Cc: Li Zefan Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e2996b80601f..b66512771167 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -358,6 +358,10 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) return; pc = lookup_page_cgroup(page); + /* + * Used bit is set without atomic ops but after smp_wmb(). + * For making pc->mem_cgroup visible, insert smp_rmb() here. + */ smp_rmb(); /* unused page is not rotated. */ if (!PageCgroupUsed(pc)) @@ -374,7 +378,10 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) if (mem_cgroup_disabled()) return; pc = lookup_page_cgroup(page); - /* barrier to sync with "charge" */ + /* + * Used bit is set without atomic ops but after smp_wmb(). + * For making pc->mem_cgroup visible, insert smp_rmb() here. + */ smp_rmb(); if (!PageCgroupUsed(pc)) return; @@ -559,6 +566,14 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page) return NULL; pc = lookup_page_cgroup(page); + /* + * Used bit is set without atomic ops but after smp_wmb(). + * For making pc->mem_cgroup visible, insert smp_rmb() here. + */ + smp_rmb(); + if (!PageCgroupUsed(pc)) + return NULL; + mz = page_cgroup_zoneinfo(pc); if (!mz) return NULL; -- cgit v1.2.3 From 40d58138f832a48208cdce57d6572a033b1f7a23 Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Thu, 15 Jan 2009 13:51:12 -0800 Subject: memcg: fix error path of mem_cgroup_move_parent There is a bug in error path of mem_cgroup_move_parent. Extra refcnt got from try_charge should be dropped, and usages incremented by try_charge should be decremented in both error paths: A: failure at get_page_unless_zero B: failure at isolate_lru_page This bug makes this parent directory unremovable. In case of A, rmdir doesn't return, because res.usage doesn't go down to 0 at mem_cgroup_force_empty even after all the pc in lru are removed. In case of B, rmdir fails and returns -EBUSY, because it has extra ref counts even after res.usage goes down to 0. Signed-off-by: Daisuke Nishimura Acked-by: KAMEZAWA Hiroyuki Acked-by: Balbir Singh Cc: Pavel Emelyanov Cc: Li Zefan Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b66512771167..7be9b35d7ffb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -994,14 +994,15 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, if (pc->mem_cgroup != from) goto out; - css_put(&from->css); res_counter_uncharge(&from->res, PAGE_SIZE); mem_cgroup_charge_statistics(from, pc, false); if (do_swap_account) res_counter_uncharge(&from->memsw, PAGE_SIZE); + css_put(&from->css); + + css_get(&to->css); pc->mem_cgroup = to; mem_cgroup_charge_statistics(to, pc, true); - css_get(&to->css); ret = 0; out: unlock_page_cgroup(pc); @@ -1034,8 +1035,10 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, if (ret || !parent) return ret; - if (!get_page_unless_zero(page)) - return -EBUSY; + if (!get_page_unless_zero(page)) { + ret = -EBUSY; + goto uncharge; + } ret = isolate_lru_page(page); @@ -1044,19 +1047,23 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, ret = mem_cgroup_move_account(pc, child, parent); - /* drop extra refcnt by try_charge() (move_account increment one) */ - css_put(&parent->css); putback_lru_page(page); if (!ret) { put_page(page); + /* drop extra refcnt by try_charge() */ + css_put(&parent->css); return 0; } - /* uncharge if move fails */ + cancel: + put_page(page); +uncharge: + /* drop extra refcnt by try_charge() */ + css_put(&parent->css); + /* uncharge if move fails */ res_counter_uncharge(&parent->res, PAGE_SIZE); if (do_swap_account) res_counter_uncharge(&parent->memsw, PAGE_SIZE); - put_page(page); return ret; } -- cgit v1.2.3 From c268e9946d7dc30ac4e55cdc3f43c8af1ae8153c Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Thu, 15 Jan 2009 13:51:13 -0800 Subject: memcg: fix hierarchical reclaim If root_mem has no children, last_scaned_child is set to root_mem itself. But after some children added to root_mem, mem_cgroup_get_next_node can mem_cgroup_put the root_mem although root_mem has not been mem_cgroup_get. This patch fixes this behavior by: - Set last_scanned_child to NULL if root_mem has no children or DFS search has returned to root_mem itself(root_mem is not a "child" of root_mem). Make mem_cgroup_get_first_node return root_mem in this case. There are no mem_cgroup_get/put for root_mem. - Rename mem_cgroup_get_next_node to __mem_cgroup_get_next_node, and mem_cgroup_get_first_node to mem_cgroup_get_next_node. Make mem_cgroup_hierarchical_reclaim call only new mem_cgroup_get_next_node. Signed-off-by: Daisuke Nishimura Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Cc: Pavel Emelyanov Cc: Li Zefan Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 68 ++++++++++++++++++++++++++++++--------------------------- 1 file changed, 36 insertions(+), 32 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7be9b35d7ffb..322625f551c2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -633,7 +633,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, * called with hierarchy_mutex held */ static struct mem_cgroup * -mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem) +__mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem) { struct cgroup *cgroup, *curr_cgroup, *root_cgroup; @@ -644,19 +644,16 @@ mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem) /* * Walk down to children */ - mem_cgroup_put(curr); cgroup = list_entry(curr_cgroup->children.next, struct cgroup, sibling); curr = mem_cgroup_from_cont(cgroup); - mem_cgroup_get(curr); goto done; } visit_parent: if (curr_cgroup == root_cgroup) { - mem_cgroup_put(curr); - curr = root_mem; - mem_cgroup_get(curr); + /* caller handles NULL case */ + curr = NULL; goto done; } @@ -664,11 +661,9 @@ visit_parent: * Goto next sibling */ if (curr_cgroup->sibling.next != &curr_cgroup->parent->children) { - mem_cgroup_put(curr); cgroup = list_entry(curr_cgroup->sibling.next, struct cgroup, sibling); curr = mem_cgroup_from_cont(cgroup); - mem_cgroup_get(curr); goto done; } @@ -679,7 +674,6 @@ visit_parent: goto visit_parent; done: - root_mem->last_scanned_child = curr; return curr; } @@ -689,40 +683,46 @@ done: * that to reclaim free pages from. */ static struct mem_cgroup * -mem_cgroup_get_first_node(struct mem_cgroup *root_mem) +mem_cgroup_get_next_node(struct mem_cgroup *root_mem) { struct cgroup *cgroup; - struct mem_cgroup *ret; + struct mem_cgroup *orig, *next; bool obsolete; - obsolete = mem_cgroup_is_obsolete(root_mem->last_scanned_child); - /* * Scan all children under the mem_cgroup mem */ mutex_lock(&mem_cgroup_subsys.hierarchy_mutex); + + orig = root_mem->last_scanned_child; + obsolete = mem_cgroup_is_obsolete(orig); + if (list_empty(&root_mem->css.cgroup->children)) { - ret = root_mem; + /* + * root_mem might have children before and last_scanned_child + * may point to one of them. We put it later. + */ + if (orig) + VM_BUG_ON(!obsolete); + next = NULL; goto done; } - if (!root_mem->last_scanned_child || obsolete) { - - if (obsolete && root_mem->last_scanned_child) - mem_cgroup_put(root_mem->last_scanned_child); - + if (!orig || obsolete) { cgroup = list_first_entry(&root_mem->css.cgroup->children, struct cgroup, sibling); - ret = mem_cgroup_from_cont(cgroup); - mem_cgroup_get(ret); + next = mem_cgroup_from_cont(cgroup); } else - ret = mem_cgroup_get_next_node(root_mem->last_scanned_child, - root_mem); + next = __mem_cgroup_get_next_node(orig, root_mem); done: - root_mem->last_scanned_child = ret; + if (next) + mem_cgroup_get(next); + root_mem->last_scanned_child = next; + if (orig) + mem_cgroup_put(orig); mutex_unlock(&mem_cgroup_subsys.hierarchy_mutex); - return ret; + return (next) ? next : root_mem; } static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) @@ -780,21 +780,18 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, if (!root_mem->use_hierarchy) return ret; - next_mem = mem_cgroup_get_first_node(root_mem); + next_mem = mem_cgroup_get_next_node(root_mem); while (next_mem != root_mem) { if (mem_cgroup_is_obsolete(next_mem)) { - mem_cgroup_put(next_mem); - next_mem = mem_cgroup_get_first_node(root_mem); + next_mem = mem_cgroup_get_next_node(root_mem); continue; } ret = try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap, get_swappiness(next_mem)); if (mem_cgroup_check_under_limit(root_mem)) return 0; - mutex_lock(&mem_cgroup_subsys.hierarchy_mutex); - next_mem = mem_cgroup_get_next_node(next_mem, root_mem); - mutex_unlock(&mem_cgroup_subsys.hierarchy_mutex); + next_mem = mem_cgroup_get_next_node(root_mem); } return ret; } @@ -2254,7 +2251,14 @@ static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss, static void mem_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cont) { - mem_cgroup_put(mem_cgroup_from_cont(cont)); + struct mem_cgroup *mem = mem_cgroup_from_cont(cont); + struct mem_cgroup *last_scanned_child = mem->last_scanned_child; + + if (last_scanned_child) { + VM_BUG_ON(!mem_cgroup_is_obsolete(last_scanned_child)); + mem_cgroup_put(last_scanned_child); + } + mem_cgroup_put(mem); } static int mem_cgroup_populate(struct cgroup_subsys *ss, -- cgit v1.2.3 From 4d1c627389c8ba6d9e703208567ffcdbd356f682 Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Thu, 15 Jan 2009 13:51:14 -0800 Subject: memcg: make oom less frequently In previous implementation, mem_cgroup_try_charge checked the return value of mem_cgroup_try_to_free_pages, and just retried if some pages had been reclaimed. But now, try_charge(and mem_cgroup_hierarchical_reclaim called from it) only checks whether the usage is less than the limit. This patch tries to change the behavior as before to cause oom less frequently. Signed-off-by: Daisuke Nishimura Acked-by: Balbir Singh Acked-by: KAMEZAWA Hiroyuki Cc: Pavel Emelyanov Cc: Li Zefan Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 322625f551c2..fb62b4335fa9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -773,10 +773,10 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, * but there might be left over accounting, even after children * have left. */ - ret = try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap, + ret += try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap, get_swappiness(root_mem)); if (mem_cgroup_check_under_limit(root_mem)) - return 0; + return 1; /* indicate reclaim has succeeded */ if (!root_mem->use_hierarchy) return ret; @@ -787,10 +787,10 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, next_mem = mem_cgroup_get_next_node(root_mem); continue; } - ret = try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap, + ret += try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap, get_swappiness(next_mem)); if (mem_cgroup_check_under_limit(root_mem)) - return 0; + return 1; /* indicate reclaim has succeeded */ next_mem = mem_cgroup_get_next_node(root_mem); } return ret; @@ -875,6 +875,8 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask, noswap); + if (ret) + continue; /* * try_to_free_mem_cgroup_pages() might not give us a full -- cgit v1.2.3 From 46666d8ac42893f90edde7e57a11bc8749d7e89c Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 15 Jan 2009 13:51:15 -0800 Subject: revert "mm: vmalloc use mutex for purge" Revert commit e97a630eb0f5b8b380fd67504de6cedebb489003 ("mm: vmalloc use mutex for purge") Bryan Donlan reports: : After testing 2.6.29-rc1 on xen-x86 with a btrfs root filesystem, I : got the OOPS quoted below and a hard freeze shortly after boot. : Boot messages and config are attached. : : ------------[ cut here ]------------ : Kernel BUG at c05ef80d [verbose debug info unavailable] : invalid opcode: 0000 [#1] SMP : last sysfs file: /sys/block/xvdc/size : Modules linked in: : : Pid: 0, comm: swapper Not tainted (2.6.29-rc1 #6) : EIP: 0061:[] EFLAGS: 00010087 CPU: 2 : EIP is at schedule+0x7cd/0x950 : EAX: d5aeca80 EBX: 00000002 ECX: 00000000 EDX: d4cb9a40 : ESI: c12f5600 EDI: d4cb9a40 EBP: d6033fa4 ESP: d6033ef4 : DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0069 : Process swapper (pid: 0, ti=d6032000 task=d6020b70 task.ti=d6032000) : Stack: : 000d85bc 00000000 000186a0 00000000 0dd11410 c0105417 c12efe00 0dc367c3 : 00000011 c0105d46 d5a5d310 deadbeef d4cb9a40 c07cc600 c05f1340 c12e0060 : deadbeef d6020b70 d6020d08 00000002 c014377d 00000000 c12f5600 00002c22 : Call Trace: : [] xen_force_evtchn_callback+0x17/0x30 : [] check_events+0x8/0x12 : [] _spin_unlock_irqrestore+0x20/0x40 : [] hrtimer_start_range_ns+0x12d/0x2e0 : [] tick_nohz_restart_sched_tick+0x146/0x160 : [] cpu_idle+0xa5/0xc0 and bisected it to this commit. Let's remove it now while we have a think about the problem. Reported-by: Bryan Donlan Tested-by: Christophe Saout Cc: Nick Piggin Cc: Ingo Molnar Cc: Jeremy Fitzhardinge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 7e00b280648a..75f49d312e8c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include @@ -496,7 +495,7 @@ static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, int sync, int force_flush) { - static DEFINE_MUTEX(purge_lock); + static DEFINE_SPINLOCK(purge_lock); LIST_HEAD(valist); struct vmap_area *va; int nr = 0; @@ -507,10 +506,10 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, * the case that isn't actually used at the moment anyway. */ if (!sync && !force_flush) { - if (!mutex_trylock(&purge_lock)) + if (!spin_trylock(&purge_lock)) return; } else - mutex_lock(&purge_lock); + spin_lock(&purge_lock); rcu_read_lock(); list_for_each_entry_rcu(va, &vmap_area_list, list) { @@ -542,7 +541,7 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, __free_vmap_area(va); spin_unlock(&vmap_area_lock); } - mutex_unlock(&purge_lock); + spin_unlock(&purge_lock); } /* -- cgit v1.2.3 From 0eb253e223c88b982461e59154fcad1b82597592 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 15 Jan 2009 13:51:25 -0800 Subject: memcg: fix section mismatch At system boot when creating the top cgroup, mem_cgroup_create() calls enable_swap_cgroup() which is marked as __init, so mark mem_cgroup_create() as __ref to avoid false section mismatch warning. Reported-by: Rakib Mullick Signed-off-by: Li Zefan Acked-by; KAMEZAWA Hiroyuki Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fb62b4335fa9..f0dc076adf05 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2202,7 +2202,7 @@ static void __init enable_swap_cgroup(void) } #endif -static struct cgroup_subsys_state * +static struct cgroup_subsys_state * __ref mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) { struct mem_cgroup *mem, *parent; -- cgit v1.2.3 From 068b38c1fa7a9210608f27ac521897ccc5f9b726 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 15 Jan 2009 13:51:26 -0800 Subject: memcg: fix a race when setting memory.swappiness (suppose: memcg->use_hierarchy == 0 and memcg->swappiness == 60) echo 10 > /memcg/0/swappiness | mem_cgroup_swappiness_write() | ... | echo 1 > /memcg/0/use_hierarchy | mkdir /mnt/0/1 | sub_memcg->swappiness = 60; memcg->swappiness = 10; | In the above scenario, we end up having 2 different swappiness values in a single hierarchy. We should hold cgroup_lock() when cheking cgrp->children list. Signed-off-by: Li Zefan Acked-by: KAMEZAWA Hiroyuki Cc: Balbir Singh Cc: Paul Menage Cc: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f0dc076adf05..4d0ea3ceba6d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1992,6 +1992,7 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, { struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); struct mem_cgroup *parent; + if (val > 100) return -EINVAL; @@ -1999,15 +2000,22 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, return -EINVAL; parent = mem_cgroup_from_cont(cgrp->parent); + + cgroup_lock(); + /* If under hierarchy, only empty-root can set this value */ if ((parent->use_hierarchy) || - (memcg->use_hierarchy && !list_empty(&cgrp->children))) + (memcg->use_hierarchy && !list_empty(&cgrp->children))) { + cgroup_unlock(); return -EINVAL; + } spin_lock(&memcg->reclaim_param_lock); memcg->swappiness = val; spin_unlock(&memcg->reclaim_param_lock); + cgroup_unlock(); + return 0; } -- cgit v1.2.3 From eb6434d9e79a72d35d68811efd68fe8bab8f5baf Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Wed, 21 Jan 2009 17:45:47 +0900 Subject: nommu: Stub in vm_map_ram()/vm_unmap_ram()/vm_unmap_aliases(). Presently we do not support these interfaces, so make them BUG() wrappers as per the rest of the vmap interface on nommu. Fixes up the modular xfs build. Signed-off-by: Paul Mundt --- mm/nommu.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/nommu.c b/mm/nommu.c index 8cee8c8ff0f2..0c3e7d2114f6 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -10,7 +10,7 @@ * Copyright (c) 2000-2003 David McCullough * Copyright (c) 2000-2001 D Jeff Dionne * Copyright (c) 2002 Greg Ungerer - * Copyright (c) 2007-2008 Paul Mundt + * Copyright (c) 2007-2009 Paul Mundt */ #include @@ -394,6 +394,24 @@ void vunmap(const void *addr) } EXPORT_SYMBOL(vunmap); +void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot) +{ + BUG(); + return NULL; +} +EXPORT_SYMBOL(vm_map_ram); + +void vm_unmap_ram(const void *mem, unsigned int count) +{ + BUG(); +} +EXPORT_SYMBOL(vm_unmap_ram); + +void vm_unmap_aliases(void) +{ +} +EXPORT_SYMBOL_GPL(vm_unmap_aliases); + /* * Implement a stub for vmalloc_sync_all() if the architecture chose not to * have one. -- cgit v1.2.3 From 05ae6fa31874eda2484da13c5dc4ddee8a47a0a4 Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Tue, 13 Jan 2009 17:30:22 +1000 Subject: uclinux: add process name to allocation error message This patch adds the name of the process to the bad allocation error message on non-MMU systems. Changed suggested by jsujjavanich@syntech-fuelmaster.com Signed-off-by: Greg Ungerer --- mm/nommu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/nommu.c b/mm/nommu.c index 0c3e7d2114f6..2fcf47d449b4 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1161,8 +1161,8 @@ error_free: return ret; enomem: - printk("Allocation of length %lu from process %d failed\n", - len, current->pid); + printk("Allocation of length %lu from process %d (%s) failed\n", + len, current->pid, current->comm); show_free_areas(); return -ENOMEM; } -- cgit v1.2.3 From 3718909448116bf4411445468c58acc946379f92 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 27 Jan 2009 18:59:46 -0800 Subject: slub: fix per cpu kmem_cache_cpu array memory leak The per cpu array of kmem_cache_cpu structures accomodates NR_KMEM_CACHE_CPU such structs. When this array overflows and a struct is allocated by kmalloc(), it may have an address at the upper bound of this array. If this happens, it does not get freed and the per cpu kmem_cache_cpu_free pointer will be out of bounds after kmem_cache_destroy() or cpu offlining. Cc: Christoph Lameter Signed-off-by: David Rientjes Signed-off-by: Pekka Enberg --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 6392ae5cc6b1..bdc9abb08a23 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1996,7 +1996,7 @@ static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s, static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu) { if (c < per_cpu(kmem_cache_cpu, cpu) || - c > per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) { + c >= per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) { kfree(c); return; } -- cgit v1.2.3 From de33c8db5910cda599899dd431cc30d7c1018cbf Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 29 Jan 2009 17:46:42 -0800 Subject: Fix OOPS in mmap_region() when merging adjacent VM_LOCKED file segments As of commit ba470de43188cdbff795b5da43a1474523c6c2fb ("map: handle mlocked pages during map, remap, unmap") we now use the 'vma' variable at the end of mmap_region() to handle the page-in of newly mapped mlocked pages. However, if we merged adjacent vma's together, the vma we're using may be stale. We historically consciously avoided using it after the merge operation, but that got overlooked when redoing the locked page handling. This commit simplifies mmap_region() by doing any vma merges early, avoiding the issue entirely, and 'vma' will always be valid. As pointed out by Hugh Dickins, this depends on any drivers that change the page offset of flags to have set one of the VM_SPECIAL bits (so that they cannot trigger the early merge logic), but that's true in general. Reported-and-tested-by: Maksim Yevmenkin Cc: Lee Schermerhorn Cc: Nick Piggin Cc: Andrew Morton Cc: Hugh Dickins Signed-off-by: Linus Torvalds --- mm/mmap.c | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 8d95902e9a38..d3fa10a726cf 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1134,16 +1134,11 @@ munmap_back: } /* - * Can we just expand an old private anonymous mapping? - * The VM_SHARED test is necessary because shmem_zero_setup - * will create the file object for a shared anonymous map below. + * Can we just expand an old mapping? */ - if (!file && !(vm_flags & VM_SHARED)) { - vma = vma_merge(mm, prev, addr, addr + len, vm_flags, - NULL, NULL, pgoff, NULL); - if (vma) - goto out; - } + vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL); + if (vma) + goto out; /* * Determine the object being mapped and call the appropriate @@ -1206,17 +1201,8 @@ munmap_back: if (vma_wants_writenotify(vma)) vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED); - if (file && vma_merge(mm, prev, addr, vma->vm_end, - vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) { - mpol_put(vma_policy(vma)); - kmem_cache_free(vm_area_cachep, vma); - fput(file); - if (vm_flags & VM_EXECUTABLE) - removed_exe_file_vma(mm); - } else { - vma_link(mm, vma, prev, rb_link, rb_parent); - file = vma->vm_file; - } + vma_link(mm, vma, prev, rb_link, rb_parent); + file = vma->vm_file; /* Once vma denies write, undo our temporary denial count */ if (correct_wcount) -- cgit v1.2.3 From 7bcc1bb1232de6efc0b85e0c7fe38e90b2436318 Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Thu, 29 Jan 2009 14:25:11 -0800 Subject: memcg: get/put parents at create/free The lifetime of struct cgroup and struct mem_cgroup is different and mem_cgroup has its own reference count for handling references from swap_cgroup. This causes strange problem that the parent mem_cgroup dies while child mem_cgroup alive, and this problem causes a bug in case of use_hierarchy==1 because res_counter_uncharge climbs up the tree. This patch is for avoiding it by getting the parent at create, and putting it at freeing. Signed-off-by: Daisuke Nishimura Reviewed-by; KAMEZAWA Hiroyuki Cc: Balbir Singh Cc: Pavel Emelyanov Cc: Li Zefan Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4d0ea3ceba6d..76feccd26dc3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -202,6 +202,7 @@ pcg_default_flags[NR_CHARGE_TYPE] = { static void mem_cgroup_get(struct mem_cgroup *mem); static void mem_cgroup_put(struct mem_cgroup *mem); +static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, struct page_cgroup *pc, @@ -2193,10 +2194,23 @@ static void mem_cgroup_get(struct mem_cgroup *mem) static void mem_cgroup_put(struct mem_cgroup *mem) { - if (atomic_dec_and_test(&mem->refcnt)) + if (atomic_dec_and_test(&mem->refcnt)) { + struct mem_cgroup *parent = parent_mem_cgroup(mem); __mem_cgroup_free(mem); + if (parent) + mem_cgroup_put(parent); + } } +/* + * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. + */ +static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem) +{ + if (!mem->res.parent) + return NULL; + return mem_cgroup_from_res_counter(mem->res.parent, res); +} #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP static void __init enable_swap_cgroup(void) @@ -2235,6 +2249,13 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) if (parent && parent->use_hierarchy) { res_counter_init(&mem->res, &parent->res); res_counter_init(&mem->memsw, &parent->memsw); + /* + * We increment refcnt of the parent to ensure that we can + * safely access it on res_counter_charge/uncharge. + * This refcnt will be decremented when freeing this + * mem_cgroup(see mem_cgroup_put). + */ + mem_cgroup_get(parent); } else { res_counter_init(&mem->res, NULL); res_counter_init(&mem->memsw, NULL); -- cgit v1.2.3 From 85d9fc89fb0f0703df6444f260187c088a8d59ff Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 29 Jan 2009 14:25:13 -0800 Subject: memcg: fix refcnt handling at swapoff Now, at swapoff, even while try_charge() fails, commit is executed. This is a bug which turns the refcnt of cgroup_subsys_state negative. Reported-by: Li Zefan Tested-by: Li Zefan Tested-by: Daisuke Nishimura Signed-off-by: KAMEZAWA Hiroyuki Reviewed-by: Daisuke Nishimura Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index f48b831e5e5c..7e6304dfafab 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -698,8 +698,10 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, pte_t *pte; int ret = 1; - if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr)) + if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr)) { ret = -ENOMEM; + goto out_nolock; + } pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { @@ -723,6 +725,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, activate_page(page); out: pte_unmap_unlock(pte, ptl); +out_nolock: return ret; } -- cgit v1.2.3 From 299b4eaa302138426d5a9ecd954de1f565d76c94 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 29 Jan 2009 14:25:17 -0800 Subject: memcg: NULL pointer dereference at rmdir on some NUMA systems N_POSSIBLE doesn't means there is memory...and force_empty can visit invalid node which have no pgdat. To visit all valid nodes, N_HIGH_MEMORY should be used. Reported-by: Li Zefan Signed-off-by: KAMEZAWA Hiroyuki Tested-by: Li Zefan Cc: Balbir Singh Cc: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 76feccd26dc3..8e4be9cb2a6a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1685,7 +1685,7 @@ move_account: /* This is for making all *used* pages to be on LRU. */ lru_add_drain_all(); ret = 0; - for_each_node_state(node, N_POSSIBLE) { + for_each_node_state(node, N_HIGH_MEMORY) { for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { enum lru_list l; for_each_lru(l) { -- cgit v1.2.3 From 33bfad54b58cf05cfe6678c3ec9235d4bc8db4c2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 30 Jan 2009 11:37:22 -0800 Subject: Allow opportunistic merging of VM_CAN_NONLINEAR areas Commit de33c8db5910cda599899dd431cc30d7c1018cbf ("Fix OOPS in mmap_region() when merging adjacent VM_LOCKED file segments") unified the vma merging of anonymous and file maps to just one place, which simplified the code and fixed a use-after-free bug that could cause an oops. But by doing the merge opportunistically before even having called ->mmap() on the file method, it now compares two different 'vm_flags' values: the pre-mmap() value of the new not-yet-formed vma, and previous mappings of the same file around it. And in doing so, it refused to merge the common file case, which adds a marker to say "I can be made non-linear". This fixes it by just adding a set of flags that don't have to match, because we know they are ok to merge. Currently it's only that single VM_CAN_NONLINEAR flag, but at least conceptually there could be others in the future. Reported-and-acked-by: Hugh Dickins Cc: Lee Schermerhorn Cc: Nick Piggin Cc: Andrew Morton Cc: Greg KH Signed-off-by: Linus Torvalds --- mm/mmap.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index d3fa10a726cf..c581df14d0de 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -658,6 +658,9 @@ again: remove_next = 1 + (end > next->vm_end); validate_mm(mm); } +/* Flags that can be inherited from an existing mapping when merging */ +#define VM_MERGEABLE_FLAGS (VM_CAN_NONLINEAR) + /* * If the vma has a ->close operation then the driver probably needs to release * per-vma resources, so we don't attempt to merge those. @@ -665,7 +668,7 @@ again: remove_next = 1 + (end > next->vm_end); static inline int is_mergeable_vma(struct vm_area_struct *vma, struct file *file, unsigned long vm_flags) { - if (vma->vm_flags != vm_flags) + if ((vma->vm_flags ^ vm_flags) & ~VM_MERGEABLE_FLAGS) return 0; if (vma->vm_file != file) return 0; -- cgit v1.2.3 From fc8744adc870a8d4366908221508bb113d8b72ee Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 31 Jan 2009 15:08:56 -0800 Subject: Stop playing silly games with the VM_ACCOUNT flag The mmap_region() code would temporarily set the VM_ACCOUNT flag for anonymous shared mappings just to inform shmem_zero_setup() that it should enable accounting for the resulting shm object. It would then clear the flag after calling ->mmap (for the /dev/zero case) or doing shmem_zero_setup() (for the MAP_ANON case). This just resulted in vma merge issues, but also made for just unnecessary confusion. Use the already-existing VM_NORESERVE flag for this instead, and let shmem_{zero|file}_setup() just figure it out from that. This also happens to make it obvious that the new DRI2 GEM layer uses a non-reserving backing store for its object allocation - which is quite possibly not intentional. But since I didn't want to change semantics in this patch, I left it alone, and just updated the caller to use the new flag semantics. Signed-off-by: Linus Torvalds --- mm/mmap.c | 48 +++++++++++++++++++++++++----------------------- mm/shmem.c | 2 +- 2 files changed, 26 insertions(+), 24 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index c581df14d0de..214b6a258eeb 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1090,6 +1090,15 @@ int vma_wants_writenotify(struct vm_area_struct *vma) mapping_cap_account_dirty(vma->vm_file->f_mapping); } +/* + * We account for memory if it's a private writeable mapping, + * and VM_NORESERVE wasn't set. + */ +static inline int accountable_mapping(unsigned int vm_flags) +{ + return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE; +} + unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, unsigned long flags, unsigned int vm_flags, unsigned long pgoff, @@ -1117,23 +1126,24 @@ munmap_back: if (!may_expand_vm(mm, len >> PAGE_SHIFT)) return -ENOMEM; - if (flags & MAP_NORESERVE) + /* + * Set 'VM_NORESERVE' if we should not account for the + * memory use of this mapping. We only honor MAP_NORESERVE + * if we're allowed to overcommit memory. + */ + if ((flags & MAP_NORESERVE) && sysctl_overcommit_memory != OVERCOMMIT_NEVER) + vm_flags |= VM_NORESERVE; + if (!accountable) vm_flags |= VM_NORESERVE; - if (accountable && (!(flags & MAP_NORESERVE) || - sysctl_overcommit_memory == OVERCOMMIT_NEVER)) { - if (vm_flags & VM_SHARED) { - /* Check memory availability in shmem_file_setup? */ - vm_flags |= VM_ACCOUNT; - } else if (vm_flags & VM_WRITE) { - /* - * Private writable mapping: check memory availability - */ - charged = len >> PAGE_SHIFT; - if (security_vm_enough_memory(charged)) - return -ENOMEM; - vm_flags |= VM_ACCOUNT; - } + /* + * Private writable mapping: check memory availability + */ + if (accountable_mapping(vm_flags)) { + charged = len >> PAGE_SHIFT; + if (security_vm_enough_memory(charged)) + return -ENOMEM; + vm_flags |= VM_ACCOUNT; } /* @@ -1184,14 +1194,6 @@ munmap_back: goto free_vma; } - /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform - * shmem_zero_setup (perhaps called through /dev/zero's ->mmap) - * that memory reservation must be checked; but that reservation - * belongs to shared memory object, not to vma: so now clear it. - */ - if ((vm_flags & (VM_SHARED|VM_ACCOUNT)) == (VM_SHARED|VM_ACCOUNT)) - vma->vm_flags &= ~VM_ACCOUNT; - /* Can addr have changed?? * * Answer: Yes, several device drivers can do it in their diff --git a/mm/shmem.c b/mm/shmem.c index 5d0de96c9789..19d566ccdeea 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2628,7 +2628,7 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) goto close_file; #ifdef CONFIG_SHMEM - SHMEM_I(inode)->flags = flags & VM_ACCOUNT; + SHMEM_I(inode)->flags = (flags & VM_NORESERVE) ? 0 : VM_ACCOUNT; #endif d_instantiate(dentry, inode); inode->i_size = size; -- cgit v1.2.3 From 27421e211a39784694b597dbf35848b88363c248 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 1 Feb 2009 11:00:16 -0800 Subject: Manually revert "mlock: downgrade mmap sem while populating mlocked regions" This essentially reverts commit 8edb08caf68184fb170f4f69c7445929e199eaea. It downgraded our mmap semaphore to a read-lock while mlocking pages, in order to allow other threads (and external accesses like "ps" et al) to walk the vma lists and take page faults etc. Which is a nice idea, but the implementation does not work. Because we cannot upgrade the lock back to a write lock without releasing the mmap semaphore, the code had to release the lock entirely and then re-take it as a writelock. However, that meant that the caller possibly lost the vma chain that it was following, since now another thread could come in and mmap/munmap the range. The code tried to work around that by just looking up the vma again and erroring out if that happened, but quite frankly, that was just a buggy hack that doesn't actually protect against anything (the other thread could just have replaced the vma with another one instead of totally unmapping it). The only way to downgrade to a read map _reliably_ is to do it at the end, which is likely the right thing to do: do all the 'vma' operations with the write-lock held, then downgrade to a read after completing them all, and then do the "populate the newly mlocked regions" while holding just the read lock. And then just drop the read-lock and return to user space. The (perhaps somewhat simpler) alternative is to just make all the callers of mlock_vma_pages_range() know that the mmap lock got dropped, and just re-grab the mmap semaphore if it needs to mlock more than one vma region. So we can do this "downgrade mmap sem while populating mlocked regions" thing right, but the way it was done here was absolutely not correct. Thus the revert, in the expectation that we will do it all correctly some day. Cc: Lee Schermerhorn Cc: Rik van Riel Cc: Andrew Morton Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- mm/mlock.c | 47 ++--------------------------------------------- 1 file changed, 2 insertions(+), 45 deletions(-) (limited to 'mm') diff --git a/mm/mlock.c b/mm/mlock.c index 2904a347e476..028ec482fdd4 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -294,14 +294,10 @@ static inline int __mlock_posix_error_return(long retval) * * return number of pages [> 0] to be removed from locked_vm on success * of "special" vmas. - * - * return negative error if vma spanning @start-@range disappears while - * mmap semaphore is dropped. Unlikely? */ long mlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { - struct mm_struct *mm = vma->vm_mm; int nr_pages = (end - start) / PAGE_SIZE; BUG_ON(!(vma->vm_flags & VM_LOCKED)); @@ -314,20 +310,8 @@ long mlock_vma_pages_range(struct vm_area_struct *vma, if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(current))) { - long error; - downgrade_write(&mm->mmap_sem); - - error = __mlock_vma_pages_range(vma, start, end, 1); - up_read(&mm->mmap_sem); - /* vma can change or disappear */ - down_write(&mm->mmap_sem); - vma = find_vma(mm, start); - /* non-NULL vma must contain @start, but need to check @end */ - if (!vma || end > vma->vm_end) - return -ENOMEM; - - return 0; /* hide other errors from mmap(), et al */ + return __mlock_vma_pages_range(vma, start, end, 1); } /* @@ -438,41 +422,14 @@ success: vma->vm_flags = newflags; if (lock) { - /* - * mmap_sem is currently held for write. Downgrade the write - * lock to a read lock so that other faults, mmap scans, ... - * while we fault in all pages. - */ - downgrade_write(&mm->mmap_sem); - ret = __mlock_vma_pages_range(vma, start, end, 1); - /* - * Need to reacquire mmap sem in write mode, as our callers - * expect this. We have no support for atomically upgrading - * a sem to write, so we need to check for ranges while sem - * is unlocked. - */ - up_read(&mm->mmap_sem); - /* vma can change or disappear */ - down_write(&mm->mmap_sem); - *prev = find_vma(mm, start); - /* non-NULL *prev must contain @start, but need to check @end */ - if (!(*prev) || end > (*prev)->vm_end) - ret = -ENOMEM; - else if (ret > 0) { + if (ret > 0) { mm->locked_vm -= ret; ret = 0; } else ret = __mlock_posix_error_return(ret); /* translate if needed */ } else { - /* - * TODO: for unlocking, pages will already be resident, so - * we don't need to wait for allocations/reclaim/pagein, ... - * However, unlocking a very large region can still take a - * while. Should we downgrade the semaphore for both lock - * AND unlock ? - */ __mlock_vma_pages_range(vma, start, end, 0); } -- cgit v1.2.3 From dcf6a79dda5cc2a2bec183e50d829030c0972aaa Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Mon, 2 Feb 2009 18:33:49 +0200 Subject: write-back: fix nr_to_write counter Commit 05fe478dd04e02fa230c305ab9b5616669821dd3 introduced some @wbc->nr_to_write breakage. It made the following changes: 1. Decrement wbc->nr_to_write instead of nr_to_write 2. Decrement wbc->nr_to_write _only_ if wbc->sync_mode == WB_SYNC_NONE 3. If synced nr_to_write pages, stop only if if wbc->sync_mode == WB_SYNC_NONE, otherwise keep going. However, according to the commit message, the intention was to only make change 3. Change 1 is a bug. Change 2 does not seem to be necessary, and it breaks UBIFS expectations, so if needed, it should be done separately later. And change 2 does not seem to be documented in the commit message. This patch does the following: 1. Undo changes 1 and 2 2. Add a comment explaining change 3 (it very useful to have comments in _code_, not only in the commit). Signed-off-by: Artem Bityutskiy Acked-by: Nick Piggin Cc: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index b493db7841dc..dc32dae01e5f 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1051,13 +1051,22 @@ continue_unlock: } } - if (wbc->sync_mode == WB_SYNC_NONE) { - wbc->nr_to_write--; - if (wbc->nr_to_write <= 0) { - done = 1; - break; - } + if (nr_to_write > 0) + nr_to_write--; + else if (wbc->sync_mode == WB_SYNC_NONE) { + /* + * We stop writing back only if we are not + * doing integrity sync. In case of integrity + * sync we have to keep going because someone + * may be concurrently dirtying pages, and we + * might have synced a lot of newly appeared + * dirty pages, but have not synced all of the + * old dirty pages. + */ + done = 1; + break; } + if (wbc->nonblocking && bdi_write_congested(bdi)) { wbc->encountered_congestion = 1; done = 1; -- cgit v1.2.3 From ab92661d5d9514647346047f30f67a7f35ffea67 Mon Sep 17 00:00:00 2001 From: Carsten Otte Date: Wed, 4 Feb 2009 15:12:16 -0800 Subject: do_wp_page: fix regression with execute in place Fix do_wp_page for VM_MIXEDMAP mappings. In the case where pfn_valid returns 0 for a pfn at the beginning of do_wp_page and the mapping is not shared writable, the code branches to label `gotten:' with old_page == NULL. In case the vma is locked (vma->vm_flags & VM_LOCKED), lock_page, clear_page_mlock, and unlock_page try to access the old_page. This patch checks whether old_page is valid before it is dereferenced. The regression was introduced by "mlock: mlocked pages are unevictable" (commit b291f000393f5a0b679012b39d79fbc85c018233). Signed-off-by: Carsten Otte Cc: Nick Piggin Cc: Heiko Carstens Cc: [2.6.28.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 22bfa7a47a0b..baa999e87cd2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1999,7 +1999,7 @@ gotten: * Don't let another task, with possibly unlocked vma, * keep the mlocked page. */ - if (vma->vm_flags & VM_LOCKED) { + if ((vma->vm_flags & VM_LOCKED) && old_page) { lock_page(old_page); /* for LRU manipulation */ clear_page_mlock(old_page); unlock_page(old_page); -- cgit v1.2.3 From d5b562330ec766292a3ac54ae5e0673610bd5b3d Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sun, 8 Feb 2009 20:56:58 +0000 Subject: mm: fix error case in mlock downgrade reversion Commit 27421e211a39784694b597dbf35848b88363c248, Manually revert "mlock: downgrade mmap sem while populating mlocked regions", has introduced its own regression: __mlock_vma_pages_range() may report an error (for example, -EFAULT from trying to lock down pages from beyond EOF), but mlock_vma_pages_range() must hide that from its callers as before. Reported-by: Sami Farin Signed-off-by: Hugh Dickins Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- mm/mlock.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mlock.c b/mm/mlock.c index 028ec482fdd4..037161d61b4e 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -311,7 +311,10 @@ long mlock_vma_pages_range(struct vm_area_struct *vma, is_vm_hugetlb_page(vma) || vma == get_gate_vma(current))) { - return __mlock_vma_pages_range(vma, start, end, 1); + __mlock_vma_pages_range(vma, start, end, 1); + + /* Hide errors from mmap() and other callers */ + return 0; } /* -- cgit v1.2.3 From 5a6fe125950676015f5108fb71b2a67441755003 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 10 Feb 2009 14:02:27 +0000 Subject: Do not account for the address space used by hugetlbfs using VM_ACCOUNT When overcommit is disabled, the core VM accounts for pages used by anonymous shared, private mappings and special mappings. It keeps track of VMAs that should be accounted for with VM_ACCOUNT and VMAs that never had a reserve with VM_NORESERVE. Overcommit for hugetlbfs is much riskier than overcommit for base pages due to contiguity requirements. It avoids overcommiting on both shared and private mappings using reservation counters that are checked and updated during mmap(). This ensures (within limits) that hugepages exist in the future when faults occurs or it is too easy to applications to be SIGKILLed. As hugetlbfs makes its own reservations of a different unit to the base page size, VM_ACCOUNT should never be set. Even if the units were correct, we would double account for the usage in the core VM and hugetlbfs. VM_NORESERVE may be set because an application can request no reserves be made for hugetlbfs at the risk of getting killed later. With commit fc8744adc870a8d4366908221508bb113d8b72ee, VM_NORESERVE and VM_ACCOUNT are getting unconditionally set for hugetlbfs-backed mappings. This breaks the accounting for both the core VM and hugetlbfs, can trigger an OOM storm when hugepage pools are too small lockups and corrupted counters otherwise are used. This patch brings hugetlbfs more in line with how the core VM treats VM_NORESERVE but prevents VM_ACCOUNT being set. Signed-off-by: Mel Gorman Signed-off-by: Linus Torvalds --- mm/fremap.c | 2 +- mm/hugetlb.c | 39 +++++++++++++++++++++++++-------------- mm/mmap.c | 38 ++++++++++++++++++++++---------------- mm/mprotect.c | 5 +++-- 4 files changed, 51 insertions(+), 33 deletions(-) (limited to 'mm') diff --git a/mm/fremap.c b/mm/fremap.c index 736ba7f3306a..b6ec85abbb39 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -198,7 +198,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, flags &= MAP_NONBLOCK; get_file(file); addr = mmap_region(file, start, size, - flags, vma->vm_flags, pgoff, 1); + flags, vma->vm_flags, pgoff); fput(file); if (IS_ERR_VALUE(addr)) { err = addr; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 618e98304080..207464209546 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2269,14 +2269,12 @@ void hugetlb_change_protection(struct vm_area_struct *vma, int hugetlb_reserve_pages(struct inode *inode, long from, long to, - struct vm_area_struct *vma) + struct vm_area_struct *vma, + int acctflag) { - long ret, chg; + long ret = 0, chg; struct hstate *h = hstate_inode(inode); - if (vma && vma->vm_flags & VM_NORESERVE) - return 0; - /* * Shared mappings base their reservation on the number of pages that * are already allocated on behalf of the file. Private mappings need @@ -2285,22 +2283,25 @@ int hugetlb_reserve_pages(struct inode *inode, */ if (!vma || vma->vm_flags & VM_SHARED) chg = region_chg(&inode->i_mapping->private_list, from, to); - else { - struct resv_map *resv_map = resv_map_alloc(); - if (!resv_map) - return -ENOMEM; - + else chg = to - from; - set_vma_resv_map(vma, resv_map); - set_vma_resv_flags(vma, HPAGE_RESV_OWNER); - } - if (chg < 0) return chg; if (hugetlb_get_quota(inode->i_mapping, chg)) return -ENOSPC; + + /* + * Only apply hugepage reservation if asked. We still have to + * take the filesystem quota because it is an upper limit + * defined for the mount and not necessarily memory as a whole + */ + if (acctflag & VM_NORESERVE) { + reset_vma_resv_huge_pages(vma); + return 0; + } + ret = hugetlb_acct_memory(h, chg); if (ret < 0) { hugetlb_put_quota(inode->i_mapping, chg); @@ -2308,6 +2309,16 @@ int hugetlb_reserve_pages(struct inode *inode, } if (!vma || vma->vm_flags & VM_SHARED) region_add(&inode->i_mapping->private_list, from, to); + else { + struct resv_map *resv_map = resv_map_alloc(); + + if (!resv_map) + return -ENOMEM; + + set_vma_resv_map(vma, resv_map); + set_vma_resv_flags(vma, HPAGE_RESV_OWNER); + } + return 0; } diff --git a/mm/mmap.c b/mm/mmap.c index 214b6a258eeb..eb1270bebe67 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -918,7 +918,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, struct inode *inode; unsigned int vm_flags; int error; - int accountable = 1; unsigned long reqprot = prot; /* @@ -1019,8 +1018,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, return -EPERM; vm_flags &= ~VM_MAYEXEC; } - if (is_file_hugepages(file)) - accountable = 0; if (!file->f_op || !file->f_op->mmap) return -ENODEV; @@ -1053,8 +1050,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, if (error) return error; - return mmap_region(file, addr, len, flags, vm_flags, pgoff, - accountable); + return mmap_region(file, addr, len, flags, vm_flags, pgoff); } EXPORT_SYMBOL(do_mmap_pgoff); @@ -1092,17 +1088,23 @@ int vma_wants_writenotify(struct vm_area_struct *vma) /* * We account for memory if it's a private writeable mapping, - * and VM_NORESERVE wasn't set. + * not hugepages and VM_NORESERVE wasn't set. */ -static inline int accountable_mapping(unsigned int vm_flags) +static inline int accountable_mapping(struct file *file, unsigned int vm_flags) { + /* + * hugetlb has its own accounting separate from the core VM + * VM_HUGETLB may not be set yet so we cannot check for that flag. + */ + if (file && is_file_hugepages(file)) + return 0; + return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE; } unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, unsigned long flags, - unsigned int vm_flags, unsigned long pgoff, - int accountable) + unsigned int vm_flags, unsigned long pgoff) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; @@ -1128,18 +1130,22 @@ munmap_back: /* * Set 'VM_NORESERVE' if we should not account for the - * memory use of this mapping. We only honor MAP_NORESERVE - * if we're allowed to overcommit memory. + * memory use of this mapping. */ - if ((flags & MAP_NORESERVE) && sysctl_overcommit_memory != OVERCOMMIT_NEVER) - vm_flags |= VM_NORESERVE; - if (!accountable) - vm_flags |= VM_NORESERVE; + if ((flags & MAP_NORESERVE)) { + /* We honor MAP_NORESERVE if allowed to overcommit */ + if (sysctl_overcommit_memory != OVERCOMMIT_NEVER) + vm_flags |= VM_NORESERVE; + + /* hugetlb applies strict overcommit unless MAP_NORESERVE */ + if (file && is_file_hugepages(file)) + vm_flags |= VM_NORESERVE; + } /* * Private writable mapping: check memory availability */ - if (accountable_mapping(vm_flags)) { + if (accountable_mapping(file, vm_flags)) { charged = len >> PAGE_SHIFT; if (security_vm_enough_memory(charged)) return -ENOMEM; diff --git a/mm/mprotect.c b/mm/mprotect.c index abe2694e13f4..258197b76fb4 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -151,10 +151,11 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, /* * If we make a private mapping writable we increase our commit; * but (without finer accounting) cannot reduce our commit if we - * make it unwritable again. + * make it unwritable again. hugetlb mapping were accounted for + * even if read-only so there is no need to account for them here */ if (newflags & VM_WRITE) { - if (!(oldflags & (VM_ACCOUNT|VM_WRITE| + if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB| VM_SHARED|VM_NORESERVE))) { charged = nrpages; if (security_vm_enough_memory(charged)) -- cgit v1.2.3 From 17c9d12e126cb0de8d535dc1908c4819d712bc68 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 11 Feb 2009 16:34:16 +0000 Subject: Do not account for hugetlbfs quota at mmap() time if mapping [SHM|MAP]_NORESERVE Commit 5a6fe125950676015f5108fb71b2a67441755003 brought hugetlbfs more in line with the core VM by obeying VM_NORESERVE and not reserving hugepages for both shared and private mappings when [SHM|MAP]_NORESERVE are specified. However, it is still taking filesystem quota unconditionally. At fault time, if there are no reserves and attempt is made to allocate the page and account for filesystem quota. If either fail, the fault fails. The impact is that quota is getting accounted for twice. This patch partially reverts 5a6fe125950676015f5108fb71b2a67441755003. To help prevent this mistake happening again, it improves the documentation of hugetlb_reserve_pages() Reported-by: Andy Whitcroft Signed-off-by: Mel Gorman Acked-by: Andy Whitcroft Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 53 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 207464209546..107da3d809a8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2272,9 +2272,17 @@ int hugetlb_reserve_pages(struct inode *inode, struct vm_area_struct *vma, int acctflag) { - long ret = 0, chg; + long ret, chg; struct hstate *h = hstate_inode(inode); + /* + * Only apply hugepage reservation if asked. At fault time, an + * attempt will be made for VM_NORESERVE to allocate a page + * and filesystem quota without using reserves + */ + if (acctflag & VM_NORESERVE) + return 0; + /* * Shared mappings base their reservation on the number of pages that * are already allocated on behalf of the file. Private mappings need @@ -2283,42 +2291,47 @@ int hugetlb_reserve_pages(struct inode *inode, */ if (!vma || vma->vm_flags & VM_SHARED) chg = region_chg(&inode->i_mapping->private_list, from, to); - else + else { + struct resv_map *resv_map = resv_map_alloc(); + if (!resv_map) + return -ENOMEM; + chg = to - from; + set_vma_resv_map(vma, resv_map); + set_vma_resv_flags(vma, HPAGE_RESV_OWNER); + } + if (chg < 0) return chg; + /* There must be enough filesystem quota for the mapping */ if (hugetlb_get_quota(inode->i_mapping, chg)) return -ENOSPC; /* - * Only apply hugepage reservation if asked. We still have to - * take the filesystem quota because it is an upper limit - * defined for the mount and not necessarily memory as a whole + * Check enough hugepages are available for the reservation. + * Hand back the quota if there are not */ - if (acctflag & VM_NORESERVE) { - reset_vma_resv_huge_pages(vma); - return 0; - } - ret = hugetlb_acct_memory(h, chg); if (ret < 0) { hugetlb_put_quota(inode->i_mapping, chg); return ret; } + + /* + * Account for the reservations made. Shared mappings record regions + * that have reservations as they are shared by multiple VMAs. + * When the last VMA disappears, the region map says how much + * the reservation was and the page cache tells how much of + * the reservation was consumed. Private mappings are per-VMA and + * only the consumed reservations are tracked. When the VMA + * disappears, the original reservation is the VMA size and the + * consumed reservations are stored in the map. Hence, nothing + * else has to be done for private mappings here + */ if (!vma || vma->vm_flags & VM_SHARED) region_add(&inode->i_mapping->private_list, from, to); - else { - struct resv_map *resv_map = resv_map_alloc(); - - if (!resv_map) - return -ENOMEM; - - set_vma_resv_map(vma, resv_map); - set_vma_resv_flags(vma, HPAGE_RESV_OWNER); - } - return 0; } -- cgit v1.2.3 From 1001c9fb8721ab395e21f571ed2aaa523cdd1e29 Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Wed, 11 Feb 2009 13:04:18 -0800 Subject: migration: migrate_vmas should check "vma" migrate_vmas() should check "vma" not "vma->vm_next" for for-loop condition. Signed-off-by: Daisuke Nishimura Cc: Christoph Lameter Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 2bb4e1d63520..a9eff3f092f6 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1129,7 +1129,7 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to, struct vm_area_struct *vma; int err = 0; - for(vma = mm->mmap; vma->vm_next && !err; vma = vma->vm_next) { + for (vma = mm->mmap; vma && !err; vma = vma->vm_next) { if (vma->vm_ops && vma->vm_ops->migrate) { err = vma->vm_ops->migrate(vma, to, from, flags); if (err) -- cgit v1.2.3 From fc3501d411d34823fb9be248a95a0c44f945866f Mon Sep 17 00:00:00 2001 From: Sven Wegener Date: Wed, 11 Feb 2009 13:04:23 -0800 Subject: mm: fix dirty_bytes/dirty_background_bytes sysctls on 64bit arches We need to pass an unsigned long as the minimum, because it gets casted to an unsigned long in the sysctl handler. If we pass an int, we'll access four more bytes on 64bit arches, resulting in a random minimum value. [rientjes@google.com: fix type of `old_bytes'] Signed-off-by: Sven Wegener Cc: Peter Zijlstra Cc: Dave Chinner Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index dc32dae01e5f..c17005e73974 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -209,7 +209,7 @@ int dirty_bytes_handler(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { - int old_bytes = vm_dirty_bytes; + unsigned long old_bytes = vm_dirty_bytes; int ret; ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); -- cgit v1.2.3 From 508b9f8efdad123b202b228f71f59feba51e4fb5 Mon Sep 17 00:00:00 2001 From: MinChan Kim Date: Wed, 11 Feb 2009 13:04:27 -0800 Subject: mm: fix mlocked page counter mismatch When I tested following program, I found that the mlocked counter is strange. It cannot free some mlocked pages. It is because try_to_unmap_file() doesn't check real page mappings in vmas. That is because the goal of an address_space for a file is to find all processes into which the file's specific interval is mapped. It is related to the file's interval, not to pages. Even if the page isn't really mapped by the vma, it returns SWAP_MLOCK since the vma has VM_LOCKED, then calls try_to_mlock_page. After this the mlocked counter is increased again. COWed anon page in a file-backed vma could be a such case. This patch resolves it. -- my test program -- int main() { mlockall(MCL_CURRENT); return 0; } -- before -- root@barrios-target-linux:~# cat /proc/meminfo | egrep 'Mlo|Unev' Unevictable: 0 kB Mlocked: 0 kB -- after -- root@barrios-target-linux:~# cat /proc/meminfo | egrep 'Mlo|Unev' Unevictable: 8 kB Mlocked: 8 kB Signed-off-by: MinChan Kim Acked-by: Lee Schermerhorn Acked-by: KOSAKI Motohiro Tested-by: Lee Schermerhorn Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index ac4af8cffbf9..16521664010d 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1072,7 +1072,8 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration) spin_lock(&mapping->i_mmap_lock); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { if (MLOCK_PAGES && unlikely(unlock)) { - if (!(vma->vm_flags & VM_LOCKED)) + if (!((vma->vm_flags & VM_LOCKED) && + page_mapped_in_vma(page, vma))) continue; /* must visit all vmas */ ret = SWAP_MLOCK; } else { -- cgit v1.2.3 From 2e9c23724328ae4e56c42a35a717a956d7d3001d Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 11 Feb 2009 13:04:29 -0800 Subject: memcg: use __GFP_NOWARN in page cgroup allocation page_cgroup's page allocation at init/memory hotplug uses kmalloc() and vmalloc(). If kmalloc() failes, vmalloc() is used. This is because vmalloc() is very limited resource on 32bit systems. We want to use kmalloc() first. But in this kind of call, __GFP_NOWARN should be specified. Reported-by: Heiko Carstens Signed-off-by: KAMEZAWA Hiroyuki Acked-by: Balbir Singh Acked-by: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_cgroup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 7006a11350c8..ceecfbb143fa 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -114,7 +114,8 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn) nid = page_to_nid(pfn_to_page(pfn)); table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; if (slab_is_available()) { - base = kmalloc_node(table_size, GFP_KERNEL, nid); + base = kmalloc_node(table_size, + GFP_KERNEL | __GFP_NOWARN, nid); if (!base) base = vmalloc_node(table_size, nid); } else { -- cgit v1.2.3 From 89e1219004b3657cc014521663eeef0744f1c99d Mon Sep 17 00:00:00 2001 From: Federico Cuello Date: Wed, 11 Feb 2009 13:04:39 -0800 Subject: writeback: fix break condition Commit dcf6a79dda5cc2a2bec183e50d829030c0972aaa ("write-back: fix nr_to_write counter") fixed nr_to_write counter, but didn't set the break condition properly. If nr_to_write == 0 after being decremented it will loop one more time before setting done = 1 and breaking the loop. [akpm@linux-foundation.org: coding-style fixes] Cc: Artem Bityutskiy Acked-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index c17005e73974..6106a5c7ed44 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1051,20 +1051,23 @@ continue_unlock: } } - if (nr_to_write > 0) + if (nr_to_write > 0) { nr_to_write--; - else if (wbc->sync_mode == WB_SYNC_NONE) { - /* - * We stop writing back only if we are not - * doing integrity sync. In case of integrity - * sync we have to keep going because someone - * may be concurrently dirtying pages, and we - * might have synced a lot of newly appeared - * dirty pages, but have not synced all of the - * old dirty pages. - */ - done = 1; - break; + if (nr_to_write == 0 && + wbc->sync_mode == WB_SYNC_NONE) { + /* + * We stop writing back only if we are + * not doing integrity sync. In case of + * integrity sync we have to keep going + * because someone may be concurrently + * dirtying pages, and we might have + * synced a lot of newly appeared dirty + * pages, but have not synced all of the + * old dirty pages. + */ + done = 1; + break; + } } if (wbc->nonblocking && bdi_write_congested(bdi)) { -- cgit v1.2.3 From 9480c53e9b2aa13a06283ffb96bb8f1873ac4e9a Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 11 Feb 2009 13:04:41 -0800 Subject: mm: rearrange exit_mmap() to unlock before arch_exit_mmap Christophe Saout reported [in precursor to: http://marc.info/?l=linux-kernel&m=123209902707347&w=4]: > Note that I also some a different issue with CONFIG_UNEVICTABLE_LRU. > Seems like Xen tears down current->mm early on process termination, so > that __get_user_pages in exit_mmap causes nasty messages when the > process had any mlocked pages. (in fact, it somehow manages to get into > the swapping code and produces a null pointer dereference trying to get > a swap token) Jeremy explained: Yes. In the normal case under Xen, an in-use pagetable is "pinned", meaning that it is RO to the kernel, and all updates must go via hypercall (or writes are trapped and emulated, which is much the same thing). An unpinned pagetable is not currently in use by any process, and can be directly accessed as normal RW pages. As an optimisation at process exit time, we unpin the pagetable as early as possible (switching the process to init_mm), so that all the normal pagetable teardown can happen with direct memory accesses. This happens in exit_mmap() -> arch_exit_mmap(). The munlocking happens a few lines below. The obvious thing to do would be to move arch_exit_mmap() to below the munlock code, but I think we'd want to call it even if mm->mmap is NULL, just to be on the safe side. Thus, this patch: exit_mmap() needs to unlock any locked vmas before calling arch_exit_mmap, as the latter may switch the current mm to init_mm, which would cause the former to fail. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Lee Schermerhorn Cc: Christophe Saout Cc: Keir Fraser Cc: Christophe Saout Cc: Alex Williamson Cc: [2.6.28.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index eb1270bebe67..00ced3ee49a8 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2084,12 +2084,8 @@ void exit_mmap(struct mm_struct *mm) unsigned long end; /* mm's last user has gone, and its about to be pulled down */ - arch_exit_mmap(mm); mmu_notifier_release(mm); - if (!mm->mmap) /* Can happen if dup_mmap() received an OOM */ - return; - if (mm->locked_vm) { vma = mm->mmap; while (vma) { @@ -2098,7 +2094,13 @@ void exit_mmap(struct mm_struct *mm) vma = vma->vm_next; } } + + arch_exit_mmap(mm); + vma = mm->mmap; + if (!vma) /* Can happen if dup_mmap() received an OOM */ + return; + lru_add_drain(); flush_cache_mm(mm); tlb = tlb_gather_mmu(mm, 1); -- cgit v1.2.3