diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/backing-dev.c | 23 | ||||
-rw-r--r-- | mm/bounce.c | 4 | ||||
-rw-r--r-- | mm/compaction.c | 29 | ||||
-rw-r--r-- | mm/debug-pagealloc.c | 3 | ||||
-rw-r--r-- | mm/filemap.c | 34 | ||||
-rw-r--r-- | mm/filemap_xip.c | 7 | ||||
-rw-r--r-- | mm/huge_memory.c | 103 | ||||
-rw-r--r-- | mm/hugetlb.c | 11 | ||||
-rw-r--r-- | mm/kmemleak.c | 161 | ||||
-rw-r--r-- | mm/ksm.c | 12 | ||||
-rw-r--r-- | mm/memblock.c | 13 | ||||
-rw-r--r-- | mm/memcontrol.c | 1245 | ||||
-rw-r--r-- | mm/memory-failure.c | 2 | ||||
-rw-r--r-- | mm/memory.c | 45 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 2 | ||||
-rw-r--r-- | mm/mempolicy.c | 5 | ||||
-rw-r--r-- | mm/migrate.c | 173 | ||||
-rw-r--r-- | mm/mlock.c | 3 | ||||
-rw-r--r-- | mm/mmap.c | 17 | ||||
-rw-r--r-- | mm/mprotect.c | 3 | ||||
-rw-r--r-- | mm/nommu.c | 9 | ||||
-rw-r--r-- | mm/oom_kill.c | 42 | ||||
-rw-r--r-- | mm/page_alloc.c | 76 | ||||
-rw-r--r-- | mm/page_cgroup.c | 166 | ||||
-rw-r--r-- | mm/percpu-vm.c | 3 | ||||
-rw-r--r-- | mm/percpu.c | 12 | ||||
-rw-r--r-- | mm/process_vm_access.c | 23 | ||||
-rw-r--r-- | mm/rmap.c | 20 | ||||
-rw-r--r-- | mm/shmem.c | 57 | ||||
-rw-r--r-- | mm/slab.c | 39 | ||||
-rw-r--r-- | mm/slub.c | 86 | ||||
-rw-r--r-- | mm/swap.c | 89 | ||||
-rw-r--r-- | mm/swapfile.c | 39 | ||||
-rw-r--r-- | mm/truncate.c | 2 | ||||
-rw-r--r-- | mm/vmalloc.c | 17 | ||||
-rw-r--r-- | mm/vmscan.c | 790 | ||||
-rw-r--r-- | mm/vmstat.c | 2 |
37 files changed, 1868 insertions, 1499 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 7ba8feae11b8..dd8e2aafb07e 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -318,7 +318,7 @@ static void wakeup_timer_fn(unsigned long data) if (bdi->wb.task) { trace_writeback_wake_thread(bdi); wake_up_process(bdi->wb.task); - } else { + } else if (bdi->dev) { /* * When bdi tasks are inactive for long time, they are killed. * In this case we have to wake-up the forker thread which @@ -584,6 +584,8 @@ EXPORT_SYMBOL(bdi_register_dev); */ static void bdi_wb_shutdown(struct backing_dev_info *bdi) { + struct task_struct *task; + if (!bdi_cap_writeback_dirty(bdi)) return; @@ -602,8 +604,13 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi) * Finally, kill the kernel thread. We don't need to be RCU * safe anymore, since the bdi is gone from visibility. */ - if (bdi->wb.task) - kthread_stop(bdi->wb.task); + spin_lock_bh(&bdi->wb_lock); + task = bdi->wb.task; + bdi->wb.task = NULL; + spin_unlock_bh(&bdi->wb_lock); + + if (task) + kthread_stop(task); } /* @@ -623,7 +630,9 @@ static void bdi_prune_sb(struct backing_dev_info *bdi) void bdi_unregister(struct backing_dev_info *bdi) { - if (bdi->dev) { + struct device *dev = bdi->dev; + + if (dev) { bdi_set_min_ratio(bdi, 0); trace_writeback_bdi_unregister(bdi); bdi_prune_sb(bdi); @@ -632,8 +641,12 @@ void bdi_unregister(struct backing_dev_info *bdi) if (!bdi_cap_flush_forker(bdi)) bdi_wb_shutdown(bdi); bdi_debug_unregister(bdi); - device_unregister(bdi->dev); + + spin_lock_bh(&bdi->wb_lock); bdi->dev = NULL; + spin_unlock_bh(&bdi->wb_lock); + + device_unregister(dev); } } EXPORT_SYMBOL(bdi_unregister); diff --git a/mm/bounce.c b/mm/bounce.c index 4e9ae722af83..d1be02ca1889 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -50,9 +50,9 @@ static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom) unsigned char *vto; local_irq_save(flags); - vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ); + vto = kmap_atomic(to->bv_page); memcpy(vto + to->bv_offset, vfrom, to->bv_len); - kunmap_atomic(vto, KM_BOUNCE_READ); + kunmap_atomic(vto); local_irq_restore(flags); } diff --git a/mm/compaction.c b/mm/compaction.c index e6670c34eb49..d9ebebe1a2aa 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -313,12 +313,34 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, } else if (!locked) spin_lock_irq(&zone->lru_lock); + /* + * migrate_pfn does not necessarily start aligned to a + * pageblock. Ensure that pfn_valid is called when moving + * into a new MAX_ORDER_NR_PAGES range in case of large + * memory holes within the zone + */ + if ((low_pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) { + if (!pfn_valid(low_pfn)) { + low_pfn += MAX_ORDER_NR_PAGES - 1; + continue; + } + } + if (!pfn_valid_within(low_pfn)) continue; nr_scanned++; - /* Get the page and skip if free */ + /* + * Get the page and ensure the page is within the same zone. + * See the comment in isolate_freepages about overlapping + * nodes. It is deliberate that the new zone lock is not taken + * as memory compaction should not move pages between nodes. + */ page = pfn_to_page(low_pfn); + if (page_zone(page) != zone) + continue; + + /* Skip if free */ if (PageBuddy(page)) continue; @@ -350,7 +372,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, } if (!cc->sync) - mode |= ISOLATE_CLEAN; + mode |= ISOLATE_ASYNC_MIGRATE; /* Try isolate the page */ if (__isolate_lru_page(page, mode, 0) != 0) @@ -557,7 +579,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) nr_migrate = cc->nr_migratepages; err = migrate_pages(&cc->migratepages, compaction_alloc, (unsigned long)cc, false, - cc->sync); + cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC); update_nr_listpages(cc); nr_remaining = cc->nr_migratepages; @@ -671,6 +693,7 @@ static int compact_node(int nid) .nr_freepages = 0, .nr_migratepages = 0, .order = -1, + .sync = true, }; zone = &pgdat->node_zones[zoneid]; diff --git a/mm/debug-pagealloc.c b/mm/debug-pagealloc.c index 7cea557407f4..789ff70c8a4a 100644 --- a/mm/debug-pagealloc.c +++ b/mm/debug-pagealloc.c @@ -95,9 +95,6 @@ static void unpoison_pages(struct page *page, int n) void kernel_map_pages(struct page *page, int numpages, int enable) { - if (!debug_pagealloc_enabled) - return; - if (enable) unpoison_pages(page, numpages); else diff --git a/mm/filemap.c b/mm/filemap.c index c4ee2e918bea..2f8165075a5a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -393,24 +393,11 @@ EXPORT_SYMBOL(filemap_write_and_wait_range); int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) { int error; - struct mem_cgroup *memcg = NULL; VM_BUG_ON(!PageLocked(old)); VM_BUG_ON(!PageLocked(new)); VM_BUG_ON(new->mapping); - /* - * This is not page migration, but prepare_migration and - * end_migration does enough work for charge replacement. - * - * In the longer term we probably want a specialized function - * for moving the charge from old to new in a more efficient - * manner. - */ - error = mem_cgroup_prepare_migration(old, new, &memcg, gfp_mask); - if (error) - return error; - error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); if (!error) { struct address_space *mapping = old->mapping; @@ -432,13 +419,12 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) if (PageSwapBacked(new)) __inc_zone_page_state(new, NR_SHMEM); spin_unlock_irq(&mapping->tree_lock); + /* mem_cgroup codes must not be called under tree_lock */ + mem_cgroup_replace_page_cache(old, new); radix_tree_preload_end(); if (freepage) freepage(old); page_cache_release(old); - mem_cgroup_end_migration(memcg, old, new, true); - } else { - mem_cgroup_end_migration(memcg, old, new, false); } return error; @@ -1332,10 +1318,10 @@ int file_read_actor(read_descriptor_t *desc, struct page *page, * taking the kmap. */ if (!fault_in_pages_writeable(desc->arg.buf, size)) { - kaddr = kmap_atomic(page, KM_USER0); + kaddr = kmap_atomic(page); left = __copy_to_user_inatomic(desc->arg.buf, kaddr + offset, size); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); if (left == 0) goto success; } @@ -1414,15 +1400,12 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, unsigned long seg = 0; size_t count; loff_t *ppos = &iocb->ki_pos; - struct blk_plug plug; count = 0; retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); if (retval) return retval; - blk_start_plug(&plug); - /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ if (filp->f_flags & O_DIRECT) { loff_t size; @@ -1438,8 +1421,12 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, retval = filemap_write_and_wait_range(mapping, pos, pos + iov_length(iov, nr_segs) - 1); if (!retval) { + struct blk_plug plug; + + blk_start_plug(&plug); retval = mapping->a_ops->direct_IO(READ, iocb, iov, pos, nr_segs); + blk_finish_plug(&plug); } if (retval > 0) { *ppos = pos + retval; @@ -1495,7 +1482,6 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, break; } out: - blk_finish_plug(&plug); return retval; } EXPORT_SYMBOL(generic_file_aio_read); @@ -2059,7 +2045,7 @@ size_t iov_iter_copy_from_user_atomic(struct page *page, size_t copied; BUG_ON(!in_atomic()); - kaddr = kmap_atomic(page, KM_USER0); + kaddr = kmap_atomic(page); if (likely(i->nr_segs == 1)) { int left; char __user *buf = i->iov->iov_base + i->iov_offset; @@ -2069,7 +2055,7 @@ size_t iov_iter_copy_from_user_atomic(struct page *page, copied = __iovec_copy_from_user_inatomic(kaddr + offset, i->iov, i->iov_offset, bytes); } - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); return copied; } diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index f91b2f687343..a4eb31132229 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -263,7 +263,12 @@ found: xip_pfn); if (err == -ENOMEM) return VM_FAULT_OOM; - BUG_ON(err); + /* + * err == -EBUSY is fine, we've raced against another thread + * that faulted-in the same page + */ + if (err != -EBUSY) + BUG_ON(err); return VM_FAULT_NOPAGE; } else { int err, ret = VM_FAULT_OOM; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 36b3d988b4ef..8f7fc394f636 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -487,41 +487,68 @@ static struct attribute_group khugepaged_attr_group = { .attrs = khugepaged_attr, .name = "khugepaged", }; -#endif /* CONFIG_SYSFS */ -static int __init hugepage_init(void) +static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) { int err; -#ifdef CONFIG_SYSFS - static struct kobject *hugepage_kobj; -#endif - err = -EINVAL; - if (!has_transparent_hugepage()) { - transparent_hugepage_flags = 0; - goto out; - } - -#ifdef CONFIG_SYSFS - err = -ENOMEM; - hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); - if (unlikely(!hugepage_kobj)) { + *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); + if (unlikely(!*hugepage_kobj)) { printk(KERN_ERR "hugepage: failed kobject create\n"); - goto out; + return -ENOMEM; } - err = sysfs_create_group(hugepage_kobj, &hugepage_attr_group); + err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); if (err) { printk(KERN_ERR "hugepage: failed register hugeage group\n"); - goto out; + goto delete_obj; } - err = sysfs_create_group(hugepage_kobj, &khugepaged_attr_group); + err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); if (err) { printk(KERN_ERR "hugepage: failed register hugeage group\n"); - goto out; + goto remove_hp_group; } -#endif + + return 0; + +remove_hp_group: + sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group); +delete_obj: + kobject_put(*hugepage_kobj); + return err; +} + +static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj) +{ + sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group); + sysfs_remove_group(hugepage_kobj, &hugepage_attr_group); + kobject_put(hugepage_kobj); +} +#else +static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj) +{ + return 0; +} + +static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj) +{ +} +#endif /* CONFIG_SYSFS */ + +static int __init hugepage_init(void) +{ + int err; + struct kobject *hugepage_kobj; + + if (!has_transparent_hugepage()) { + transparent_hugepage_flags = 0; + return -EINVAL; + } + + err = hugepage_init_sysfs(&hugepage_kobj); + if (err) + return err; err = khugepaged_slab_init(); if (err) @@ -545,7 +572,9 @@ static int __init hugepage_init(void) set_recommended_min_free_kbytes(); + return 0; out: + hugepage_exit_sysfs(hugepage_kobj); return err; } module_init(hugepage_init) @@ -642,6 +671,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, set_pmd_at(mm, haddr, pmd, entry); prepare_pmd_huge_pte(pgtable, mm); add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); + mm->nr_ptes++; spin_unlock(&mm->page_table_lock); } @@ -760,6 +790,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd = pmd_mkold(pmd_wrprotect(pmd)); set_pmd_at(dst_mm, addr, dst_pmd, pmd); prepare_pmd_huge_pte(pgtable, dst_mm); + dst_mm->nr_ptes++; ret = 0; out_unlock: @@ -858,7 +889,6 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, } kfree(pages); - mm->nr_ptes++; smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); page_remove_rmap(page); @@ -997,7 +1027,7 @@ out: } int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, - pmd_t *pmd) + pmd_t *pmd, unsigned long addr) { int ret = 0; @@ -1013,10 +1043,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pgtable = get_pmd_huge_pte(tlb->mm); page = pmd_page(*pmd); pmd_clear(pmd); + tlb_remove_pmd_tlb_entry(tlb, pmd, addr); page_remove_rmap(page); VM_BUG_ON(page_mapcount(page) < 0); add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); VM_BUG_ON(!PageHead(page)); + tlb->mm->nr_ptes--; spin_unlock(&tlb->mm->page_table_lock); tlb_remove_page(tlb, page); pte_free(tlb->mm, pgtable); @@ -1116,7 +1148,6 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, entry = pmd_modify(entry, newprot); set_pmd_at(mm, addr, pmd, entry); spin_unlock(&vma->vm_mm->page_table_lock); - flush_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE); ret = 1; } } else @@ -1199,16 +1230,16 @@ static int __split_huge_page_splitting(struct page *page, static void __split_huge_page_refcount(struct page *page) { int i; - unsigned long head_index = page->index; struct zone *zone = page_zone(page); - int zonestat; int tail_count = 0; /* prevent PageLRU to go away from under us, and freeze lru stats */ spin_lock_irq(&zone->lru_lock); compound_lock(page); + /* complete memcg works before add pages to LRU */ + mem_cgroup_split_huge_fixup(page); - for (i = 1; i < HPAGE_PMD_NR; i++) { + for (i = HPAGE_PMD_NR - 1; i >= 1; i--) { struct page *page_tail = page + i; /* tail_page->_mapcount cannot change */ @@ -1271,14 +1302,13 @@ static void __split_huge_page_refcount(struct page *page) BUG_ON(page_tail->mapping); page_tail->mapping = page->mapping; - page_tail->index = ++head_index; + page_tail->index = page->index + i; BUG_ON(!PageAnon(page_tail)); BUG_ON(!PageUptodate(page_tail)); BUG_ON(!PageDirty(page_tail)); BUG_ON(!PageSwapBacked(page_tail)); - mem_cgroup_split_huge_fixup(page, page_tail); lru_add_page_tail(zone, page, page_tail); } @@ -1288,15 +1318,6 @@ static void __split_huge_page_refcount(struct page *page) __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); - /* - * A hugepage counts for HPAGE_PMD_NR pages on the LRU statistics, - * so adjust those appropriately if this page is on the LRU. - */ - if (PageLRU(page)) { - zonestat = NR_LRU_BASE + page_lru(page); - __mod_zone_page_state(zone, zonestat, -(HPAGE_PMD_NR-1)); - } - ClearPageCompound(page); compound_unlock(page); spin_unlock_irq(&zone->lru_lock); @@ -1356,7 +1377,6 @@ static int __split_huge_page_map(struct page *page, pte_unmap(pte); } - mm->nr_ptes++; smp_wmb(); /* make pte visible before pmd */ /* * Up to this point the pmd is present and huge and @@ -1969,7 +1989,6 @@ static void collapse_huge_page(struct mm_struct *mm, set_pmd_at(mm, address, pmd, _pmd); update_mmu_cache(vma, address, _pmd); prepare_pmd_huge_pte(pgtable, mm); - mm->nr_ptes--; spin_unlock(&mm->page_table_lock); #ifndef CONFIG_NUMA @@ -2064,7 +2083,7 @@ static void collect_mm_slot(struct mm_slot *mm_slot) { struct mm_struct *mm = mm_slot->mm; - VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock)); + VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); if (khugepaged_test_exit(mm)) { /* free mm_slot */ @@ -2094,7 +2113,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int progress = 0; VM_BUG_ON(!pages); - VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock)); + VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); if (khugepaged_scan.mm_slot) mm_slot = khugepaged_scan.mm_slot; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ea8c3a4cd2ae..a876871f6be5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2277,8 +2277,8 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, set_page_dirty(page); list_add(&page->lru, &page_list); } - spin_unlock(&mm->page_table_lock); flush_tlb_range(vma, start, end); + spin_unlock(&mm->page_table_lock); mmu_notifier_invalidate_range_end(mm, start, end); list_for_each_entry_safe(page, tmp, &page_list, lru) { page_remove_rmap(page); @@ -2508,6 +2508,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, { struct hstate *h = hstate_vma(vma); int ret = VM_FAULT_SIGBUS; + int anon_rmap = 0; pgoff_t idx; unsigned long size; struct page *page; @@ -2562,14 +2563,13 @@ retry: spin_lock(&inode->i_lock); inode->i_blocks += blocks_per_huge_page(h); spin_unlock(&inode->i_lock); - page_dup_rmap(page); } else { lock_page(page); if (unlikely(anon_vma_prepare(vma))) { ret = VM_FAULT_OOM; goto backout_unlocked; } - hugepage_add_new_anon_rmap(page, vma, address); + anon_rmap = 1; } } else { /* @@ -2582,7 +2582,6 @@ retry: VM_FAULT_SET_HINDEX(h - hstates); goto backout_unlocked; } - page_dup_rmap(page); } /* @@ -2606,6 +2605,10 @@ retry: if (!huge_pte_none(huge_ptep_get(ptep))) goto backout; + if (anon_rmap) + hugepage_add_new_anon_rmap(page, vma, address); + else + page_dup_rmap(page); new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_SHARED))); set_huge_pte_at(mm, address, ptep, new_pte); diff --git a/mm/kmemleak.c b/mm/kmemleak.c index f3b2a00fe9c1..45eb6217bf38 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -100,6 +100,7 @@ #include <linux/kmemcheck.h> #include <linux/kmemleak.h> +#include <linux/memory_hotplug.h> /* * Kmemleak configuration and common defines. @@ -196,7 +197,9 @@ static atomic_t kmemleak_enabled = ATOMIC_INIT(0); static atomic_t kmemleak_initialized = ATOMIC_INIT(0); /* enables or disables early logging of the memory operations */ static atomic_t kmemleak_early_log = ATOMIC_INIT(1); -/* set if a fata kmemleak error has occurred */ +/* set if a kmemleak warning was issued */ +static atomic_t kmemleak_warning = ATOMIC_INIT(0); +/* set if a fatal kmemleak error has occurred */ static atomic_t kmemleak_error = ATOMIC_INIT(0); /* minimum and maximum address that may be valid pointers */ @@ -228,8 +231,10 @@ static int kmemleak_skip_disable; /* kmemleak operation type for early logging */ enum { KMEMLEAK_ALLOC, + KMEMLEAK_ALLOC_PERCPU, KMEMLEAK_FREE, KMEMLEAK_FREE_PART, + KMEMLEAK_FREE_PERCPU, KMEMLEAK_NOT_LEAK, KMEMLEAK_IGNORE, KMEMLEAK_SCAN_AREA, @@ -259,9 +264,10 @@ static void kmemleak_disable(void); /* * Print a warning and dump the stack trace. */ -#define kmemleak_warn(x...) do { \ - pr_warning(x); \ - dump_stack(); \ +#define kmemleak_warn(x...) do { \ + pr_warning(x); \ + dump_stack(); \ + atomic_set(&kmemleak_warning, 1); \ } while (0) /* @@ -403,8 +409,8 @@ static struct kmemleak_object *lookup_object(unsigned long ptr, int alias) object = prio_tree_entry(node, struct kmemleak_object, tree_node); if (!alias && object->pointer != ptr) { - pr_warning("Found object by alias at 0x%08lx\n", ptr); - dump_stack(); + kmemleak_warn("Found object by alias at 0x%08lx\n", + ptr); dump_object_info(object); object = NULL; } @@ -794,9 +800,13 @@ static void __init log_early(int op_type, const void *ptr, size_t size, unsigned long flags; struct early_log *log; + if (atomic_read(&kmemleak_error)) { + /* kmemleak stopped recording, just count the requests */ + crt_early_log++; + return; + } + if (crt_early_log >= ARRAY_SIZE(early_log)) { - pr_warning("Early log buffer exceeded, " - "please increase DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n"); kmemleak_disable(); return; } @@ -811,8 +821,7 @@ static void __init log_early(int op_type, const void *ptr, size_t size, log->ptr = ptr; log->size = size; log->min_count = min_count; - if (op_type == KMEMLEAK_ALLOC) - log->trace_len = __save_stack_trace(log->trace); + log->trace_len = __save_stack_trace(log->trace); crt_early_log++; local_irq_restore(flags); } @@ -846,6 +855,20 @@ out: rcu_read_unlock(); } +/* + * Log an early allocated block and populate the stack trace. + */ +static void early_alloc_percpu(struct early_log *log) +{ + unsigned int cpu; + const void __percpu *ptr = log->ptr; + + for_each_possible_cpu(cpu) { + log->ptr = per_cpu_ptr(ptr, cpu); + early_alloc(log); + } +} + /** * kmemleak_alloc - register a newly allocated object * @ptr: pointer to beginning of the object @@ -873,6 +896,34 @@ void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count, EXPORT_SYMBOL_GPL(kmemleak_alloc); /** + * kmemleak_alloc_percpu - register a newly allocated __percpu object + * @ptr: __percpu pointer to beginning of the object + * @size: size of the object + * + * This function is called from the kernel percpu allocator when a new object + * (memory block) is allocated (alloc_percpu). It assumes GFP_KERNEL + * allocation. + */ +void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size) +{ + unsigned int cpu; + + pr_debug("%s(0x%p, %zu)\n", __func__, ptr, size); + + /* + * Percpu allocations are only scanned and not reported as leaks + * (min_count is set to 0). + */ + if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + for_each_possible_cpu(cpu) + create_object((unsigned long)per_cpu_ptr(ptr, cpu), + size, 0, GFP_KERNEL); + else if (atomic_read(&kmemleak_early_log)) + log_early(KMEMLEAK_ALLOC_PERCPU, ptr, size, 0); +} +EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu); + +/** * kmemleak_free - unregister a previously registered object * @ptr: pointer to beginning of the object * @@ -911,6 +962,28 @@ void __ref kmemleak_free_part(const void *ptr, size_t size) EXPORT_SYMBOL_GPL(kmemleak_free_part); /** + * kmemleak_free_percpu - unregister a previously registered __percpu object + * @ptr: __percpu pointer to beginning of the object + * + * This function is called from the kernel percpu allocator when an object + * (memory block) is freed (free_percpu). + */ +void __ref kmemleak_free_percpu(const void __percpu *ptr) +{ + unsigned int cpu; + + pr_debug("%s(0x%p)\n", __func__, ptr); + + if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + for_each_possible_cpu(cpu) + delete_object_full((unsigned long)per_cpu_ptr(ptr, + cpu)); + else if (atomic_read(&kmemleak_early_log)) + log_early(KMEMLEAK_FREE_PERCPU, ptr, 0, 0); +} +EXPORT_SYMBOL_GPL(kmemleak_free_percpu); + +/** * kmemleak_not_leak - mark an allocated object as false positive * @ptr: pointer to beginning of the object * @@ -963,7 +1036,7 @@ void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) { pr_debug("%s(0x%p)\n", __func__, ptr); - if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + if (atomic_read(&kmemleak_enabled) && ptr && size && !IS_ERR(ptr)) add_scan_area((unsigned long)ptr, size, gfp); else if (atomic_read(&kmemleak_early_log)) log_early(KMEMLEAK_SCAN_AREA, ptr, size, 0); @@ -1220,9 +1293,9 @@ static void kmemleak_scan(void) #endif /* - * Struct page scanning for each node. The code below is not yet safe - * with MEMORY_HOTPLUG. + * Struct page scanning for each node. */ + lock_memory_hotplug(); for_each_online_node(i) { pg_data_t *pgdat = NODE_DATA(i); unsigned long start_pfn = pgdat->node_start_pfn; @@ -1241,6 +1314,7 @@ static void kmemleak_scan(void) scan_block(page, page + 1, NULL, 1); } } + unlock_memory_hotplug(); /* * Scanning the task stacks (may introduce false negatives). @@ -1467,9 +1541,6 @@ static const struct seq_operations kmemleak_seq_ops = { static int kmemleak_open(struct inode *inode, struct file *file) { - if (!atomic_read(&kmemleak_enabled)) - return -EBUSY; - return seq_open(file, &kmemleak_seq_ops); } @@ -1543,6 +1614,9 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, int buf_size; int ret; + if (!atomic_read(&kmemleak_enabled)) + return -EBUSY; + buf_size = min(size, (sizeof(buf) - 1)); if (strncpy_from_user(buf, user_buf, buf_size) < 0) return -EFAULT; @@ -1602,20 +1676,24 @@ static const struct file_operations kmemleak_fops = { }; /* - * Perform the freeing of the kmemleak internal objects after waiting for any - * current memory scan to complete. + * Stop the memory scanning thread and free the kmemleak internal objects if + * no previous scan thread (otherwise, kmemleak may still have some useful + * information on memory leaks). */ static void kmemleak_do_cleanup(struct work_struct *work) { struct kmemleak_object *object; + bool cleanup = scan_thread == NULL; mutex_lock(&scan_mutex); stop_scan_thread(); - rcu_read_lock(); - list_for_each_entry_rcu(object, &object_list, object_list) - delete_object_full(object->pointer); - rcu_read_unlock(); + if (cleanup) { + rcu_read_lock(); + list_for_each_entry_rcu(object, &object_list, object_list) + delete_object_full(object->pointer); + rcu_read_unlock(); + } mutex_unlock(&scan_mutex); } @@ -1632,7 +1710,6 @@ static void kmemleak_disable(void) return; /* stop any memory operation tracing */ - atomic_set(&kmemleak_early_log, 0); atomic_set(&kmemleak_enabled, 0); /* check whether it is too early for a kernel thread */ @@ -1659,6 +1736,17 @@ static int kmemleak_boot_config(char *str) } early_param("kmemleak", kmemleak_boot_config); +static void __init print_log_trace(struct early_log *log) +{ + struct stack_trace trace; + + trace.nr_entries = log->trace_len; + trace.entries = log->trace; + + pr_notice("Early log backtrace:\n"); + print_stack_trace(&trace, 2); +} + /* * Kmemleak initialization. */ @@ -1669,6 +1757,7 @@ void __init kmemleak_init(void) #ifdef CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF if (!kmemleak_skip_disable) { + atomic_set(&kmemleak_early_log, 0); kmemleak_disable(); return; } @@ -1681,12 +1770,18 @@ void __init kmemleak_init(void) scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE); INIT_PRIO_TREE_ROOT(&object_tree_root); + if (crt_early_log >= ARRAY_SIZE(early_log)) + pr_warning("Early log buffer exceeded (%d), please increase " + "DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n", crt_early_log); + /* the kernel is still in UP mode, so disabling the IRQs is enough */ local_irq_save(flags); - if (!atomic_read(&kmemleak_error)) { + atomic_set(&kmemleak_early_log, 0); + if (atomic_read(&kmemleak_error)) { + local_irq_restore(flags); + return; + } else atomic_set(&kmemleak_enabled, 1); - atomic_set(&kmemleak_early_log, 0); - } local_irq_restore(flags); /* @@ -1701,12 +1796,18 @@ void __init kmemleak_init(void) case KMEMLEAK_ALLOC: early_alloc(log); break; + case KMEMLEAK_ALLOC_PERCPU: + early_alloc_percpu(log); + break; case KMEMLEAK_FREE: kmemleak_free(log->ptr); break; case KMEMLEAK_FREE_PART: kmemleak_free_part(log->ptr, log->size); break; + case KMEMLEAK_FREE_PERCPU: + kmemleak_free_percpu(log->ptr); + break; case KMEMLEAK_NOT_LEAK: kmemleak_not_leak(log->ptr); break; @@ -1720,7 +1821,13 @@ void __init kmemleak_init(void) kmemleak_no_scan(log->ptr); break; default: - WARN_ON(1); + kmemleak_warn("Unknown early log operation: %d\n", + log->op_type); + } + + if (atomic_read(&kmemleak_warning)) { + print_log_trace(log); + atomic_set(&kmemleak_warning, 0); } } } @@ -672,9 +672,9 @@ error: static u32 calc_checksum(struct page *page) { u32 checksum; - void *addr = kmap_atomic(page, KM_USER0); + void *addr = kmap_atomic(page); checksum = jhash2(addr, PAGE_SIZE / 4, 17); - kunmap_atomic(addr, KM_USER0); + kunmap_atomic(addr); return checksum; } @@ -683,11 +683,11 @@ static int memcmp_pages(struct page *page1, struct page *page2) char *addr1, *addr2; int ret; - addr1 = kmap_atomic(page1, KM_USER0); - addr2 = kmap_atomic(page2, KM_USER1); + addr1 = kmap_atomic(page1); + addr2 = kmap_atomic(page2); ret = memcmp(addr1, addr2, PAGE_SIZE); - kunmap_atomic(addr2, KM_USER1); - kunmap_atomic(addr1, KM_USER0); + kunmap_atomic(addr2); + kunmap_atomic(addr1); return ret; } diff --git a/mm/memblock.c b/mm/memblock.c index 2f55f19b7c86..99f285599501 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -99,21 +99,21 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, phys_addr_t this_start, this_end, cand; u64 i; - /* align @size to avoid excessive fragmentation on reserved array */ - size = round_up(size, align); - /* pump up @end */ if (end == MEMBLOCK_ALLOC_ACCESSIBLE) end = memblock.current_limit; - /* adjust @start to avoid underflow and allocating the first page */ - start = max3(start, size, (phys_addr_t)PAGE_SIZE); + /* avoid allocating the first page */ + start = max_t(phys_addr_t, start, PAGE_SIZE); end = max(start, end); for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) { this_start = clamp(this_start, start, end); this_end = clamp(this_end, start, end); + if (this_end < size) + continue; + cand = round_down(this_end - size, align); if (cand >= this_start) return cand; @@ -728,6 +728,9 @@ static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, { phys_addr_t found; + /* align @size to avoid excessive fragmentation on reserved array */ + size = round_up(size, align); + found = memblock_find_in_range_node(0, max_addr, size, align, nid); if (found && !memblock_reserve(found, size)) return found; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d87aa3510c5e..26c6f4ec20f4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -123,16 +123,22 @@ struct mem_cgroup_stat_cpu { unsigned long targets[MEM_CGROUP_NTARGETS]; }; +struct mem_cgroup_reclaim_iter { + /* css_id of the last scanned hierarchy member */ + int position; + /* scan generation, increased every round-trip */ + unsigned int generation; +}; + /* * per-zone information in memory controller. */ struct mem_cgroup_per_zone { - /* - * spin_lock to protect the per cgroup LRU - */ - struct list_head lists[NR_LRU_LISTS]; + struct lruvec lruvec; unsigned long count[NR_LRU_LISTS]; + struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; + struct zone_reclaim_stat reclaim_stat; struct rb_node tree_node; /* RB tree node */ unsigned long long usage_in_excess;/* Set to the value by which */ @@ -224,20 +230,35 @@ struct mem_cgroup { * the counter to account for memory usage */ struct res_counter res; - /* - * the counter to account for mem+swap usage. - */ - struct res_counter memsw; + + union { + /* + * the counter to account for mem+swap usage. + */ + struct res_counter memsw; + + /* + * rcu_freeing is used only when freeing struct mem_cgroup, + * so put it into a union to avoid wasting more memory. + * It must be disjoint from the css field. It could be + * in a union with the res field, but res plays a much + * larger part in mem_cgroup life than memsw, and might + * be of interest, even at time of free, when debugging. + * So share rcu_head with the less interesting memsw. + */ + struct rcu_head rcu_freeing; + /* + * But when using vfree(), that cannot be done at + * interrupt time, so we must then queue the work. + */ + struct work_struct work_freeing; + }; + /* * Per cgroup active and inactive list, similar to the * per zone LRU lists. */ struct mem_cgroup_lru_info info; - /* - * While reclaiming in a hierarchy, we cache the last child we - * reclaimed from. - */ - int last_scanned_child; int last_scanned_node; #if MAX_NUMNODES > 1 nodemask_t scan_nodes; @@ -366,22 +387,19 @@ enum charge_type { #define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) #define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 #define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) -#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2 -#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT) static void mem_cgroup_get(struct mem_cgroup *memcg); static void mem_cgroup_put(struct mem_cgroup *memcg); /* Writing them here to avoid exposing memcg's inner layout */ #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM -#ifdef CONFIG_INET #include <net/sock.h> #include <net/ip.h> static bool mem_cgroup_is_root(struct mem_cgroup *memcg); void sock_update_memcg(struct sock *sk) { - if (static_branch(&memcg_socket_limit_enabled)) { + if (mem_cgroup_sockets_enabled) { struct mem_cgroup *memcg; BUG_ON(!sk->sk_prot->proto_cgroup); @@ -413,7 +431,7 @@ EXPORT_SYMBOL(sock_update_memcg); void sock_release_memcg(struct sock *sk) { - if (static_branch(&memcg_socket_limit_enabled) && sk->sk_cgrp) { + if (mem_cgroup_sockets_enabled && sk->sk_cgrp) { struct mem_cgroup *memcg; WARN_ON(!sk->sk_cgrp->memcg); memcg = sk->sk_cgrp->memcg; @@ -421,6 +439,7 @@ void sock_release_memcg(struct sock *sk) } } +#ifdef CONFIG_INET struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) { if (!memcg || mem_cgroup_is_root(memcg)) @@ -566,7 +585,7 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) struct mem_cgroup_per_zone *mz; struct mem_cgroup_tree_per_zone *mctz; - for_each_node_state(node, N_POSSIBLE) { + for_each_node(node) { for (zone = 0; zone < MAX_NR_ZONES; zone++) { mz = mem_cgroup_zoneinfo(memcg, node, zone); mctz = soft_limit_tree_node_zone(node, zone); @@ -656,16 +675,6 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); } -void mem_cgroup_pgfault(struct mem_cgroup *memcg, int val) -{ - this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val); -} - -void mem_cgroup_pgmajfault(struct mem_cgroup *memcg, int val) -{ - this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val); -} - static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, enum mem_cgroup_events_index idx) { @@ -749,37 +758,32 @@ static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, return total; } -static bool __memcg_event_check(struct mem_cgroup *memcg, int target) +static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, + enum mem_cgroup_events_target target) { unsigned long val, next; val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]); next = __this_cpu_read(memcg->stat->targets[target]); /* from time_after() in jiffies.h */ - return ((long)next - (long)val < 0); -} - -static void __mem_cgroup_target_update(struct mem_cgroup *memcg, int target) -{ - unsigned long val, next; - - val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]); - - switch (target) { - case MEM_CGROUP_TARGET_THRESH: - next = val + THRESHOLDS_EVENTS_TARGET; - break; - case MEM_CGROUP_TARGET_SOFTLIMIT: - next = val + SOFTLIMIT_EVENTS_TARGET; - break; - case MEM_CGROUP_TARGET_NUMAINFO: - next = val + NUMAINFO_EVENTS_TARGET; - break; - default: - return; + if ((long)next - (long)val < 0) { + switch (target) { + case MEM_CGROUP_TARGET_THRESH: + next = val + THRESHOLDS_EVENTS_TARGET; + break; + case MEM_CGROUP_TARGET_SOFTLIMIT: + next = val + SOFTLIMIT_EVENTS_TARGET; + break; + case MEM_CGROUP_TARGET_NUMAINFO: + next = val + NUMAINFO_EVENTS_TARGET; + break; + default: + break; + } + __this_cpu_write(memcg->stat->targets[target], next); + return true; } - - __this_cpu_write(memcg->stat->targets[target], next); + return false; } /* @@ -790,25 +794,28 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) { preempt_disable(); /* threshold event is triggered in finer grain than soft limit */ - if (unlikely(__memcg_event_check(memcg, MEM_CGROUP_TARGET_THRESH))) { + if (unlikely(mem_cgroup_event_ratelimit(memcg, + MEM_CGROUP_TARGET_THRESH))) { + bool do_softlimit; + bool do_numainfo __maybe_unused; + + do_softlimit = mem_cgroup_event_ratelimit(memcg, + MEM_CGROUP_TARGET_SOFTLIMIT); +#if MAX_NUMNODES > 1 + do_numainfo = mem_cgroup_event_ratelimit(memcg, + MEM_CGROUP_TARGET_NUMAINFO); +#endif + preempt_enable(); + mem_cgroup_threshold(memcg); - __mem_cgroup_target_update(memcg, MEM_CGROUP_TARGET_THRESH); - if (unlikely(__memcg_event_check(memcg, - MEM_CGROUP_TARGET_SOFTLIMIT))) { + if (unlikely(do_softlimit)) mem_cgroup_update_tree(memcg, page); - __mem_cgroup_target_update(memcg, - MEM_CGROUP_TARGET_SOFTLIMIT); - } #if MAX_NUMNODES > 1 - if (unlikely(__memcg_event_check(memcg, - MEM_CGROUP_TARGET_NUMAINFO))) { + if (unlikely(do_numainfo)) atomic_inc(&memcg->numainfo_events); - __mem_cgroup_target_update(memcg, - MEM_CGROUP_TARGET_NUMAINFO); - } #endif - } - preempt_enable(); + } else + preempt_enable(); } struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) @@ -853,83 +860,116 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) return memcg; } -/* The caller has to guarantee "mem" exists before calling this */ -static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *memcg) +/** + * mem_cgroup_iter - iterate over memory cgroup hierarchy + * @root: hierarchy root + * @prev: previously returned memcg, NULL on first invocation + * @reclaim: cookie for shared reclaim walks, NULL for full walks + * + * Returns references to children of the hierarchy below @root, or + * @root itself, or %NULL after a full round-trip. + * + * Caller must pass the return value in @prev on subsequent + * invocations for reference counting, or use mem_cgroup_iter_break() + * to cancel a hierarchy walk before the round-trip is complete. + * + * Reclaimers can specify a zone and a priority level in @reclaim to + * divide up the memcgs in the hierarchy among all concurrent + * reclaimers operating on the same zone and priority. + */ +struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, + struct mem_cgroup *prev, + struct mem_cgroup_reclaim_cookie *reclaim) { - struct cgroup_subsys_state *css; - int found; + struct mem_cgroup *memcg = NULL; + int id = 0; - if (!memcg) /* ROOT cgroup has the smallest ID */ - return root_mem_cgroup; /*css_put/get against root is ignored*/ - if (!memcg->use_hierarchy) { - if (css_tryget(&memcg->css)) - return memcg; + if (mem_cgroup_disabled()) return NULL; - } - rcu_read_lock(); - /* - * searching a memory cgroup which has the smallest ID under given - * ROOT cgroup. (ID >= 1) - */ - css = css_get_next(&mem_cgroup_subsys, 1, &memcg->css, &found); - if (css && css_tryget(css)) - memcg = container_of(css, struct mem_cgroup, css); - else - memcg = NULL; - rcu_read_unlock(); - return memcg; -} -static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, - struct mem_cgroup *root, - bool cond) -{ - int nextid = css_id(&iter->css) + 1; - int found; - int hierarchy_used; - struct cgroup_subsys_state *css; + if (!root) + root = root_mem_cgroup; - hierarchy_used = iter->use_hierarchy; + if (prev && !reclaim) + id = css_id(&prev->css); - css_put(&iter->css); - /* If no ROOT, walk all, ignore hierarchy */ - if (!cond || (root && !hierarchy_used)) - return NULL; + if (prev && prev != root) + css_put(&prev->css); - if (!root) - root = root_mem_cgroup; + if (!root->use_hierarchy && root != root_mem_cgroup) { + if (prev) + return NULL; + return root; + } - do { - iter = NULL; - rcu_read_lock(); + while (!memcg) { + struct mem_cgroup_reclaim_iter *uninitialized_var(iter); + struct cgroup_subsys_state *css; + + if (reclaim) { + int nid = zone_to_nid(reclaim->zone); + int zid = zone_idx(reclaim->zone); + struct mem_cgroup_per_zone *mz; - css = css_get_next(&mem_cgroup_subsys, nextid, - &root->css, &found); - if (css && css_tryget(css)) - iter = container_of(css, struct mem_cgroup, css); + mz = mem_cgroup_zoneinfo(root, nid, zid); + iter = &mz->reclaim_iter[reclaim->priority]; + if (prev && reclaim->generation != iter->generation) + return NULL; + id = iter->position; + } + + rcu_read_lock(); + css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id); + if (css) { + if (css == &root->css || css_tryget(css)) + memcg = container_of(css, + struct mem_cgroup, css); + } else + id = 0; rcu_read_unlock(); - /* If css is NULL, no more cgroups will be found */ - nextid = found + 1; - } while (css && !iter); - return iter; + if (reclaim) { + iter->position = id; + if (!css) + iter->generation++; + else if (!prev && memcg) + reclaim->generation = iter->generation; + } + + if (prev && !css) + return NULL; + } + return memcg; +} + +/** + * mem_cgroup_iter_break - abort a hierarchy walk prematurely + * @root: hierarchy root + * @prev: last visited hierarchy member as returned by mem_cgroup_iter() + */ +void mem_cgroup_iter_break(struct mem_cgroup *root, + struct mem_cgroup *prev) +{ + if (!root) + root = root_mem_cgroup; + if (prev && prev != root) + css_put(&prev->css); } + /* - * for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please - * be careful that "break" loop is not allowed. We have reference count. - * Instead of that modify "cond" to be false and "continue" to exit the loop. + * Iteration constructs for visiting all cgroups (under a tree). If + * loops are exited prematurely (break), mem_cgroup_iter_break() must + * be used for reference counting. */ -#define for_each_mem_cgroup_tree_cond(iter, root, cond) \ - for (iter = mem_cgroup_start_loop(root);\ - iter != NULL;\ - iter = mem_cgroup_get_next(iter, root, cond)) - -#define for_each_mem_cgroup_tree(iter, root) \ - for_each_mem_cgroup_tree_cond(iter, root, true) - -#define for_each_mem_cgroup_all(iter) \ - for_each_mem_cgroup_tree_cond(iter, NULL, true) +#define for_each_mem_cgroup_tree(iter, root) \ + for (iter = mem_cgroup_iter(root, NULL, NULL); \ + iter != NULL; \ + iter = mem_cgroup_iter(root, iter, NULL)) +#define for_each_mem_cgroup(iter) \ + for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ + iter != NULL; \ + iter = mem_cgroup_iter(NULL, iter, NULL)) static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) { @@ -949,11 +989,11 @@ void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) goto out; switch (idx) { - case PGMAJFAULT: - mem_cgroup_pgmajfault(memcg, 1); - break; case PGFAULT: - mem_cgroup_pgfault(memcg, 1); + this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]); + break; + case PGMAJFAULT: + this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]); break; default: BUG(); @@ -963,6 +1003,27 @@ out: } EXPORT_SYMBOL(mem_cgroup_count_vm_event); +/** + * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg + * @zone: zone of the wanted lruvec + * @mem: memcg of the wanted lruvec + * + * Returns the lru list vector holding pages for the given @zone and + * @mem. This can be the global zone lruvec, if the memory controller + * is disabled. + */ +struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, + struct mem_cgroup *memcg) +{ + struct mem_cgroup_per_zone *mz; + + if (mem_cgroup_disabled()) + return &zone->lruvec; + + mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone)); + return &mz->lruvec; +} + /* * Following LRU functions are allowed to be used without PCG_LOCK. * Operations are called by routine of global LRU independently from memcg. @@ -977,180 +1038,104 @@ EXPORT_SYMBOL(mem_cgroup_count_vm_event); * When moving account, the page is not on LRU. It's isolated. */ -void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) -{ - struct page_cgroup *pc; - struct mem_cgroup_per_zone *mz; - - if (mem_cgroup_disabled()) - return; - pc = lookup_page_cgroup(page); - /* can happen while we handle swapcache. */ - if (!TestClearPageCgroupAcctLRU(pc)) - return; - VM_BUG_ON(!pc->mem_cgroup); - /* - * We don't check PCG_USED bit. It's cleared when the "page" is finally - * removed from global LRU. - */ - mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); - /* huge page split is done under lru_lock. so, we have no races. */ - MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); - if (mem_cgroup_is_root(pc->mem_cgroup)) - return; - VM_BUG_ON(list_empty(&pc->lru)); - list_del_init(&pc->lru); -} - -void mem_cgroup_del_lru(struct page *page) -{ - mem_cgroup_del_lru_list(page, page_lru(page)); -} - -/* - * Writeback is about to end against a page which has been marked for immediate - * reclaim. If it still appears to be reclaimable, move it to the tail of the - * inactive list. +/** + * mem_cgroup_lru_add_list - account for adding an lru page and return lruvec + * @zone: zone of the page + * @page: the page + * @lru: current lru + * + * This function accounts for @page being added to @lru, and returns + * the lruvec for the given @zone and the memcg @page is charged to. + * + * The callsite is then responsible for physically linking the page to + * the returned lruvec->lists[@lru]. */ -void mem_cgroup_rotate_reclaimable_page(struct page *page) -{ - struct mem_cgroup_per_zone *mz; - struct page_cgroup *pc; - enum lru_list lru = page_lru(page); - - if (mem_cgroup_disabled()) - return; - - pc = lookup_page_cgroup(page); - /* unused or root page is not rotated. */ - if (!PageCgroupUsed(pc)) - return; - /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ - smp_rmb(); - if (mem_cgroup_is_root(pc->mem_cgroup)) - return; - mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); - list_move_tail(&pc->lru, &mz->lists[lru]); -} - -void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) +struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page, + enum lru_list lru) { struct mem_cgroup_per_zone *mz; + struct mem_cgroup *memcg; struct page_cgroup *pc; if (mem_cgroup_disabled()) - return; + return &zone->lruvec; pc = lookup_page_cgroup(page); - /* unused or root page is not rotated. */ - if (!PageCgroupUsed(pc)) - return; - /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ - smp_rmb(); - if (mem_cgroup_is_root(pc->mem_cgroup)) - return; - mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); - list_move(&pc->lru, &mz->lists[lru]); -} - -void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) -{ - struct page_cgroup *pc; - struct mem_cgroup_per_zone *mz; + memcg = pc->mem_cgroup; - if (mem_cgroup_disabled()) - return; - pc = lookup_page_cgroup(page); - VM_BUG_ON(PageCgroupAcctLRU(pc)); /* - * putback: charge: - * SetPageLRU SetPageCgroupUsed - * smp_mb smp_mb - * PageCgroupUsed && add to memcg LRU PageLRU && add to memcg LRU + * Surreptitiously switch any uncharged page to root: + * an uncharged page off lru does nothing to secure + * its former mem_cgroup from sudden removal. * - * Ensure that one of the two sides adds the page to the memcg - * LRU during a race. + * Our caller holds lru_lock, and PageCgroupUsed is updated + * under page_cgroup lock: between them, they make all uses + * of pc->mem_cgroup safe. */ - smp_mb(); - if (!PageCgroupUsed(pc)) - return; - /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ - smp_rmb(); - mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); - /* huge page split is done under lru_lock. so, we have no races. */ + if (!PageCgroupUsed(pc) && memcg != root_mem_cgroup) + pc->mem_cgroup = memcg = root_mem_cgroup; + + mz = page_cgroup_zoneinfo(memcg, page); + /* compound_order() is stabilized through lru_lock */ MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); - SetPageCgroupAcctLRU(pc); - if (mem_cgroup_is_root(pc->mem_cgroup)) - return; - list_add(&pc->lru, &mz->lists[lru]); + return &mz->lruvec; } -/* - * At handling SwapCache and other FUSE stuff, pc->mem_cgroup may be changed - * while it's linked to lru because the page may be reused after it's fully - * uncharged. To handle that, unlink page_cgroup from LRU when charge it again. - * It's done under lock_page and expected that zone->lru_lock isnever held. +/** + * mem_cgroup_lru_del_list - account for removing an lru page + * @page: the page + * @lru: target lru + * + * This function accounts for @page being removed from @lru. + * + * The callsite is then responsible for physically unlinking + * @page->lru. */ -static void mem_cgroup_lru_del_before_commit(struct page *page) +void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru) { - unsigned long flags; - struct zone *zone = page_zone(page); - struct page_cgroup *pc = lookup_page_cgroup(page); + struct mem_cgroup_per_zone *mz; + struct mem_cgroup *memcg; + struct page_cgroup *pc; - /* - * Doing this check without taking ->lru_lock seems wrong but this - * is safe. Because if page_cgroup's USED bit is unset, the page - * will not be added to any memcg's LRU. If page_cgroup's USED bit is - * set, the commit after this will fail, anyway. - * This all charge/uncharge is done under some mutual execustion. - * So, we don't need to taking care of changes in USED bit. - */ - if (likely(!PageLRU(page))) + if (mem_cgroup_disabled()) return; - spin_lock_irqsave(&zone->lru_lock, flags); - /* - * Forget old LRU when this page_cgroup is *not* used. This Used bit - * is guarded by lock_page() because the page is SwapCache. - */ - if (!PageCgroupUsed(pc)) - mem_cgroup_del_lru_list(page, page_lru(page)); - spin_unlock_irqrestore(&zone->lru_lock, flags); + pc = lookup_page_cgroup(page); + memcg = pc->mem_cgroup; + VM_BUG_ON(!memcg); + mz = page_cgroup_zoneinfo(memcg, page); + /* huge page split is done under lru_lock. so, we have no races. */ + VM_BUG_ON(MEM_CGROUP_ZSTAT(mz, lru) < (1 << compound_order(page))); + MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); } -static void mem_cgroup_lru_add_after_commit(struct page *page) +void mem_cgroup_lru_del(struct page *page) { - unsigned long flags; - struct zone *zone = page_zone(page); - struct page_cgroup *pc = lookup_page_cgroup(page); - /* - * putback: charge: - * SetPageLRU SetPageCgroupUsed - * smp_mb smp_mb - * PageCgroupUsed && add to memcg LRU PageLRU && add to memcg LRU - * - * Ensure that one of the two sides adds the page to the memcg - * LRU during a race. - */ - smp_mb(); - /* taking care of that the page is added to LRU while we commit it */ - if (likely(!PageLRU(page))) - return; - spin_lock_irqsave(&zone->lru_lock, flags); - /* link when the page is linked to LRU but page_cgroup isn't */ - if (PageLRU(page) && !PageCgroupAcctLRU(pc)) - mem_cgroup_add_lru_list(page, page_lru(page)); - spin_unlock_irqrestore(&zone->lru_lock, flags); + mem_cgroup_lru_del_list(page, page_lru(page)); } - -void mem_cgroup_move_lists(struct page *page, - enum lru_list from, enum lru_list to) +/** + * mem_cgroup_lru_move_lists - account for moving a page between lrus + * @zone: zone of the page + * @page: the page + * @from: current lru + * @to: target lru + * + * This function accounts for @page being moved between the lrus @from + * and @to, and returns the lruvec for the given @zone and the memcg + * @page is charged to. + * + * The callsite is then responsible for physically relinking + * @page->lru to the returned lruvec->lists[@to]. + */ +struct lruvec *mem_cgroup_lru_move_lists(struct zone *zone, + struct page *page, + enum lru_list from, + enum lru_list to) { - if (mem_cgroup_disabled()) - return; - mem_cgroup_del_lru_list(page, from); - mem_cgroup_add_lru_list(page, to); + /* XXX: Optimize this, especially for @from == @to */ + mem_cgroup_lru_del_list(page, from); + return mem_cgroup_lru_add_list(zone, page, to); } /* @@ -1175,10 +1160,21 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) struct task_struct *p; p = find_lock_task_mm(task); - if (!p) - return 0; - curr = try_get_mem_cgroup_from_mm(p->mm); - task_unlock(p); + if (p) { + curr = try_get_mem_cgroup_from_mm(p->mm); + task_unlock(p); + } else { + /* + * All threads may have already detached their mm's, but the oom + * killer still needs to detect if they have already been oom + * killed to prevent needlessly killing additional tasks. + */ + task_lock(task); + curr = mem_cgroup_from_task(task); + if (curr) + css_get(&curr->css); + task_unlock(task); + } if (!curr) return 0; /* @@ -1258,68 +1254,6 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page) return &mz->reclaim_stat; } -unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, - struct list_head *dst, - unsigned long *scanned, int order, - isolate_mode_t mode, - struct zone *z, - struct mem_cgroup *mem_cont, - int active, int file) -{ - unsigned long nr_taken = 0; - struct page *page; - unsigned long scan; - LIST_HEAD(pc_list); - struct list_head *src; - struct page_cgroup *pc, *tmp; - int nid = zone_to_nid(z); - int zid = zone_idx(z); - struct mem_cgroup_per_zone *mz; - int lru = LRU_FILE * file + active; - int ret; - - BUG_ON(!mem_cont); - mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); - src = &mz->lists[lru]; - - scan = 0; - list_for_each_entry_safe_reverse(pc, tmp, src, lru) { - if (scan >= nr_to_scan) - break; - - if (unlikely(!PageCgroupUsed(pc))) - continue; - - page = lookup_cgroup_page(pc); - - if (unlikely(!PageLRU(page))) - continue; - - scan++; - ret = __isolate_lru_page(page, mode, file); - switch (ret) { - case 0: - list_move(&page->lru, dst); - mem_cgroup_del_lru(page); - nr_taken += hpage_nr_pages(page); - break; - case -EBUSY: - /* we don't affect global LRU but rotate in our LRU */ - mem_cgroup_rotate_lru_list(page, page_lru(page)); - break; - default: - break; - } - } - - *scanned = scan; - - trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken, - 0, 0, 0, mode); - - return nr_taken; -} - #define mem_cgroup_from_res_counter(counter, member) \ container_of(counter, struct mem_cgroup, member) @@ -1536,41 +1470,40 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) return min(limit, memsw); } -/* - * Visit the first child (need not be the first child as per the ordering - * of the cgroup list, since we track last_scanned_child) of @mem and use - * that to reclaim free pages from. - */ -static struct mem_cgroup * -mem_cgroup_select_victim(struct mem_cgroup *root_memcg) +static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, + gfp_t gfp_mask, + unsigned long flags) { - struct mem_cgroup *ret = NULL; - struct cgroup_subsys_state *css; - int nextid, found; - - if (!root_memcg->use_hierarchy) { - css_get(&root_memcg->css); - ret = root_memcg; - } + unsigned long total = 0; + bool noswap = false; + int loop; - while (!ret) { - rcu_read_lock(); - nextid = root_memcg->last_scanned_child + 1; - css = css_get_next(&mem_cgroup_subsys, nextid, &root_memcg->css, - &found); - if (css && css_tryget(css)) - ret = container_of(css, struct mem_cgroup, css); + if (flags & MEM_CGROUP_RECLAIM_NOSWAP) + noswap = true; + if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum) + noswap = true; - rcu_read_unlock(); - /* Updates scanning parameter */ - if (!css) { - /* this means start scan from ID:1 */ - root_memcg->last_scanned_child = 0; - } else - root_memcg->last_scanned_child = found; + for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) { + if (loop) + drain_all_stock_async(memcg); + total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap); + /* + * Allow limit shrinkers, which are triggered directly + * by userspace, to catch signals and stop reclaim + * after minimal progress, regardless of the margin. + */ + if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK)) + break; + if (mem_cgroup_margin(memcg)) + break; + /* + * If nothing was reclaimed after two attempts, there + * may be no reclaimable pages in this hierarchy. + */ + if (loop && !total) + break; } - - return ret; + return total; } /** @@ -1710,61 +1643,35 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) } #endif -/* - * Scan the hierarchy if needed to reclaim memory. We remember the last child - * we reclaimed from, so that we don't end up penalizing one child extensively - * based on its position in the children list. - * - * root_memcg is the original ancestor that we've been reclaim from. - * - * We give up and return to the caller when we visit root_memcg twice. - * (other groups can be removed while we're walking....) - * - * If shrink==true, for avoiding to free too much, this returns immedieately. - */ -static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg, - struct zone *zone, - gfp_t gfp_mask, - unsigned long reclaim_options, - unsigned long *total_scanned) -{ - struct mem_cgroup *victim; - int ret, total = 0; +static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, + struct zone *zone, + gfp_t gfp_mask, + unsigned long *total_scanned) +{ + struct mem_cgroup *victim = NULL; + int total = 0; int loop = 0; - bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; - bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; - bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; unsigned long excess; unsigned long nr_scanned; + struct mem_cgroup_reclaim_cookie reclaim = { + .zone = zone, + .priority = 0, + }; excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; - /* If memsw_is_minimum==1, swap-out is of-no-use. */ - if (!check_soft && !shrink && root_memcg->memsw_is_minimum) - noswap = true; - while (1) { - victim = mem_cgroup_select_victim(root_memcg); - if (victim == root_memcg) { + victim = mem_cgroup_iter(root_memcg, victim, &reclaim); + if (!victim) { loop++; - /* - * We are not draining per cpu cached charges during - * soft limit reclaim because global reclaim doesn't - * care about charges. It tries to free some memory and - * charges will not give any. - */ - if (!check_soft && loop >= 1) - drain_all_stock_async(root_memcg); if (loop >= 2) { /* * If we have not been able to reclaim * anything, it might because there are * no reclaimable pages under this hierarchy */ - if (!check_soft || !total) { - css_put(&victim->css); + if (!total) break; - } /* * We want to do more targeted reclaim. * excess >> 2 is not to excessive so as to @@ -1772,40 +1679,20 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg, * coming back to reclaim from this cgroup */ if (total >= (excess >> 2) || - (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) { - css_put(&victim->css); + (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) break; - } } - } - if (!mem_cgroup_reclaimable(victim, noswap)) { - /* this cgroup's local usage == 0 */ - css_put(&victim->css); continue; } - /* we use swappiness of local cgroup */ - if (check_soft) { - ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, - noswap, zone, &nr_scanned); - *total_scanned += nr_scanned; - } else - ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, - noswap); - css_put(&victim->css); - /* - * At shrinking usage, we can't check we should stop here or - * reclaim more. It's depends on callers. last_scanned_child - * will work enough for keeping fairness under tree. - */ - if (shrink) - return ret; - total += ret; - if (check_soft) { - if (!res_counter_soft_limit_excess(&root_memcg->res)) - return total; - } else if (mem_cgroup_margin(root_memcg)) - return total; + if (!mem_cgroup_reclaimable(victim, false)) + continue; + total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, + zone, &nr_scanned); + *total_scanned += nr_scanned; + if (!res_counter_soft_limit_excess(&root_memcg->res)) + break; } + mem_cgroup_iter_break(root_memcg, victim); return total; } @@ -1817,16 +1704,16 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg, static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) { struct mem_cgroup *iter, *failed = NULL; - bool cond = true; - for_each_mem_cgroup_tree_cond(iter, memcg, cond) { + for_each_mem_cgroup_tree(iter, memcg) { if (iter->oom_lock) { /* * this subtree of our hierarchy is already locked * so we cannot give a lock. */ failed = iter; - cond = false; + mem_cgroup_iter_break(memcg, iter); + break; } else iter->oom_lock = true; } @@ -1838,11 +1725,10 @@ static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) * OK, we failed to lock the whole subtree so we have to clean up * what we set up to the failing subtree */ - cond = true; - for_each_mem_cgroup_tree_cond(iter, memcg, cond) { + for_each_mem_cgroup_tree(iter, memcg) { if (iter == failed) { - cond = false; - continue; + mem_cgroup_iter_break(memcg, iter); + break; } iter->oom_lock = false; } @@ -2007,7 +1893,7 @@ void mem_cgroup_update_page_stat(struct page *page, bool need_unlock = false; unsigned long uninitialized_var(flags); - if (unlikely(!pc)) + if (mem_cgroup_disabled()) return; rcu_read_lock(); @@ -2238,7 +2124,7 @@ static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, struct mem_cgroup *iter; if ((action == CPU_ONLINE)) { - for_each_mem_cgroup_all(iter) + for_each_mem_cgroup(iter) synchronize_mem_cgroup_on_move(iter, cpu); return NOTIFY_OK; } @@ -2246,7 +2132,7 @@ static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN) return NOTIFY_OK; - for_each_mem_cgroup_all(iter) + for_each_mem_cgroup(iter) mem_cgroup_drain_pcp_counter(iter, cpu); stock = &per_cpu(memcg_stock, cpu); @@ -2300,8 +2186,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, if (!(gfp_mask & __GFP_WAIT)) return CHARGE_WOULDBLOCK; - ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, - gfp_mask, flags, NULL); + ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); if (mem_cgroup_margin(mem_over_limit) >= nr_pages) return CHARGE_RETRY; /* @@ -2334,8 +2219,25 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, } /* - * Unlike exported interface, "oom" parameter is added. if oom==true, - * oom-killer can be invoked. + * __mem_cgroup_try_charge() does + * 1. detect memcg to be charged against from passed *mm and *ptr, + * 2. update res_counter + * 3. call memory reclaim if necessary. + * + * In some special case, if the task is fatal, fatal_signal_pending() or + * has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup + * to *ptr. There are two reasons for this. 1: fatal threads should quit as soon + * as possible without any hazards. 2: all pages should have a valid + * pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg + * pointer, that is treated as a charge to root_mem_cgroup. + * + * So __mem_cgroup_try_charge() will return + * 0 ... on success, filling *ptr with a valid memcg pointer. + * -ENOMEM ... charge failure because of resource limits. + * -EINTR ... if thread is fatal. *ptr is filled with root_mem_cgroup. + * + * Unlike the exported interface, an "oom" parameter is added. if oom==true, + * the oom-killer can be invoked. */ static int __mem_cgroup_try_charge(struct mm_struct *mm, gfp_t gfp_mask, @@ -2364,7 +2266,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, * set, if so charge the init_mm (happens for pagecache usage). */ if (!*ptr && !mm) - goto bypass; + *ptr = root_mem_cgroup; again: if (*ptr) { /* css should be a valid one */ memcg = *ptr; @@ -2390,7 +2292,9 @@ again: * task-struct. So, mm->owner can be NULL. */ memcg = mem_cgroup_from_task(p); - if (!memcg || mem_cgroup_is_root(memcg)) { + if (!memcg) + memcg = root_mem_cgroup; + if (mem_cgroup_is_root(memcg)) { rcu_read_unlock(); goto done; } @@ -2465,8 +2369,8 @@ nomem: *ptr = NULL; return -ENOMEM; bypass: - *ptr = NULL; - return 0; + *ptr = root_mem_cgroup; + return -EINTR; } /* @@ -2522,7 +2426,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) memcg = NULL; } else if (PageSwapCache(page)) { ent.val = page_private(page); - id = lookup_swap_cgroup(ent); + id = lookup_swap_cgroup_id(ent); rcu_read_lock(); memcg = mem_cgroup_lookup(id); if (memcg && !css_tryget(&memcg->css)) @@ -2537,8 +2441,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, struct page *page, unsigned int nr_pages, struct page_cgroup *pc, - enum charge_type ctype) + enum charge_type ctype, + bool lrucare) { + struct zone *uninitialized_var(zone); + bool was_on_lru = false; + lock_page_cgroup(pc); if (unlikely(PageCgroupUsed(pc))) { unlock_page_cgroup(pc); @@ -2549,6 +2457,21 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, * we don't need page_cgroup_lock about tail pages, becase they are not * accessed by any other context at this point. */ + + /* + * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page + * may already be on some other mem_cgroup's LRU. Take care of it. + */ + if (lrucare) { + zone = page_zone(page); + spin_lock_irq(&zone->lru_lock); + if (PageLRU(page)) { + ClearPageLRU(page); + del_page_from_lru_list(zone, page, page_lru(page)); + was_on_lru = true; + } + } + pc->mem_cgroup = memcg; /* * We access a page_cgroup asynchronously without lock_page_cgroup(). @@ -2572,8 +2495,18 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, break; } + if (lrucare) { + if (was_on_lru) { + VM_BUG_ON(PageLRU(page)); + SetPageLRU(page); + add_page_to_lru_list(zone, page, page_lru(page)); + } + spin_unlock_irq(&zone->lru_lock); + } + mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages); unlock_page_cgroup(pc); + /* * "charge_statistics" updated event counter. Then, check it. * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. @@ -2585,44 +2518,29 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\ - (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION)) + (1 << PCG_MIGRATION)) /* * Because tail pages are not marked as "used", set it. We're under - * zone->lru_lock, 'splitting on pmd' and compund_lock. + * zone->lru_lock, 'splitting on pmd' and compound_lock. + * charge/uncharge will be never happen and move_account() is done under + * compound_lock(), so we don't have to take care of races. */ -void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) +void mem_cgroup_split_huge_fixup(struct page *head) { struct page_cgroup *head_pc = lookup_page_cgroup(head); - struct page_cgroup *tail_pc = lookup_page_cgroup(tail); - unsigned long flags; + struct page_cgroup *pc; + int i; if (mem_cgroup_disabled()) return; - /* - * We have no races with charge/uncharge but will have races with - * page state accounting. - */ - move_lock_page_cgroup(head_pc, &flags); - - tail_pc->mem_cgroup = head_pc->mem_cgroup; - smp_wmb(); /* see __commit_charge() */ - if (PageCgroupAcctLRU(head_pc)) { - enum lru_list lru; - struct mem_cgroup_per_zone *mz; - - /* - * LRU flags cannot be copied because we need to add tail - *.page to LRU by generic call and our hook will be called. - * We hold lru_lock, then, reduce counter directly. - */ - lru = page_lru(head); - mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head); - MEM_CGROUP_ZSTAT(mz, lru) -= 1; + for (i = 1; i < HPAGE_PMD_NR; i++) { + pc = head_pc + i; + pc->mem_cgroup = head_pc->mem_cgroup; + smp_wmb();/* see __commit_charge() */ + pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; } - tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; - move_unlock_page_cgroup(head_pc, &flags); } -#endif +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /** * mem_cgroup_move_account - move account of the page @@ -2737,7 +2655,7 @@ static int mem_cgroup_move_parent(struct page *page, parent = mem_cgroup_from_cont(pcg); ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false); - if (ret || !parent) + if (ret) goto put_back; if (nr_pages > 1) @@ -2783,13 +2701,10 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, } pc = lookup_page_cgroup(page); - BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */ - ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom); - if (ret || !memcg) + if (ret == -ENOMEM) return ret; - - __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype); + __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype, false); return 0; } @@ -2798,45 +2713,22 @@ int mem_cgroup_newpage_charge(struct page *page, { if (mem_cgroup_disabled()) return 0; - /* - * If already mapped, we don't have to account. - * If page cache, page->mapping has address_space. - * But page->mapping may have out-of-use anon_vma pointer, - * detecit it by PageAnon() check. newly-mapped-anon's page->mapping - * is NULL. - */ - if (page_mapped(page) || (page->mapping && !PageAnon(page))) - return 0; - if (unlikely(!mm)) - mm = &init_mm; + VM_BUG_ON(page_mapped(page)); + VM_BUG_ON(page->mapping && !PageAnon(page)); + VM_BUG_ON(!mm); return mem_cgroup_charge_common(page, mm, gfp_mask, - MEM_CGROUP_CHARGE_TYPE_MAPPED); + MEM_CGROUP_CHARGE_TYPE_MAPPED); } static void __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, enum charge_type ctype); -static void -__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *memcg, - enum charge_type ctype) -{ - struct page_cgroup *pc = lookup_page_cgroup(page); - /* - * In some case, SwapCache, FUSE(splice_buf->radixtree), the page - * is already on LRU. It means the page may on some other page_cgroup's - * LRU. Take care of it. - */ - mem_cgroup_lru_del_before_commit(page); - __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype); - mem_cgroup_lru_add_after_commit(page); - return; -} - int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { struct mem_cgroup *memcg = NULL; + enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; int ret; if (mem_cgroup_disabled()) @@ -2846,31 +2738,16 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, if (unlikely(!mm)) mm = &init_mm; + if (!page_is_file_cache(page)) + type = MEM_CGROUP_CHARGE_TYPE_SHMEM; - if (page_is_file_cache(page)) { - ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &memcg, true); - if (ret || !memcg) - return ret; - - /* - * FUSE reuses pages without going through the final - * put that would remove them from the LRU list, make - * sure that they get relinked properly. - */ - __mem_cgroup_commit_charge_lrucare(page, memcg, - MEM_CGROUP_CHARGE_TYPE_CACHE); - return ret; - } - /* shmem */ - if (PageSwapCache(page)) { + if (!PageSwapCache(page)) + ret = mem_cgroup_charge_common(page, mm, gfp_mask, type); + else { /* page is swapcache/shmem */ ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg); if (!ret) - __mem_cgroup_commit_charge_swapin(page, memcg, - MEM_CGROUP_CHARGE_TYPE_SHMEM); - } else - ret = mem_cgroup_charge_common(page, mm, gfp_mask, - MEM_CGROUP_CHARGE_TYPE_SHMEM); - + __mem_cgroup_commit_charge_swapin(page, memcg, type); + } return ret; } @@ -2882,12 +2759,12 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, */ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page, - gfp_t mask, struct mem_cgroup **ptr) + gfp_t mask, struct mem_cgroup **memcgp) { struct mem_cgroup *memcg; int ret; - *ptr = NULL; + *memcgp = NULL; if (mem_cgroup_disabled()) return 0; @@ -2905,27 +2782,35 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, memcg = try_get_mem_cgroup_from_page(page); if (!memcg) goto charge_cur_mm; - *ptr = memcg; - ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true); + *memcgp = memcg; + ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true); css_put(&memcg->css); + if (ret == -EINTR) + ret = 0; return ret; charge_cur_mm: if (unlikely(!mm)) mm = &init_mm; - return __mem_cgroup_try_charge(mm, mask, 1, ptr, true); + ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true); + if (ret == -EINTR) + ret = 0; + return ret; } static void -__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, +__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, enum charge_type ctype) { + struct page_cgroup *pc; + if (mem_cgroup_disabled()) return; - if (!ptr) + if (!memcg) return; - cgroup_exclude_rmdir(&ptr->css); + cgroup_exclude_rmdir(&memcg->css); - __mem_cgroup_commit_charge_lrucare(page, ptr, ctype); + pc = lookup_page_cgroup(page); + __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype, true); /* * Now swap is on-memory. This means this page may be * counted both as mem and swap....double count. @@ -2935,21 +2820,22 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, */ if (do_swap_account && PageSwapCache(page)) { swp_entry_t ent = {.val = page_private(page)}; + struct mem_cgroup *swap_memcg; unsigned short id; - struct mem_cgroup *memcg; id = swap_cgroup_record(ent, 0); rcu_read_lock(); - memcg = mem_cgroup_lookup(id); - if (memcg) { + swap_memcg = mem_cgroup_lookup(id); + if (swap_memcg) { /* * This recorded memcg can be obsolete one. So, avoid * calling css_tryget */ - if (!mem_cgroup_is_root(memcg)) - res_counter_uncharge(&memcg->memsw, PAGE_SIZE); - mem_cgroup_swap_statistics(memcg, false); - mem_cgroup_put(memcg); + if (!mem_cgroup_is_root(swap_memcg)) + res_counter_uncharge(&swap_memcg->memsw, + PAGE_SIZE); + mem_cgroup_swap_statistics(swap_memcg, false); + mem_cgroup_put(swap_memcg); } rcu_read_unlock(); } @@ -2958,13 +2844,14 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, * So, rmdir()->pre_destroy() can be called while we do this charge. * In that case, we need to call pre_destroy() again. check it here. */ - cgroup_release_and_wakeup_rmdir(&ptr->css); + cgroup_release_and_wakeup_rmdir(&memcg->css); } -void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) +void mem_cgroup_commit_charge_swapin(struct page *page, + struct mem_cgroup *memcg) { - __mem_cgroup_commit_charge_swapin(page, ptr, - MEM_CGROUP_CHARGE_TYPE_MAPPED); + __mem_cgroup_commit_charge_swapin(page, memcg, + MEM_CGROUP_CHARGE_TYPE_MAPPED); } void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) @@ -3054,7 +2941,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) * Check if our page_cgroup is valid */ pc = lookup_page_cgroup(page); - if (unlikely(!pc || !PageCgroupUsed(pc))) + if (unlikely(!PageCgroupUsed(pc))) return NULL; lock_page_cgroup(pc); @@ -3117,8 +3004,7 @@ void mem_cgroup_uncharge_page(struct page *page) /* early check. */ if (page_mapped(page)) return; - if (page->mapping && !PageAnon(page)) - return; + VM_BUG_ON(page->mapping && !PageAnon(page)); __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); } @@ -3293,14 +3179,14 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, * page belongs to. */ int mem_cgroup_prepare_migration(struct page *page, - struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask) + struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask) { struct mem_cgroup *memcg = NULL; struct page_cgroup *pc; enum charge_type ctype; int ret = 0; - *ptr = NULL; + *memcgp = NULL; VM_BUG_ON(PageTransHuge(page)); if (mem_cgroup_disabled()) @@ -3351,10 +3237,10 @@ int mem_cgroup_prepare_migration(struct page *page, if (!memcg) return 0; - *ptr = memcg; - ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false); + *memcgp = memcg; + ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, memcgp, false); css_put(&memcg->css);/* drop extra refcnt */ - if (ret || *ptr == NULL) { + if (ret) { if (PageAnon(page)) { lock_page_cgroup(pc); ClearPageCgroupMigration(pc); @@ -3364,6 +3250,7 @@ int mem_cgroup_prepare_migration(struct page *page, */ mem_cgroup_uncharge_page(page); } + /* we'll need to revisit this error code (we have -EINTR) */ return -ENOMEM; } /* @@ -3379,7 +3266,7 @@ int mem_cgroup_prepare_migration(struct page *page, ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; else ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; - __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype); + __mem_cgroup_commit_charge(memcg, newpage, 1, pc, ctype, false); return ret; } @@ -3432,12 +3319,51 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, cgroup_release_and_wakeup_rmdir(&memcg->css); } +/* + * At replace page cache, newpage is not under any memcg but it's on + * LRU. So, this function doesn't touch res_counter but handles LRU + * in correct way. Both pages are locked so we cannot race with uncharge. + */ +void mem_cgroup_replace_page_cache(struct page *oldpage, + struct page *newpage) +{ + struct mem_cgroup *memcg; + struct page_cgroup *pc; + enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; + + if (mem_cgroup_disabled()) + return; + + pc = lookup_page_cgroup(oldpage); + /* fix accounting on old pages */ + lock_page_cgroup(pc); + memcg = pc->mem_cgroup; + mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -1); + ClearPageCgroupUsed(pc); + unlock_page_cgroup(pc); + + if (PageSwapBacked(oldpage)) + type = MEM_CGROUP_CHARGE_TYPE_SHMEM; + + /* + * Even if newpage->mapping was NULL before starting replacement, + * the newpage may be on LRU(or pagevec for LRU) already. We lock + * LRU while we overwrite pc->mem_cgroup. + */ + __mem_cgroup_commit_charge(memcg, newpage, 1, pc, type, true); +} + #ifdef CONFIG_DEBUG_VM static struct page_cgroup *lookup_page_cgroup_used(struct page *page) { struct page_cgroup *pc; pc = lookup_page_cgroup(page); + /* + * Can be NULL while feeding pages into the page allocator for + * the first time, i.e. during boot or memory hotplug; + * or when mem_cgroup_disabled(). + */ if (likely(pc) && PageCgroupUsed(pc)) return pc; return NULL; @@ -3457,23 +3383,8 @@ void mem_cgroup_print_bad_page(struct page *page) pc = lookup_page_cgroup_used(page); if (pc) { - int ret = -1; - char *path; - - printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p", + printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p\n", pc, pc->flags, pc->mem_cgroup); - - path = kmalloc(PATH_MAX, GFP_KERNEL); - if (path) { - rcu_read_lock(); - ret = cgroup_path(pc->mem_cgroup->css.cgroup, - path, PATH_MAX); - rcu_read_unlock(); - } - - printk(KERN_CONT "(%s)\n", - (ret < 0) ? "cannot get the path" : path); - kfree(path); } } #endif @@ -3534,9 +3445,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, if (!ret) break; - mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, - MEM_CGROUP_RECLAIM_SHRINK, - NULL); + mem_cgroup_reclaim(memcg, GFP_KERNEL, + MEM_CGROUP_RECLAIM_SHRINK); curusage = res_counter_read_u64(&memcg->res, RES_USAGE); /* Usage is reduced ? */ if (curusage >= oldusage) @@ -3594,10 +3504,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, if (!ret) break; - mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, - MEM_CGROUP_RECLAIM_NOSWAP | - MEM_CGROUP_RECLAIM_SHRINK, - NULL); + mem_cgroup_reclaim(memcg, GFP_KERNEL, + MEM_CGROUP_RECLAIM_NOSWAP | + MEM_CGROUP_RECLAIM_SHRINK); curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); /* Usage is reduced ? */ if (curusage >= oldusage) @@ -3640,10 +3549,8 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, break; nr_scanned = 0; - reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone, - gfp_mask, - MEM_CGROUP_RECLAIM_SOFT, - &nr_scanned); + reclaimed = mem_cgroup_soft_reclaim(mz->mem, zone, + gfp_mask, &nr_scanned); nr_reclaimed += reclaimed; *total_scanned += nr_scanned; spin_lock(&mctz->lock); @@ -3711,22 +3618,23 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, int node, int zid, enum lru_list lru) { - struct zone *zone; struct mem_cgroup_per_zone *mz; - struct page_cgroup *pc, *busy; unsigned long flags, loop; struct list_head *list; + struct page *busy; + struct zone *zone; int ret = 0; zone = &NODE_DATA(node)->node_zones[zid]; mz = mem_cgroup_zoneinfo(memcg, node, zid); - list = &mz->lists[lru]; + list = &mz->lruvec.lists[lru]; loop = MEM_CGROUP_ZSTAT(mz, lru); /* give some margin against EBUSY etc...*/ loop += 256; busy = NULL; while (loop--) { + struct page_cgroup *pc; struct page *page; ret = 0; @@ -3735,24 +3643,24 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, spin_unlock_irqrestore(&zone->lru_lock, flags); break; } - pc = list_entry(list->prev, struct page_cgroup, lru); - if (busy == pc) { - list_move(&pc->lru, list); + page = list_entry(list->prev, struct page, lru); + if (busy == page) { + list_move(&page->lru, list); busy = NULL; spin_unlock_irqrestore(&zone->lru_lock, flags); continue; } spin_unlock_irqrestore(&zone->lru_lock, flags); - page = lookup_cgroup_page(pc); + pc = lookup_page_cgroup(page); ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL); - if (ret == -ENOMEM) + if (ret == -ENOMEM || ret == -EINTR) break; if (ret == -EBUSY || ret == -EINVAL) { /* found lock contention or "pc" is obsolete. */ - busy = pc; + busy = page; cond_resched(); } else busy = NULL; @@ -4524,6 +4432,9 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, */ BUG_ON(!thresholds); + if (!thresholds->primary) + goto unlock; + usage = mem_cgroup_usage(memcg, type == _MEMSWAP); /* Check if a threshold crossed before removing */ @@ -4572,7 +4483,7 @@ swap_buffers: /* To be sure that nobody uses thresholds */ synchronize_rcu(); - +unlock: mutex_unlock(&memcg->thresholds_lock); } @@ -4691,10 +4602,9 @@ static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) return mem_cgroup_sockets_init(cont, ss); }; -static void kmem_cgroup_destroy(struct cgroup_subsys *ss, - struct cgroup *cont) +static void kmem_cgroup_destroy(struct cgroup *cont) { - mem_cgroup_sockets_destroy(cont, ss); + mem_cgroup_sockets_destroy(cont); } #else static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) @@ -4702,8 +4612,7 @@ static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) return 0; } -static void kmem_cgroup_destroy(struct cgroup_subsys *ss, - struct cgroup *cont) +static void kmem_cgroup_destroy(struct cgroup *cont) { } #endif @@ -4846,7 +4755,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) for (zone = 0; zone < MAX_NR_ZONES; zone++) { mz = &pn->zoneinfo[zone]; for_each_lru(l) - INIT_LIST_HEAD(&mz->lists[l]); + INIT_LIST_HEAD(&mz->lruvec.lists[l]); mz->usage_in_excess = 0; mz->on_tree = false; mz->mem = memcg; @@ -4889,6 +4798,27 @@ out_free: } /* + * Helpers for freeing a vzalloc()ed mem_cgroup by RCU, + * but in process context. The work_freeing structure is overlaid + * on the rcu_freeing structure, which itself is overlaid on memsw. + */ +static void vfree_work(struct work_struct *work) +{ + struct mem_cgroup *memcg; + + memcg = container_of(work, struct mem_cgroup, work_freeing); + vfree(memcg); +} +static void vfree_rcu(struct rcu_head *rcu_head) +{ + struct mem_cgroup *memcg; + + memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing); + INIT_WORK(&memcg->work_freeing, vfree_work); + schedule_work(&memcg->work_freeing); +} + +/* * At destroying mem_cgroup, references from swap_cgroup can remain. * (scanning all at force_empty is too costly...) * @@ -4906,14 +4836,14 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) mem_cgroup_remove_from_trees(memcg); free_css_id(&mem_cgroup_subsys, &memcg->css); - for_each_node_state(node, N_POSSIBLE) + for_each_node(node) free_mem_cgroup_per_zone_info(memcg, node); free_percpu(memcg->stat); if (sizeof(struct mem_cgroup) < PAGE_SIZE) - kfree(memcg); + kfree_rcu(memcg, rcu_freeing); else - vfree(memcg); + call_rcu(&memcg->rcu_freeing, vfree_rcu); } static void mem_cgroup_get(struct mem_cgroup *memcg) @@ -4965,13 +4895,13 @@ static int mem_cgroup_soft_limit_tree_init(void) struct mem_cgroup_tree_per_zone *rtpz; int tmp, node, zone; - for_each_node_state(node, N_POSSIBLE) { + for_each_node(node) { tmp = node; if (!node_state(node, N_NORMAL_MEMORY)) tmp = -1; rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); if (!rtpn) - return 1; + goto err_cleanup; soft_limit_tree.rb_tree_per_node[node] = rtpn; @@ -4982,10 +4912,20 @@ static int mem_cgroup_soft_limit_tree_init(void) } } return 0; + +err_cleanup: + for_each_node(node) { + if (!soft_limit_tree.rb_tree_per_node[node]) + break; + kfree(soft_limit_tree.rb_tree_per_node[node]); + soft_limit_tree.rb_tree_per_node[node] = NULL; + } + return 1; + } static struct cgroup_subsys_state * __ref -mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) +mem_cgroup_create(struct cgroup *cont) { struct mem_cgroup *memcg, *parent; long error = -ENOMEM; @@ -4995,7 +4935,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) if (!memcg) return ERR_PTR(error); - for_each_node_state(node, N_POSSIBLE) + for_each_node(node) if (alloc_mem_cgroup_per_zone_info(memcg, node)) goto free_out; @@ -5033,7 +4973,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) res_counter_init(&memcg->res, NULL); res_counter_init(&memcg->memsw, NULL); } - memcg->last_scanned_child = 0; memcg->last_scanned_node = MAX_NUMNODES; INIT_LIST_HEAD(&memcg->oom_notify); @@ -5048,20 +4987,18 @@ free_out: return ERR_PTR(error); } -static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss, - struct cgroup *cont) +static int mem_cgroup_pre_destroy(struct cgroup *cont) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); return mem_cgroup_force_empty(memcg, false); } -static void mem_cgroup_destroy(struct cgroup_subsys *ss, - struct cgroup *cont) +static void mem_cgroup_destroy(struct cgroup *cont) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); - kmem_cgroup_destroy(ss, cont); + kmem_cgroup_destroy(cont); mem_cgroup_put(memcg); } @@ -5129,9 +5066,9 @@ one_by_one: } ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, 1, &memcg, false); - if (ret || !memcg) + if (ret) /* mem_cgroup_clear_mc() will do uncharge later */ - return -ENOMEM; + return ret; mc.precharge++; } return ret; @@ -5276,7 +5213,7 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma, } /* There is a swap entry and a page doesn't exist or isn't charged */ if (ent.val && !ret && - css_id(&mc.from->css) == lookup_swap_cgroup(ent)) { + css_id(&mc.from->css) == lookup_swap_cgroup_id(ent)) { ret = MC_TARGET_SWAP; if (target) target->ent = ent; @@ -5398,9 +5335,8 @@ static void mem_cgroup_clear_mc(void) mem_cgroup_end_move(from); } -static int mem_cgroup_can_attach(struct cgroup_subsys *ss, - struct cgroup *cgroup, - struct cgroup_taskset *tset) +static int mem_cgroup_can_attach(struct cgroup *cgroup, + struct cgroup_taskset *tset) { struct task_struct *p = cgroup_taskset_first(tset); int ret = 0; @@ -5438,9 +5374,8 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, return ret; } -static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, - struct cgroup *cgroup, - struct cgroup_taskset *tset) +static void mem_cgroup_cancel_attach(struct cgroup *cgroup, + struct cgroup_taskset *tset) { mem_cgroup_clear_mc(); } @@ -5555,9 +5490,8 @@ retry: up_read(&mm->mmap_sem); } -static void mem_cgroup_move_task(struct cgroup_subsys *ss, - struct cgroup *cont, - struct cgroup_taskset *tset) +static void mem_cgroup_move_task(struct cgroup *cont, + struct cgroup_taskset *tset) { struct task_struct *p = cgroup_taskset_first(tset); struct mm_struct *mm = get_task_mm(p); @@ -5572,20 +5506,17 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, mem_cgroup_clear_mc(); } #else /* !CONFIG_MMU */ -static int mem_cgroup_can_attach(struct cgroup_subsys *ss, - struct cgroup *cgroup, - struct cgroup_taskset *tset) +static int mem_cgroup_can_attach(struct cgroup *cgroup, + struct cgroup_taskset *tset) { return 0; } -static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, - struct cgroup *cgroup, - struct cgroup_taskset *tset) +static void mem_cgroup_cancel_attach(struct cgroup *cgroup, + struct cgroup_taskset *tset) { } -static void mem_cgroup_move_task(struct cgroup_subsys *ss, - struct cgroup *cont, - struct cgroup_taskset *tset) +static void mem_cgroup_move_task(struct cgroup *cont, + struct cgroup_taskset *tset) { } #endif diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 06d3479513aa..56080ea36140 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1557,7 +1557,7 @@ int soft_offline_page(struct page *page, int flags) page_is_file_cache(page)); list_add(&page->lru, &pagelist); ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, - 0, true); + 0, MIGRATE_SYNC); if (ret) { putback_lru_pages(&pagelist); pr_info("soft offline: %#lx: migration failed %d, type %lx\n", diff --git a/mm/memory.c b/mm/memory.c index 829d43735402..347e5fad1cfa 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -293,7 +293,7 @@ int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) { struct mmu_gather_batch *batch; - tlb->need_flush = 1; + VM_BUG_ON(!tlb->need_flush); if (tlb_fast_mode(tlb)) { free_page_and_swap_cache(page); @@ -878,15 +878,24 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, } if (likely(!non_swap_entry(entry))) rss[MM_SWAPENTS]++; - else if (is_write_migration_entry(entry) && - is_cow_mapping(vm_flags)) { - /* - * COW mappings require pages in both parent - * and child to be set to read. - */ - make_migration_entry_read(&entry); - pte = swp_entry_to_pte(entry); - set_pte_at(src_mm, addr, src_pte, pte); + else if (is_migration_entry(entry)) { + page = migration_entry_to_page(entry); + + if (PageAnon(page)) + rss[MM_ANONPAGES]++; + else + rss[MM_FILEPAGES]++; + + if (is_write_migration_entry(entry) && + is_cow_mapping(vm_flags)) { + /* + * COW mappings require pages in both + * parent and child to be set to read. + */ + make_migration_entry_read(&entry); + pte = swp_entry_to_pte(entry); + set_pte_at(src_mm, addr, src_pte, pte); + } } } goto out_set_pte; @@ -1191,6 +1200,16 @@ again: if (!non_swap_entry(entry)) rss[MM_SWAPENTS]--; + else if (is_migration_entry(entry)) { + struct page *page; + + page = migration_entry_to_page(entry); + + if (PageAnon(page)) + rss[MM_ANONPAGES]--; + else + rss[MM_FILEPAGES]--; + } if (unlikely(!free_swap_and_cache(entry))) print_bad_pte(vma, addr, ptent, NULL); } @@ -1231,7 +1250,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, if (next-addr != HPAGE_PMD_SIZE) { VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); split_huge_page_pmd(vma->vm_mm, pmd); - } else if (zap_huge_pmd(tlb, vma, pmd)) + } else if (zap_huge_pmd(tlb, vma, pmd, addr)) continue; /* fall through */ } @@ -2428,7 +2447,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo * fails, we just zero-fill it. Live with it. */ if (unlikely(!src)) { - void *kaddr = kmap_atomic(dst, KM_USER0); + void *kaddr = kmap_atomic(dst); void __user *uaddr = (void __user *)(va & PAGE_MASK); /* @@ -2439,7 +2458,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo */ if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) clear_page(kaddr); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); flush_dcache_page(dst); } else copy_user_highpage(dst, src, va, vma); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 2168489c0bc9..6629fafd6ce4 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -809,7 +809,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) } /* this function returns # of failed pages */ ret = migrate_pages(&source, hotremove_migrate_alloc, 0, - true, true); + true, MIGRATE_SYNC); if (ret) putback_lru_pages(&source); } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e3d58f088466..47296fee23db 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -640,10 +640,11 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, unsigned long vmstart; unsigned long vmend; - vma = find_vma_prev(mm, start, &prev); + vma = find_vma(mm, start); if (!vma || vma->vm_start > start) return -EFAULT; + prev = vma->vm_prev; if (start > vma->vm_start) prev = vma; @@ -942,7 +943,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, new_node_page, dest, - false, true); + false, MIGRATE_SYNC); if (err) putback_lru_pages(&pagelist); } diff --git a/mm/migrate.c b/mm/migrate.c index 89ea0854332e..1503b6b54ecb 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -216,6 +216,56 @@ out: pte_unmap_unlock(ptep, ptl); } +#ifdef CONFIG_BLOCK +/* Returns true if all buffers are successfully locked */ +static bool buffer_migrate_lock_buffers(struct buffer_head *head, + enum migrate_mode mode) +{ + struct buffer_head *bh = head; + + /* Simple case, sync compaction */ + if (mode != MIGRATE_ASYNC) { + do { + get_bh(bh); + lock_buffer(bh); + bh = bh->b_this_page; + + } while (bh != head); + + return true; + } + + /* async case, we cannot block on lock_buffer so use trylock_buffer */ + do { + get_bh(bh); + if (!trylock_buffer(bh)) { + /* + * We failed to lock the buffer and cannot stall in + * async migration. Release the taken locks + */ + struct buffer_head *failed_bh = bh; + put_bh(failed_bh); + bh = head; + while (bh != failed_bh) { + unlock_buffer(bh); + put_bh(bh); + bh = bh->b_this_page; + } + return false; + } + + bh = bh->b_this_page; + } while (bh != head); + return true; +} +#else +static inline bool buffer_migrate_lock_buffers(struct buffer_head *head, + enum migrate_mode mode) +{ + return true; +} +#endif /* CONFIG_BLOCK */ + /* * Replace the page in the mapping. * @@ -225,7 +275,8 @@ out: * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. */ static int migrate_page_move_mapping(struct address_space *mapping, - struct page *newpage, struct page *page) + struct page *newpage, struct page *page, + struct buffer_head *head, enum migrate_mode mode) { int expected_count; void **pslot; @@ -255,6 +306,20 @@ static int migrate_page_move_mapping(struct address_space *mapping, } /* + * In the async migration case of moving a page with buffers, lock the + * buffers using trylock before the mapping is moved. If the mapping + * was moved, we later failed to lock the buffers and could not move + * the mapping back due to an elevated page count, we would have to + * block waiting on other references to be dropped. + */ + if (mode == MIGRATE_ASYNC && head && + !buffer_migrate_lock_buffers(head, mode)) { + page_unfreeze_refs(page, expected_count); + spin_unlock_irq(&mapping->tree_lock); + return -EAGAIN; + } + + /* * Now we know that no one else is looking at the page. */ get_page(newpage); /* add cache reference */ @@ -380,7 +445,6 @@ void migrate_page_copy(struct page *newpage, struct page *page) ClearPageSwapCache(page); ClearPagePrivate(page); set_page_private(page, 0); - page->mapping = NULL; /* * If any waiters have accumulated on the new page then @@ -409,13 +473,14 @@ EXPORT_SYMBOL(fail_migrate_page); * Pages are locked upon entry and exit. */ int migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page) + struct page *newpage, struct page *page, + enum migrate_mode mode) { int rc; BUG_ON(PageWriteback(page)); /* Writeback must be complete */ - rc = migrate_page_move_mapping(mapping, newpage, page); + rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode); if (rc) return rc; @@ -432,28 +497,28 @@ EXPORT_SYMBOL(migrate_page); * exist. */ int buffer_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page) + struct page *newpage, struct page *page, enum migrate_mode mode) { struct buffer_head *bh, *head; int rc; if (!page_has_buffers(page)) - return migrate_page(mapping, newpage, page); + return migrate_page(mapping, newpage, page, mode); head = page_buffers(page); - rc = migrate_page_move_mapping(mapping, newpage, page); + rc = migrate_page_move_mapping(mapping, newpage, page, head, mode); if (rc) return rc; - bh = head; - do { - get_bh(bh); - lock_buffer(bh); - bh = bh->b_this_page; - - } while (bh != head); + /* + * In the async case, migrate_page_move_mapping locked the buffers + * with an IRQ-safe spinlock held. In the sync case, the buffers + * need to be locked now + */ + if (mode != MIGRATE_ASYNC) + BUG_ON(!buffer_migrate_lock_buffers(head, mode)); ClearPagePrivate(page); set_page_private(newpage, page_private(page)); @@ -530,10 +595,14 @@ static int writeout(struct address_space *mapping, struct page *page) * Default handling if a filesystem does not provide a migration function. */ static int fallback_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page) + struct page *newpage, struct page *page, enum migrate_mode mode) { - if (PageDirty(page)) + if (PageDirty(page)) { + /* Only writeback pages in full synchronous migration */ + if (mode != MIGRATE_SYNC) + return -EBUSY; return writeout(mapping, page); + } /* * Buffers may be managed in a filesystem specific way. @@ -543,7 +612,7 @@ static int fallback_migrate_page(struct address_space *mapping, !try_to_release_page(page, GFP_KERNEL)) return -EAGAIN; - return migrate_page(mapping, newpage, page); + return migrate_page(mapping, newpage, page, mode); } /* @@ -558,7 +627,7 @@ static int fallback_migrate_page(struct address_space *mapping, * == 0 - success */ static int move_to_new_page(struct page *newpage, struct page *page, - int remap_swapcache, bool sync) + int remap_swapcache, enum migrate_mode mode) { struct address_space *mapping; int rc; @@ -579,35 +648,25 @@ static int move_to_new_page(struct page *newpage, struct page *page, mapping = page_mapping(page); if (!mapping) - rc = migrate_page(mapping, newpage, page); - else { + rc = migrate_page(mapping, newpage, page, mode); + else if (mapping->a_ops->migratepage) /* - * Do not writeback pages if !sync and migratepage is - * not pointing to migrate_page() which is nonblocking - * (swapcache/tmpfs uses migratepage = migrate_page). + * Most pages have a mapping and most filesystems provide a + * migratepage callback. Anonymous pages are part of swap + * space which also has its own migratepage callback. This + * is the most common path for page migration. */ - if (PageDirty(page) && !sync && - mapping->a_ops->migratepage != migrate_page) - rc = -EBUSY; - else if (mapping->a_ops->migratepage) - /* - * Most pages have a mapping and most filesystems - * should provide a migration function. Anonymous - * pages are part of swap space which also has its - * own migration function. This is the most common - * path for page migration. - */ - rc = mapping->a_ops->migratepage(mapping, - newpage, page); - else - rc = fallback_migrate_page(mapping, newpage, page); - } + rc = mapping->a_ops->migratepage(mapping, + newpage, page, mode); + else + rc = fallback_migrate_page(mapping, newpage, page, mode); if (rc) { newpage->mapping = NULL; } else { if (remap_swapcache) remove_migration_ptes(page, newpage); + page->mapping = NULL; } unlock_page(newpage); @@ -616,7 +675,7 @@ static int move_to_new_page(struct page *newpage, struct page *page, } static int __unmap_and_move(struct page *page, struct page *newpage, - int force, bool offlining, bool sync) + int force, bool offlining, enum migrate_mode mode) { int rc = -EAGAIN; int remap_swapcache = 1; @@ -625,7 +684,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, struct anon_vma *anon_vma = NULL; if (!trylock_page(page)) { - if (!force || !sync) + if (!force || mode == MIGRATE_ASYNC) goto out; /* @@ -671,10 +730,12 @@ static int __unmap_and_move(struct page *page, struct page *newpage, if (PageWriteback(page)) { /* - * For !sync, there is no point retrying as the retry loop - * is expected to be too short for PageWriteback to be cleared + * Only in the case of a full syncronous migration is it + * necessary to wait for PageWriteback. In the async case, + * the retry loop is too short and in the sync-light case, + * the overhead of stalling is too much */ - if (!sync) { + if (mode != MIGRATE_SYNC) { rc = -EBUSY; goto uncharge; } @@ -745,7 +806,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, skip_unmap: if (!page_mapped(page)) - rc = move_to_new_page(newpage, page, remap_swapcache, sync); + rc = move_to_new_page(newpage, page, remap_swapcache, mode); if (rc && remap_swapcache) remove_migration_ptes(page, page); @@ -768,7 +829,8 @@ out: * to the newly allocated page in newpage. */ static int unmap_and_move(new_page_t get_new_page, unsigned long private, - struct page *page, int force, bool offlining, bool sync) + struct page *page, int force, bool offlining, + enum migrate_mode mode) { int rc = 0; int *result = NULL; @@ -786,7 +848,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, if (unlikely(split_huge_page(page))) goto out; - rc = __unmap_and_move(page, newpage, force, offlining, sync); + rc = __unmap_and_move(page, newpage, force, offlining, mode); out: if (rc != -EAGAIN) { /* @@ -834,7 +896,8 @@ out: */ static int unmap_and_move_huge_page(new_page_t get_new_page, unsigned long private, struct page *hpage, - int force, bool offlining, bool sync) + int force, bool offlining, + enum migrate_mode mode) { int rc = 0; int *result = NULL; @@ -847,7 +910,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, rc = -EAGAIN; if (!trylock_page(hpage)) { - if (!force || !sync) + if (!force || mode != MIGRATE_SYNC) goto out; lock_page(hpage); } @@ -858,7 +921,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); if (!page_mapped(hpage)) - rc = move_to_new_page(new_hpage, hpage, 1, sync); + rc = move_to_new_page(new_hpage, hpage, 1, mode); if (rc) remove_migration_ptes(hpage, hpage); @@ -901,7 +964,7 @@ out: */ int migrate_pages(struct list_head *from, new_page_t get_new_page, unsigned long private, bool offlining, - bool sync) + enum migrate_mode mode) { int retry = 1; int nr_failed = 0; @@ -922,7 +985,7 @@ int migrate_pages(struct list_head *from, rc = unmap_and_move(get_new_page, private, page, pass > 2, offlining, - sync); + mode); switch(rc) { case -ENOMEM: @@ -952,7 +1015,7 @@ out: int migrate_huge_pages(struct list_head *from, new_page_t get_new_page, unsigned long private, bool offlining, - bool sync) + enum migrate_mode mode) { int retry = 1; int nr_failed = 0; @@ -969,7 +1032,7 @@ int migrate_huge_pages(struct list_head *from, rc = unmap_and_move_huge_page(get_new_page, private, page, pass > 2, offlining, - sync); + mode); switch(rc) { case -ENOMEM: @@ -1098,7 +1161,7 @@ set_status: err = 0; if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, new_page_node, - (unsigned long)pm, 0, true); + (unsigned long)pm, 0, MIGRATE_SYNC); if (err) putback_lru_pages(&pagelist); } diff --git a/mm/mlock.c b/mm/mlock.c index 4f4f53bdc65d..ef726e8aa8e9 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -385,10 +385,11 @@ static int do_mlock(unsigned long start, size_t len, int on) return -EINVAL; if (end == start) return 0; - vma = find_vma_prev(current->mm, start, &prev); + vma = find_vma(current->mm, start); if (!vma || vma->vm_start > start) return -ENOMEM; + prev = vma->vm_prev; if (start > vma->vm_start) prev = vma; diff --git a/mm/mmap.c b/mm/mmap.c index 3f758c7f4c81..da15a79b1441 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1266,8 +1266,9 @@ munmap_back: vma->vm_pgoff = pgoff; INIT_LIST_HEAD(&vma->anon_vma_chain); + error = -EINVAL; /* when rejecting VM_GROWSDOWN|VM_GROWSUP */ + if (file) { - error = -EINVAL; if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) goto free_vma; if (vm_flags & VM_DENYWRITE) { @@ -1293,6 +1294,8 @@ munmap_back: pgoff = vma->vm_pgoff; vm_flags = vma->vm_flags; } else if (vm_flags & VM_SHARED) { + if (unlikely(vm_flags & (VM_GROWSDOWN|VM_GROWSUP))) + goto free_vma; error = shmem_zero_setup(vma); if (error) goto free_vma; @@ -1605,7 +1608,6 @@ EXPORT_SYMBOL(find_vma); /* * Same as find_vma, but also return a pointer to the previous VMA in *pprev. - * Note: pprev is set to NULL when return value is NULL. */ struct vm_area_struct * find_vma_prev(struct mm_struct *mm, unsigned long addr, @@ -1614,7 +1616,16 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr, struct vm_area_struct *vma; vma = find_vma(mm, addr); - *pprev = vma ? vma->vm_prev : NULL; + if (vma) { + *pprev = vma->vm_prev; + } else { + struct rb_node *rb_node = mm->mm_rb.rb_node; + *pprev = NULL; + while (rb_node) { + *pprev = rb_entry(rb_node, struct vm_area_struct, vm_rb); + rb_node = rb_node->rb_right; + } + } return vma; } diff --git a/mm/mprotect.c b/mm/mprotect.c index 5a688a2756be..f437d054c3bf 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -262,10 +262,11 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, down_write(¤t->mm->mmap_sem); - vma = find_vma_prev(current->mm, start, &prev); + vma = find_vma(current->mm, start); error = -ENOMEM; if (!vma) goto out; + prev = vma->vm_prev; if (unlikely(grows & PROT_GROWSDOWN)) { if (vma->vm_start >= end) goto out; diff --git a/mm/nommu.c b/mm/nommu.c index b982290fd962..f59e170fceb4 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -696,9 +696,11 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) if (vma->vm_file) { mapping = vma->vm_file->f_mapping; + mutex_lock(&mapping->i_mmap_mutex); flush_dcache_mmap_lock(mapping); vma_prio_tree_insert(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); + mutex_unlock(&mapping->i_mmap_mutex); } /* add the VMA to the tree */ @@ -760,9 +762,11 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) if (vma->vm_file) { mapping = vma->vm_file->f_mapping; + mutex_lock(&mapping->i_mmap_mutex); flush_dcache_mmap_lock(mapping); vma_prio_tree_remove(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); + mutex_unlock(&mapping->i_mmap_mutex); } /* remove from the MM's tree and list */ @@ -775,8 +779,6 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) if (vma->vm_next) vma->vm_next->vm_prev = vma->vm_prev; - - vma->vm_mm = NULL; } /* @@ -2052,6 +2054,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; down_write(&nommu_region_sem); + mutex_lock(&inode->i_mapping->i_mmap_mutex); /* search for VMAs that fall within the dead zone */ vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, @@ -2059,6 +2062,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, /* found one - only interested if it's shared out of the page * cache */ if (vma->vm_flags & VM_SHARED) { + mutex_unlock(&inode->i_mapping->i_mmap_mutex); up_write(&nommu_region_sem); return -ETXTBSY; /* not quite true, but near enough */ } @@ -2086,6 +2090,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, } } + mutex_unlock(&inode->i_mapping->i_mmap_mutex); up_write(&nommu_region_sem); return 0; } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 7c122faa05c5..2958fd8e7c9a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -152,7 +152,7 @@ struct task_struct *find_lock_task_mm(struct task_struct *p) /* return true if the task is not adequate as candidate victim task. */ static bool oom_unkillable_task(struct task_struct *p, - const struct mem_cgroup *mem, const nodemask_t *nodemask) + const struct mem_cgroup *memcg, const nodemask_t *nodemask) { if (is_global_init(p)) return true; @@ -160,7 +160,7 @@ static bool oom_unkillable_task(struct task_struct *p, return true; /* When mem_cgroup_out_of_memory() and p is not member of the group */ - if (mem && !task_in_mem_cgroup(p, mem)) + if (memcg && !task_in_mem_cgroup(p, memcg)) return true; /* p may not have freeable memory in nodemask */ @@ -179,12 +179,12 @@ static bool oom_unkillable_task(struct task_struct *p, * predictable as possible. The goal is to return the highest value for the * task consuming the most memory to avoid subsequent oom failures. */ -unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, +unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, unsigned long totalpages) { long points; - if (oom_unkillable_task(p, mem, nodemask)) + if (oom_unkillable_task(p, memcg, nodemask)) return 0; p = find_lock_task_mm(p); @@ -308,7 +308,7 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist, * (not docbooked, we don't want this one cluttering up the manual) */ static struct task_struct *select_bad_process(unsigned int *ppoints, - unsigned long totalpages, struct mem_cgroup *mem, + unsigned long totalpages, struct mem_cgroup *memcg, const nodemask_t *nodemask) { struct task_struct *g, *p; @@ -320,7 +320,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, if (p->exit_state) continue; - if (oom_unkillable_task(p, mem, nodemask)) + if (oom_unkillable_task(p, memcg, nodemask)) continue; /* @@ -364,7 +364,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, } } - points = oom_badness(p, mem, nodemask, totalpages); + points = oom_badness(p, memcg, nodemask, totalpages); if (points > *ppoints) { chosen = p; *ppoints = points; @@ -387,14 +387,14 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, * * Call with tasklist_lock read-locked. */ -static void dump_tasks(const struct mem_cgroup *mem, const nodemask_t *nodemask) +static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask) { struct task_struct *p; struct task_struct *task; pr_info("[ pid ] uid tgid total_vm rss cpu oom_adj oom_score_adj name\n"); for_each_process(p) { - if (oom_unkillable_task(p, mem, nodemask)) + if (oom_unkillable_task(p, memcg, nodemask)) continue; task = find_lock_task_mm(p); @@ -417,7 +417,7 @@ static void dump_tasks(const struct mem_cgroup *mem, const nodemask_t *nodemask) } static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, - struct mem_cgroup *mem, const nodemask_t *nodemask) + struct mem_cgroup *memcg, const nodemask_t *nodemask) { task_lock(current); pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " @@ -427,14 +427,14 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, cpuset_print_task_mems_allowed(current); task_unlock(current); dump_stack(); - mem_cgroup_print_oom_info(mem, p); + mem_cgroup_print_oom_info(memcg, p); show_mem(SHOW_MEM_FILTER_NODES); if (sysctl_oom_dump_tasks) - dump_tasks(mem, nodemask); + dump_tasks(memcg, nodemask); } #define K(x) ((x) << (PAGE_SHIFT-10)) -static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem) +static int oom_kill_task(struct task_struct *p) { struct task_struct *q; struct mm_struct *mm; @@ -484,7 +484,7 @@ static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem) static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, unsigned int points, unsigned long totalpages, - struct mem_cgroup *mem, nodemask_t *nodemask, + struct mem_cgroup *memcg, nodemask_t *nodemask, const char *message) { struct task_struct *victim = p; @@ -493,7 +493,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, unsigned int victim_points = 0; if (printk_ratelimit()) - dump_header(p, gfp_mask, order, mem, nodemask); + dump_header(p, gfp_mask, order, memcg, nodemask); /* * If the task is already exiting, don't alarm the sysadmin or kill @@ -524,7 +524,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, /* * oom_badness() returns 0 if the thread is unkillable */ - child_points = oom_badness(child, mem, nodemask, + child_points = oom_badness(child, memcg, nodemask, totalpages); if (child_points > victim_points) { victim = child; @@ -533,7 +533,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, } } while_each_thread(p, t); - return oom_kill_task(victim, mem); + return oom_kill_task(victim); } /* @@ -561,7 +561,7 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, } #ifdef CONFIG_CGROUP_MEM_RES_CTLR -void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) +void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask) { unsigned long limit; unsigned int points = 0; @@ -578,14 +578,14 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) } check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL); - limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT; + limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT; read_lock(&tasklist_lock); retry: - p = select_bad_process(&points, limit, mem, NULL); + p = select_bad_process(&points, limit, memcg, NULL); if (!p || PTR_ERR(p) == -1UL) goto out; - if (oom_kill_process(p, gfp_mask, 0, points, limit, mem, NULL, + if (oom_kill_process(p, gfp_mask, 0, points, limit, memcg, NULL, "Memory cgroup out of memory")) goto retry; out: diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 794e6715c226..a13ded1938f0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1981,14 +1981,20 @@ static struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, unsigned long *did_some_progress, - bool sync_migration) + int migratetype, bool sync_migration, + bool *deferred_compaction, + unsigned long *did_some_progress) { struct page *page; - if (!order || compaction_deferred(preferred_zone)) + if (!order) return NULL; + if (compaction_deferred(preferred_zone)) { + *deferred_compaction = true; + return NULL; + } + current->flags |= PF_MEMALLOC; *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, nodemask, sync_migration); @@ -2016,7 +2022,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, * but not enough to satisfy watermarks. */ count_vm_event(COMPACTFAIL); - defer_compaction(preferred_zone); + + /* + * As async compaction considers a subset of pageblocks, only + * defer if the failure was a sync compaction failure. + */ + if (sync_migration) + defer_compaction(preferred_zone); cond_resched(); } @@ -2028,8 +2040,9 @@ static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, unsigned long *did_some_progress, - bool sync_migration) + int migratetype, bool sync_migration, + bool *deferred_compaction, + unsigned long *did_some_progress) { return NULL; } @@ -2179,6 +2192,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned long pages_reclaimed = 0; unsigned long did_some_progress; bool sync_migration = false; + bool deferred_compaction = false; /* * In the slowpath, we sanity check order to avoid ever trying to @@ -2259,12 +2273,22 @@ rebalance: zonelist, high_zoneidx, nodemask, alloc_flags, preferred_zone, - migratetype, &did_some_progress, - sync_migration); + migratetype, sync_migration, + &deferred_compaction, + &did_some_progress); if (page) goto got_pg; sync_migration = true; + /* + * If compaction is deferred for high-order allocations, it is because + * sync compaction recently failed. In this is the case and the caller + * has requested the system not be heavily disrupted, fail the + * allocation now instead of entering direct reclaim + */ + if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD)) + goto nopage; + /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, zonelist, high_zoneidx, @@ -2328,8 +2352,9 @@ rebalance: zonelist, high_zoneidx, nodemask, alloc_flags, preferred_zone, - migratetype, &did_some_progress, - sync_migration); + migratetype, sync_migration, + &deferred_compaction, + &did_some_progress); if (page) goto got_pg; } @@ -4237,7 +4262,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize, memmap_pages; - enum lru_list l; + enum lru_list lru; size = zone_spanned_pages_in_node(nid, j, zones_size); realsize = size - zone_absent_pages_in_node(nid, j, @@ -4287,8 +4312,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, zone->zone_pgdat = pgdat; zone_pcp_init(zone); - for_each_lru(l) - INIT_LIST_HEAD(&zone->lru[l].list); + for_each_lru(lru) + INIT_LIST_HEAD(&zone->lruvec.lists[lru]); zone->reclaim_stat.recent_rotated[0] = 0; zone->reclaim_stat.recent_rotated[1] = 0; zone->reclaim_stat.recent_scanned[0] = 0; @@ -4642,8 +4667,10 @@ static void check_for_regular_memory(pg_data_t *pgdat) for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) { struct zone *zone = &pgdat->node_zones[zone_type]; - if (zone->present_pages) + if (zone->present_pages) { node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY); + break; + } } #endif } @@ -5209,6 +5236,7 @@ void *__init alloc_large_system_hash(const char *tablename, max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; do_div(max, bucketsize); } + max = min(max, 0x80000000ULL); if (numentries > max) numentries = max; @@ -5386,7 +5414,25 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count) bool is_pageblock_removable_nolock(struct page *page) { - struct zone *zone = page_zone(page); + struct zone *zone; + unsigned long pfn; + + /* + * We have to be careful here because we are iterating over memory + * sections which are not zone aware so we might end up outside of + * the zone but still within the section. + * We have to take care about the node as well. If the node is offline + * its NODE_DATA will be NULL - see page_zone. + */ + if (!node_online(page_to_nid(page))) + return false; + + zone = page_zone(page); + pfn = page_to_pfn(page); + if (zone->zone_start_pfn > pfn || + zone->zone_start_pfn + zone->spanned_pages <= pfn) + return false; + return __count_immobile_pages(zone, page, 0); } diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 2d123f94a8df..1ccbd714059c 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -11,13 +11,6 @@ #include <linux/swapops.h> #include <linux/kmemleak.h> -static void __meminit init_page_cgroup(struct page_cgroup *pc, unsigned long id) -{ - pc->flags = 0; - set_page_cgroup_array_id(pc, id); - pc->mem_cgroup = NULL; - INIT_LIST_HEAD(&pc->lru); -} static unsigned long total_usage; #if !defined(CONFIG_SPARSEMEM) @@ -35,35 +28,27 @@ struct page_cgroup *lookup_page_cgroup(struct page *page) struct page_cgroup *base; base = NODE_DATA(page_to_nid(page))->node_page_cgroup; +#ifdef CONFIG_DEBUG_VM + /* + * The sanity checks the page allocator does upon freeing a + * page can reach here before the page_cgroup arrays are + * allocated when feeding a range of pages to the allocator + * for the first time during bootup or memory hotplug. + */ if (unlikely(!base)) return NULL; - +#endif offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn; return base + offset; } -struct page *lookup_cgroup_page(struct page_cgroup *pc) -{ - unsigned long pfn; - struct page *page; - pg_data_t *pgdat; - - pgdat = NODE_DATA(page_cgroup_array_id(pc)); - pfn = pc - pgdat->node_page_cgroup + pgdat->node_start_pfn; - page = pfn_to_page(pfn); - VM_BUG_ON(pc != lookup_page_cgroup(page)); - return page; -} - static int __init alloc_node_page_cgroup(int nid) { - struct page_cgroup *base, *pc; + struct page_cgroup *base; unsigned long table_size; - unsigned long start_pfn, nr_pages, index; + unsigned long nr_pages; - start_pfn = NODE_DATA(nid)->node_start_pfn; nr_pages = NODE_DATA(nid)->node_spanned_pages; - if (!nr_pages) return 0; @@ -73,10 +58,6 @@ static int __init alloc_node_page_cgroup(int nid) table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); if (!base) return -ENOMEM; - for (index = 0; index < nr_pages; index++) { - pc = base + index; - init_page_cgroup(pc, nid); - } NODE_DATA(nid)->node_page_cgroup = base; total_usage += table_size; return 0; @@ -111,29 +92,23 @@ struct page_cgroup *lookup_page_cgroup(struct page *page) { unsigned long pfn = page_to_pfn(page); struct mem_section *section = __pfn_to_section(pfn); - +#ifdef CONFIG_DEBUG_VM + /* + * The sanity checks the page allocator does upon freeing a + * page can reach here before the page_cgroup arrays are + * allocated when feeding a range of pages to the allocator + * for the first time during bootup or memory hotplug. + */ if (!section->page_cgroup) return NULL; +#endif return section->page_cgroup + pfn; } -struct page *lookup_cgroup_page(struct page_cgroup *pc) -{ - struct mem_section *section; - struct page *page; - unsigned long nr; - - nr = page_cgroup_array_id(pc); - section = __nr_to_section(nr); - page = pfn_to_page(pc - section->page_cgroup); - VM_BUG_ON(pc != lookup_page_cgroup(page)); - return page; -} - static void *__meminit alloc_page_cgroup(size_t size, int nid) { + gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN; void *addr = NULL; - gfp_t flags = GFP_KERNEL | __GFP_NOWARN; addr = alloc_pages_exact_nid(nid, size, flags); if (addr) { @@ -142,39 +117,20 @@ static void *__meminit alloc_page_cgroup(size_t size, int nid) } if (node_state(nid, N_HIGH_MEMORY)) - addr = vmalloc_node(size, nid); + addr = vzalloc_node(size, nid); else - addr = vmalloc(size); + addr = vzalloc(size); return addr; } -#ifdef CONFIG_MEMORY_HOTPLUG -static void free_page_cgroup(void *addr) -{ - if (is_vmalloc_addr(addr)) { - vfree(addr); - } else { - struct page *page = virt_to_page(addr); - size_t table_size = - sizeof(struct page_cgroup) * PAGES_PER_SECTION; - - BUG_ON(PageReserved(page)); - free_pages_exact(addr, table_size); - } -} -#endif - static int __meminit init_section_page_cgroup(unsigned long pfn, int nid) { - struct page_cgroup *base, *pc; struct mem_section *section; + struct page_cgroup *base; unsigned long table_size; - unsigned long nr; - int index; - nr = pfn_to_section_nr(pfn); - section = __nr_to_section(nr); + section = __pfn_to_section(pfn); if (section->page_cgroup) return 0; @@ -194,10 +150,6 @@ static int __meminit init_section_page_cgroup(unsigned long pfn, int nid) return -ENOMEM; } - for (index = 0; index < PAGES_PER_SECTION; index++) { - pc = base + index; - init_page_cgroup(pc, nr); - } /* * The passed "pfn" may not be aligned to SECTION. For the calculation * we need to apply a mask. @@ -208,6 +160,20 @@ static int __meminit init_section_page_cgroup(unsigned long pfn, int nid) return 0; } #ifdef CONFIG_MEMORY_HOTPLUG +static void free_page_cgroup(void *addr) +{ + if (is_vmalloc_addr(addr)) { + vfree(addr); + } else { + struct page *page = virt_to_page(addr); + size_t table_size = + sizeof(struct page_cgroup) * PAGES_PER_SECTION; + + BUG_ON(PageReserved(page)); + free_pages_exact(addr, table_size); + } +} + void __free_page_cgroup(unsigned long pfn) { struct mem_section *ms; @@ -366,7 +332,6 @@ struct swap_cgroup { unsigned short id; }; #define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup)) -#define SC_POS_MASK (SC_PER_PAGE - 1) /* * SwapCgroup implements "lookup" and "exchange" operations. @@ -408,6 +373,23 @@ not_enough_page: return -ENOMEM; } +static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent, + struct swap_cgroup_ctrl **ctrlp) +{ + pgoff_t offset = swp_offset(ent); + struct swap_cgroup_ctrl *ctrl; + struct page *mappage; + struct swap_cgroup *sc; + + ctrl = &swap_cgroup_ctrl[swp_type(ent)]; + if (ctrlp) + *ctrlp = ctrl; + + mappage = ctrl->map[offset / SC_PER_PAGE]; + sc = page_address(mappage); + return sc + offset % SC_PER_PAGE; +} + /** * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. * @end: swap entry to be cmpxchged @@ -420,21 +402,13 @@ not_enough_page: unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, unsigned short old, unsigned short new) { - int type = swp_type(ent); - unsigned long offset = swp_offset(ent); - unsigned long idx = offset / SC_PER_PAGE; - unsigned long pos = offset & SC_POS_MASK; struct swap_cgroup_ctrl *ctrl; - struct page *mappage; struct swap_cgroup *sc; unsigned long flags; unsigned short retval; - ctrl = &swap_cgroup_ctrl[type]; + sc = lookup_swap_cgroup(ent, &ctrl); - mappage = ctrl->map[idx]; - sc = page_address(mappage); - sc += pos; spin_lock_irqsave(&ctrl->lock, flags); retval = sc->id; if (retval == old) @@ -455,21 +429,13 @@ unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, */ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) { - int type = swp_type(ent); - unsigned long offset = swp_offset(ent); - unsigned long idx = offset / SC_PER_PAGE; - unsigned long pos = offset & SC_POS_MASK; struct swap_cgroup_ctrl *ctrl; - struct page *mappage; struct swap_cgroup *sc; unsigned short old; unsigned long flags; - ctrl = &swap_cgroup_ctrl[type]; + sc = lookup_swap_cgroup(ent, &ctrl); - mappage = ctrl->map[idx]; - sc = page_address(mappage); - sc += pos; spin_lock_irqsave(&ctrl->lock, flags); old = sc->id; sc->id = id; @@ -479,28 +445,14 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) } /** - * lookup_swap_cgroup - lookup mem_cgroup tied to swap entry + * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry * @ent: swap entry to be looked up. * * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID) */ -unsigned short lookup_swap_cgroup(swp_entry_t ent) +unsigned short lookup_swap_cgroup_id(swp_entry_t ent) { - int type = swp_type(ent); - unsigned long offset = swp_offset(ent); - unsigned long idx = offset / SC_PER_PAGE; - unsigned long pos = offset & SC_POS_MASK; - struct swap_cgroup_ctrl *ctrl; - struct page *mappage; - struct swap_cgroup *sc; - unsigned short ret; - - ctrl = &swap_cgroup_ctrl[type]; - mappage = ctrl->map[idx]; - sc = page_address(mappage); - sc += pos; - ret = sc->id; - return ret; + return lookup_swap_cgroup(ent, NULL)->id; } int swap_cgroup_swapon(int type, unsigned long max_pages) diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 12a48a88c0d8..405d331804c3 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -184,8 +184,7 @@ static void pcpu_unmap_pages(struct pcpu_chunk *chunk, page_end - page_start); } - for (i = page_start; i < page_end; i++) - __clear_bit(i, populated); + bitmap_clear(populated, page_start, page_end - page_start); } /** diff --git a/mm/percpu.c b/mm/percpu.c index 716eb4acf2fc..f47af9123af7 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -67,6 +67,7 @@ #include <linux/spinlock.h> #include <linux/vmalloc.h> #include <linux/workqueue.h> +#include <linux/kmemleak.h> #include <asm/cacheflush.h> #include <asm/sections.h> @@ -710,6 +711,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) const char *err; int slot, off, new_alloc; unsigned long flags; + void __percpu *ptr; if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) { WARN(true, "illegal size (%zu) or align (%zu) for " @@ -802,7 +804,9 @@ area_found: mutex_unlock(&pcpu_alloc_mutex); /* return address relative to base address */ - return __addr_to_pcpu_ptr(chunk->base_addr + off); + ptr = __addr_to_pcpu_ptr(chunk->base_addr + off); + kmemleak_alloc_percpu(ptr, size); + return ptr; fail_unlock: spin_unlock_irqrestore(&pcpu_lock, flags); @@ -916,6 +920,8 @@ void free_percpu(void __percpu *ptr) if (!ptr) return; + kmemleak_free_percpu(ptr); + addr = __pcpu_ptr_to_addr(ptr); spin_lock_irqsave(&pcpu_lock, flags); @@ -1639,6 +1645,8 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, rc = -ENOMEM; goto out_free_areas; } + /* kmemleak tracks the percpu allocations separately */ + kmemleak_free(ptr); areas[group] = ptr; base = min(ptr, base); @@ -1753,6 +1761,8 @@ int __init pcpu_page_first_chunk(size_t reserved_size, "for cpu%u\n", psize_str, cpu); goto enomem; } + /* kmemleak tracks the percpu allocations separately */ + kmemleak_free(ptr); pages[j++] = virt_to_page(ptr); } diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index e920aa3ce104..c20ff48994c2 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -298,23 +298,18 @@ static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec, goto free_proc_pages; } - task_lock(task); - if (__ptrace_may_access(task, PTRACE_MODE_ATTACH)) { - task_unlock(task); - rc = -EPERM; - goto put_task_struct; - } - mm = task->mm; - - if (!mm || (task->flags & PF_KTHREAD)) { - task_unlock(task); - rc = -EINVAL; + mm = mm_access(task, PTRACE_MODE_ATTACH); + if (!mm || IS_ERR(mm)) { + rc = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; + /* + * Explicitly map EACCES to EPERM as EPERM is a more a + * appropriate error code for process_vw_readv/writev + */ + if (rc == -EACCES) + rc = -EPERM; goto put_task_struct; } - atomic_inc(&mm->mm_users); - task_unlock(task); - for (i = 0; i < riovcnt && iov_l_curr_idx < liovcnt; i++) { rc = process_vm_rw_single_vec( (unsigned long)rvec[i].iov_base, rvec[i].iov_len, diff --git a/mm/rmap.c b/mm/rmap.c index a2e5ce1fa081..c8454e06b6c8 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -773,7 +773,7 @@ out: } static int page_referenced_anon(struct page *page, - struct mem_cgroup *mem_cont, + struct mem_cgroup *memcg, unsigned long *vm_flags) { unsigned int mapcount; @@ -796,7 +796,7 @@ static int page_referenced_anon(struct page *page, * counting on behalf of references from different * cgroups */ - if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) + if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) continue; referenced += page_referenced_one(page, vma, address, &mapcount, vm_flags); @@ -811,7 +811,7 @@ static int page_referenced_anon(struct page *page, /** * page_referenced_file - referenced check for object-based rmap * @page: the page we're checking references on. - * @mem_cont: target memory controller + * @memcg: target memory control group * @vm_flags: collect encountered vma->vm_flags who actually referenced the page * * For an object-based mapped page, find all the places it is mapped and @@ -822,7 +822,7 @@ static int page_referenced_anon(struct page *page, * This function is only called from page_referenced for object-based pages. */ static int page_referenced_file(struct page *page, - struct mem_cgroup *mem_cont, + struct mem_cgroup *memcg, unsigned long *vm_flags) { unsigned int mapcount; @@ -864,7 +864,7 @@ static int page_referenced_file(struct page *page, * counting on behalf of references from different * cgroups */ - if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) + if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) continue; referenced += page_referenced_one(page, vma, address, &mapcount, vm_flags); @@ -880,7 +880,7 @@ static int page_referenced_file(struct page *page, * page_referenced - test if the page was referenced * @page: the page to test * @is_locked: caller holds lock on the page - * @mem_cont: target memory controller + * @memcg: target memory cgroup * @vm_flags: collect encountered vma->vm_flags who actually referenced the page * * Quick test_and_clear_referenced for all mappings to a page, @@ -888,7 +888,7 @@ static int page_referenced_file(struct page *page, */ int page_referenced(struct page *page, int is_locked, - struct mem_cgroup *mem_cont, + struct mem_cgroup *memcg, unsigned long *vm_flags) { int referenced = 0; @@ -904,13 +904,13 @@ int page_referenced(struct page *page, } } if (unlikely(PageKsm(page))) - referenced += page_referenced_ksm(page, mem_cont, + referenced += page_referenced_ksm(page, memcg, vm_flags); else if (PageAnon(page)) - referenced += page_referenced_anon(page, mem_cont, + referenced += page_referenced_anon(page, memcg, vm_flags); else if (page->mapping) - referenced += page_referenced_file(page, mem_cont, + referenced += page_referenced_file(page, memcg, vm_flags); if (we_locked) unlock_page(page); diff --git a/mm/shmem.c b/mm/shmem.c index feead1943d92..b7e195571862 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -379,7 +379,7 @@ static int shmem_free_swap(struct address_space *mapping, /* * Pagevec may contain swap entries, so shuffle up pages before releasing. */ -static void shmem_pagevec_release(struct pagevec *pvec) +static void shmem_deswap_pagevec(struct pagevec *pvec) { int i, j; @@ -389,7 +389,36 @@ static void shmem_pagevec_release(struct pagevec *pvec) pvec->pages[j++] = page; } pvec->nr = j; - pagevec_release(pvec); +} + +/* + * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists. + */ +void shmem_unlock_mapping(struct address_space *mapping) +{ + struct pagevec pvec; + pgoff_t indices[PAGEVEC_SIZE]; + pgoff_t index = 0; + + pagevec_init(&pvec, 0); + /* + * Minor point, but we might as well stop if someone else SHM_LOCKs it. + */ + while (!mapping_unevictable(mapping)) { + /* + * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it + * has finished, if it hits a row of PAGEVEC_SIZE swap entries. + */ + pvec.nr = shmem_find_get_pages_and_swap(mapping, index, + PAGEVEC_SIZE, pvec.pages, indices); + if (!pvec.nr) + break; + index = indices[pvec.nr - 1] + 1; + shmem_deswap_pagevec(&pvec); + check_move_unevictable_pages(pvec.pages, pvec.nr); + pagevec_release(&pvec); + cond_resched(); + } } /* @@ -440,7 +469,8 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) } unlock_page(page); } - shmem_pagevec_release(&pvec); + shmem_deswap_pagevec(&pvec); + pagevec_release(&pvec); mem_cgroup_uncharge_end(); cond_resched(); index++; @@ -470,7 +500,8 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) continue; } if (index == start && indices[0] > end) { - shmem_pagevec_release(&pvec); + shmem_deswap_pagevec(&pvec); + pagevec_release(&pvec); break; } mem_cgroup_uncharge_start(); @@ -494,7 +525,8 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) } unlock_page(page); } - shmem_pagevec_release(&pvec); + shmem_deswap_pagevec(&pvec); + pagevec_release(&pvec); mem_cgroup_uncharge_end(); index++; } @@ -1068,13 +1100,6 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user) user_shm_unlock(inode->i_size, user); info->flags &= ~VM_LOCKED; mapping_clear_unevictable(file->f_mapping); - /* - * Ensure that a racing putback_lru_page() can see - * the pages of this mapping are evictable when we - * skip them due to !PageLRU during the scan. - */ - smp_mb__after_clear_bit(); - scan_mapping_unevictable_pages(file->f_mapping); } retval = 0; @@ -1631,9 +1656,9 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s } inode->i_mapping->a_ops = &shmem_aops; inode->i_op = &shmem_symlink_inode_operations; - kaddr = kmap_atomic(page, KM_USER0); + kaddr = kmap_atomic(page); memcpy(kaddr, symname, len); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); set_page_dirty(page); unlock_page(page); page_cache_release(page); @@ -2445,6 +2470,10 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user) return 0; } +void shmem_unlock_mapping(struct address_space *mapping) +{ +} + void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) { truncate_inode_pages_range(inode->i_mapping, lstart, lend); diff --git a/mm/slab.c b/mm/slab.c index 2acfa0d90943..f0bd7857ab3b 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -481,11 +481,13 @@ EXPORT_SYMBOL(slab_buffer_size); #endif /* - * Do not go above this order unless 0 objects fit into the slab. + * Do not go above this order unless 0 objects fit into the slab or + * overridden on the command line. */ -#define BREAK_GFP_ORDER_HI 1 -#define BREAK_GFP_ORDER_LO 0 -static int slab_break_gfp_order = BREAK_GFP_ORDER_LO; +#define SLAB_MAX_ORDER_HI 1 +#define SLAB_MAX_ORDER_LO 0 +static int slab_max_order = SLAB_MAX_ORDER_LO; +static bool slab_max_order_set __initdata; /* * Functions for storing/retrieving the cachep and or slab from the page @@ -854,6 +856,17 @@ static int __init noaliencache_setup(char *s) } __setup("noaliencache", noaliencache_setup); +static int __init slab_max_order_setup(char *str) +{ + get_option(&str, &slab_max_order); + slab_max_order = slab_max_order < 0 ? 0 : + min(slab_max_order, MAX_ORDER - 1); + slab_max_order_set = true; + + return 1; +} +__setup("slab_max_order=", slab_max_order_setup); + #ifdef CONFIG_NUMA /* * Special reaping functions for NUMA systems called from cache_reap(). @@ -1502,10 +1515,11 @@ void __init kmem_cache_init(void) /* * Fragmentation resistance on low memory - only use bigger - * page orders on machines with more than 32MB of memory. + * page orders on machines with more than 32MB of memory if + * not overridden on the command line. */ - if (totalram_pages > (32 << 20) >> PAGE_SHIFT) - slab_break_gfp_order = BREAK_GFP_ORDER_HI; + if (!slab_max_order_set && totalram_pages > (32 << 20) >> PAGE_SHIFT) + slab_max_order = SLAB_MAX_ORDER_HI; /* Bootstrap is tricky, because several objects are allocated * from caches that do not exist yet: @@ -1932,8 +1946,8 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) /* Print header */ if (lines == 0) { printk(KERN_ERR - "Slab corruption: %s start=%p, len=%d\n", - cachep->name, realobj, size); + "Slab corruption (%s): %s start=%p, len=%d\n", + print_tainted(), cachep->name, realobj, size); print_objinfo(cachep, objp, 0); } /* Hexdump the affected line */ @@ -2117,7 +2131,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, * Large number of objects is good, but very large slabs are * currently bad for the gfp()s. */ - if (gfporder >= slab_break_gfp_order) + if (gfporder >= slab_max_order) break; /* @@ -3042,8 +3056,9 @@ static void check_slabp(struct kmem_cache *cachep, struct slab *slabp) if (entries != cachep->num - slabp->inuse) { bad: printk(KERN_ERR "slab: Internal list corruption detected in " - "cache '%s'(%d), slabp %p(%d). Hexdump:\n", - cachep->name, cachep->num, slabp, slabp->inuse); + "cache '%s'(%d), slabp %p(%d). Tainted(%s). Hexdump:\n", + cachep->name, cachep->num, slabp, slabp->inuse, + print_tainted()); print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1, slabp, sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t), 1); diff --git a/mm/slub.c b/mm/slub.c index d99acbf14e01..4907563ef7ff 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -366,7 +366,8 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page const char *n) { VM_BUG_ON(!irqs_disabled()); -#ifdef CONFIG_CMPXCHG_DOUBLE +#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ + defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) if (s->flags & __CMPXCHG_DOUBLE) { if (cmpxchg_double(&page->freelist, &page->counters, freelist_old, counters_old, @@ -400,7 +401,8 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, void *freelist_new, unsigned long counters_new, const char *n) { -#ifdef CONFIG_CMPXCHG_DOUBLE +#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ + defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) if (s->flags & __CMPXCHG_DOUBLE) { if (cmpxchg_double(&page->freelist, &page->counters, freelist_old, counters_old, @@ -570,7 +572,7 @@ static void slab_bug(struct kmem_cache *s, char *fmt, ...) va_end(args); printk(KERN_ERR "========================================" "=====================================\n"); - printk(KERN_ERR "BUG %s: %s\n", s->name, buf); + printk(KERN_ERR "BUG %s (%s): %s\n", s->name, print_tainted(), buf); printk(KERN_ERR "----------------------------------------" "-------------------------------------\n\n"); } @@ -1901,11 +1903,14 @@ static void unfreeze_partials(struct kmem_cache *s) } if (l != m) { - if (l == M_PARTIAL) + if (l == M_PARTIAL) { remove_partial(n, page); - else + stat(s, FREE_REMOVE_PARTIAL); + } else { add_partial(n, page, DEACTIVATE_TO_TAIL); + stat(s, FREE_ADD_PARTIAL); + } l = m; } @@ -2124,6 +2129,37 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, } /* + * Check the page->freelist of a page and either transfer the freelist to the per cpu freelist + * or deactivate the page. + * + * The page is still frozen if the return value is not NULL. + * + * If this function returns NULL then the page has been unfrozen. + */ +static inline void *get_freelist(struct kmem_cache *s, struct page *page) +{ + struct page new; + unsigned long counters; + void *freelist; + + do { + freelist = page->freelist; + counters = page->counters; + new.counters = counters; + VM_BUG_ON(!new.frozen); + + new.inuse = page->objects; + new.frozen = freelist != NULL; + + } while (!cmpxchg_double_slab(s, page, + freelist, counters, + NULL, new.counters, + "get_freelist")); + + return freelist; +} + +/* * Slow path. The lockless freelist is empty or we need to perform * debugging duties. * @@ -2144,8 +2180,6 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, { void **object; unsigned long flags; - struct page new; - unsigned long counters; local_irq_save(flags); #ifdef CONFIG_PREEMPT @@ -2166,31 +2200,14 @@ redo: goto new_slab; } - stat(s, ALLOC_SLOWPATH); - - do { - object = c->page->freelist; - counters = c->page->counters; - new.counters = counters; - VM_BUG_ON(!new.frozen); - - /* - * If there is no object left then we use this loop to - * deactivate the slab which is simple since no objects - * are left in the slab and therefore we do not need to - * put the page back onto the partial list. - * - * If there are objects left then we retrieve them - * and use them to refill the per cpu queue. - */ + /* must check again c->freelist in case of cpu migration or IRQ */ + object = c->freelist; + if (object) + goto load_freelist; - new.inuse = c->page->objects; - new.frozen = object != NULL; + stat(s, ALLOC_SLOWPATH); - } while (!__cmpxchg_double_slab(s, c->page, - object, counters, - NULL, new.counters, - "__slab_alloc")); + object = get_freelist(s, c->page); if (!object) { c->page = NULL; @@ -2999,7 +3016,8 @@ static int kmem_cache_open(struct kmem_cache *s, } } -#ifdef CONFIG_CMPXCHG_DOUBLE +#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ + defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0) /* Enable fast mode */ s->flags |= __CMPXCHG_DOUBLE; @@ -3028,7 +3046,9 @@ static int kmem_cache_open(struct kmem_cache *s, * per node list when we run out of per cpu objects. We only fetch 50% * to keep some capacity around for frees. */ - if (s->size >= PAGE_SIZE) + if (kmem_cache_debug(s)) + s->cpu_partial = 0; + else if (s->size >= PAGE_SIZE) s->cpu_partial = 2; else if (s->size >= 1024) s->cpu_partial = 6; @@ -4637,6 +4657,8 @@ static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, err = strict_strtoul(buf, 10, &objects); if (err) return err; + if (objects && kmem_cache_debug(s)) + return -EINVAL; s->cpu_partial = objects; flush_all(s); diff --git a/mm/swap.c b/mm/swap.c index 67a09a633a09..14380e9fbe33 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -23,7 +23,6 @@ #include <linux/init.h> #include <linux/export.h> #include <linux/mm_inline.h> -#include <linux/buffer_head.h> /* for try_to_release_page() */ #include <linux/percpu_counter.h> #include <linux/percpu.h> #include <linux/cpu.h> @@ -54,7 +53,7 @@ static void __page_cache_release(struct page *page) spin_lock_irqsave(&zone->lru_lock, flags); VM_BUG_ON(!PageLRU(page)); __ClearPageLRU(page); - del_page_from_lru(zone, page); + del_page_from_lru_list(zone, page, page_off_lru(page)); spin_unlock_irqrestore(&zone->lru_lock, flags); } } @@ -232,12 +231,14 @@ static void pagevec_lru_move_fn(struct pagevec *pvec, static void pagevec_move_tail_fn(struct page *page, void *arg) { int *pgmoved = arg; - struct zone *zone = page_zone(page); if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { enum lru_list lru = page_lru_base_type(page); - list_move_tail(&page->lru, &zone->lru[lru].list); - mem_cgroup_rotate_reclaimable_page(page); + struct lruvec *lruvec; + + lruvec = mem_cgroup_lru_move_lists(page_zone(page), + page, lru, lru); + list_move_tail(&page->lru, &lruvec->lists[lru]); (*pgmoved)++; } } @@ -368,7 +369,6 @@ void mark_page_accessed(struct page *page) SetPageReferenced(page); } } - EXPORT_SYMBOL(mark_page_accessed); void __lru_cache_add(struct page *page, enum lru_list lru) @@ -377,7 +377,7 @@ void __lru_cache_add(struct page *page, enum lru_list lru) page_cache_get(page); if (!pagevec_add(pvec, page)) - ____pagevec_lru_add(pvec, lru); + __pagevec_lru_add(pvec, lru); put_cpu_var(lru_add_pvecs); } EXPORT_SYMBOL(__lru_cache_add); @@ -476,12 +476,13 @@ static void lru_deactivate_fn(struct page *page, void *arg) */ SetPageReclaim(page); } else { + struct lruvec *lruvec; /* * The page's writeback ends up during pagevec * We moves tha page into tail of inactive. */ - list_move_tail(&page->lru, &zone->lru[lru].list); - mem_cgroup_rotate_reclaimable_page(page); + lruvec = mem_cgroup_lru_move_lists(zone, page, lru, lru); + list_move_tail(&page->lru, &lruvec->lists[lru]); __count_vm_event(PGROTATED); } @@ -504,7 +505,7 @@ static void drain_cpu_pagevecs(int cpu) for_each_lru(lru) { pvec = &pvecs[lru - LRU_BASE]; if (pagevec_count(pvec)) - ____pagevec_lru_add(pvec, lru); + __pagevec_lru_add(pvec, lru); } pvec = &per_cpu(lru_rotate_pvecs, cpu); @@ -616,7 +617,7 @@ void release_pages(struct page **pages, int nr, int cold) } VM_BUG_ON(!PageLRU(page)); __ClearPageLRU(page); - del_page_from_lru(zone, page); + del_page_from_lru_list(zone, page, page_off_lru(page)); } list_add(&page->lru, &pages_to_free); @@ -644,22 +645,21 @@ void __pagevec_release(struct pagevec *pvec) release_pages(pvec->pages, pagevec_count(pvec), pvec->cold); pagevec_reinit(pvec); } - EXPORT_SYMBOL(__pagevec_release); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE /* used by __split_huge_page_refcount() */ void lru_add_page_tail(struct zone* zone, struct page *page, struct page *page_tail) { - int active; + int uninitialized_var(active); enum lru_list lru; const int file = 0; - struct list_head *head; VM_BUG_ON(!PageHead(page)); VM_BUG_ON(PageCompound(page_tail)); VM_BUG_ON(PageLRU(page_tail)); - VM_BUG_ON(!spin_is_locked(&zone->lru_lock)); + VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&zone->lru_lock)); SetPageLRU(page_tail); @@ -672,19 +672,33 @@ void lru_add_page_tail(struct zone* zone, active = 0; lru = LRU_INACTIVE_ANON; } - update_page_reclaim_stat(zone, page_tail, file, active); - if (likely(PageLRU(page))) - head = page->lru.prev; - else - head = &zone->lru[lru].list; - __add_page_to_lru_list(zone, page_tail, lru, head); } else { SetPageUnevictable(page_tail); - add_page_to_lru_list(zone, page_tail, LRU_UNEVICTABLE); + lru = LRU_UNEVICTABLE; } + + if (likely(PageLRU(page))) + list_add_tail(&page_tail->lru, &page->lru); + else { + struct list_head *list_head; + /* + * Head page has not yet been counted, as an hpage, + * so we must account for each subpage individually. + * + * Use the standard add function to put page_tail on the list, + * but then correct its position so they all end up in order. + */ + add_page_to_lru_list(zone, page_tail, lru); + list_head = page_tail->lru.prev; + list_move_tail(&page_tail->lru, list_head); + } + + if (!PageUnevictable(page)) + update_page_reclaim_stat(zone, page_tail, file, active); } +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -static void ____pagevec_lru_add_fn(struct page *page, void *arg) +static void __pagevec_lru_add_fn(struct page *page, void *arg) { enum lru_list lru = (enum lru_list)arg; struct zone *zone = page_zone(page); @@ -698,40 +712,21 @@ static void ____pagevec_lru_add_fn(struct page *page, void *arg) SetPageLRU(page); if (active) SetPageActive(page); - update_page_reclaim_stat(zone, page, file, active); add_page_to_lru_list(zone, page, lru); + update_page_reclaim_stat(zone, page, file, active); } /* * Add the passed pages to the LRU, then drop the caller's refcount * on them. Reinitialises the caller's pagevec. */ -void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) +void __pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) { VM_BUG_ON(is_unevictable_lru(lru)); - pagevec_lru_move_fn(pvec, ____pagevec_lru_add_fn, (void *)lru); -} - -EXPORT_SYMBOL(____pagevec_lru_add); - -/* - * Try to drop buffers from the pages in a pagevec - */ -void pagevec_strip(struct pagevec *pvec) -{ - int i; - - for (i = 0; i < pagevec_count(pvec); i++) { - struct page *page = pvec->pages[i]; - - if (page_has_private(page) && trylock_page(page)) { - if (page_has_private(page)) - try_to_release_page(page, 0); - unlock_page(page); - } - } + pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, (void *)lru); } +EXPORT_SYMBOL(__pagevec_lru_add); /** * pagevec_lookup - gang pagecache lookup @@ -755,7 +750,6 @@ unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages); return pagevec_count(pvec); } - EXPORT_SYMBOL(pagevec_lookup); unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, @@ -765,7 +759,6 @@ unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, nr_pages, pvec->pages); return pagevec_count(pvec); } - EXPORT_SYMBOL(pagevec_lookup_tag); /* diff --git a/mm/swapfile.c b/mm/swapfile.c index 9520592d4231..00a962caab1a 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -847,12 +847,13 @@ unsigned int count_swap_pages(int type, int free) static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, swp_entry_t entry, struct page *page) { - struct mem_cgroup *ptr; + struct mem_cgroup *memcg; spinlock_t *ptl; pte_t *pte; int ret = 1; - if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr)) { + if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, + GFP_KERNEL, &memcg)) { ret = -ENOMEM; goto out_nolock; } @@ -860,7 +861,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { if (ret > 0) - mem_cgroup_cancel_charge_swapin(ptr); + mem_cgroup_cancel_charge_swapin(memcg); ret = 0; goto out; } @@ -871,7 +872,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, set_pte_at(vma->vm_mm, addr, pte, pte_mkold(mk_pte(page, vma->vm_page_prot))); page_add_anon_rmap(page, vma, addr); - mem_cgroup_commit_charge_swapin(page, ptr); + mem_cgroup_commit_charge_swapin(page, memcg); swap_free(entry); /* * Move the page to the active list so it is not @@ -2426,9 +2427,9 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) if (!(count & COUNT_CONTINUED)) goto out; - map = kmap_atomic(list_page, KM_USER0) + offset; + map = kmap_atomic(list_page) + offset; count = *map; - kunmap_atomic(map, KM_USER0); + kunmap_atomic(map); /* * If this continuation count now has some space in it, @@ -2471,7 +2472,7 @@ static bool swap_count_continued(struct swap_info_struct *si, offset &= ~PAGE_MASK; page = list_entry(head->lru.next, struct page, lru); - map = kmap_atomic(page, KM_USER0) + offset; + map = kmap_atomic(page) + offset; if (count == SWAP_MAP_MAX) /* initial increment from swap_map */ goto init_map; /* jump over SWAP_CONT_MAX checks */ @@ -2481,26 +2482,26 @@ static bool swap_count_continued(struct swap_info_struct *si, * Think of how you add 1 to 999 */ while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) { - kunmap_atomic(map, KM_USER0); + kunmap_atomic(map); page = list_entry(page->lru.next, struct page, lru); BUG_ON(page == head); - map = kmap_atomic(page, KM_USER0) + offset; + map = kmap_atomic(page) + offset; } if (*map == SWAP_CONT_MAX) { - kunmap_atomic(map, KM_USER0); + kunmap_atomic(map); page = list_entry(page->lru.next, struct page, lru); if (page == head) return false; /* add count continuation */ - map = kmap_atomic(page, KM_USER0) + offset; + map = kmap_atomic(page) + offset; init_map: *map = 0; /* we didn't zero the page */ } *map += 1; - kunmap_atomic(map, KM_USER0); + kunmap_atomic(map); page = list_entry(page->lru.prev, struct page, lru); while (page != head) { - map = kmap_atomic(page, KM_USER0) + offset; + map = kmap_atomic(page) + offset; *map = COUNT_CONTINUED; - kunmap_atomic(map, KM_USER0); + kunmap_atomic(map); page = list_entry(page->lru.prev, struct page, lru); } return true; /* incremented */ @@ -2511,22 +2512,22 @@ init_map: *map = 0; /* we didn't zero the page */ */ BUG_ON(count != COUNT_CONTINUED); while (*map == COUNT_CONTINUED) { - kunmap_atomic(map, KM_USER0); + kunmap_atomic(map); page = list_entry(page->lru.next, struct page, lru); BUG_ON(page == head); - map = kmap_atomic(page, KM_USER0) + offset; + map = kmap_atomic(page) + offset; } BUG_ON(*map == 0); *map -= 1; if (*map == 0) count = 0; - kunmap_atomic(map, KM_USER0); + kunmap_atomic(map); page = list_entry(page->lru.prev, struct page, lru); while (page != head) { - map = kmap_atomic(page, KM_USER0) + offset; + map = kmap_atomic(page) + offset; *map = SWAP_CONT_MAX | count; count = COUNT_CONTINUED; - kunmap_atomic(map, KM_USER0); + kunmap_atomic(map); page = list_entry(page->lru.prev, struct page, lru); } return count == COUNT_CONTINUED; diff --git a/mm/truncate.c b/mm/truncate.c index 632b15e29f74..a188058582e0 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -184,7 +184,7 @@ int invalidate_inode_page(struct page *page) } /** - * truncate_inode_pages - truncate range of pages specified by start & end byte offsets + * truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets * @mapping: mapping to truncate * @lstart: offset from which to truncate * @lend: offset to which to truncate diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 877ca046f43d..94dff883b449 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1906,9 +1906,9 @@ static int aligned_vread(char *buf, char *addr, unsigned long count) * we can expect USER0 is not used (see vread/vwrite's * function description) */ - void *map = kmap_atomic(p, KM_USER0); + void *map = kmap_atomic(p); memcpy(buf, map + offset, length); - kunmap_atomic(map, KM_USER0); + kunmap_atomic(map); } else memset(buf, 0, length); @@ -1945,9 +1945,9 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count) * we can expect USER0 is not used (see vread/vwrite's * function description) */ - void *map = kmap_atomic(p, KM_USER0); + void *map = kmap_atomic(p); memcpy(map + offset, buf, length); - kunmap_atomic(map, KM_USER0); + kunmap_atomic(map); } addr += length; buf += length; @@ -2378,7 +2378,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL); vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL); if (!vas || !vms) - goto err_free; + goto err_free2; for (area = 0; area < nr_vms; area++) { vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL); @@ -2476,11 +2476,10 @@ found: err_free: for (area = 0; area < nr_vms; area++) { - if (vas) - kfree(vas[area]); - if (vms) - kfree(vms[area]); + kfree(vas[area]); + kfree(vms[area]); } +err_free2: kfree(vas); kfree(vms); return NULL; diff --git a/mm/vmscan.c b/mm/vmscan.c index 26f4a8a4e0c7..c52b23552659 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -26,7 +26,6 @@ #include <linux/buffer_head.h> /* for try_to_release_page(), buffer_heads_over_limit */ #include <linux/mm_inline.h> -#include <linux/pagevec.h> #include <linux/backing-dev.h> #include <linux/rmap.h> #include <linux/topology.h> @@ -103,8 +102,11 @@ struct scan_control { */ reclaim_mode_t reclaim_mode; - /* Which cgroup do we reclaim from */ - struct mem_cgroup *mem_cgroup; + /* + * The memory cgroup that hit its limit and as a result is the + * primary target of this reclaim invocation. + */ + struct mem_cgroup *target_mem_cgroup; /* * Nodemask of nodes allowed by the caller. If NULL, all nodes @@ -113,6 +115,11 @@ struct scan_control { nodemask_t *nodemask; }; +struct mem_cgroup_zone { + struct mem_cgroup *mem_cgroup; + struct zone *zone; +}; + #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) #ifdef ARCH_HAS_PREFETCH @@ -153,28 +160,45 @@ static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); #ifdef CONFIG_CGROUP_MEM_RES_CTLR -#define scanning_global_lru(sc) (!(sc)->mem_cgroup) +static bool global_reclaim(struct scan_control *sc) +{ + return !sc->target_mem_cgroup; +} + +static bool scanning_global_lru(struct mem_cgroup_zone *mz) +{ + return !mz->mem_cgroup; +} #else -#define scanning_global_lru(sc) (1) +static bool global_reclaim(struct scan_control *sc) +{ + return true; +} + +static bool scanning_global_lru(struct mem_cgroup_zone *mz) +{ + return true; +} #endif -static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone, - struct scan_control *sc) +static struct zone_reclaim_stat *get_reclaim_stat(struct mem_cgroup_zone *mz) { - if (!scanning_global_lru(sc)) - return mem_cgroup_get_reclaim_stat(sc->mem_cgroup, zone); + if (!scanning_global_lru(mz)) + return mem_cgroup_get_reclaim_stat(mz->mem_cgroup, mz->zone); - return &zone->reclaim_stat; + return &mz->zone->reclaim_stat; } -static unsigned long zone_nr_lru_pages(struct zone *zone, - struct scan_control *sc, enum lru_list lru) +static unsigned long zone_nr_lru_pages(struct mem_cgroup_zone *mz, + enum lru_list lru) { - if (!scanning_global_lru(sc)) - return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup, - zone_to_nid(zone), zone_idx(zone), BIT(lru)); + if (!scanning_global_lru(mz)) + return mem_cgroup_zone_nr_lru_pages(mz->mem_cgroup, + zone_to_nid(mz->zone), + zone_idx(mz->zone), + BIT(lru)); - return zone_page_state(zone, NR_LRU_BASE + lru); + return zone_page_state(mz->zone, NR_LRU_BASE + lru); } @@ -636,7 +660,7 @@ redo: * When racing with an mlock or AS_UNEVICTABLE clearing * (page is unlocked) make sure that if the other thread * does not observe our setting of PG_lru and fails - * isolation/check_move_unevictable_page, + * isolation/check_move_unevictable_pages, * we see PG_mlocked/AS_UNEVICTABLE cleared below and move * the page back to the evictable list. * @@ -677,12 +701,13 @@ enum page_references { }; static enum page_references page_check_references(struct page *page, + struct mem_cgroup_zone *mz, struct scan_control *sc) { int referenced_ptes, referenced_page; unsigned long vm_flags; - referenced_ptes = page_referenced(page, 1, sc->mem_cgroup, &vm_flags); + referenced_ptes = page_referenced(page, 1, mz->mem_cgroup, &vm_flags); referenced_page = TestClearPageReferenced(page); /* Lumpy reclaim - ignore references */ @@ -738,7 +763,7 @@ static enum page_references page_check_references(struct page *page, * shrink_page_list() returns the number of reclaimed pages */ static unsigned long shrink_page_list(struct list_head *page_list, - struct zone *zone, + struct mem_cgroup_zone *mz, struct scan_control *sc, int priority, unsigned long *ret_nr_dirty, @@ -769,7 +794,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep; VM_BUG_ON(PageActive(page)); - VM_BUG_ON(page_zone(page) != zone); + VM_BUG_ON(page_zone(page) != mz->zone); sc->nr_scanned++; @@ -803,7 +828,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, } } - references = page_check_references(page, sc); + references = page_check_references(page, mz, sc); switch (references) { case PAGEREF_ACTIVATE: goto activate_locked; @@ -994,8 +1019,8 @@ keep_lumpy: * back off and wait for congestion to clear because further reclaim * will encounter the same problem */ - if (nr_dirty && nr_dirty == nr_congested && scanning_global_lru(sc)) - zone_set_flag(zone, ZONE_CONGESTED); + if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc)) + zone_set_flag(mz->zone, ZONE_CONGESTED); free_hot_cold_page_list(&free_pages, 1); @@ -1049,8 +1074,39 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) ret = -EBUSY; - if ((mode & ISOLATE_CLEAN) && (PageDirty(page) || PageWriteback(page))) - return ret; + /* + * To minimise LRU disruption, the caller can indicate that it only + * wants to isolate pages it will be able to operate on without + * blocking - clean pages for the most part. + * + * ISOLATE_CLEAN means that only clean pages should be isolated. This + * is used by reclaim when it is cannot write to backing storage + * + * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages + * that it is possible to migrate without blocking + */ + if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) { + /* All the caller can do on PageWriteback is block */ + if (PageWriteback(page)) + return ret; + + if (PageDirty(page)) { + struct address_space *mapping; + + /* ISOLATE_CLEAN means only clean pages */ + if (mode & ISOLATE_CLEAN) + return ret; + + /* + * Only pages without mappings or that have a + * ->migratepage callback are possible to migrate + * without blocking + */ + mapping = page_mapping(page); + if (mapping && !mapping->a_ops->migratepage) + return ret; + } + } if ((mode & ISOLATE_UNMAPPED) && page_mapped(page)) return ret; @@ -1079,25 +1135,36 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) * Appropriate locks must be held before calling this function. * * @nr_to_scan: The number of pages to look through on the list. - * @src: The LRU list to pull pages off. + * @mz: The mem_cgroup_zone to pull pages from. * @dst: The temp list to put pages on to. - * @scanned: The number of pages that were scanned. + * @nr_scanned: The number of pages that were scanned. * @order: The caller's attempted allocation order * @mode: One of the LRU isolation modes + * @active: True [1] if isolating active pages * @file: True [1] if isolating file [!anon] pages * * returns how many pages were moved onto *@dst. */ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, - struct list_head *src, struct list_head *dst, - unsigned long *scanned, int order, isolate_mode_t mode, - int file) + struct mem_cgroup_zone *mz, struct list_head *dst, + unsigned long *nr_scanned, int order, isolate_mode_t mode, + int active, int file) { + struct lruvec *lruvec; + struct list_head *src; unsigned long nr_taken = 0; unsigned long nr_lumpy_taken = 0; unsigned long nr_lumpy_dirty = 0; unsigned long nr_lumpy_failed = 0; unsigned long scan; + int lru = LRU_BASE; + + lruvec = mem_cgroup_zone_lruvec(mz->zone, mz->mem_cgroup); + if (active) + lru += LRU_ACTIVE; + if (file) + lru += LRU_FILE; + src = &lruvec->lists[lru]; for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { struct page *page; @@ -1113,15 +1180,14 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, switch (__isolate_lru_page(page, mode, file)) { case 0: + mem_cgroup_lru_del(page); list_move(&page->lru, dst); - mem_cgroup_del_lru(page); nr_taken += hpage_nr_pages(page); break; case -EBUSY: /* else it is being freed elsewhere */ list_move(&page->lru, src); - mem_cgroup_rotate_lru_list(page, page_lru(page)); continue; default: @@ -1171,13 +1237,17 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, break; if (__isolate_lru_page(cursor_page, mode, file) == 0) { + unsigned int isolated_pages; + + mem_cgroup_lru_del(cursor_page); list_move(&cursor_page->lru, dst); - mem_cgroup_del_lru(cursor_page); - nr_taken += hpage_nr_pages(cursor_page); - nr_lumpy_taken++; + isolated_pages = hpage_nr_pages(cursor_page); + nr_taken += isolated_pages; + nr_lumpy_taken += isolated_pages; if (PageDirty(cursor_page)) - nr_lumpy_dirty++; + nr_lumpy_dirty += isolated_pages; scan++; + pfn += isolated_pages - 1; } else { /* * Check if the page is freed already. @@ -1203,57 +1273,16 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, nr_lumpy_failed++; } - *scanned = scan; + *nr_scanned = scan; trace_mm_vmscan_lru_isolate(order, nr_to_scan, scan, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, - mode); + mode, file); return nr_taken; } -static unsigned long isolate_pages_global(unsigned long nr, - struct list_head *dst, - unsigned long *scanned, int order, - isolate_mode_t mode, - struct zone *z, int active, int file) -{ - int lru = LRU_BASE; - if (active) - lru += LRU_ACTIVE; - if (file) - lru += LRU_FILE; - return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order, - mode, file); -} - -/* - * clear_active_flags() is a helper for shrink_active_list(), clearing - * any active bits from the pages in the list. - */ -static unsigned long clear_active_flags(struct list_head *page_list, - unsigned int *count) -{ - int nr_active = 0; - int lru; - struct page *page; - - list_for_each_entry(page, page_list, lru) { - int numpages = hpage_nr_pages(page); - lru = page_lru_base_type(page); - if (PageActive(page)) { - lru += LRU_ACTIVE; - ClearPageActive(page); - nr_active += numpages; - } - if (count) - count[lru] += numpages; - } - - return nr_active; -} - /** * isolate_lru_page - tries to isolate a page from its LRU list * @page: page to isolate from its LRU list @@ -1313,7 +1342,7 @@ static int too_many_isolated(struct zone *zone, int file, if (current_is_kswapd()) return 0; - if (!scanning_global_lru(sc)) + if (!global_reclaim(sc)) return 0; if (file) { @@ -1327,27 +1356,21 @@ static int too_many_isolated(struct zone *zone, int file, return isolated > inactive; } -/* - * TODO: Try merging with migrations version of putback_lru_pages - */ static noinline_for_stack void -putback_lru_pages(struct zone *zone, struct scan_control *sc, - unsigned long nr_anon, unsigned long nr_file, - struct list_head *page_list) +putback_inactive_pages(struct mem_cgroup_zone *mz, + struct list_head *page_list) { - struct page *page; - struct pagevec pvec; - struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); - - pagevec_init(&pvec, 1); + struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); + struct zone *zone = mz->zone; + LIST_HEAD(pages_to_free); /* * Put back any unfreeable pages. */ - spin_lock(&zone->lru_lock); while (!list_empty(page_list)) { + struct page *page = lru_to_page(page_list); int lru; - page = lru_to_page(page_list); + VM_BUG_ON(PageLRU(page)); list_del(&page->lru); if (unlikely(!page_evictable(page, NULL))) { @@ -1364,30 +1387,53 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc, int numpages = hpage_nr_pages(page); reclaim_stat->recent_rotated[file] += numpages; } - if (!pagevec_add(&pvec, page)) { - spin_unlock_irq(&zone->lru_lock); - __pagevec_release(&pvec); - spin_lock_irq(&zone->lru_lock); + if (put_page_testzero(page)) { + __ClearPageLRU(page); + __ClearPageActive(page); + del_page_from_lru_list(zone, page, lru); + + if (unlikely(PageCompound(page))) { + spin_unlock_irq(&zone->lru_lock); + (*get_compound_page_dtor(page))(page); + spin_lock_irq(&zone->lru_lock); + } else + list_add(&page->lru, &pages_to_free); } } - __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon); - __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file); - spin_unlock_irq(&zone->lru_lock); - pagevec_release(&pvec); + /* + * To save our caller's stack, now use input list for pages to free. + */ + list_splice(&pages_to_free, page_list); } -static noinline_for_stack void update_isolated_counts(struct zone *zone, - struct scan_control *sc, - unsigned long *nr_anon, - unsigned long *nr_file, - struct list_head *isolated_list) +static noinline_for_stack void +update_isolated_counts(struct mem_cgroup_zone *mz, + struct list_head *page_list, + unsigned long *nr_anon, + unsigned long *nr_file) { - unsigned long nr_active; + struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); + struct zone *zone = mz->zone; unsigned int count[NR_LRU_LISTS] = { 0, }; - struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); + unsigned long nr_active = 0; + struct page *page; + int lru; + + /* + * Count pages and clear active flags + */ + list_for_each_entry(page, page_list, lru) { + int numpages = hpage_nr_pages(page); + lru = page_lru_base_type(page); + if (PageActive(page)) { + lru += LRU_ACTIVE; + ClearPageActive(page); + nr_active += numpages; + } + count[lru] += numpages; + } - nr_active = clear_active_flags(isolated_list, count); __count_vm_events(PGDEACTIVATE, nr_active); __mod_zone_page_state(zone, NR_ACTIVE_FILE, @@ -1401,8 +1447,6 @@ static noinline_for_stack void update_isolated_counts(struct zone *zone, *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; - __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon); - __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file); reclaim_stat->recent_scanned[0] += *nr_anon; reclaim_stat->recent_scanned[1] += *nr_file; @@ -1454,8 +1498,8 @@ static inline bool should_reclaim_stall(unsigned long nr_taken, * of reclaimed pages */ static noinline_for_stack unsigned long -shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, - struct scan_control *sc, int priority, int file) +shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, + struct scan_control *sc, int priority, int file) { LIST_HEAD(page_list); unsigned long nr_scanned; @@ -1466,6 +1510,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, unsigned long nr_dirty = 0; unsigned long nr_writeback = 0; isolate_mode_t reclaim_mode = ISOLATE_INACTIVE; + struct zone *zone = mz->zone; while (unlikely(too_many_isolated(zone, file, sc))) { congestion_wait(BLK_RW_ASYNC, HZ/10); @@ -1488,9 +1533,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, spin_lock_irq(&zone->lru_lock); - if (scanning_global_lru(sc)) { - nr_taken = isolate_pages_global(nr_to_scan, &page_list, - &nr_scanned, sc->order, reclaim_mode, zone, 0, file); + nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list, + &nr_scanned, sc->order, + reclaim_mode, 0, file); + if (global_reclaim(sc)) { zone->pages_scanned += nr_scanned; if (current_is_kswapd()) __count_zone_vm_events(PGSCAN_KSWAPD, zone, @@ -1498,14 +1544,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, else __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned); - } else { - nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list, - &nr_scanned, sc->order, reclaim_mode, zone, - sc->mem_cgroup, 0, file); - /* - * mem_cgroup_isolate_pages() keeps track of - * scanned pages on its own. - */ } if (nr_taken == 0) { @@ -1513,26 +1551,37 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, return 0; } - update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list); + update_isolated_counts(mz, &page_list, &nr_anon, &nr_file); + + __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon); + __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file); spin_unlock_irq(&zone->lru_lock); - nr_reclaimed = shrink_page_list(&page_list, zone, sc, priority, + nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority, &nr_dirty, &nr_writeback); /* Check if we should syncronously wait for writeback */ if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { set_reclaim_mode(priority, sc, true); - nr_reclaimed += shrink_page_list(&page_list, zone, sc, + nr_reclaimed += shrink_page_list(&page_list, mz, sc, priority, &nr_dirty, &nr_writeback); } - local_irq_disable(); + spin_lock_irq(&zone->lru_lock); + if (current_is_kswapd()) __count_vm_events(KSWAPD_STEAL, nr_reclaimed); __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed); - putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list); + putback_inactive_pages(mz, &page_list); + + __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon); + __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file); + + spin_unlock_irq(&zone->lru_lock); + + free_hot_cold_page_list(&page_list, 1); /* * If reclaim is isolating dirty pages under writeback, it implies @@ -1588,30 +1637,47 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, static void move_active_pages_to_lru(struct zone *zone, struct list_head *list, + struct list_head *pages_to_free, enum lru_list lru) { unsigned long pgmoved = 0; - struct pagevec pvec; struct page *page; - pagevec_init(&pvec, 1); + if (buffer_heads_over_limit) { + spin_unlock_irq(&zone->lru_lock); + list_for_each_entry(page, list, lru) { + if (page_has_private(page) && trylock_page(page)) { + if (page_has_private(page)) + try_to_release_page(page, 0); + unlock_page(page); + } + } + spin_lock_irq(&zone->lru_lock); + } while (!list_empty(list)) { + struct lruvec *lruvec; + page = lru_to_page(list); VM_BUG_ON(PageLRU(page)); SetPageLRU(page); - list_move(&page->lru, &zone->lru[lru].list); - mem_cgroup_add_lru_list(page, lru); + lruvec = mem_cgroup_lru_add_list(zone, page, lru); + list_move(&page->lru, &lruvec->lists[lru]); pgmoved += hpage_nr_pages(page); - if (!pagevec_add(&pvec, page) || list_empty(list)) { - spin_unlock_irq(&zone->lru_lock); - if (buffer_heads_over_limit) - pagevec_strip(&pvec); - __pagevec_release(&pvec); - spin_lock_irq(&zone->lru_lock); + if (put_page_testzero(page)) { + __ClearPageLRU(page); + __ClearPageActive(page); + del_page_from_lru_list(zone, page, lru); + + if (unlikely(PageCompound(page))) { + spin_unlock_irq(&zone->lru_lock); + (*get_compound_page_dtor(page))(page); + spin_lock_irq(&zone->lru_lock); + } else + list_add(&page->lru, pages_to_free); } } __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); @@ -1619,19 +1685,22 @@ static void move_active_pages_to_lru(struct zone *zone, __count_vm_events(PGDEACTIVATE, pgmoved); } -static void shrink_active_list(unsigned long nr_pages, struct zone *zone, - struct scan_control *sc, int priority, int file) +static void shrink_active_list(unsigned long nr_to_scan, + struct mem_cgroup_zone *mz, + struct scan_control *sc, + int priority, int file) { unsigned long nr_taken; - unsigned long pgscanned; + unsigned long nr_scanned; unsigned long vm_flags; LIST_HEAD(l_hold); /* The pages which were snipped off */ LIST_HEAD(l_active); LIST_HEAD(l_inactive); struct page *page; - struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); + struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); unsigned long nr_rotated = 0; isolate_mode_t reclaim_mode = ISOLATE_ACTIVE; + struct zone *zone = mz->zone; lru_add_drain(); @@ -1641,26 +1710,16 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, reclaim_mode |= ISOLATE_CLEAN; spin_lock_irq(&zone->lru_lock); - if (scanning_global_lru(sc)) { - nr_taken = isolate_pages_global(nr_pages, &l_hold, - &pgscanned, sc->order, - reclaim_mode, zone, - 1, file); - zone->pages_scanned += pgscanned; - } else { - nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold, - &pgscanned, sc->order, - reclaim_mode, zone, - sc->mem_cgroup, 1, file); - /* - * mem_cgroup_isolate_pages() keeps track of - * scanned pages on its own. - */ - } + + nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold, + &nr_scanned, sc->order, + reclaim_mode, 1, file); + if (global_reclaim(sc)) + zone->pages_scanned += nr_scanned; reclaim_stat->recent_scanned[file] += nr_taken; - __count_zone_vm_events(PGREFILL, zone, pgscanned); + __count_zone_vm_events(PGREFILL, zone, nr_scanned); if (file) __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken); else @@ -1678,7 +1737,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, continue; } - if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { + if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) { nr_rotated += hpage_nr_pages(page); /* * Identify referenced, file-backed active pages and @@ -1711,12 +1770,14 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, */ reclaim_stat->recent_rotated[file] += nr_rotated; - move_active_pages_to_lru(zone, &l_active, + move_active_pages_to_lru(zone, &l_active, &l_hold, LRU_ACTIVE + file * LRU_FILE); - move_active_pages_to_lru(zone, &l_inactive, + move_active_pages_to_lru(zone, &l_inactive, &l_hold, LRU_BASE + file * LRU_FILE); __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&zone->lru_lock); + + free_hot_cold_page_list(&l_hold, 1); } #ifdef CONFIG_SWAP @@ -1741,10 +1802,8 @@ static int inactive_anon_is_low_global(struct zone *zone) * Returns true if the zone does not have enough inactive anon pages, * meaning some active anon pages need to be deactivated. */ -static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc) +static int inactive_anon_is_low(struct mem_cgroup_zone *mz) { - int low; - /* * If we don't have swap space, anonymous page deactivation * is pointless. @@ -1752,15 +1811,14 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc) if (!total_swap_pages) return 0; - if (scanning_global_lru(sc)) - low = inactive_anon_is_low_global(zone); - else - low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup, zone); - return low; + if (!scanning_global_lru(mz)) + return mem_cgroup_inactive_anon_is_low(mz->mem_cgroup, + mz->zone); + + return inactive_anon_is_low_global(mz->zone); } #else -static inline int inactive_anon_is_low(struct zone *zone, - struct scan_control *sc) +static inline int inactive_anon_is_low(struct mem_cgroup_zone *mz) { return 0; } @@ -1778,8 +1836,7 @@ static int inactive_file_is_low_global(struct zone *zone) /** * inactive_file_is_low - check if file pages need to be deactivated - * @zone: zone to check - * @sc: scan control of this context + * @mz: memory cgroup and zone to check * * When the system is doing streaming IO, memory pressure here * ensures that active file pages get deactivated, until more @@ -1791,45 +1848,44 @@ static int inactive_file_is_low_global(struct zone *zone) * This uses a different ratio than the anonymous pages, because * the page cache uses a use-once replacement algorithm. */ -static int inactive_file_is_low(struct zone *zone, struct scan_control *sc) +static int inactive_file_is_low(struct mem_cgroup_zone *mz) { - int low; + if (!scanning_global_lru(mz)) + return mem_cgroup_inactive_file_is_low(mz->mem_cgroup, + mz->zone); - if (scanning_global_lru(sc)) - low = inactive_file_is_low_global(zone); - else - low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup, zone); - return low; + return inactive_file_is_low_global(mz->zone); } -static int inactive_list_is_low(struct zone *zone, struct scan_control *sc, - int file) +static int inactive_list_is_low(struct mem_cgroup_zone *mz, int file) { if (file) - return inactive_file_is_low(zone, sc); + return inactive_file_is_low(mz); else - return inactive_anon_is_low(zone, sc); + return inactive_anon_is_low(mz); } static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, - struct zone *zone, struct scan_control *sc, int priority) + struct mem_cgroup_zone *mz, + struct scan_control *sc, int priority) { int file = is_file_lru(lru); if (is_active_lru(lru)) { - if (inactive_list_is_low(zone, sc, file)) - shrink_active_list(nr_to_scan, zone, sc, priority, file); + if (inactive_list_is_low(mz, file)) + shrink_active_list(nr_to_scan, mz, sc, priority, file); return 0; } - return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); + return shrink_inactive_list(nr_to_scan, mz, sc, priority, file); } -static int vmscan_swappiness(struct scan_control *sc) +static int vmscan_swappiness(struct mem_cgroup_zone *mz, + struct scan_control *sc) { - if (scanning_global_lru(sc)) + if (global_reclaim(sc)) return vm_swappiness; - return mem_cgroup_swappiness(sc->mem_cgroup); + return mem_cgroup_swappiness(mz->mem_cgroup); } /* @@ -1840,15 +1896,15 @@ static int vmscan_swappiness(struct scan_control *sc) * * nr[0] = anon pages to scan; nr[1] = file pages to scan */ -static void get_scan_count(struct zone *zone, struct scan_control *sc, - unsigned long *nr, int priority) +static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc, + unsigned long *nr, int priority) { unsigned long anon, file, free; unsigned long anon_prio, file_prio; unsigned long ap, fp; - struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); + struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); u64 fraction[2], denominator; - enum lru_list l; + enum lru_list lru; int noswap = 0; bool force_scan = false; @@ -1862,9 +1918,9 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, * latencies, so it's better to scan a minimum amount there as * well. */ - if (scanning_global_lru(sc) && current_is_kswapd()) + if (current_is_kswapd() && mz->zone->all_unreclaimable) force_scan = true; - if (!scanning_global_lru(sc)) + if (!global_reclaim(sc)) force_scan = true; /* If we have no swap space, do not bother scanning anon pages. */ @@ -1876,16 +1932,16 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, goto out; } - anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + - zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); - file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + - zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); + anon = zone_nr_lru_pages(mz, LRU_ACTIVE_ANON) + + zone_nr_lru_pages(mz, LRU_INACTIVE_ANON); + file = zone_nr_lru_pages(mz, LRU_ACTIVE_FILE) + + zone_nr_lru_pages(mz, LRU_INACTIVE_FILE); - if (scanning_global_lru(sc)) { - free = zone_page_state(zone, NR_FREE_PAGES); + if (global_reclaim(sc)) { + free = zone_page_state(mz->zone, NR_FREE_PAGES); /* If we have very few page cache pages, force-scan anon pages. */ - if (unlikely(file + free <= high_wmark_pages(zone))) { + if (unlikely(file + free <= high_wmark_pages(mz->zone))) { fraction[0] = 1; fraction[1] = 0; denominator = 1; @@ -1897,8 +1953,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, * With swappiness at 100, anonymous and file have the same priority. * This scanning priority is essentially the inverse of IO cost. */ - anon_prio = vmscan_swappiness(sc); - file_prio = 200 - vmscan_swappiness(sc); + anon_prio = vmscan_swappiness(mz, sc); + file_prio = 200 - vmscan_swappiness(mz, sc); /* * OK, so we have swap space and a fair amount of page cache @@ -1911,7 +1967,7 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, * * anon in [0], file in [1] */ - spin_lock_irq(&zone->lru_lock); + spin_lock_irq(&mz->zone->lru_lock); if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { reclaim_stat->recent_scanned[0] /= 2; reclaim_stat->recent_rotated[0] /= 2; @@ -1932,24 +1988,24 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); fp /= reclaim_stat->recent_rotated[1] + 1; - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irq(&mz->zone->lru_lock); fraction[0] = ap; fraction[1] = fp; denominator = ap + fp + 1; out: - for_each_evictable_lru(l) { - int file = is_file_lru(l); + for_each_evictable_lru(lru) { + int file = is_file_lru(lru); unsigned long scan; - scan = zone_nr_lru_pages(zone, sc, l); + scan = zone_nr_lru_pages(mz, lru); if (priority || noswap) { scan >>= priority; if (!scan && force_scan) scan = SWAP_CLUSTER_MAX; scan = div64_u64(scan * fraction[file], denominator); } - nr[l] = scan; + nr[lru] = scan; } } @@ -1960,7 +2016,7 @@ out: * back to the allocator and call try_to_compact_zone(), we ensure that * there are enough free pages for it to be likely successful */ -static inline bool should_continue_reclaim(struct zone *zone, +static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz, unsigned long nr_reclaimed, unsigned long nr_scanned, struct scan_control *sc) @@ -2000,15 +2056,15 @@ static inline bool should_continue_reclaim(struct zone *zone, * inactive lists are large enough, continue reclaiming */ pages_for_compaction = (2UL << sc->order); - inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); + inactive_lru_pages = zone_nr_lru_pages(mz, LRU_INACTIVE_FILE); if (nr_swap_pages > 0) - inactive_lru_pages += zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); + inactive_lru_pages += zone_nr_lru_pages(mz, LRU_INACTIVE_ANON); if (sc->nr_reclaimed < pages_for_compaction && inactive_lru_pages > pages_for_compaction) return true; /* If compaction would go ahead or the allocation would succeed, stop */ - switch (compaction_suitable(zone, sc->order)) { + switch (compaction_suitable(mz->zone, sc->order)) { case COMPACT_PARTIAL: case COMPACT_CONTINUE: return false; @@ -2020,12 +2076,12 @@ static inline bool should_continue_reclaim(struct zone *zone, /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ -static void shrink_zone(int priority, struct zone *zone, - struct scan_control *sc) +static void shrink_mem_cgroup_zone(int priority, struct mem_cgroup_zone *mz, + struct scan_control *sc) { unsigned long nr[NR_LRU_LISTS]; unsigned long nr_to_scan; - enum lru_list l; + enum lru_list lru; unsigned long nr_reclaimed, nr_scanned; unsigned long nr_to_reclaim = sc->nr_to_reclaim; struct blk_plug plug; @@ -2033,19 +2089,19 @@ static void shrink_zone(int priority, struct zone *zone, restart: nr_reclaimed = 0; nr_scanned = sc->nr_scanned; - get_scan_count(zone, sc, nr, priority); + get_scan_count(mz, sc, nr, priority); blk_start_plug(&plug); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { - for_each_evictable_lru(l) { - if (nr[l]) { + for_each_evictable_lru(lru) { + if (nr[lru]) { nr_to_scan = min_t(unsigned long, - nr[l], SWAP_CLUSTER_MAX); - nr[l] -= nr_to_scan; + nr[lru], SWAP_CLUSTER_MAX); + nr[lru] -= nr_to_scan; - nr_reclaimed += shrink_list(l, nr_to_scan, - zone, sc, priority); + nr_reclaimed += shrink_list(lru, nr_to_scan, + mz, sc, priority); } } /* @@ -2066,17 +2122,89 @@ restart: * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. */ - if (inactive_anon_is_low(zone, sc)) - shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); + if (inactive_anon_is_low(mz)) + shrink_active_list(SWAP_CLUSTER_MAX, mz, sc, priority, 0); /* reclaim/compaction might need reclaim to continue */ - if (should_continue_reclaim(zone, nr_reclaimed, + if (should_continue_reclaim(mz, nr_reclaimed, sc->nr_scanned - nr_scanned, sc)) goto restart; throttle_vm_writeout(sc->gfp_mask); } +static void shrink_zone(int priority, struct zone *zone, + struct scan_control *sc) +{ + struct mem_cgroup *root = sc->target_mem_cgroup; + struct mem_cgroup_reclaim_cookie reclaim = { + .zone = zone, + .priority = priority, + }; + struct mem_cgroup *memcg; + + memcg = mem_cgroup_iter(root, NULL, &reclaim); + do { + struct mem_cgroup_zone mz = { + .mem_cgroup = memcg, + .zone = zone, + }; + + shrink_mem_cgroup_zone(priority, &mz, sc); + /* + * Limit reclaim has historically picked one memcg and + * scanned it with decreasing priority levels until + * nr_to_reclaim had been reclaimed. This priority + * cycle is thus over after a single memcg. + * + * Direct reclaim and kswapd, on the other hand, have + * to scan all memory cgroups to fulfill the overall + * scan target for the zone. + */ + if (!global_reclaim(sc)) { + mem_cgroup_iter_break(root, memcg); + break; + } + memcg = mem_cgroup_iter(root, memcg, &reclaim); + } while (memcg); +} + +/* Returns true if compaction should go ahead for a high-order request */ +static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) +{ + unsigned long balance_gap, watermark; + bool watermark_ok; + + /* Do not consider compaction for orders reclaim is meant to satisfy */ + if (sc->order <= PAGE_ALLOC_COSTLY_ORDER) + return false; + + /* + * Compaction takes time to run and there are potentially other + * callers using the pages just freed. Continue reclaiming until + * there is a buffer of free pages available to give compaction + * a reasonable chance of completing and allocating the page + */ + balance_gap = min(low_wmark_pages(zone), + (zone->present_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / + KSWAPD_ZONE_BALANCE_GAP_RATIO); + watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); + watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); + + /* + * If compaction is deferred, reclaim up to a point where + * compaction will have a chance of success when re-enabled + */ + if (compaction_deferred(zone)) + return watermark_ok; + + /* If compaction is not ready to start, keep reclaiming */ + if (!compaction_suitable(zone, sc->order)) + return false; + + return watermark_ok; +} + /* * This is the direct reclaim path, for page-allocating processes. We only * try to reclaim pages from zones which will satisfy the caller's allocation @@ -2094,8 +2222,9 @@ restart: * scan then give up on it. * * This function returns true if a zone is being reclaimed for a costly - * high-order allocation and compaction is either ready to begin or deferred. - * This indicates to the caller that it should retry the allocation or fail. + * high-order allocation and compaction is ready to begin. This indicates to + * the caller that it should consider retrying the allocation instead of + * further reclaim. */ static bool shrink_zones(int priority, struct zonelist *zonelist, struct scan_control *sc) @@ -2104,7 +2233,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, struct zone *zone; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; - bool should_abort_reclaim = false; + bool aborted_reclaim = false; for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(sc->gfp_mask), sc->nodemask) { @@ -2114,7 +2243,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, * Take care memory controller reclaiming has small influence * to global LRU. */ - if (scanning_global_lru(sc)) { + if (global_reclaim(sc)) { if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; if (zone->all_unreclaimable && priority != DEF_PRIORITY) @@ -2129,10 +2258,8 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, * noticable problem, like transparent huge page * allocations. */ - if (sc->order > PAGE_ALLOC_COSTLY_ORDER && - (compaction_suitable(zone, sc->order) || - compaction_deferred(zone))) { - should_abort_reclaim = true; + if (compaction_ready(zone, sc)) { + aborted_reclaim = true; continue; } } @@ -2154,7 +2281,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, shrink_zone(priority, zone, sc); } - return should_abort_reclaim; + return aborted_reclaim; } static bool zone_reclaimable(struct zone *zone) @@ -2208,25 +2335,25 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, struct zoneref *z; struct zone *zone; unsigned long writeback_threshold; + bool aborted_reclaim; get_mems_allowed(); delayacct_freepages_start(); - if (scanning_global_lru(sc)) + if (global_reclaim(sc)) count_vm_event(ALLOCSTALL); for (priority = DEF_PRIORITY; priority >= 0; priority--) { sc->nr_scanned = 0; if (!priority) - disable_swap_token(sc->mem_cgroup); - if (shrink_zones(priority, zonelist, sc)) - break; + disable_swap_token(sc->target_mem_cgroup); + aborted_reclaim = shrink_zones(priority, zonelist, sc); /* * Don't shrink slabs when reclaiming memory from * over limit cgroups */ - if (scanning_global_lru(sc)) { + if (global_reclaim(sc)) { unsigned long lru_pages = 0; for_each_zone_zonelist(zone, z, zonelist, gfp_zone(sc->gfp_mask)) { @@ -2287,8 +2414,12 @@ out: if (oom_killer_disabled) return 0; + /* Aborted reclaim to try compaction? don't OOM, then */ + if (aborted_reclaim) + return 1; + /* top priority shrink_zones still had more to do? don't OOM, then */ - if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc)) + if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc)) return 1; return 0; @@ -2305,7 +2436,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .may_unmap = 1, .may_swap = 1, .order = order, - .mem_cgroup = NULL, + .target_mem_cgroup = NULL, .nodemask = nodemask, }; struct shrink_control shrink = { @@ -2325,7 +2456,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, #ifdef CONFIG_CGROUP_MEM_RES_CTLR -unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, +unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, gfp_t gfp_mask, bool noswap, struct zone *zone, unsigned long *nr_scanned) @@ -2337,7 +2468,11 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, .may_unmap = 1, .may_swap = !noswap, .order = 0, - .mem_cgroup = mem, + .target_mem_cgroup = memcg, + }; + struct mem_cgroup_zone mz = { + .mem_cgroup = memcg, + .zone = zone, }; sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | @@ -2354,7 +2489,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, * will pick up pages from other mem cgroup's as well. We hack * the priority and make it zero. */ - shrink_zone(0, zone, &sc); + shrink_mem_cgroup_zone(0, &mz, &sc); trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); @@ -2362,7 +2497,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, return sc.nr_reclaimed; } -unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, +unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, gfp_t gfp_mask, bool noswap) { @@ -2375,7 +2510,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, .may_swap = !noswap, .nr_to_reclaim = SWAP_CLUSTER_MAX, .order = 0, - .mem_cgroup = mem_cont, + .target_mem_cgroup = memcg, .nodemask = NULL, /* we don't care the placement */ .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), @@ -2389,7 +2524,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, * take care of from where we get pages. So the node where we start the * scan does not need to be the current node. */ - nid = mem_cgroup_select_victim_node(mem_cont); + nid = mem_cgroup_select_victim_node(memcg); zonelist = NODE_DATA(nid)->node_zonelists; @@ -2405,6 +2540,29 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, } #endif +static void age_active_anon(struct zone *zone, struct scan_control *sc, + int priority) +{ + struct mem_cgroup *memcg; + + if (!total_swap_pages) + return; + + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + struct mem_cgroup_zone mz = { + .mem_cgroup = memcg, + .zone = zone, + }; + + if (inactive_anon_is_low(&mz)) + shrink_active_list(SWAP_CLUSTER_MAX, &mz, + sc, priority, 0); + + memcg = mem_cgroup_iter(NULL, memcg, NULL); + } while (memcg); +} + /* * pgdat_balanced is used when checking if a node is balanced for high-order * allocations. Only zones that meet watermarks and are in a zone allowed @@ -2525,7 +2683,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, */ .nr_to_reclaim = ULONG_MAX, .order = order, - .mem_cgroup = NULL, + .target_mem_cgroup = NULL, }; struct shrink_control shrink = { .gfp_mask = sc.gfp_mask, @@ -2564,9 +2722,7 @@ loop_again: * Do some background aging of the anon list, to give * pages a chance to be referenced before reclaiming. */ - if (inactive_anon_is_low(zone, &sc)) - shrink_active_list(SWAP_CLUSTER_MAX, zone, - &sc, priority, 0); + age_active_anon(zone, &sc, priority); if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), 0, 0)) { @@ -3342,97 +3498,61 @@ int page_evictable(struct page *page, struct vm_area_struct *vma) return 1; } +#ifdef CONFIG_SHMEM /** - * check_move_unevictable_page - check page for evictability and move to appropriate zone lru list - * @page: page to check evictability and move to appropriate lru list - * @zone: zone page is in + * check_move_unevictable_pages - check pages for evictability and move to appropriate zone lru list + * @pages: array of pages to check + * @nr_pages: number of pages to check * - * Checks a page for evictability and moves the page to the appropriate - * zone lru list. - * - * Restrictions: zone->lru_lock must be held, page must be on LRU and must - * have PageUnevictable set. - */ -static void check_move_unevictable_page(struct page *page, struct zone *zone) -{ - VM_BUG_ON(PageActive(page)); - -retry: - ClearPageUnevictable(page); - if (page_evictable(page, NULL)) { - enum lru_list l = page_lru_base_type(page); - - __dec_zone_state(zone, NR_UNEVICTABLE); - list_move(&page->lru, &zone->lru[l].list); - mem_cgroup_move_lists(page, LRU_UNEVICTABLE, l); - __inc_zone_state(zone, NR_INACTIVE_ANON + l); - __count_vm_event(UNEVICTABLE_PGRESCUED); - } else { - /* - * rotate unevictable list - */ - SetPageUnevictable(page); - list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list); - mem_cgroup_rotate_lru_list(page, LRU_UNEVICTABLE); - if (page_evictable(page, NULL)) - goto retry; - } -} - -/** - * scan_mapping_unevictable_pages - scan an address space for evictable pages - * @mapping: struct address_space to scan for evictable pages + * Checks pages for evictability and moves them to the appropriate lru list. * - * Scan all pages in mapping. Check unevictable pages for - * evictability and move them to the appropriate zone lru list. + * This function is only used for SysV IPC SHM_UNLOCK. */ -void scan_mapping_unevictable_pages(struct address_space *mapping) +void check_move_unevictable_pages(struct page **pages, int nr_pages) { - pgoff_t next = 0; - pgoff_t end = (i_size_read(mapping->host) + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; - struct zone *zone; - struct pagevec pvec; - - if (mapping->nrpages == 0) - return; - - pagevec_init(&pvec, 0); - while (next < end && - pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { - int i; - int pg_scanned = 0; - - zone = NULL; + struct lruvec *lruvec; + struct zone *zone = NULL; + int pgscanned = 0; + int pgrescued = 0; + int i; - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; - pgoff_t page_index = page->index; - struct zone *pagezone = page_zone(page); + for (i = 0; i < nr_pages; i++) { + struct page *page = pages[i]; + struct zone *pagezone; - pg_scanned++; - if (page_index > next) - next = page_index; - next++; + pgscanned++; + pagezone = page_zone(page); + if (pagezone != zone) { + if (zone) + spin_unlock_irq(&zone->lru_lock); + zone = pagezone; + spin_lock_irq(&zone->lru_lock); + } - if (pagezone != zone) { - if (zone) - spin_unlock_irq(&zone->lru_lock); - zone = pagezone; - spin_lock_irq(&zone->lru_lock); - } + if (!PageLRU(page) || !PageUnevictable(page)) + continue; - if (PageLRU(page) && PageUnevictable(page)) - check_move_unevictable_page(page, zone); + if (page_evictable(page, NULL)) { + enum lru_list lru = page_lru_base_type(page); + + VM_BUG_ON(PageActive(page)); + ClearPageUnevictable(page); + __dec_zone_state(zone, NR_UNEVICTABLE); + lruvec = mem_cgroup_lru_move_lists(zone, page, + LRU_UNEVICTABLE, lru); + list_move(&page->lru, &lruvec->lists[lru]); + __inc_zone_state(zone, NR_INACTIVE_ANON + lru); + pgrescued++; } - if (zone) - spin_unlock_irq(&zone->lru_lock); - pagevec_release(&pvec); - - count_vm_events(UNEVICTABLE_PGSCANNED, pg_scanned); } + if (zone) { + __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued); + __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); + spin_unlock_irq(&zone->lru_lock); + } } +#endif /* CONFIG_SHMEM */ static void warn_scan_unevictable_pages(void) { diff --git a/mm/vmstat.c b/mm/vmstat.c index 8fd603b1665e..f600557a7659 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -295,7 +295,7 @@ void __dec_zone_page_state(struct page *page, enum zone_stat_item item) } EXPORT_SYMBOL(__dec_zone_page_state); -#ifdef CONFIG_CMPXCHG_LOCAL +#ifdef CONFIG_HAVE_CMPXCHG_LOCAL /* * If we have cmpxchg_local support then we do not need to incur the overhead * that comes with local_irq_save/restore if we use this_cpu_cmpxchg. |