From 58a84aa92723d1ac3e1cc4e3b0ff49291663f7e1 Mon Sep 17 00:00:00 2001 From: Youquan Song Date: Thu, 8 Dec 2011 14:34:18 -0800 Subject: thp: set compound tail page _count to zero Commit 70b50f94f1644 ("mm: thp: tail page refcounting fix") keeps all page_tail->_count zero at all times. But the current kernel does not set page_tail->_count to zero if a 1GB page is utilized. So when an IOMMU 1GB page is used by KVM, it wil result in a kernel oops because a tail page's _count does not equal zero. kernel BUG at include/linux/mm.h:386! invalid opcode: 0000 [#1] SMP Call Trace: gup_pud_range+0xb8/0x19d get_user_pages_fast+0xcb/0x192 ? trace_hardirqs_off+0xd/0xf hva_to_pfn+0x119/0x2f2 gfn_to_pfn_memslot+0x2c/0x2e kvm_iommu_map_pages+0xfd/0x1c1 kvm_iommu_map_memslots+0x7c/0xbd kvm_iommu_map_guest+0xaa/0xbf kvm_vm_ioctl_assigned_device+0x2ef/0xa47 kvm_vm_ioctl+0x36c/0x3a2 do_vfs_ioctl+0x49e/0x4e4 sys_ioctl+0x5a/0x7c system_call_fastpath+0x16/0x1b RIP gup_huge_pud+0xf2/0x159 Signed-off-by: Youquan Song Reviewed-by: Andrea Arcangeli Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bb28a5f9db8d..73f17c0293c0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -576,6 +576,7 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order) __SetPageHead(page); for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { __SetPageTail(p); + set_page_count(p, 0); p->first_page = page; } } -- cgit v1.2.3 From 10fbcf4c6cb122005cdf36fc24d7683da92c7a27 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Wed, 21 Dec 2011 14:48:43 -0800 Subject: convert 'memory' sysdev_class to a regular subsystem This moves the 'memory sysdev_class' over to a regular 'memory' subsystem and converts the devices to regular devices. The sysdev drivers are implemented as subsystem interfaces now. After all sysdev classes are ported to regular driver core entities, the sysdev implementation will be entirely removed from the kernel. Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- mm/hugetlb.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index dae27ba3be2c..ad713e2d61bc 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1591,9 +1591,9 @@ static void __init hugetlb_sysfs_init(void) /* * node_hstate/s - associate per node hstate attributes, via their kobjects, - * with node sysdevs in node_devices[] using a parallel array. The array - * index of a node sysdev or _hstate == node id. - * This is here to avoid any static dependency of the node sysdev driver, in + * with node devices in node_devices[] using a parallel array. The array + * index of a node device or _hstate == node id. + * This is here to avoid any static dependency of the node device driver, in * the base kernel, on the hugetlb module. */ struct node_hstate { @@ -1603,7 +1603,7 @@ struct node_hstate { struct node_hstate node_hstates[MAX_NUMNODES]; /* - * A subset of global hstate attributes for node sysdevs + * A subset of global hstate attributes for node devices */ static struct attribute *per_node_hstate_attrs[] = { &nr_hugepages_attr.attr, @@ -1617,7 +1617,7 @@ static struct attribute_group per_node_hstate_attr_group = { }; /* - * kobj_to_node_hstate - lookup global hstate for node sysdev hstate attr kobj. + * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj. * Returns node id via non-NULL nidp. */ static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) @@ -1640,13 +1640,13 @@ static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) } /* - * Unregister hstate attributes from a single node sysdev. + * Unregister hstate attributes from a single node device. * No-op if no hstate attributes attached. */ void hugetlb_unregister_node(struct node *node) { struct hstate *h; - struct node_hstate *nhs = &node_hstates[node->sysdev.id]; + struct node_hstate *nhs = &node_hstates[node->dev.id]; if (!nhs->hugepages_kobj) return; /* no hstate attributes */ @@ -1662,7 +1662,7 @@ void hugetlb_unregister_node(struct node *node) } /* - * hugetlb module exit: unregister hstate attributes from node sysdevs + * hugetlb module exit: unregister hstate attributes from node devices * that have them. */ static void hugetlb_unregister_all_nodes(void) @@ -1670,7 +1670,7 @@ static void hugetlb_unregister_all_nodes(void) int nid; /* - * disable node sysdev registrations. + * disable node device registrations. */ register_hugetlbfs_with_node(NULL, NULL); @@ -1682,20 +1682,20 @@ static void hugetlb_unregister_all_nodes(void) } /* - * Register hstate attributes for a single node sysdev. + * Register hstate attributes for a single node device. * No-op if attributes already registered. */ void hugetlb_register_node(struct node *node) { struct hstate *h; - struct node_hstate *nhs = &node_hstates[node->sysdev.id]; + struct node_hstate *nhs = &node_hstates[node->dev.id]; int err; if (nhs->hugepages_kobj) return; /* already allocated */ nhs->hugepages_kobj = kobject_create_and_add("hugepages", - &node->sysdev.kobj); + &node->dev.kobj); if (!nhs->hugepages_kobj) return; @@ -1706,7 +1706,7 @@ void hugetlb_register_node(struct node *node) if (err) { printk(KERN_ERR "Hugetlb: Unable to add hstate %s" " for node %d\n", - h->name, node->sysdev.id); + h->name, node->dev.id); hugetlb_unregister_node(node); break; } @@ -1715,8 +1715,8 @@ void hugetlb_register_node(struct node *node) /* * hugetlb init time: register hstate attributes for all registered node - * sysdevs of nodes that have memory. All on-line nodes should have - * registered their associated sysdev by this time. + * devices of nodes that have memory. All on-line nodes should have + * registered their associated device by this time. */ static void hugetlb_register_all_nodes(void) { @@ -1724,12 +1724,12 @@ static void hugetlb_register_all_nodes(void) for_each_node_state(nid, N_HIGH_MEMORY) { struct node *node = &node_devices[nid]; - if (node->sysdev.id == nid) + if (node->dev.id == nid) hugetlb_register_node(node); } /* - * Let the node sysdev driver know we're here so it can + * Let the node device driver know we're here so it can * [un]register hstate attributes on node hotplug. */ register_hugetlbfs_with_node(hugetlb_register_node, -- cgit v1.2.3 From b0365c8d0cb6e79eb5f21418ae61ab511f31b575 Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Wed, 28 Dec 2011 15:57:16 -0800 Subject: mm: hugetlb: fix non-atomic enqueue of huge page If a huge page is enqueued under the protection of hugetlb_lock, then the operation is atomic and safe. Signed-off-by: Hillf Danton Reviewed-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Cc: [2.6.37+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 73f17c0293c0..2316840b337a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -901,7 +901,6 @@ retry: h->resv_huge_pages += delta; ret = 0; - spin_unlock(&hugetlb_lock); /* Free the needed pages to the hugetlb pool */ list_for_each_entry_safe(page, tmp, &surplus_list, lru) { if ((--needed) < 0) @@ -915,6 +914,7 @@ retry: VM_BUG_ON(page_count(page)); enqueue_huge_page(h, page); } + spin_unlock(&hugetlb_lock); /* Free unnecessary surplus pages to the buddy allocator */ free: -- cgit v1.2.3 From a734bcc812146cfba530e1adaf609fce1357982e Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Tue, 10 Jan 2012 15:07:20 -0800 Subject: hugetlb: detect race upon page allocation failure during COW Currently we are not rechecking pte_same in hugetlb_cow after we take ptl lock again in the page allocation failure code path and simply retry again. This is not an issue at the moment because hugetlb fault path is protected by hugetlb_instantiation_mutex so we cannot race. The original page is locked and so we cannot race even with the page migration. Let's add the pte_same check anyway as we want to be consistent with the other check later in this function and be safe if we ever remove the mutex. [mhocko@suse.cz: reworded the changelog] Signed-off-by: Hillf Danton Signed-off-by: Michal Hocko Cc: Andrea Arcangeli Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7acd12503f73..2c551b28ba69 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2408,7 +2408,14 @@ retry_avoidcopy: BUG_ON(page_count(old_page) != 1); BUG_ON(huge_pte_none(pte)); spin_lock(&mm->page_table_lock); - goto retry_avoidcopy; + ptep = huge_pte_offset(mm, address & huge_page_mask(h)); + if (likely(pte_same(huge_ptep_get(ptep), pte))) + goto retry_avoidcopy; + /* + * race occurs while re-acquiring page_table_lock, and + * our job is done. + */ + return 0; } WARN_ON_ONCE(1); } -- cgit v1.2.3 From ef009b25f4f8a77d2b32067d424d5ac757dcdc5b Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 10 Jan 2012 15:07:21 -0800 Subject: hugetlb: clarify hugetlb_instantiation_mutex usage Let's make it clear that we cannot race with other fault handlers due to hugetlb (global) mutex. Also make it clear that we want to keep pte_same checks anayway to have a transition from the global mutex easier. Signed-off-by: Michal Hocko Cc: Hillf Danton Cc: Andrea Arcangeli Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 2c551b28ba69..49e693b7fd0c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2349,6 +2349,9 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, /* * Hugetlb_cow() should be called with page lock of the original hugepage held. + * Called with hugetlb_instantiation_mutex held and pte_page locked so we + * cannot race with other handlers or page migration. + * Keep the pte_same checks anyway to make transition from the mutex easier. */ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t pte, -- cgit v1.2.3 From 1e16a539ac16e7b3a8c2cee188897d4bdb88e6e8 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 10 Jan 2012 15:07:22 -0800 Subject: mm/hugetlb.c: fix virtual address handling in hugetlb fault handle_mm_fault() passes 'faulted' address to hugetlb_fault(). This address is not aligned to a hugepage boundary. Most of the functions for hugetlb pages are aware of that and calculate an alignment themselves. However some functions such as copy_user_huge_page() and clear_huge_page() don't handle alignment by themselves. This patch make hugeltb_fault() fix the alignment and pass an aligned addresss (to address of a faulted hugepage) to functions. [akpm@linux-foundation.org: use &=] Signed-off-by: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 49e693b7fd0c..ab89d6f382d1 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2640,6 +2640,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, static DEFINE_MUTEX(hugetlb_instantiation_mutex); struct hstate *h = hstate_vma(vma); + address &= huge_page_mask(h); + ptep = huge_pte_offset(mm, address); if (ptep) { entry = huge_ptep_get(ptep); -- cgit v1.2.3 From 0c176d52b0b2619f231b2bbf329b90c028134f58 Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Tue, 10 Jan 2012 15:08:19 -0800 Subject: mm: hugetlb: fix pgoff computation when unmapping page from vma The computation for pgoff is incorrect, at least with (vma->vm_pgoff >> PAGE_SHIFT) involved. It is fixed with the available method if HPAGE_SIZE is concerned in page cache lookup. [akpm@linux-foundation.org: use vma_hugecache_offset() directly, per Michal] Signed-off-by: Hillf Danton Cc: Mel Gorman Cc: Michal Hocko Reviewed-by: KAMEZAWA Hiroyuki Cc: Andrea Arcangeli Cc: David Rientjes Reviewed-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ab89d6f382d1..bb7dc405634f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2315,8 +2315,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * from page cache lookup which is in HPAGE_SIZE units. */ address = address & huge_page_mask(h); - pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) - + (vma->vm_pgoff >> PAGE_SHIFT); + pgoff = vma_hugecache_offset(h, vma, address); mapping = (struct address_space *)page_private(page); /* -- cgit v1.2.3 From ea5768c74b8e0d6a866508fc6399d5ff958da5e3 Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Tue, 10 Jan 2012 15:08:30 -0800 Subject: mm/hugetlb.c: avoid bogus counter of surplus huge page If we have to hand back the newly allocated huge page to page allocator, for any reason, the changed counter should be recovered. This affects only s390 at present. Signed-off-by: Hillf Danton Reviewed-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Cc: Martin Schwidefsky Cc: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bb7dc405634f..ea8c3a4cd2ae 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -800,7 +800,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) if (page && arch_prepare_hugepage(page)) { __free_pages(page, huge_page_order(h)); - return NULL; + page = NULL; } spin_lock(&hugetlb_lock); -- cgit v1.2.3 From 409eb8c2611b4310947a150af988111f7f52ab15 Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Fri, 20 Jan 2012 14:34:13 -0800 Subject: mm/hugetlb.c: undo change to page mapcount in fault handler Page mapcount should be updated only if we are sure that the page ends up in the page table otherwise we would leak if we couldn't COW due to reservations or if idx is out of bounds. Signed-off-by: Hillf Danton Reviewed-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ea8c3a4cd2ae..5f34bd8dda34 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2508,6 +2508,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, { struct hstate *h = hstate_vma(vma); int ret = VM_FAULT_SIGBUS; + int anon_rmap = 0; pgoff_t idx; unsigned long size; struct page *page; @@ -2562,14 +2563,13 @@ retry: spin_lock(&inode->i_lock); inode->i_blocks += blocks_per_huge_page(h); spin_unlock(&inode->i_lock); - page_dup_rmap(page); } else { lock_page(page); if (unlikely(anon_vma_prepare(vma))) { ret = VM_FAULT_OOM; goto backout_unlocked; } - hugepage_add_new_anon_rmap(page, vma, address); + anon_rmap = 1; } } else { /* @@ -2582,7 +2582,6 @@ retry: VM_FAULT_SET_HINDEX(h - hstates); goto backout_unlocked; } - page_dup_rmap(page); } /* @@ -2606,6 +2605,10 @@ retry: if (!huge_pte_none(huge_ptep_get(ptep))) goto backout; + if (anon_rmap) + hugepage_add_new_anon_rmap(page, vma, address); + else + page_dup_rmap(page); new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_SHARED))); set_huge_pte_at(mm, address, ptep, new_pte); -- cgit v1.2.3