From 040fa02077de01c7e08fa75be6125e4ca5636011 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 3 Jul 2013 15:01:16 -0700 Subject: clear_refs: sanitize accepted commands declaration This is the implementation of the soft-dirty bit concept that should help keep track of changes in user memory, which in turn is very-very required by the checkpoint-restore project (http://criu.org). To create a dump of an application(s) we save all the information about it to files, and the biggest part of such dump is the contents of tasks' memory. However, there are usage scenarios where it's not required to get _all_ the task memory while creating a dump. For example, when doing periodical dumps, it's only required to take full memory dump only at the first step and then take incremental changes of memory. Another example is live migration. We copy all the memory to the destination node without stopping all tasks, then stop them, check for what pages has changed, dump it and the rest of the state, then copy it to the destination node. This decreases freeze time significantly. That said, some help from kernel to watch how processes modify the contents of their memory is required. The proposal is to track changes with the help of new soft-dirty bit this way: 1. First do "echo 4 > /proc/$pid/clear_refs". At that point kernel clears the soft dirty _and_ the writable bits from all ptes of process $pid. From now on every write to any page will result in #pf and the subsequent call to pte_mkdirty/pmd_mkdirty, which in turn will set the soft dirty flag. 2. Then read the /proc/$pid/pagemap2 and check the soft-dirty bit reported there (the 55'th one). If set, the respective pte was written to since last call to clear refs. The soft-dirty bit is the _PAGE_BIT_HIDDEN one. Although it's used by kmemcheck, the latter one marks kernel pages with it, while the former bit is put on user pages so they do not conflict to each other. This patch: A new clear-refs type will be added in the next patch, so prepare code for that. [akpm@linux-foundation.org: don't assume that sizeof(enum clear_refs_types) == sizeof(int)] Signed-off-by: Pavel Emelyanov Cc: Matt Mackall Cc: Xiao Guangrong Cc: Glauber Costa Cc: Marcelo Tosatti Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'fs/proc/task_mmu.c') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 3e636d864d56..dad0809db551 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -688,6 +688,13 @@ const struct file_operations proc_tid_smaps_operations = { .release = seq_release_private, }; +enum clear_refs_types { + CLEAR_REFS_ALL = 1, + CLEAR_REFS_ANON, + CLEAR_REFS_MAPPED, + CLEAR_REFS_LAST, +}; + static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { @@ -719,10 +726,6 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, return 0; } -#define CLEAR_REFS_ALL 1 -#define CLEAR_REFS_ANON 2 -#define CLEAR_REFS_MAPPED 3 - static ssize_t clear_refs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { @@ -730,7 +733,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, char buffer[PROC_NUMBUF]; struct mm_struct *mm; struct vm_area_struct *vma; - int type; + enum clear_refs_types type; + int itype; int rv; memset(buffer, 0, sizeof(buffer)); @@ -738,10 +742,11 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) return -EFAULT; - rv = kstrtoint(strstrip(buffer), 10, &type); + rv = kstrtoint(strstrip(buffer), 10, &itype); if (rv < 0) return rv; - if (type < CLEAR_REFS_ALL || type > CLEAR_REFS_MAPPED) + type = (enum clear_refs_types)itype; + if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) return -EINVAL; task = get_proc_task(file_inode(file)); if (!task) -- cgit v1.2.3 From af9de7eb180fa9b74c2cdc256349304a58c63c02 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 3 Jul 2013 15:01:18 -0700 Subject: clear_refs: introduce private struct for mm_walk In the next patch the clear-refs-type will be required in clear_refs_pte_range funciton, so prepare the walk->private to carry this info. Signed-off-by: Pavel Emelyanov Cc: Matt Mackall Cc: Xiao Guangrong Cc: Glauber Costa Cc: Marcelo Tosatti Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'fs/proc/task_mmu.c') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index dad0809db551..ef6f6c62dfee 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -695,10 +695,15 @@ enum clear_refs_types { CLEAR_REFS_LAST, }; +struct clear_refs_private { + struct vm_area_struct *vma; +}; + static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { - struct vm_area_struct *vma = walk->private; + struct clear_refs_private *cp = walk->private; + struct vm_area_struct *vma = cp->vma; pte_t *pte, ptent; spinlock_t *ptl; struct page *page; @@ -753,13 +758,16 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, return -ESRCH; mm = get_task_mm(task); if (mm) { + struct clear_refs_private cp = { + }; struct mm_walk clear_refs_walk = { .pmd_entry = clear_refs_pte_range, .mm = mm, + .private = &cp, }; down_read(&mm->mmap_sem); for (vma = mm->mmap; vma; vma = vma->vm_next) { - clear_refs_walk.private = vma; + cp.vma = vma; if (is_vm_hugetlb_page(vma)) continue; /* -- cgit v1.2.3 From 2b0a9f017548f05e42fbf7e67c4a626c1ebd5e12 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 3 Jul 2013 15:01:19 -0700 Subject: pagemap: introduce pagemap_entry_t without pmshift bits These bits are always constant (== PAGE_SHIFT) and just occupy space in the entry. Moreover, in next patch we will need to report one more bit in the pagemap, but all bits are already busy on it. That said, describe the pagemap entry that has 6 more free zero bits. Signed-off-by: Pavel Emelyanov Cc: Matt Mackall Cc: Xiao Guangrong Cc: Glauber Costa Cc: Marcelo Tosatti Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) (limited to 'fs/proc/task_mmu.c') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index ef6f6c62dfee..39d641292579 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -807,6 +807,7 @@ typedef struct { struct pagemapread { int pos, len; pagemap_entry_t *buffer; + bool v2; }; #define PAGEMAP_WALK_SIZE (PMD_SIZE) @@ -820,14 +821,16 @@ struct pagemapread { #define PM_PSHIFT_BITS 6 #define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS) #define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET) -#define PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK) +#define __PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK) #define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1) #define PM_PFRAME(x) ((x) & PM_PFRAME_MASK) +/* in "new" pagemap pshift bits are occupied with more status bits */ +#define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT)) #define PM_PRESENT PM_STATUS(4LL) #define PM_SWAP PM_STATUS(2LL) #define PM_FILE PM_STATUS(1LL) -#define PM_NOT_PRESENT PM_PSHIFT(PAGE_SHIFT) +#define PM_NOT_PRESENT(v2) PM_STATUS2(v2, 0) #define PM_END_OF_BUFFER 1 static inline pagemap_entry_t make_pme(u64 val) @@ -850,7 +853,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, struct pagemapread *pm = walk->private; unsigned long addr; int err = 0; - pagemap_entry_t pme = make_pme(PM_NOT_PRESENT); + pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); for (addr = start; addr < end; addr += PAGE_SIZE) { err = add_to_pagemap(addr, &pme, pm); @@ -860,7 +863,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, return err; } -static void pte_to_pagemap_entry(pagemap_entry_t *pme, +static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, struct vm_area_struct *vma, unsigned long addr, pte_t pte) { u64 frame, flags; @@ -879,18 +882,18 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, if (is_migration_entry(entry)) page = migration_entry_to_page(entry); } else { - *pme = make_pme(PM_NOT_PRESENT); + *pme = make_pme(PM_NOT_PRESENT(pm->v2)); return; } if (page && !PageAnon(page)) flags |= PM_FILE; - *pme = make_pme(PM_PFRAME(frame) | PM_PSHIFT(PAGE_SHIFT) | flags); + *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, 0) | flags); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, +static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, pmd_t pmd, int offset) { /* @@ -900,12 +903,12 @@ static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, */ if (pmd_present(pmd)) *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset) - | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT); + | PM_STATUS2(pm->v2, 0) | PM_PRESENT); else - *pme = make_pme(PM_NOT_PRESENT); + *pme = make_pme(PM_NOT_PRESENT(pm->v2)); } #else -static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, +static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, pmd_t pmd, int offset) { } @@ -918,7 +921,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct pagemapread *pm = walk->private; pte_t *pte; int err = 0; - pagemap_entry_t pme = make_pme(PM_NOT_PRESENT); + pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); /* find the first VMA at or above 'addr' */ vma = find_vma(walk->mm, addr); @@ -928,7 +931,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, offset = (addr & ~PAGEMAP_WALK_MASK) >> PAGE_SHIFT; - thp_pmd_to_pagemap_entry(&pme, *pmd, offset); + thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset); err = add_to_pagemap(addr, &pme, pm); if (err) break; @@ -945,7 +948,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, * and need a new, higher one */ if (vma && (addr >= vma->vm_end)) { vma = find_vma(walk->mm, addr); - pme = make_pme(PM_NOT_PRESENT); + pme = make_pme(PM_NOT_PRESENT(pm->v2)); } /* check that 'vma' actually covers this address, @@ -953,7 +956,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (vma && (vma->vm_start <= addr) && !is_vm_hugetlb_page(vma)) { pte = pte_offset_map(pmd, addr); - pte_to_pagemap_entry(&pme, vma, addr, *pte); + pte_to_pagemap_entry(&pme, pm, vma, addr, *pte); /* unmap before userspace copy */ pte_unmap(pte); } @@ -968,14 +971,14 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, } #ifdef CONFIG_HUGETLB_PAGE -static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, +static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, pte_t pte, int offset) { if (pte_present(pte)) *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) - | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT); + | PM_STATUS2(pm->v2, 0) | PM_PRESENT); else - *pme = make_pme(PM_NOT_PRESENT); + *pme = make_pme(PM_NOT_PRESENT(pm->v2)); } /* This function walks within one hugetlb entry in the single call */ @@ -989,7 +992,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, for (; addr != end; addr += PAGE_SIZE) { int offset = (addr & ~hmask) >> PAGE_SHIFT; - huge_pte_to_pagemap_entry(&pme, *pte, offset); + huge_pte_to_pagemap_entry(&pme, pm, *pte, offset); err = add_to_pagemap(addr, &pme, pm); if (err) return err; @@ -1051,6 +1054,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, if (!count) goto out_task; + pm.v2 = false; pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); pm.buffer = kmalloc(pm.len, GFP_TEMPORARY); ret = -ENOMEM; -- cgit v1.2.3 From 0f8975ec4db2c8b5bd111b211292ca9be0feb6b8 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 3 Jul 2013 15:01:20 -0700 Subject: mm: soft-dirty bits for user memory changes tracking The soft-dirty is a bit on a PTE which helps to track which pages a task writes to. In order to do this tracking one should 1. Clear soft-dirty bits from PTEs ("echo 4 > /proc/PID/clear_refs) 2. Wait some time. 3. Read soft-dirty bits (55'th in /proc/PID/pagemap2 entries) To do this tracking, the writable bit is cleared from PTEs when the soft-dirty bit is. Thus, after this, when the task tries to modify a page at some virtual address the #PF occurs and the kernel sets the soft-dirty bit on the respective PTE. Note, that although all the task's address space is marked as r/o after the soft-dirty bits clear, the #PF-s that occur after that are processed fast. This is so, since the pages are still mapped to physical memory, and thus all the kernel does is finds this fact out and puts back writable, dirty and soft-dirty bits on the PTE. Another thing to note, is that when mremap moves PTEs they are marked with soft-dirty as well, since from the user perspective mremap modifies the virtual memory at mremap's new address. Signed-off-by: Pavel Emelyanov Cc: Matt Mackall Cc: Xiao Guangrong Cc: Glauber Costa Cc: Marcelo Tosatti Cc: KOSAKI Motohiro Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 47 ++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 42 insertions(+), 5 deletions(-) (limited to 'fs/proc/task_mmu.c') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 39d641292579..a18e065c1c3e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -692,13 +693,32 @@ enum clear_refs_types { CLEAR_REFS_ALL = 1, CLEAR_REFS_ANON, CLEAR_REFS_MAPPED, + CLEAR_REFS_SOFT_DIRTY, CLEAR_REFS_LAST, }; struct clear_refs_private { struct vm_area_struct *vma; + enum clear_refs_types type; }; +static inline void clear_soft_dirty(struct vm_area_struct *vma, + unsigned long addr, pte_t *pte) +{ +#ifdef CONFIG_MEM_SOFT_DIRTY + /* + * The soft-dirty tracker uses #PF-s to catch writes + * to pages, so write-protect the pte as well. See the + * Documentation/vm/soft-dirty.txt for full description + * of how soft-dirty works. + */ + pte_t ptent = *pte; + ptent = pte_wrprotect(ptent); + ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); + set_pte_at(vma->vm_mm, addr, pte, ptent); +#endif +} + static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { @@ -718,6 +738,11 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, if (!pte_present(ptent)) continue; + if (cp->type == CLEAR_REFS_SOFT_DIRTY) { + clear_soft_dirty(vma, addr, pte); + continue; + } + page = vm_normal_page(vma, addr, ptent); if (!page) continue; @@ -759,6 +784,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, mm = get_task_mm(task); if (mm) { struct clear_refs_private cp = { + .type = type, }; struct mm_walk clear_refs_walk = { .pmd_entry = clear_refs_pte_range, @@ -766,6 +792,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, .private = &cp, }; down_read(&mm->mmap_sem); + if (type == CLEAR_REFS_SOFT_DIRTY) + mmu_notifier_invalidate_range_start(mm, 0, -1); for (vma = mm->mmap; vma; vma = vma->vm_next) { cp.vma = vma; if (is_vm_hugetlb_page(vma)) @@ -786,6 +814,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, walk_page_range(vma->vm_start, vma->vm_end, &clear_refs_walk); } + if (type == CLEAR_REFS_SOFT_DIRTY) + mmu_notifier_invalidate_range_end(mm, 0, -1); flush_tlb_mm(mm); up_read(&mm->mmap_sem); mmput(mm); @@ -827,6 +857,7 @@ struct pagemapread { /* in "new" pagemap pshift bits are occupied with more status bits */ #define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT)) +#define __PM_SOFT_DIRTY (1LL) #define PM_PRESENT PM_STATUS(4LL) #define PM_SWAP PM_STATUS(2LL) #define PM_FILE PM_STATUS(1LL) @@ -868,6 +899,7 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, { u64 frame, flags; struct page *page = NULL; + int flags2 = 0; if (pte_present(pte)) { frame = pte_pfn(pte); @@ -888,13 +920,15 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, if (page && !PageAnon(page)) flags |= PM_FILE; + if (pte_soft_dirty(pte)) + flags2 |= __PM_SOFT_DIRTY; - *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, 0) | flags); + *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, - pmd_t pmd, int offset) + pmd_t pmd, int offset, int pmd_flags2) { /* * Currently pmd for thp is always present because thp can not be @@ -903,13 +937,13 @@ static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *p */ if (pmd_present(pmd)) *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset) - | PM_STATUS2(pm->v2, 0) | PM_PRESENT); + | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT); else *pme = make_pme(PM_NOT_PRESENT(pm->v2)); } #else static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, - pmd_t pmd, int offset) + pmd_t pmd, int offset, int pmd_flags2) { } #endif @@ -926,12 +960,15 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, /* find the first VMA at or above 'addr' */ vma = find_vma(walk->mm, addr); if (vma && pmd_trans_huge_lock(pmd, vma) == 1) { + int pmd_flags2; + + pmd_flags2 = (pmd_soft_dirty(*pmd) ? __PM_SOFT_DIRTY : 0); for (; addr != end; addr += PAGE_SIZE) { unsigned long offset; offset = (addr & ~PAGEMAP_WALK_MASK) >> PAGE_SHIFT; - thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset); + thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2); err = add_to_pagemap(addr, &pme, pm); if (err) break; -- cgit v1.2.3 From 541c237c0923f567c9c4cabb8a81635baadc713f Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 3 Jul 2013 15:01:22 -0700 Subject: pagemap: prepare to reuse constant bits with page-shift In order to reuse bits from pagemap entries gracefully, we leave the entries as is but on pagemap open emit a warning in dmesg, that bits 55-60 are about to change in a couple of releases. Next, if a user issues soft-dirty clear command via the clear_refs file (it was disabled before v3.9) we assume that he's aware of the new pagemap format, note that fact and report the bits in pagemap in the new manner. The "migration strategy" looks like this then: 1. existing users are not affected -- they don't touch soft-dirty feature, thus see old bits in pagemap, but are warned and have time to fix themselves 2. those who use soft-dirty know about new pagemap format 3. some time soon we get rid of any signs of page-shift in pagemap as well as this trick with clear-soft-dirty affecting pagemap format. Signed-off-by: Pavel Emelyanov Cc: Matt Mackall Cc: Xiao Guangrong Cc: Glauber Costa Cc: Marcelo Tosatti Cc: KOSAKI Motohiro Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) (limited to 'fs/proc/task_mmu.c') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index a18e065c1c3e..dbf61f6174f0 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -689,6 +689,23 @@ const struct file_operations proc_tid_smaps_operations = { .release = seq_release_private, }; +/* + * We do not want to have constant page-shift bits sitting in + * pagemap entries and are about to reuse them some time soon. + * + * Here's the "migration strategy": + * 1. when the system boots these bits remain what they are, + * but a warning about future change is printed in log; + * 2. once anyone clears soft-dirty bits via clear_refs file, + * these flag is set to denote, that user is aware of the + * new API and those page-shift bits change their meaning. + * The respective warning is printed in dmesg; + * 3. In a couple of releases we will remove all the mentions + * of page-shift in pagemap entries. + */ + +static bool soft_dirty_cleared __read_mostly; + enum clear_refs_types { CLEAR_REFS_ALL = 1, CLEAR_REFS_ANON, @@ -778,6 +795,13 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, type = (enum clear_refs_types)itype; if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) return -EINVAL; + + if (type == CLEAR_REFS_SOFT_DIRTY) { + soft_dirty_cleared = true; + pr_warn_once("The pagemap bits 55-60 has changed their meaning! " + "See the linux/Documentation/vm/pagemap.txt for details.\n"); + } + task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; @@ -1091,7 +1115,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, if (!count) goto out_task; - pm.v2 = false; + pm.v2 = soft_dirty_cleared; pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); pm.buffer = kmalloc(pm.len, GFP_TEMPORARY); ret = -ENOMEM; @@ -1164,9 +1188,18 @@ out: return ret; } +static int pagemap_open(struct inode *inode, struct file *file) +{ + pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about " + "to stop being page-shift some time soon. See the " + "linux/Documentation/vm/pagemap.txt for details.\n"); + return 0; +} + const struct file_operations proc_pagemap_operations = { .llseek = mem_lseek, /* borrow this */ .read = pagemap_read, + .open = pagemap_open, }; #endif /* CONFIG_PROC_PAGE_MONITOR */ -- cgit v1.2.3 From 179ef71cbc085252e3fe6b8159263a7ed1d88ea4 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Tue, 13 Aug 2013 16:00:49 -0700 Subject: mm: save soft-dirty bits on swapped pages Andy Lutomirski reported that if a page with _PAGE_SOFT_DIRTY bit set get swapped out, the bit is getting lost and no longer available when pte read back. To resolve this we introduce _PTE_SWP_SOFT_DIRTY bit which is saved in pte entry for the page being swapped out. When such page is to be read back from a swap cache we check for bit presence and if it's there we clear it and restore the former _PAGE_SOFT_DIRTY bit back. One of the problem was to find a place in pte entry where we can save the _PTE_SWP_SOFT_DIRTY bit while page is in swap. The _PAGE_PSE was chosen for that, it doesn't intersect with swap entry format stored in pte. Reported-by: Andy Lutomirski Signed-off-by: Cyrill Gorcunov Acked-by: Pavel Emelyanov Cc: Matt Mackall Cc: Xiao Guangrong Cc: Marcelo Tosatti Cc: KOSAKI Motohiro Cc: Stephen Rothwell Cc: Peter Zijlstra Cc: "Aneesh Kumar K.V" Reviewed-by: Minchan Kim Reviewed-by: Wanpeng Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'fs/proc/task_mmu.c') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index dbf61f6174f0..e2d9bdce5e7e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -730,8 +730,14 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, * of how soft-dirty works. */ pte_t ptent = *pte; - ptent = pte_wrprotect(ptent); - ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); + + if (pte_present(ptent)) { + ptent = pte_wrprotect(ptent); + ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); + } else if (is_swap_pte(ptent)) { + ptent = pte_swp_clear_soft_dirty(ptent); + } + set_pte_at(vma->vm_mm, addr, pte, ptent); #endif } @@ -752,14 +758,15 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) { ptent = *pte; - if (!pte_present(ptent)) - continue; if (cp->type == CLEAR_REFS_SOFT_DIRTY) { clear_soft_dirty(vma, addr, pte); continue; } + if (!pte_present(ptent)) + continue; + page = vm_normal_page(vma, addr, ptent); if (!page) continue; @@ -930,8 +937,10 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, flags = PM_PRESENT; page = vm_normal_page(vma, addr, pte); } else if (is_swap_pte(pte)) { - swp_entry_t entry = pte_to_swp_entry(pte); - + swp_entry_t entry; + if (pte_swp_soft_dirty(pte)) + flags2 |= __PM_SOFT_DIRTY; + entry = pte_to_swp_entry(pte); frame = swp_type(entry) | (swp_offset(entry) << MAX_SWAPFILES_SHIFT); flags = PM_SWAP; -- cgit v1.2.3 From 41bb3476b361ef38576cf9d539b19bae2ac93167 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Tue, 13 Aug 2013 16:00:51 -0700 Subject: mm: save soft-dirty bits on file pages Andy reported that if file page get reclaimed we lose the soft-dirty bit if it was there, so save _PAGE_BIT_SOFT_DIRTY bit when page address get encoded into pte entry. Thus when #pf happens on such non-present pte we can restore it back. Reported-by: Andy Lutomirski Signed-off-by: Cyrill Gorcunov Acked-by: Pavel Emelyanov Cc: Matt Mackall Cc: Xiao Guangrong Cc: Marcelo Tosatti Cc: KOSAKI Motohiro Cc: Stephen Rothwell Cc: Peter Zijlstra Cc: "Aneesh Kumar K.V" Cc: Minchan Kim Cc: Wanpeng Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/proc/task_mmu.c') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index e2d9bdce5e7e..a11720767abc 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -736,6 +736,8 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); } else if (is_swap_pte(ptent)) { ptent = pte_swp_clear_soft_dirty(ptent); + } else if (pte_file(ptent)) { + ptent = pte_file_clear_soft_dirty(ptent); } set_pte_at(vma->vm_mm, addr, pte, ptent); -- cgit v1.2.3 From 8c8296223f3abb142be8fc31711b18a704c0e7d8 Mon Sep 17 00:00:00 2001 From: yonghua zheng Date: Tue, 13 Aug 2013 16:01:03 -0700 Subject: fs/proc/task_mmu.c: fix buffer overflow in add_page_map() Recently we met quite a lot of random kernel panic issues after enabling CONFIG_PROC_PAGE_MONITOR. After debuggind we found this has something to do with following bug in pagemap: In struct pagemapread: struct pagemapread { int pos, len; pagemap_entry_t *buffer; bool v2; }; pos is number of PM_ENTRY_BYTES in buffer, but len is the size of buffer, it is a mistake to compare pos and len in add_page_map() for checking buffer is full or not, and this can lead to buffer overflow and random kernel panic issue. Correct len to be total number of PM_ENTRY_BYTES in buffer. [akpm@linux-foundation.org: document pagemapread.pos and .len units, fix PM_ENTRY_BYTES definition] Signed-off-by: Yonghua Zheng Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/proc/task_mmu.c') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index a11720767abc..107d026f5d6e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -868,7 +868,7 @@ typedef struct { } pagemap_entry_t; struct pagemapread { - int pos, len; + int pos, len; /* units: PM_ENTRY_BYTES, not bytes */ pagemap_entry_t *buffer; bool v2; }; @@ -876,7 +876,7 @@ struct pagemapread { #define PAGEMAP_WALK_SIZE (PMD_SIZE) #define PAGEMAP_WALK_MASK (PMD_MASK) -#define PM_ENTRY_BYTES sizeof(u64) +#define PM_ENTRY_BYTES sizeof(pagemap_entry_t) #define PM_STATUS_BITS 3 #define PM_STATUS_OFFSET (64 - PM_STATUS_BITS) #define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET) @@ -1127,8 +1127,8 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, goto out_task; pm.v2 = soft_dirty_cleared; - pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); - pm.buffer = kmalloc(pm.len, GFP_TEMPORARY); + pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); + pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY); ret = -ENOMEM; if (!pm.buffer) goto out_task; -- cgit v1.2.3