From f7e839dd36fd940b0202cfb7d39b2a1b2dc59b1b Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:31:20 -0700 Subject: readahead: move max_sane_readahead() calls into force_page_cache_readahead() Impact: code simplification. Cc: Nick Piggin Signed-off-by: Wu Fengguang Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 1b60f30cebfa..dcef9fd6b92e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1390,8 +1390,7 @@ do_readahead(struct address_space *mapping, struct file *filp, if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) return -EINVAL; - force_page_cache_readahead(mapping, filp, index, - max_sane_readahead(nr)); + force_page_cache_readahead(mapping, filp, index, nr); return 0; } -- cgit v1.2.3 From ef00e08e26dd5d84271ef706262506b82195e752 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 16 Jun 2009 15:31:25 -0700 Subject: readahead: clean up and simplify the code for filemap page fault readahead This shouldn't really change behavior all that much, but the single rather complex function with read-ahead inside a loop etc is broken up into more manageable pieces. The behaviour is also less subtle, with the read-ahead being done up-front rather than inside some subtle loop and thus avoiding the now unnecessary extra state variables (ie "did_readaround" is gone). Fengguang: the code split in fact fixed a bug reported by Pavel Levshin: the PGMAJFAULT accounting used to be bypassed when MADV_RANDOM is set, in which case the original code will directly jump to no_cached_page reading. Cc: Pavel Levshin Cc: Cc: Nick Piggin Signed-off-by: Wu Fengguang Signed-off-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 156 ++++++++++++++++++++++++++++++++++------------------------- 1 file changed, 89 insertions(+), 67 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index dcef9fd6b92e..827536485599 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1456,6 +1456,68 @@ static int page_cache_read(struct file *file, pgoff_t offset) #define MMAP_LOTSAMISS (100) +/* + * Synchronous readahead happens when we don't even find + * a page in the page cache at all. + */ +static void do_sync_mmap_readahead(struct vm_area_struct *vma, + struct file_ra_state *ra, + struct file *file, + pgoff_t offset) +{ + unsigned long ra_pages; + struct address_space *mapping = file->f_mapping; + + /* If we don't want any read-ahead, don't bother */ + if (VM_RandomReadHint(vma)) + return; + + if (VM_SequentialReadHint(vma)) { + page_cache_sync_readahead(mapping, ra, file, offset, 1); + return; + } + + if (ra->mmap_miss < INT_MAX) + ra->mmap_miss++; + + /* + * Do we miss much more than hit in this file? If so, + * stop bothering with read-ahead. It will only hurt. + */ + if (ra->mmap_miss > MMAP_LOTSAMISS) + return; + + ra_pages = max_sane_readahead(ra->ra_pages); + if (ra_pages) { + pgoff_t start = 0; + + if (offset > ra_pages / 2) + start = offset - ra_pages / 2; + do_page_cache_readahead(mapping, file, start, ra_pages); + } +} + +/* + * Asynchronous readahead happens when we find the page and PG_readahead, + * so we want to possibly extend the readahead further.. + */ +static void do_async_mmap_readahead(struct vm_area_struct *vma, + struct file_ra_state *ra, + struct file *file, + struct page *page, + pgoff_t offset) +{ + struct address_space *mapping = file->f_mapping; + + /* If we don't want any read-ahead, don't bother */ + if (VM_RandomReadHint(vma)) + return; + if (ra->mmap_miss > 0) + ra->mmap_miss--; + if (PageReadahead(page)) + page_cache_async_readahead(mapping, ra, file, page, offset, 1); +} + /** * filemap_fault - read in file data for page fault handling * @vma: vma in which the fault was taken @@ -1475,78 +1537,44 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct address_space *mapping = file->f_mapping; struct file_ra_state *ra = &file->f_ra; struct inode *inode = mapping->host; + pgoff_t offset = vmf->pgoff; struct page *page; pgoff_t size; - int did_readaround = 0; int ret = 0; size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (vmf->pgoff >= size) + if (offset >= size) return VM_FAULT_SIGBUS; - /* If we don't want any read-ahead, don't bother */ - if (VM_RandomReadHint(vma)) - goto no_cached_page; - /* * Do we have something in the page cache already? */ -retry_find: - page = find_lock_page(mapping, vmf->pgoff); - /* - * For sequential accesses, we use the generic readahead logic. - */ - if (VM_SequentialReadHint(vma)) { - if (!page) { - page_cache_sync_readahead(mapping, ra, file, - vmf->pgoff, 1); - page = find_lock_page(mapping, vmf->pgoff); - if (!page) - goto no_cached_page; - } - if (PageReadahead(page)) { - page_cache_async_readahead(mapping, ra, file, page, - vmf->pgoff, 1); - } - } - - if (!page) { - unsigned long ra_pages; - - ra->mmap_miss++; - + page = find_get_page(mapping, offset); + if (likely(page)) { /* - * Do we miss much more than hit in this file? If so, - * stop bothering with read-ahead. It will only hurt. + * We found the page, so try async readahead before + * waiting for the lock. */ - if (ra->mmap_miss > MMAP_LOTSAMISS) - goto no_cached_page; + do_async_mmap_readahead(vma, ra, file, page, offset); + lock_page(page); - /* - * To keep the pgmajfault counter straight, we need to - * check did_readaround, as this is an inner loop. - */ - if (!did_readaround) { - ret = VM_FAULT_MAJOR; - count_vm_event(PGMAJFAULT); - } - did_readaround = 1; - ra_pages = max_sane_readahead(file->f_ra.ra_pages); - if (ra_pages) { - pgoff_t start = 0; - - if (vmf->pgoff > ra_pages / 2) - start = vmf->pgoff - ra_pages / 2; - do_page_cache_readahead(mapping, file, start, ra_pages); + /* Did it get truncated? */ + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + put_page(page); + goto no_cached_page; } - page = find_lock_page(mapping, vmf->pgoff); + } else { + /* No page in the page cache at all */ + do_sync_mmap_readahead(vma, ra, file, offset); + count_vm_event(PGMAJFAULT); + ret = VM_FAULT_MAJOR; +retry_find: + page = find_lock_page(mapping, offset); if (!page) goto no_cached_page; } - if (!did_readaround) - ra->mmap_miss--; - /* * We have a locked page in the page cache, now we need to check * that it's up-to-date. If not, it is going to be due to an error. @@ -1554,18 +1582,18 @@ retry_find: if (unlikely(!PageUptodate(page))) goto page_not_uptodate; - /* Must recheck i_size under page lock */ + /* + * Found the page and have a reference on it. + * We must recheck i_size under page lock. + */ size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (unlikely(vmf->pgoff >= size)) { + if (unlikely(offset >= size)) { unlock_page(page); page_cache_release(page); return VM_FAULT_SIGBUS; } - /* - * Found the page and have a reference on it. - */ - ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT; + ra->prev_pos = (loff_t)offset << PAGE_CACHE_SHIFT; vmf->page = page; return ret | VM_FAULT_LOCKED; @@ -1574,7 +1602,7 @@ no_cached_page: * We're only likely to ever get here if MADV_RANDOM is in * effect. */ - error = page_cache_read(file, vmf->pgoff); + error = page_cache_read(file, offset); /* * The page we want has now been added to the page cache. @@ -1594,12 +1622,6 @@ no_cached_page: return VM_FAULT_SIGBUS; page_not_uptodate: - /* IO error path */ - if (!did_readaround) { - ret = VM_FAULT_MAJOR; - count_vm_event(PGMAJFAULT); - } - /* * Umm, take care of errors if the page isn't up-to-date. * Try to re-read it _once_. We do this synchronously, -- cgit v1.2.3 From 70ac23cfa31f68289d4b720c6162b3929ab4de36 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:31:28 -0700 Subject: readahead: sequential mmap readahead Auto-detect sequential mmap reads and do readahead for them. The sequential mmap readahead will be triggered when - sync readahead: it's a major fault and (prev_offset == offset-1); - async readahead: minor fault on PG_readahead page with valid readahead state. The benefits of doing readahead instead of read-around: - less I/O wait thanks to async readahead - double real I/O size and no more cache hits The single stream case is improved a little. For 100,000 sequential mmap reads: user system cpu total (1-1) plain -mm, 128KB readaround: 3.224 2.554 48.40% 11.838 (1-2) plain -mm, 256KB readaround: 3.170 2.392 46.20% 11.976 (2) patched -mm, 128KB readahead: 3.117 2.448 47.33% 11.607 The patched (2) has smallest total time, since it has no cache hit overheads and less I/O block time(thanks to async readahead). Here the I/O size makes no much difference, since there's only one single stream. Note that (1-1)'s real I/O size is 64KB and (1-2)'s real I/O size is 128KB, since the half of the read-around pages will be readahead cache hits. This is going to make _real_ differences for _concurrent_ IO streams. Cc: Nick Piggin Signed-off-by: Wu Fengguang Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 827536485599..99977f0a94e4 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1472,7 +1472,8 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma, if (VM_RandomReadHint(vma)) return; - if (VM_SequentialReadHint(vma)) { + if (VM_SequentialReadHint(vma) || + offset - 1 == (ra->prev_pos >> PAGE_CACHE_SHIFT)) { page_cache_sync_readahead(mapping, ra, file, offset, 1); return; } -- cgit v1.2.3 From 2fad6f5deee5556f511eab58da78737a23ddb35d Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:31:29 -0700 Subject: readahead: enforce full readahead size on async mmap readahead We need this in one particular case and two more general ones. Now we do async readahead for sequential mmap reads, and do it with the help of PG_readahead. For normal reads, PG_readahead is the sufficient condition to do a sequential readahead. But unfortunately, for mmap reads, there is a tiny nuisance: [11736.998347] readahead-init0(process: sh/23926, file: sda1/w3m, offset=0:4503599627370495, ra=0+4-3) = 4 [11737.014985] readahead-around(process: w3m/23926, file: sda1/w3m, offset=0:0, ra=290+32-0) = 17 [11737.019488] readahead-around(process: w3m/23926, file: sda1/w3m, offset=0:0, ra=118+32-0) = 32 [11737.024921] readahead-interleaved(process: w3m/23926, file: sda1/w3m, offset=0:2, ra=4+6-6) = 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~ An unfavorably small readahead. The original dumb read-around size could be more efficient. That happened because ld-linux.so does a read(832) in L1 before mmap(), which triggers a 4-page readahead, with the second page tagged PG_readahead. L0: open("/lib/libc.so.6", O_RDONLY) = 3 L1: read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\340\342"..., 832) = 832 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ L2: fstat(3, {st_mode=S_IFREG|0755, st_size=1420624, ...}) = 0 L3: mmap(NULL, 3527256, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7fac6e51d000 L4: mprotect(0x7fac6e671000, 2097152, PROT_NONE) = 0 L5: mmap(0x7fac6e871000, 20480, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x154000) = 0x7fac6e871000 L6: mmap(0x7fac6e876000, 16984, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7fac6e876000 L7: close(3) = 0 In general, the PG_readahead flag will also be hit in cases - sequential reads - clustered random reads A full readahead size is desirable in both cases. Cc: Nick Piggin Signed-off-by: Wu Fengguang Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 99977f0a94e4..5c0c6518f341 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1516,7 +1516,8 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma, if (ra->mmap_miss > 0) ra->mmap_miss--; if (PageReadahead(page)) - page_cache_async_readahead(mapping, ra, file, page, offset, 1); + page_cache_async_readahead(mapping, ra, file, + page, offset, ra->ra_pages); } /** -- cgit v1.2.3 From d30a11004e3411909f2448546f036a011978062e Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:31:30 -0700 Subject: readahead: record mmap read-around states in file_ra_state Mmap read-around now shares the same code style and data structure with readahead code. This also removes do_page_cache_readahead(). Its last user, mmap read-around, has been changed to call ra_submit(). The no-readahead-if-congested logic is dumped by the way. Users will be pretty sensitive about the slow loading of executables. So it's unfavorable to disabled mmap read-around on a congested queue. [akpm@linux-foundation.org: coding-style fixes] Cc: Nick Piggin Signed-off-by: Fengguang Wu Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 5c0c6518f341..734891d0663d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1488,13 +1488,15 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma, if (ra->mmap_miss > MMAP_LOTSAMISS) return; + /* + * mmap read-around + */ ra_pages = max_sane_readahead(ra->ra_pages); if (ra_pages) { - pgoff_t start = 0; - - if (offset > ra_pages / 2) - start = offset - ra_pages / 2; - do_page_cache_readahead(mapping, file, start, ra_pages); + ra->start = max_t(long, 0, offset - ra_pages/2); + ra->size = ra_pages; + ra->async_size = 0; + ra_submit(ra, mapping, file); } } -- cgit v1.2.3 From 61b7cbdba2f3c588a0cf3db574c562805454b09b Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:31:36 -0700 Subject: readahead: remove redundant test in shrink_readahead_size_eio() Signed-off-by: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 734891d0663d..2e9bcc2e7977 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1004,9 +1004,6 @@ EXPORT_SYMBOL(grab_cache_page_nowait); static void shrink_readahead_size_eio(struct file *filp, struct file_ra_state *ra) { - if (!ra->ra_pages) - return; - ra->ra_pages /= 4; } -- cgit v1.2.3 From 7ffc59b4d0bdfa00e882339f85b8a969bb7021e2 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:31:38 -0700 Subject: readahead: enforce full sync mmap readahead size Now that we do readahead for sequential mmap reads, here is a simple evaluation of the impacts, and one further optimization. It's an NFS-root debian desktop system, readahead size = 60 pages. The numbers are grabbed after a fresh boot into console. approach pgmajfault RA miss ratio mmap IO count avg IO size(pages) A 383 31.6% 383 11 B 225 32.4% 390 11 C 224 32.6% 307 13 case A: mmap sync/async readahead disabled case B: mmap sync/async readahead enabled, with enforced full async readahead size case C: mmap sync/async readahead enabled, with enforced full sync/async readahead size or: A = vanilla 2.6.30-rc1 B = A plus mmap readahead C = B plus this patch The numbers show that - there are good possibilities for random mmap reads to trigger readahead - 'pgmajfault' is reduced by 1/3, due to the _async_ nature of readahead - case C can further reduce IO count by 1/4 - readahead miss ratios are not quite affected The theory is - readahead is _good_ for clustered random reads, and can perform _better_ than readaround because they could be _async_. - async readahead size is guaranteed to be larger than readaround size, and they are _async_, hence will mostly behave better However for B - sync readahead size could be smaller than readaround size, hence may make things worse by produce more smaller IOs which will be fixed by this patch. Final conclusion: - mmap readahead reduced major faults by 1/3 and no obvious overheads; - mmap io can be further reduced by 1/4 with this patch. Signed-off-by: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 2e9bcc2e7977..6846a902f5cf 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1471,7 +1471,8 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma, if (VM_SequentialReadHint(vma) || offset - 1 == (ra->prev_pos >> PAGE_CACHE_SHIFT)) { - page_cache_sync_readahead(mapping, ra, file, offset, 1); + page_cache_sync_readahead(mapping, ra, file, offset, + ra->ra_pages); return; } -- cgit v1.2.3 From 6484eb3e2a81807722c5f28efef94d8338b7b996 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 16 Jun 2009 15:31:54 -0700 Subject: page allocator: do not check NUMA node ID when the caller knows the node is valid Callers of alloc_pages_node() can optionally specify -1 as a node to mean "allocate from the current node". However, a number of the callers in fast paths know for a fact their node is valid. To avoid a comparison and branch, this patch adds alloc_pages_exact_node() that only checks the nid with VM_BUG_ON(). Callers that know their node is valid are then converted. Signed-off-by: Mel Gorman Reviewed-by: Christoph Lameter Reviewed-by: KOSAKI Motohiro Reviewed-by: Pekka Enberg Acked-by: Paul Mundt [for the SLOB NUMA bits] Cc: Peter Zijlstra Cc: Nick Piggin Cc: Dave Hansen Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 6846a902f5cf..22396713feb9 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -521,7 +521,7 @@ struct page *__page_cache_alloc(gfp_t gfp) { if (cpuset_do_page_mem_spread()) { int n = cpuset_mem_spread_node(); - return alloc_pages_node(n, gfp, 0); + return alloc_pages_exact_node(n, gfp, 0); } return alloc_pages(gfp, 0); } -- cgit v1.2.3