From 5b8ba10198a109f8a02380648c5d29000caa9c55 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 27 Jun 2011 16:18:01 -0700 Subject: mm: move vmtruncate_range to truncate.c You would expect to find vmtruncate_range() next to vmtruncate() in mm/truncate.c: move it there. Signed-off-by: Hugh Dickins Acked-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/truncate.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'mm/truncate.c') diff --git a/mm/truncate.c b/mm/truncate.c index 3a29a6180212..5b4c3a4847e9 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -603,3 +603,27 @@ int vmtruncate(struct inode *inode, loff_t offset) return 0; } EXPORT_SYMBOL(vmtruncate); + +int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) +{ + struct address_space *mapping = inode->i_mapping; + + /* + * If the underlying filesystem is not going to provide + * a way to truncate a range of blocks (punch a hole) - + * we should return failure right now. + */ + if (!inode->i_op->truncate_range) + return -ENOSYS; + + mutex_lock(&inode->i_mutex); + down_write(&inode->i_alloc_sem); + unmap_mapping_range(mapping, offset, (end - offset), 1); + truncate_inode_pages_range(mapping, offset, end); + unmap_mapping_range(mapping, offset, (end - offset), 1); + inode->i_op->truncate_range(inode, offset, end); + up_write(&inode->i_alloc_sem); + mutex_unlock(&inode->i_mutex); + + return 0; +} -- cgit v1.2.3 From 94c1e62df4494b79782cb9c7279f827212d1de70 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 27 Jun 2011 16:18:03 -0700 Subject: tmpfs: take control of its truncate_range 2.6.35's new truncate convention gave tmpfs the opportunity to control its file truncation, no longer enforced from outside by vmtruncate(). We shall want to build upon that, to handle pagecache and swap together. Slightly redefine the ->truncate_range interface: let it now be called between the unmap_mapping_range()s, with the filesystem responsible for doing the truncate_inode_pages_range() from it - just as the filesystem is nowadays responsible for doing that from its ->setattr. Let's rename shmem_notify_change() to shmem_setattr(). Instead of calling the generic truncate_setsize(), bring that code in so we can call shmem_truncate_range() - which will later be updated to perform its own variant of truncate_inode_pages_range(). Remove the punch_hole unmap_mapping_range() from shmem_truncate_range(): now that the COW's unmap_mapping_range() comes after ->truncate_range, there is no need to call it a third time. Export shmem_truncate_range() and add it to the list in shmem_fs.h, so that i915_gem_object_truncate() can call it explicitly in future; get this patch in first, then update drm/i915 once this is available (until then, i915 will just be doing the truncate_inode_pages() twice). Though introduced five years ago, no other filesystem is implementing ->truncate_range, and its only other user is madvise(,,MADV_REMOVE): we expect to convert it to fallocate(,FALLOC_FL_PUNCH_HOLE,,) shortly, whereupon ->truncate_range can be removed from inode_operations - shmem_truncate_range() will help i915 across that transition too. Signed-off-by: Hugh Dickins Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/truncate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/truncate.c') diff --git a/mm/truncate.c b/mm/truncate.c index 5b4c3a4847e9..29a9b8a5a31a 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -619,9 +619,9 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) mutex_lock(&inode->i_mutex); down_write(&inode->i_alloc_sem); unmap_mapping_range(mapping, offset, (end - offset), 1); - truncate_inode_pages_range(mapping, offset, end); - unmap_mapping_range(mapping, offset, (end - offset), 1); inode->i_op->truncate_range(inode, offset, end); + /* unmap again to remove racily COWed private pages */ + unmap_mapping_range(mapping, offset, (end - offset), 1); up_write(&inode->i_alloc_sem); mutex_unlock(&inode->i_mutex); -- cgit v1.2.3 From 08142579b6ca35883c1ed066a2681de6f6917062 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 27 Jun 2011 16:18:10 -0700 Subject: mm: fix assertion mapping->nrpages == 0 in end_writeback() Under heavy memory and filesystem load, users observe the assertion mapping->nrpages == 0 in end_writeback() trigger. This can be caused by page reclaim reclaiming the last page from a mapping in the following race: CPU0 CPU1 ... shrink_page_list() __remove_mapping() __delete_from_page_cache() radix_tree_delete() evict_inode() truncate_inode_pages() truncate_inode_pages_range() pagevec_lookup() - finds nothing end_writeback() mapping->nrpages != 0 -> BUG page->mapping = NULL mapping->nrpages-- Fix the problem by doing a reliable check of mapping->nrpages under mapping->tree_lock in end_writeback(). Analyzed by Jay , lost in LKML, and dug out by Miklos Szeredi . Cc: Jay Cc: Miklos Szeredi Signed-off-by: Jan Kara Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/truncate.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'mm/truncate.c') diff --git a/mm/truncate.c b/mm/truncate.c index 29a9b8a5a31a..e13f22efaad7 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -304,6 +304,11 @@ EXPORT_SYMBOL(truncate_inode_pages_range); * @lstart: offset from which to truncate * * Called under (and serialised by) inode->i_mutex. + * + * Note: When this function returns, there can be a page in the process of + * deletion (inside __delete_from_page_cache()) in the specified range. Thus + * mapping->nrpages can be non-zero when this function returns even after + * truncation of the whole mapping. */ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) { -- cgit v1.2.3 From bd5fe6c5eb9c548d7f07fe8f89a150bb6705e8e3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Jun 2011 14:29:43 -0400 Subject: fs: kill i_alloc_sem i_alloc_sem is a rather special rw_semaphore. It's the last one that may be released by a non-owner, and it's write side is always mirrored by real exclusion. It's intended use it to wait for all pending direct I/O requests to finish before starting a truncate. Replace it with a hand-grown construct: - exclusion for truncates is already guaranteed by i_mutex, so it can simply fall way - the reader side is replaced by an i_dio_count member in struct inode that counts the number of pending direct I/O requests. Truncate can't proceed as long as it's non-zero - when i_dio_count reaches non-zero we wake up a pending truncate using wake_up_bit on a new bit in i_flags - new references to i_dio_count can't appear while we are waiting for it to read zero because the direct I/O count always needs i_mutex (or an equivalent like XFS's i_iolock) for starting a new operation. This scheme is much simpler, and saves the space of a spinlock_t and a struct list_head in struct inode (typically 160 bits on a non-debug 64-bit system). Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- mm/truncate.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm/truncate.c') diff --git a/mm/truncate.c b/mm/truncate.c index e13f22efaad7..003c6c685fc8 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -622,12 +622,11 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) return -ENOSYS; mutex_lock(&inode->i_mutex); - down_write(&inode->i_alloc_sem); + inode_dio_wait(inode); unmap_mapping_range(mapping, offset, (end - offset), 1); inode->i_op->truncate_range(inode, offset, end); /* unmap again to remove racily COWed private pages */ unmap_mapping_range(mapping, offset, (end - offset), 1); - up_write(&inode->i_alloc_sem); mutex_unlock(&inode->i_mutex); return 0; -- cgit v1.2.3 From 8a549bea51138be2126a2cc6aabe8f17ef66b79b Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 25 Jul 2011 17:12:24 -0700 Subject: mm: tidy vmtruncate_range and related functions Use consistent variable names in truncate_pagecache(), truncate_setsize(), vmtruncate() and vmtruncate_range(). unmap_mapping_range() and vmtruncate_range() have mismatched interfaces: don't change either, but make the vmtruncates more precise about what they expect unmap_mapping_range() to do. vmtruncate_range() is currently called only with page-aligned start and end+1: can handle unaligned start, but unaligned end+1 would hit BUG_ON in truncate_inode_pages_range() (lacks partial clearing of the end page). Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/truncate.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) (limited to 'mm/truncate.c') diff --git a/mm/truncate.c b/mm/truncate.c index 003c6c685fc8..c924764e2ce5 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -531,8 +531,8 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2); /** * truncate_pagecache - unmap and remove pagecache that has been truncated * @inode: inode - * @old: old file offset - * @new: new file offset + * @oldsize: old file size + * @newsize: new file size * * inode's new i_size must already be written before truncate_pagecache * is called. @@ -544,9 +544,10 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2); * situations such as writepage being called for a page that has already * had its underlying blocks deallocated. */ -void truncate_pagecache(struct inode *inode, loff_t old, loff_t new) +void truncate_pagecache(struct inode *inode, loff_t oldsize, loff_t newsize) { struct address_space *mapping = inode->i_mapping; + loff_t holebegin = round_up(newsize, PAGE_SIZE); /* * unmap_mapping_range is called twice, first simply for @@ -557,9 +558,9 @@ void truncate_pagecache(struct inode *inode, loff_t old, loff_t new) * truncate_inode_pages finishes, hence the second * unmap_mapping_range call must be made for correctness. */ - unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); - truncate_inode_pages(mapping, new); - unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); + unmap_mapping_range(mapping, holebegin, 0, 1); + truncate_inode_pages(mapping, newsize); + unmap_mapping_range(mapping, holebegin, 0, 1); } EXPORT_SYMBOL(truncate_pagecache); @@ -589,29 +590,31 @@ EXPORT_SYMBOL(truncate_setsize); /** * vmtruncate - unmap mappings "freed" by truncate() syscall * @inode: inode of the file used - * @offset: file offset to start truncating + * @newsize: file offset to start truncating * * This function is deprecated and truncate_setsize or truncate_pagecache * should be used instead, together with filesystem specific block truncation. */ -int vmtruncate(struct inode *inode, loff_t offset) +int vmtruncate(struct inode *inode, loff_t newsize) { int error; - error = inode_newsize_ok(inode, offset); + error = inode_newsize_ok(inode, newsize); if (error) return error; - truncate_setsize(inode, offset); + truncate_setsize(inode, newsize); if (inode->i_op->truncate) inode->i_op->truncate(inode); return 0; } EXPORT_SYMBOL(vmtruncate); -int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) +int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend) { struct address_space *mapping = inode->i_mapping; + loff_t holebegin = round_up(lstart, PAGE_SIZE); + loff_t holelen = 1 + lend - holebegin; /* * If the underlying filesystem is not going to provide @@ -623,10 +626,10 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) mutex_lock(&inode->i_mutex); inode_dio_wait(inode); - unmap_mapping_range(mapping, offset, (end - offset), 1); - inode->i_op->truncate_range(inode, offset, end); + unmap_mapping_range(mapping, holebegin, holelen, 1); + inode->i_op->truncate_range(inode, lstart, lend); /* unmap again to remove racily COWed private pages */ - unmap_mapping_range(mapping, offset, (end - offset), 1); + unmap_mapping_range(mapping, holebegin, holelen, 1); mutex_unlock(&inode->i_mutex); return 0; -- cgit v1.2.3 From b85e0effd3dcbf9118b896232f59526ab1a39a74 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 25 Jul 2011 17:12:25 -0700 Subject: mm: consistent truncate and invalidate loops Make the pagevec_lookup loops in truncate_inode_pages_range(), invalidate_mapping_pages() and invalidate_inode_pages2_range() more consistent with each other. They were relying upon page->index of an unlocked page, but apologizing for it: accept it, embrace it, add comments and WARN_ONs, and simplify the index handling. invalidate_inode_pages2_range() had special handling for a wrapped page->index + 1 = 0 case; but MAX_LFS_FILESIZE doesn't let us anywhere near there, and a corrupt page->index in the radix_tree could cause more trouble than that would catch. Remove that wrapped handling. invalidate_inode_pages2_range() uses min() to limit the pagevec_lookup when near the end of the range: copy that into the other two, although it's less useful than you might think (it limits the use of the buffer, rather than the indices looked up). Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/truncate.c | 110 +++++++++++++++++++++++++--------------------------------- 1 file changed, 47 insertions(+), 63 deletions(-) (limited to 'mm/truncate.c') diff --git a/mm/truncate.c b/mm/truncate.c index c924764e2ce5..dc459014f777 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -199,9 +199,6 @@ int invalidate_inode_page(struct page *page) * The first pass will remove most pages, so the search cost of the second pass * is low. * - * When looking at page->index outside the page lock we need to be careful to - * copy it into a local to avoid races (it could change at any time). - * * We pass down the cache-hot hint to the page freeing code. Even if the * mapping is large, it is probably the case that the final pages are the most * recently touched, and freeing happens in ascending file offset order. @@ -210,10 +207,10 @@ void truncate_inode_pages_range(struct address_space *mapping, loff_t lstart, loff_t lend) { const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; - pgoff_t end; const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); struct pagevec pvec; - pgoff_t next; + pgoff_t index; + pgoff_t end; int i; cleancache_flush_inode(mapping); @@ -224,24 +221,21 @@ void truncate_inode_pages_range(struct address_space *mapping, end = (lend >> PAGE_CACHE_SHIFT); pagevec_init(&pvec, 0); - next = start; - while (next <= end && - pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + index = start; + while (index <= end && pagevec_lookup(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; - pgoff_t page_index = page->index; - if (page_index > end) { - next = page_index; + /* We rely upon deletion not changing page->index */ + index = page->index; + if (index > end) break; - } - if (page_index > next) - next = page_index; - next++; if (!trylock_page(page)) continue; + WARN_ON(page->index != index); if (PageWriteback(page)) { unlock_page(page); continue; @@ -252,6 +246,7 @@ void truncate_inode_pages_range(struct address_space *mapping, pagevec_release(&pvec); mem_cgroup_uncharge_end(); cond_resched(); + index++; } if (partial) { @@ -264,13 +259,14 @@ void truncate_inode_pages_range(struct address_space *mapping, } } - next = start; + index = start; for ( ; ; ) { cond_resched(); - if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { - if (next == start) + if (!pagevec_lookup(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { + if (index == start) break; - next = start; + index = start; continue; } if (pvec.pages[0]->index > end) { @@ -281,18 +277,20 @@ void truncate_inode_pages_range(struct address_space *mapping, for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; - if (page->index > end) + /* We rely upon deletion not changing page->index */ + index = page->index; + if (index > end) break; + lock_page(page); + WARN_ON(page->index != index); wait_on_page_writeback(page); truncate_inode_page(mapping, page); - if (page->index > next) - next = page->index; - next++; unlock_page(page); } pagevec_release(&pvec); mem_cgroup_uncharge_end(); + index++; } cleancache_flush_inode(mapping); } @@ -333,35 +331,26 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end) { struct pagevec pvec; - pgoff_t next = start; + pgoff_t index = start; unsigned long ret; unsigned long count = 0; int i; pagevec_init(&pvec, 0); - while (next <= end && - pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + while (index <= end && pagevec_lookup(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; - pgoff_t index; - int lock_failed; - - lock_failed = !trylock_page(page); - /* - * We really shouldn't be looking at the ->index of an - * unlocked page. But we're not allowed to lock these - * pages. So we rely upon nobody altering the ->index - * of this (pinned-by-us) page. - */ + /* We rely upon deletion not changing page->index */ index = page->index; - if (index > next) - next = index; - next++; - if (lock_failed) - continue; + if (index > end) + break; + if (!trylock_page(page)) + continue; + WARN_ON(page->index != index); ret = invalidate_inode_page(page); unlock_page(page); /* @@ -371,12 +360,11 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, if (!ret) deactivate_page(page); count += ret; - if (next > end) - break; } pagevec_release(&pvec); mem_cgroup_uncharge_end(); cond_resched(); + index++; } return count; } @@ -442,37 +430,32 @@ int invalidate_inode_pages2_range(struct address_space *mapping, pgoff_t start, pgoff_t end) { struct pagevec pvec; - pgoff_t next; + pgoff_t index; int i; int ret = 0; int ret2 = 0; int did_range_unmap = 0; - int wrapped = 0; cleancache_flush_inode(mapping); pagevec_init(&pvec, 0); - next = start; - while (next <= end && !wrapped && - pagevec_lookup(&pvec, mapping, next, - min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { + index = start; + while (index <= end && pagevec_lookup(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; - pgoff_t page_index; + + /* We rely upon deletion not changing page->index */ + index = page->index; + if (index > end) + break; lock_page(page); + WARN_ON(page->index != index); if (page->mapping != mapping) { unlock_page(page); continue; } - page_index = page->index; - next = page_index + 1; - if (next == 0) - wrapped = 1; - if (page_index > end) { - unlock_page(page); - break; - } wait_on_page_writeback(page); if (page_mapped(page)) { if (!did_range_unmap) { @@ -480,9 +463,9 @@ int invalidate_inode_pages2_range(struct address_space *mapping, * Zap the rest of the file in one hit. */ unmap_mapping_range(mapping, - (loff_t)page_index< Date: Mon, 25 Jul 2011 17:12:25 -0700 Subject: mm: pincer in truncate_inode_pages_range truncate_inode_pages_range()'s final loop has a nice pincer property, bringing start and end together, squeezing out the last pages. But the range handling missed out on that, just sliding up the range, perhaps letting pages come in behind it. Add one more test to give it the same pincer effect. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/truncate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/truncate.c') diff --git a/mm/truncate.c b/mm/truncate.c index dc459014f777..232eb2736a79 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -269,7 +269,7 @@ void truncate_inode_pages_range(struct address_space *mapping, index = start; continue; } - if (pvec.pages[0]->index > end) { + if (index == start && pvec.pages[0]->index > end) { pagevec_release(&pvec); break; } -- cgit v1.2.3 From 31475dd611209413bace21651a400afb91d0bd9d Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 3 Aug 2011 16:21:27 -0700 Subject: mm: a few small updates for radix-swap Remove PageSwapBacked (!page_is_file_cache) cases from add_to_page_cache_locked() and add_to_page_cache_lru(): those pages now go through shmem_add_to_page_cache(). Remove a comment on maximum tmpfs size from fsstack_copy_inode_size(), and add a comment on swap entries to invalidate_mapping_pages(). And mincore_page() uses find_get_page() on what might be shmem or a tmpfs file: allow for a radix_tree_exceptional_entry(), and proceed to find_get_page() on swapper_space if so (oh, swapper_space needs #ifdef). Signed-off-by: Hugh Dickins Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/truncate.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'mm/truncate.c') diff --git a/mm/truncate.c b/mm/truncate.c index 232eb2736a79..b40ac6d4e86e 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -336,6 +336,14 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, unsigned long count = 0; int i; + /* + * Note: this function may get called on a shmem/tmpfs mapping: + * pagevec_lookup() might then return 0 prematurely (because it + * got a gangful of swap entries); but it's hardly worth worrying + * about - it can rarely have anything to free from such a mapping + * (most pages are dirty), and already skips over any difficulties. + */ + pagevec_init(&pvec, 0); while (index <= end && pagevec_lookup(&pvec, mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { -- cgit v1.2.3