From 6328650bb4d854a7dc1498d1c0048b838b0d340c Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 3 Aug 2011 16:21:18 -0700 Subject: radix_tree: exceptional entries and indices A patchset to extend tmpfs to MAX_LFS_FILESIZE by abandoning its peculiar swap vector, instead keeping a file's swap entries in the same radix tree as its struct page pointers: thus saving memory, and simplifying its code and locking. This patch: The radix_tree is used by several subsystems for different purposes. A major use is to store the struct page pointers of a file's pagecache for memory management. But what if mm wanted to store something other than page pointers there too? The low bit of a radix_tree entry is already used to denote an indirect pointer, for internal use, and the unlikely radix_tree_deref_retry() case. Define the next bit as denoting an exceptional entry, and supply inline functions radix_tree_exception() to return non-0 in either unlikely case, and radix_tree_exceptional_entry() to return non-0 in the second case. If a subsystem already uses radix_tree with that bit set, no problem: it does not affect internal workings at all, but is defined for the convenience of those storing well-aligned pointers in the radix_tree. The radix_tree_gang_lookups have an implicit assumption that the caller can deduce the offset of each entry returned e.g. by the page->index of a struct page. But that may not be feasible for some kinds of item to be stored there. radix_tree_gang_lookup_slot() allow for an optional indices argument, output array in which to return those offsets. The same could be added to other radix_tree_gang_lookups, but for now keep it to the only one for which we need it. Signed-off-by: Hugh Dickins Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 867d40222ec7..b83aebfd0a00 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -840,7 +840,7 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start, rcu_read_lock(); restart: nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, - (void ***)pages, start, nr_pages); + (void ***)pages, NULL, start, nr_pages); ret = 0; for (i = 0; i < nr_found; i++) { struct page *page; @@ -903,7 +903,7 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, rcu_read_lock(); restart: nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, - (void ***)pages, index, nr_pages); + (void ***)pages, NULL, index, nr_pages); ret = 0; for (i = 0; i < nr_found; i++) { struct page *page; -- cgit v1.2.3 From a2c16d6cb0e478812829ca84aeabd02e36af35eb Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 3 Aug 2011 16:21:19 -0700 Subject: mm: let swap use exceptional entries If swap entries are to be stored along with struct page pointers in a radix tree, they need to be distinguished as exceptional entries. Most of the handling of swap entries in radix tree will be contained in shmem.c, but a few functions in filemap.c's common code need to check for their appearance: find_get_page(), find_lock_page(), find_get_pages() and find_get_pages_contig(). So as not to slow their fast paths, tuck those checks inside the existing checks for unlikely radix_tree_deref_slot(); except for find_lock_page(), where it is an added test. And make it a BUG in find_get_pages_tag(), which is not applied to tmpfs files. A part of the reason for eliminating shmem_readpage() earlier, was to minimize the places where common code would need to allow for swap entries. The swp_entry_t known to swapfile.c must be massaged into a slightly different form when stored in the radix tree, just as it gets massaged into a pte_t when stored in page tables. In an i386 kernel this limits its information (type and page offset) to 30 bits: given 32 "types" of swapfile and 4kB pagesize, that's a maximum swapfile size of 128GB. Which is less than the 512GB we previously allowed with X86_PAE (where the swap entry can occupy the entire upper 32 bits of a pte_t), but not a new limitation on 32-bit without PAE; and there's not a new limitation on 64-bit (where swap filesize is already limited to 16TB by a 32-bit page offset). Thirty areas of 128GB is probably still enough swap for a 64GB 32-bit machine. Provide swp_to_radix_entry() and radix_to_swp_entry() conversions, and enforce filesize limit in read_swap_header(), just as for ptes. Signed-off-by: Hugh Dickins Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 49 +++++++++++++++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 18 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index b83aebfd0a00..76bfb6460f57 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -714,9 +714,12 @@ repeat: page = radix_tree_deref_slot(pagep); if (unlikely(!page)) goto out; - if (radix_tree_deref_retry(page)) + if (radix_tree_exception(page)) { + if (radix_tree_exceptional_entry(page)) + goto out; + /* radix_tree_deref_retry(page) */ goto repeat; - + } if (!page_cache_get_speculative(page)) goto repeat; @@ -753,7 +756,7 @@ struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) repeat: page = find_get_page(mapping, offset); - if (page) { + if (page && !radix_tree_exception(page)) { lock_page(page); /* Has the page been truncated? */ if (unlikely(page->mapping != mapping)) { @@ -849,11 +852,14 @@ repeat: if (unlikely(!page)) continue; - /* - * This can only trigger when the entry at index 0 moves out - * of or back to the root: none yet gotten, safe to restart. - */ - if (radix_tree_deref_retry(page)) { + if (radix_tree_exception(page)) { + if (radix_tree_exceptional_entry(page)) + continue; + /* + * radix_tree_deref_retry(page): + * can only trigger when entry at index 0 moves out of + * or back to root: none yet gotten, safe to restart. + */ WARN_ON(start | i); goto restart; } @@ -912,12 +918,16 @@ repeat: if (unlikely(!page)) continue; - /* - * This can only trigger when the entry at index 0 moves out - * of or back to the root: none yet gotten, safe to restart. - */ - if (radix_tree_deref_retry(page)) + if (radix_tree_exception(page)) { + if (radix_tree_exceptional_entry(page)) + break; + /* + * radix_tree_deref_retry(page): + * can only trigger when entry at index 0 moves out of + * or back to root: none yet gotten, safe to restart. + */ goto restart; + } if (!page_cache_get_speculative(page)) goto repeat; @@ -977,12 +987,15 @@ repeat: if (unlikely(!page)) continue; - /* - * This can only trigger when the entry at index 0 moves out - * of or back to the root: none yet gotten, safe to restart. - */ - if (radix_tree_deref_retry(page)) + if (radix_tree_exception(page)) { + BUG_ON(radix_tree_exceptional_entry(page)); + /* + * radix_tree_deref_retry(page): + * can only trigger when entry at index 0 moves out of + * or back to root: none yet gotten, safe to restart. + */ goto restart; + } if (!page_cache_get_speculative(page)) goto repeat; -- cgit v1.2.3 From 31475dd611209413bace21651a400afb91d0bd9d Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 3 Aug 2011 16:21:27 -0700 Subject: mm: a few small updates for radix-swap Remove PageSwapBacked (!page_is_file_cache) cases from add_to_page_cache_locked() and add_to_page_cache_lru(): those pages now go through shmem_add_to_page_cache(). Remove a comment on maximum tmpfs size from fsstack_copy_inode_size(), and add a comment on swap entries to invalidate_mapping_pages(). And mincore_page() uses find_get_page() on what might be shmem or a tmpfs file: allow for a radix_tree_exceptional_entry(), and proceed to find_get_page() on swapper_space if so (oh, swapper_space needs #ifdef). Signed-off-by: Hugh Dickins Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 76bfb6460f57..96778faf82d5 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -33,7 +33,6 @@ #include #include /* for BUG_ON(!in_atomic()) only */ #include -#include /* for page_is_file_cache() */ #include #include "internal.h" @@ -462,6 +461,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, int error; VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON(PageSwapBacked(page)); error = mem_cgroup_cache_charge(page, current->mm, gfp_mask & GFP_RECLAIM_MASK); @@ -479,8 +479,6 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, if (likely(!error)) { mapping->nrpages++; __inc_zone_page_state(page, NR_FILE_PAGES); - if (PageSwapBacked(page)) - __inc_zone_page_state(page, NR_SHMEM); spin_unlock_irq(&mapping->tree_lock); } else { page->mapping = NULL; @@ -502,22 +500,9 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, { int ret; - /* - * Splice_read and readahead add shmem/tmpfs pages into the page cache - * before shmem_readpage has a chance to mark them as SwapBacked: they - * need to go on the anon lru below, and mem_cgroup_cache_charge - * (called in add_to_page_cache) needs to know where they're going too. - */ - if (mapping_cap_swap_backed(mapping)) - SetPageSwapBacked(page); - ret = add_to_page_cache(page, mapping, offset, gfp_mask); - if (ret == 0) { - if (page_is_file_cache(page)) - lru_cache_add_file(page); - else - lru_cache_add_anon(page); - } + if (ret == 0) + lru_cache_add_file(page); return ret; } EXPORT_SYMBOL_GPL(add_to_page_cache_lru); -- cgit v1.2.3 From 8079b1c859c44f27d63da4951f5038a16589a563 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 3 Aug 2011 16:21:28 -0700 Subject: mm: clarify the radix_tree exceptional cases Make the radix_tree exceptional cases, mostly in filemap.c, clearer. It's hard to devise a suitable snappy name that illuminates the use by shmem/tmpfs for swap, while keeping filemap/pagecache/radix_tree generality. And akpm points out that /* radix_tree_deref_retry(page) */ comments look like calls that have been commented out for unknown reason. Skirt the naming difficulty by rearranging these blocks to handle the transient radix_tree_deref_retry(page) case first; then just explain the remaining shmem/tmpfs swap case in a comment. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 66 ++++++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 44 insertions(+), 22 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 96778faf82d5..645a080ba4df 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -700,10 +700,14 @@ repeat: if (unlikely(!page)) goto out; if (radix_tree_exception(page)) { - if (radix_tree_exceptional_entry(page)) - goto out; - /* radix_tree_deref_retry(page) */ - goto repeat; + if (radix_tree_deref_retry(page)) + goto repeat; + /* + * Otherwise, shmem/tmpfs must be storing a swap entry + * here as an exceptional entry: so return it without + * attempting to raise page count. + */ + goto out; } if (!page_cache_get_speculative(page)) goto repeat; @@ -838,15 +842,21 @@ repeat: continue; if (radix_tree_exception(page)) { - if (radix_tree_exceptional_entry(page)) - continue; + if (radix_tree_deref_retry(page)) { + /* + * Transient condition which can only trigger + * when entry at index 0 moves out of or back + * to root: none yet gotten, safe to restart. + */ + WARN_ON(start | i); + goto restart; + } /* - * radix_tree_deref_retry(page): - * can only trigger when entry at index 0 moves out of - * or back to root: none yet gotten, safe to restart. + * Otherwise, shmem/tmpfs must be storing a swap entry + * here as an exceptional entry: so skip over it - + * we only reach this from invalidate_mapping_pages(). */ - WARN_ON(start | i); - goto restart; + continue; } if (!page_cache_get_speculative(page)) @@ -904,14 +914,20 @@ repeat: continue; if (radix_tree_exception(page)) { - if (radix_tree_exceptional_entry(page)) - break; + if (radix_tree_deref_retry(page)) { + /* + * Transient condition which can only trigger + * when entry at index 0 moves out of or back + * to root: none yet gotten, safe to restart. + */ + goto restart; + } /* - * radix_tree_deref_retry(page): - * can only trigger when entry at index 0 moves out of - * or back to root: none yet gotten, safe to restart. + * Otherwise, shmem/tmpfs must be storing a swap entry + * here as an exceptional entry: so stop looking for + * contiguous pages. */ - goto restart; + break; } if (!page_cache_get_speculative(page)) @@ -973,13 +989,19 @@ repeat: continue; if (radix_tree_exception(page)) { - BUG_ON(radix_tree_exceptional_entry(page)); + if (radix_tree_deref_retry(page)) { + /* + * Transient condition which can only trigger + * when entry at index 0 moves out of or back + * to root: none yet gotten, safe to restart. + */ + goto restart; + } /* - * radix_tree_deref_retry(page): - * can only trigger when entry at index 0 moves out of - * or back to root: none yet gotten, safe to restart. + * This function is never used on a shmem/tmpfs + * mapping, so a swap entry won't be found here. */ - goto restart; + BUG(); } if (!page_cache_get_speculative(page)) -- cgit v1.2.3