From d47992f86b307985b3215bcf141d56d1849d71df Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Tue, 21 May 2013 23:17:23 -0400
Subject: mm: change invalidatepage prototype to accept length

Currently there is no way to truncate partial page where the end
truncate point is not at the end of the page. This is because it was not
needed and the functionality was enough for file system truncate
operation to work properly. However more file systems now support punch
hole feature and it can benefit from mm supporting truncating page just
up to the certain point.

Specifically, with this functionality truncate_inode_pages_range() can
be changed so it supports truncating partial page at the end of the
range (currently it will BUG_ON() if 'end' is not at the end of the
page).

This commit changes the invalidatepage() address space operation
prototype to accept range to be invalidated and update all the instances
for it.

We also change the block_invalidatepage() in the same way and actually
make a use of the new length argument implementing range invalidation.

Actual file system implementations will follow except the file systems
where the changes are really simple and should not change the behaviour
in any way .Implementation for truncate_page_range() which will be able
to accept page unaligned ranges will follow as well.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Hugh Dickins <hughd@google.com>
---
 Documentation/filesystems/Locking |  6 +++---
 Documentation/filesystems/vfs.txt | 20 ++++++++++----------
 2 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 0706d32a61e6..cbbac3fa0eb4 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -189,7 +189,7 @@ prototypes:
 				loff_t pos, unsigned len, unsigned copied,
 				struct page *page, void *fsdata);
 	sector_t (*bmap)(struct address_space *, sector_t);
-	int (*invalidatepage) (struct page *, unsigned long);
+	void (*invalidatepage) (struct page *, unsigned int, unsigned int);
 	int (*releasepage) (struct page *, int);
 	void (*freepage)(struct page *);
 	int (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
@@ -310,8 +310,8 @@ filesystems and by the swapper. The latter will eventually go away.  Please,
 keep it that way and don't breed new callers.
 
 	->invalidatepage() is called when the filesystem must attempt to drop
-some or all of the buffers from the page when it is being truncated.  It
-returns zero on success.  If ->invalidatepage is zero, the kernel uses
+some or all of the buffers from the page when it is being truncated. It
+returns zero on success. If ->invalidatepage is zero, the kernel uses
 block_invalidatepage() instead.
 
 	->releasepage() is called when the kernel is about to try to drop the
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index bc4b06b3160a..e445b95a002b 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -549,7 +549,7 @@ struct address_space_operations
 -------------------------------
 
 This describes how the VFS can manipulate mapping of a file to page cache in
-your filesystem. As of kernel 2.6.22, the following members are defined:
+your filesystem. The following members are defined:
 
 struct address_space_operations {
 	int (*writepage)(struct page *page, struct writeback_control *wbc);
@@ -566,7 +566,7 @@ struct address_space_operations {
 				loff_t pos, unsigned len, unsigned copied,
 				struct page *page, void *fsdata);
 	sector_t (*bmap)(struct address_space *, sector_t);
-	int (*invalidatepage) (struct page *, unsigned long);
+	void (*invalidatepage) (struct page *, unsigned int, unsigned int);
 	int (*releasepage) (struct page *, int);
 	void (*freepage)(struct page *);
 	ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
@@ -685,14 +685,14 @@ struct address_space_operations {
   invalidatepage: If a page has PagePrivate set, then invalidatepage
         will be called when part or all of the page is to be removed
 	from the address space.  This generally corresponds to either a
-	truncation or a complete invalidation of the address space
-	(in the latter case 'offset' will always be 0).
-	Any private data associated with the page should be updated
-	to reflect this truncation.  If offset is 0, then
-	the private data should be released, because the page
-	must be able to be completely discarded.  This may be done by
-        calling the ->releasepage function, but in this case the
-        release MUST succeed.
+	truncation, punch hole  or a complete invalidation of the address
+	space (in the latter case 'offset' will always be 0 and 'length'
+	will be PAGE_CACHE_SIZE). Any private data associated with the page
+	should be updated to reflect this truncation.  If offset is 0 and
+	length is PAGE_CACHE_SIZE, then the private data should be released,
+	because the page must be able to be completely discarded.  This may
+	be done by calling the ->releasepage function, but in this case the
+	release MUST succeed.
 
   releasepage: releasepage is called on PagePrivate pages to indicate
         that the page should be freed if possible.  ->releasepage
-- 
cgit v1.2.3


From f884ab15afdc5514e88105c92a4e2e1e6539869a Mon Sep 17 00:00:00 2001
From: Anatol Pomozov <anatol.pomozov@gmail.com>
Date: Wed, 8 May 2013 16:56:16 -0700
Subject: doc: fix misspellings with 'codespell' tool

Signed-off-by: Anatol Pomozov <anatol.pomozov@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 Documentation/filesystems/jfs.txt  | 2 +-
 Documentation/filesystems/qnx6.txt | 2 +-
 Documentation/filesystems/vfat.txt | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/jfs.txt b/Documentation/filesystems/jfs.txt
index f7433355394a..41fd757997b3 100644
--- a/Documentation/filesystems/jfs.txt
+++ b/Documentation/filesystems/jfs.txt
@@ -42,7 +42,7 @@ nodiscard(*)	block device when blocks are freed. This is useful for SSD
 		devices and sparse/thinly-provisioned LUNs.  The FITRIM ioctl
 		command is also available together with the nodiscard option.
 		The value of minlen specifies the minimum blockcount, when
-		a TRIM command to the block device is considered usefull.
+		a TRIM command to the block device is considered useful.
 		When no value is given to the discard option, it defaults to
 		64 blocks, which means 256KiB in JFS.
 		The minlen value of discard overrides the minlen value given
diff --git a/Documentation/filesystems/qnx6.txt b/Documentation/filesystems/qnx6.txt
index e59f2f09f56e..99e90184a72f 100644
--- a/Documentation/filesystems/qnx6.txt
+++ b/Documentation/filesystems/qnx6.txt
@@ -148,7 +148,7 @@ smaller than addressing space in the bitmap.
 Bitmap system area
 ------------------
 
-The bitmap itself is devided into three parts.
+The bitmap itself is divided into three parts.
 First the system area, that is split into two halfs.
 Then userspace.
 
diff --git a/Documentation/filesystems/vfat.txt b/Documentation/filesystems/vfat.txt
index 4a93e98b290a..aa1f459fa6cf 100644
--- a/Documentation/filesystems/vfat.txt
+++ b/Documentation/filesystems/vfat.txt
@@ -307,7 +307,7 @@ the following:
 
                 <proceeding files...>
                 <slot #3, id = 0x43, characters = "h is long">
-                <slot #2, id = 0x02, characters = "xtension whic">
+                <slot #2, id = 0x02, characters = "xtension which">
                 <slot #1, id = 0x01, characters = "My Big File.E">
                 <directory entry, name = "MYBIGFIL.EXT">
 
-- 
cgit v1.2.3


From d3eaace84e40bf946129e516dcbd617173c1cf14 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 5 Jun 2013 12:09:09 +1000
Subject: xfs: disable noattr2/attr2 mount options for CRC enabled filesystems

attr2 format is always enabled for v5 superblock filesystems, so the
mount options to enable or disable it need to be cause mount errors.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 Documentation/filesystems/xfs.txt | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt
index 3e4b3dd1e046..83577f0232a0 100644
--- a/Documentation/filesystems/xfs.txt
+++ b/Documentation/filesystems/xfs.txt
@@ -33,6 +33,9 @@ When mounting an XFS filesystem, the following options are accepted.
 	removing extended attributes) the on-disk superblock feature
 	bit field will be updated to reflect this format being in use.
 
+	CRC enabled filesystems always use the attr2 format, and so
+	will reject the noattr2 mount option if it is set.
+
   barrier
 	Enables the use of block layer write barriers for writes into
 	the journal and unwritten extent conversion.  This allows for
-- 
cgit v1.2.3


From 696c018c7718f5e33e1107da19c4d64a25018878 Mon Sep 17 00:00:00 2001
From: Namjae Jeon <namjae.jeon@samsung.com>
Date: Sun, 16 Jun 2013 09:48:48 +0900
Subject: f2fs: add remount_fs callback support

Add the f2fs_remount function call which will be used
during the filesystem remounting. This function
will help us to change the mount options specific to
f2fs.

Also modify the f2fs background_gc mount option, which
will allow the user to dynamically trun on/off the
garbage collection in f2fs based on the background_gc
value. If background_gc=on, Garbage collection will
be turned off & if background_gc=off, Garbage collection
will be truned on.

By default the garbage collection is on in f2fs.

Change Log:
v2: Incorporated the review comments by Gu Zheng.
    Removing the restore part for VFS flags
    Updating comments with proper flag conditions
    Display GC background option as ON/OFF
    Revised conditions to stop GC in case of remount

v1: Initial changes for adding remount_fs callback
support.

Cc: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Namjae Jeon <namjae.jeon@samsung.com>
Signed-off-by: Pankaj Kumar <pankaj.km@samsung.com>
Reviewed-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
[Jaegeuk Kim: change /** with /* for the coding style]
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 Documentation/filesystems/f2fs.txt | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt
index bd3c56c67380..b91e2f26b672 100644
--- a/Documentation/filesystems/f2fs.txt
+++ b/Documentation/filesystems/f2fs.txt
@@ -98,8 +98,13 @@ Cleaning Overhead
 MOUNT OPTIONS
 ================================================================================
 
-background_gc_off      Turn off cleaning operations, namely garbage collection,
-		       triggered in background when I/O subsystem is idle.
+background_gc=%s       Turn on/off cleaning operations, namely garbage
+                       collection, triggered in background when I/O subsystem is
+                       idle. If background_gc=on, it will turn on the garbage
+                       collection and if background_gc=off, garbage collection
+                       will be truned off.
+                       Default value for this option is on. So garbage
+                       collection is on by default.
 disable_roll_forward   Disable the roll-forward recovery routine
 discard                Issue discard/TRIM commands when a segment is cleaned.
 no_heap                Disable heap-style segment allocation which finds free
-- 
cgit v1.2.3


From 5c0ba4e0762e6dabd14a5c276652e2defec38de7 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 15 May 2013 13:52:59 -0400
Subject: [readdir] introduce iterate_dir() and dir_context

iterate_dir(): new helper, replacing vfs_readdir().

struct dir_context: contains the readdir callback (and will get more stuff
in it), embedded into whatever data that callback wants to deal with;
eventually, we'll be passing it to ->readdir() replacement instead of
(data,filldir) pair.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/porting | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index 4db22f6491e0..85a4a033bae7 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -445,3 +445,6 @@ object doesn't exist.  It's remote/distributed ones that might care...
 [mandatory]
 	FS_REVAL_DOT is gone; if you used to have it, add ->d_weak_revalidate()
 in your dentry operations instead.
+--
+[mandatory]
+	vfs_readdir() is gone; switch to iterate_dir() instead
-- 
cgit v1.2.3


From 2233f31aade393641f0eaed43a71110e629bb900 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 22 May 2013 21:44:23 -0400
Subject: [readdir] ->readdir() is gone

everything's converted to ->iterate()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/Locking | 2 +-
 Documentation/filesystems/porting | 3 +++
 Documentation/filesystems/vfs.txt | 4 ++--
 3 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 0706d32a61e6..bdd82b2339d9 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -414,7 +414,7 @@ prototypes:
 	ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
 	ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
 	ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
-	int (*readdir) (struct file *, void *, filldir_t);
+	int (*iterate) (struct file *, struct dir_context *);
 	unsigned int (*poll) (struct file *, struct poll_table_struct *);
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index 85a4a033bae7..206a1bdc7321 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -448,3 +448,6 @@ in your dentry operations instead.
 --
 [mandatory]
 	vfs_readdir() is gone; switch to iterate_dir() instead
+--
+[mandatory]
+	->readdir() is gone now; switch to ->iterate()
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index bc4b06b3160a..4a35f6614a66 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -777,7 +777,7 @@ struct file_operations {
 	ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
 	ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
 	ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
-	int (*readdir) (struct file *, void *, filldir_t);
+	int (*iterate) (struct file *, struct dir_context *);
 	unsigned int (*poll) (struct file *, struct poll_table_struct *);
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
@@ -815,7 +815,7 @@ otherwise noted.
 
   aio_write: called by io_submit(2) and other asynchronous I/O operations
 
-  readdir: called when the VFS needs to read the directory contents
+  iterate: called when the VFS needs to read the directory contents
 
   poll: called by the VFS when a process wants to check if there is
 	activity on this file and (optionally) go to sleep until there
-- 
cgit v1.2.3


From da53be12bbb4fabbe2e9f6f908de0cf478b5161d Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 21 May 2013 15:22:44 -0700
Subject: Don't pass inode to ->d_hash() and ->d_compare()

Instances either don't look at it at all (the majority of cases) or
only want it to find the superblock (which can be had as dentry->d_sb).
A few cases that want more are actually safe with dentry->d_inode -
the only precaution needed is the check that it hadn't been replaced with
NULL by rmdir() or by overwriting rename(), which case should be simply
treated as cache miss.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/Locking |  6 ++----
 Documentation/filesystems/vfs.txt | 19 ++++++++-----------
 2 files changed, 10 insertions(+), 15 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index bdd82b2339d9..f94a362f408e 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -11,10 +11,8 @@ be able to use diff(1).
 prototypes:
 	int (*d_revalidate)(struct dentry *, unsigned int);
 	int (*d_weak_revalidate)(struct dentry *, unsigned int);
-	int (*d_hash)(const struct dentry *, const struct inode *,
-			struct qstr *);
-	int (*d_compare)(const struct dentry *, const struct inode *,
-			const struct dentry *, const struct inode *,
+	int (*d_hash)(const struct dentry *, struct qstr *);
+	int (*d_compare)(const struct dentry *, const struct dentry *,
 			unsigned int, const char *, const struct qstr *);
 	int (*d_delete)(struct dentry *);
 	void (*d_release)(struct dentry *);
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 4a35f6614a66..51ba44e3fc40 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -901,10 +901,8 @@ defined:
 struct dentry_operations {
 	int (*d_revalidate)(struct dentry *, unsigned int);
 	int (*d_weak_revalidate)(struct dentry *, unsigned int);
-	int (*d_hash)(const struct dentry *, const struct inode *,
-			struct qstr *);
-	int (*d_compare)(const struct dentry *, const struct inode *,
-			const struct dentry *, const struct inode *,
+	int (*d_hash)(const struct dentry *, struct qstr *);
+	int (*d_compare)(const struct dentry *, const struct dentry *,
 			unsigned int, const char *, const struct qstr *);
 	int (*d_delete)(const struct dentry *);
 	void (*d_release)(struct dentry *);
@@ -949,25 +947,24 @@ struct dentry_operations {
 
   d_hash: called when the VFS adds a dentry to the hash table. The first
 	dentry passed to d_hash is the parent directory that the name is
-	to be hashed into. The inode is the dentry's inode.
+	to be hashed into.
 
 	Same locking and synchronisation rules as d_compare regarding
 	what is safe to dereference etc.
 
   d_compare: called to compare a dentry name with a given name. The first
 	dentry is the parent of the dentry to be compared, the second is
-	the parent's inode, then the dentry and inode (may be NULL) of the
-	child dentry. len and name string are properties of the dentry to be
-	compared. qstr is the name to compare it with.
+	the child dentry. len and name string are properties of the dentry
+	to be compared. qstr is the name to compare it with.
 
 	Must be constant and idempotent, and should not take locks if
-	possible, and should not or store into the dentry or inodes.
-	Should not dereference pointers outside the dentry or inodes without
+	possible, and should not or store into the dentry.
+	Should not dereference pointers outside the dentry without
 	lots of care (eg.  d_parent, d_inode, d_name should not be used).
 
 	However, our vfsmount is pinned, and RCU held, so the dentries and
 	inodes won't disappear, neither will our sb or filesystem module.
-	->i_sb and ->d_sb may be used.
+	->d_sb may be used.
 
 	It is a tricky calling convention because it needs to be called under
 	"rcu-walk", ie. without any locks or references on things.
-- 
cgit v1.2.3


From 1c8c601a8c0dc59fe64907dcd9d512a3d181ddc7 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 21 Jun 2013 08:58:15 -0400
Subject: locks: protect most of the file_lock handling with i_lock

Having a global lock that protects all of this code is a clear
scalability problem. Instead of doing that, move most of the code to be
protected by the i_lock instead. The exceptions are the global lists
that the ->fl_link sits on, and the ->fl_block list.

->fl_link is what connects these structures to the
global lists, so we must ensure that we hold those locks when iterating
over or updating these lists.

Furthermore, sound deadlock detection requires that we hold the
blocked_list state steady while checking for loops. We also must ensure
that the search and update to the list are atomic.

For the checking and insertion side of the blocked_list, push the
acquisition of the global lock into __posix_lock_file and ensure that
checking and update of the  blocked_list is done without dropping the
lock in between.

On the removal side, when waking up blocked lock waiters, take the
global lock before walking the blocked list and dequeue the waiters from
the global list prior to removal from the fl_block list.

With this, deadlock detection should be race free while we minimize
excessive file_lock_lock thrashing.

Finally, in order to avoid a lock inversion problem when handling
/proc/locks output we must ensure that manipulations of the fl_block
list are also protected by the file_lock_lock.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/Locking | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index f94a362f408e..c2963a74fbc3 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -342,7 +342,7 @@ prototypes:
 
 
 locking rules:
-			file_lock_lock	may block
+			inode->i_lock	may block
 fl_copy_lock:		yes		no
 fl_release_private:	maybe		no
 
@@ -355,12 +355,19 @@ prototypes:
 	int (*lm_change)(struct file_lock **, int);
 
 locking rules:
-			file_lock_lock	may block
-lm_compare_owner:	yes		no
-lm_notify:		yes		no
-lm_grant:		no		no
-lm_break:		yes		no
-lm_change		yes		no
+
+			inode->i_lock	file_lock_lock	may block
+lm_compare_owner:	yes[1]		maybe		no
+lm_notify:		yes		yes		no
+lm_grant:		no		no		no
+lm_break:		yes		no		no
+lm_change		yes		no		no
+
+[1]:	->lm_compare_owner is generally called with *an* inode->i_lock held. It
+may not be the i_lock of the inode for either file_lock being compared! This is
+the case with deadlock detection, since the code has to chase down the owners
+of locks that may be entirely unrelated to the one on which the lock is being
+acquired. When doing a search for deadlocks, the file_lock_lock is also held.
 
 --------------------------- buffer_head -----------------------------------
 prototypes:
-- 
cgit v1.2.3


From 3999e49364193f7dbbba66e2be655fe91ba1fced Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 21 Jun 2013 08:58:19 -0400
Subject: locks: add a new "lm_owner_key" lock operation

Currently, the hashing that the locking code uses to add these values
to the blocked_hash is simply calculated using fl_owner field. That's
valid in most cases except for server-side lockd, which validates the
owner of a lock based on fl_owner and fl_pid.

In the case where you have a small number of NFS clients doing a lot
of locking between different processes, you could end up with all
the blocked requests sitting in a very small number of hash buckets.

Add a new lm_owner_key operation to the lock_manager_operations that
will generate an unsigned long to use as the key in the hashtable.
That function is only implemented for server-side lockd, and simply
XORs the fl_owner and fl_pid.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Acked-by: J. Bruce Fields <bfields@fieldses.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/Locking | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index c2963a74fbc3..2db7c9e492e9 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -349,6 +349,7 @@ fl_release_private:	maybe		no
 ----------------------- lock_manager_operations ---------------------------
 prototypes:
 	int (*lm_compare_owner)(struct file_lock *, struct file_lock *);
+	unsigned long (*lm_owner_key)(struct file_lock *);
 	void (*lm_notify)(struct file_lock *);  /* unblock callback */
 	int (*lm_grant)(struct file_lock *, struct file_lock *, int);
 	void (*lm_break)(struct file_lock *); /* break_lease callback */
@@ -358,16 +359,21 @@ locking rules:
 
 			inode->i_lock	file_lock_lock	may block
 lm_compare_owner:	yes[1]		maybe		no
+lm_owner_key		yes[1]		yes		no
 lm_notify:		yes		yes		no
 lm_grant:		no		no		no
 lm_break:		yes		no		no
 lm_change		yes		no		no
 
-[1]:	->lm_compare_owner is generally called with *an* inode->i_lock held. It
-may not be the i_lock of the inode for either file_lock being compared! This is
-the case with deadlock detection, since the code has to chase down the owners
-of locks that may be entirely unrelated to the one on which the lock is being
-acquired. When doing a search for deadlocks, the file_lock_lock is also held.
+[1]:	->lm_compare_owner and ->lm_owner_key are generally called with
+*an* inode->i_lock held. It may not be the i_lock of the inode
+associated with either file_lock argument! This is the case with deadlock
+detection, since the code has to chase down the owners of locks that may
+be entirely unrelated to the one on which the lock is being acquired.
+For deadlock detection however, the file_lock_lock is also held. The
+fact that these locks are held ensures that the file_locks do not
+disappear out from under you while doing the comparison or generating an
+owner key.
 
 --------------------------- buffer_head -----------------------------------
 prototypes:
-- 
cgit v1.2.3


From 7b2296afb392bc21a50f42e7c7f4b19d3fea8c6d Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 21 Jun 2013 08:58:20 -0400
Subject: locks: give the blocked_hash its own spinlock

There's no reason we have to protect the blocked_hash and file_lock_list
with the same spinlock. With the tests I have, breaking it in two gives
a barely measurable performance benefit, but it seems reasonable to make
this locking as granular as possible.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/Locking | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 2db7c9e492e9..7d9ca7a83fcc 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -357,20 +357,20 @@ prototypes:
 
 locking rules:
 
-			inode->i_lock	file_lock_lock	may block
-lm_compare_owner:	yes[1]		maybe		no
-lm_owner_key		yes[1]		yes		no
-lm_notify:		yes		yes		no
-lm_grant:		no		no		no
-lm_break:		yes		no		no
-lm_change		yes		no		no
+			inode->i_lock	blocked_lock_lock	may block
+lm_compare_owner:	yes[1]		maybe			no
+lm_owner_key		yes[1]		yes			no
+lm_notify:		yes		yes			no
+lm_grant:		no		no			no
+lm_break:		yes		no			no
+lm_change		yes		no			no
 
 [1]:	->lm_compare_owner and ->lm_owner_key are generally called with
 *an* inode->i_lock held. It may not be the i_lock of the inode
 associated with either file_lock argument! This is the case with deadlock
 detection, since the code has to chase down the owners of locks that may
 be entirely unrelated to the one on which the lock is being acquired.
-For deadlock detection however, the file_lock_lock is also held. The
+For deadlock detection however, the blocked_lock_lock is also held. The
 fact that these locks are held ensures that the file_locks do not
 disappear out from under you while doing the comparison or generating an
 owner key.
-- 
cgit v1.2.3


From 48bde8d3620f5f3c6ae9ff599eb404055ae51664 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 3 Jul 2013 16:19:23 +0400
Subject: Document ->tmpfile()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/Locking | 2 ++
 Documentation/filesystems/vfs.txt | 5 +++++
 2 files changed, 7 insertions(+)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 7d9ca7a83fcc..e95d3131309e 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -64,6 +64,7 @@ prototypes:
 	int (*atomic_open)(struct inode *, struct dentry *,
 				struct file *, unsigned open_flag,
 				umode_t create_mode, int *opened);
+	int (*tmpfile) (struct inode *, struct dentry *, umode_t);
 
 locking rules:
 	all may block
@@ -91,6 +92,7 @@ removexattr:	yes
 fiemap:		no
 update_time:	no
 atomic_open:	yes
+tmpfile:	no
 
 	Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on
 victim.
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 51ba44e3fc40..aeff462c7228 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -360,6 +360,8 @@ struct inode_operations {
 	int (*removexattr) (struct dentry *, const char *);
 	void (*update_time)(struct inode *, struct timespec *, int);
 	int (*atomic_open)(struct inode *, struct dentry *,
+	int (*tmpfile) (struct inode *, struct dentry *, umode_t);
+} ____cacheline_aligned;
 				struct file *, unsigned open_flag,
 				umode_t create_mode, int *opened);
 };
@@ -472,6 +474,9 @@ otherwise noted.
   	component is negative or needs lookup.  Cached positive dentries are
   	still handled by f_op->open().
 
+  tmpfile: called in the end of O_TMPFILE open().  Optional, equivalent to
+	atomically creating, opening and unlinking a file in given directory.
+
 The Address Space Object
 ========================
 
-- 
cgit v1.2.3


From 0f8975ec4db2c8b5bd111b211292ca9be0feb6b8 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Wed, 3 Jul 2013 15:01:20 -0700
Subject: mm: soft-dirty bits for user memory changes tracking

The soft-dirty is a bit on a PTE which helps to track which pages a task
writes to.  In order to do this tracking one should

  1. Clear soft-dirty bits from PTEs ("echo 4 > /proc/PID/clear_refs)
  2. Wait some time.
  3. Read soft-dirty bits (55'th in /proc/PID/pagemap2 entries)

To do this tracking, the writable bit is cleared from PTEs when the
soft-dirty bit is.  Thus, after this, when the task tries to modify a
page at some virtual address the #PF occurs and the kernel sets the
soft-dirty bit on the respective PTE.

Note, that although all the task's address space is marked as r/o after
the soft-dirty bits clear, the #PF-s that occur after that are processed
fast.  This is so, since the pages are still mapped to physical memory,
and thus all the kernel does is finds this fact out and puts back
writable, dirty and soft-dirty bits on the PTE.

Another thing to note, is that when mremap moves PTEs they are marked
with soft-dirty as well, since from the user perspective mremap modifies
the virtual memory at mremap's new address.

Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Glauber Costa <glommer@parallels.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index fd8d0d594fc7..fcc22c982a25 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -473,7 +473,8 @@ This file is only present if the CONFIG_MMU kernel configuration option is
 enabled.
 
 The /proc/PID/clear_refs is used to reset the PG_Referenced and ACCESSED/YOUNG
-bits on both physical and virtual pages associated with a process.
+bits on both physical and virtual pages associated with a process, and the
+soft-dirty bit on pte (see Documentation/vm/soft-dirty.txt for details).
 To clear the bits for all the pages associated with the process
     > echo 1 > /proc/PID/clear_refs
 
@@ -482,6 +483,10 @@ To clear the bits for the anonymous pages associated with the process
 
 To clear the bits for the file mapped pages associated with the process
     > echo 3 > /proc/PID/clear_refs
+
+To clear the soft-dirty bit
+    > echo 4 > /proc/PID/clear_refs
+
 Any other value written to /proc/PID/clear_refs will have no effect.
 
 The /proc/pid/pagemap gives the PFN, which can be used to find the pageflags
-- 
cgit v1.2.3


From 26c0c5bf38159673f0ae28c38fc9f90dbeb4d4aa Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Wed, 3 Jul 2013 15:04:45 -0700
Subject: documentation: update address_space_operations

The documentation for address_space_operations is partially out of date.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/vfs.txt | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 1f0ba30ae47e..fc5d2a1d26c0 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -559,7 +559,6 @@ your filesystem. The following members are defined:
 struct address_space_operations {
 	int (*writepage)(struct page *page, struct writeback_control *wbc);
 	int (*readpage)(struct file *, struct page *);
-	int (*sync_page)(struct page *);
 	int (*writepages)(struct address_space *, struct writeback_control *);
 	int (*set_page_dirty)(struct page *page);
 	int (*readpages)(struct file *filp, struct address_space *mapping,
@@ -581,6 +580,8 @@ struct address_space_operations {
 	/* migrate the contents of a page to the specified target */
 	int (*migratepage) (struct page *, struct page *);
 	int (*launder_page) (struct page *);
+	int (*is_partially_uptodate) (struct page *, read_descriptor_t *,
+					unsigned long);
 	int (*error_remove_page) (struct mapping *mapping, struct page *page);
 	int (*swap_activate)(struct file *);
 	int (*swap_deactivate)(struct file *);
@@ -612,13 +613,6 @@ struct address_space_operations {
        In this case, the page will be relocated, relocked and if
        that all succeeds, ->readpage will be called again.
 
-  sync_page: called by the VM to notify the backing store to perform all
-  	queued I/O operations for a page. I/O operations for other pages
-	associated with this address_space object may also be performed.
-
-	This function is optional and is called only for pages with
-  	PG_Writeback set while waiting for the writeback to complete.
-
   writepages: called by the VM to write out pages associated with the
   	address_space object.  If wbc->sync_mode is WBC_SYNC_ALL, then
   	the writeback_control will specify a range of pages that must be
@@ -747,6 +741,11 @@ struct address_space_operations {
   	prevent redirtying the page, it is kept locked during the whole
 	operation.
 
+  is_partially_uptodate: Called by the VM when reading a file through the
+	pagecache when the underlying blocksize != pagesize. If the required
+	block is up to date then the read can complete without needing the IO
+	to bring the whole page up to date.
+
   error_remove_page: normally set to generic_error_remove_page if truncation
 	is ok for this address space. Used for memory failure handling.
 	Setting this implies you deal with pages going away under you,
-- 
cgit v1.2.3


From 543cc115339baa44fbea877b3d8673aca652622f Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Wed, 3 Jul 2013 15:04:46 -0700
Subject: documentation: document the is_dirty_writeback aops callback

Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/vfs.txt | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index fc5d2a1d26c0..f93a88250a44 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -582,6 +582,7 @@ struct address_space_operations {
 	int (*launder_page) (struct page *);
 	int (*is_partially_uptodate) (struct page *, read_descriptor_t *,
 					unsigned long);
+	void (*is_dirty_writeback) (struct page *, bool *, bool *);
 	int (*error_remove_page) (struct mapping *mapping, struct page *page);
 	int (*swap_activate)(struct file *);
 	int (*swap_deactivate)(struct file *);
@@ -746,6 +747,15 @@ struct address_space_operations {
 	block is up to date then the read can complete without needing the IO
 	to bring the whole page up to date.
 
+  is_dirty_writeback: Called by the VM when attempting to reclaim a page.
+	The VM uses dirty and writeback information to determine if it needs
+	to stall to allow flushers a chance to complete some IO. Ordinarily
+	it can use PageDirty and PageWriteback but some filesystems have
+	more complex state (unstable pages in NFS prevent reclaim) or
+	do not set those flags due to locking problems (jbd). This callback
+	allows a filesystem to indicate to the VM if a page should be
+	treated as dirty or writeback for the purposes of stalling.
+
   error_remove_page: normally set to generic_error_remove_page if truncation
 	is ok for this address space. Used for memory failure handling.
 	Setting this implies you deal with pages going away under you,
-- 
cgit v1.2.3


From 3e5b7d8b491c3710b7e007eab0a643f923932e3d Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 10 Jul 2013 07:03:59 +1000
Subject: xfs: update mount options documentation

Because it's horribly out of date.

And mark various deprecated options as deprecated and give them a
removal date.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 Documentation/filesystems/xfs.txt | 317 +++++++++++++++++++++++++-------------
 1 file changed, 209 insertions(+), 108 deletions(-)

(limited to 'Documentation/filesystems')

diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt
index 83577f0232a0..12525b17d9ed 100644
--- a/Documentation/filesystems/xfs.txt
+++ b/Documentation/filesystems/xfs.txt
@@ -18,6 +18,8 @@ Mount Options
 =============
 
 When mounting an XFS filesystem, the following options are accepted.
+For boolean mount options, the names with the (*) suffix is the
+default behaviour.
 
   allocsize=size
 	Sets the buffered I/O end-of-file preallocation size when
@@ -25,97 +27,128 @@ When mounting an XFS filesystem, the following options are accepted.
 	Valid values for this option are page size (typically 4KiB)
 	through to 1GiB, inclusive, in power-of-2 increments.
 
-  attr2/noattr2
-	The options enable/disable (default is disabled for backward
-	compatibility on-disk) an "opportunistic" improvement to be
-	made in the way inline extended attributes are stored on-disk.
-	When the new form is used for the first time (by setting or
-	removing extended attributes) the on-disk superblock feature
-	bit field will be updated to reflect this format being in use.
+	The default behaviour is for dynamic end-of-file
+	preallocation size, which uses a set of heuristics to
+	optimise the preallocation size based on the current
+	allocation patterns within the file and the access patterns
+	to the file. Specifying a fixed allocsize value turns off
+	the dynamic behaviour.
+
+  attr2
+  noattr2
+	The options enable/disable an "opportunistic" improvement to
+	be made in the way inline extended attributes are stored
+	on-disk.  When the new form is used for the first time when
+	attr2 is selected (either when setting or removing extended
+	attributes) the on-disk superblock feature bit field will be
+	updated to reflect this format being in use.
+
+	The default behaviour is determined by the on-disk feature
+	bit indicating that attr2 behaviour is active. If either
+	mount option it set, then that becomes the new default used
+	by the filesystem.
 
 	CRC enabled filesystems always use the attr2 format, and so
 	will reject the noattr2 mount option if it is set.
 
-  barrier
-	Enables the use of block layer write barriers for writes into
-	the journal and unwritten extent conversion.  This allows for
-	drive level write caching to be enabled, for devices that
-	support write barriers.
+  barrier (*)
+  nobarrier
+	Enables/disables the use of block layer write barriers for
+	writes into the journal and for data integrity operations.
+	This allows for drive level write caching to be enabled, for
+	devices that support write barriers.
 
   discard
-	Issue command to let the block device reclaim space freed by the
-	filesystem.  This is useful for SSD devices, thinly provisioned
-	LUNs and virtual machine images, but may have a performance
-	impact.
-
-  dmapi
-	Enable the DMAPI (Data Management API) event callouts.
-	Use with the "mtpt" option.
-
-  grpid/bsdgroups and nogrpid/sysvgroups
-	These options define what group ID a newly created file gets.
-	When grpid is set, it takes the group ID of the directory in
-	which it is created; otherwise (the default) it takes the fsgid
-	of the current process, unless the directory has the setgid bit
-	set, in which case it takes the gid from the parent directory,
-	and also gets the setgid bit set if it is a directory itself.
-
-  ihashsize=value
-	In memory inode hashes have been removed, so this option has
-	no function as of August 2007. Option is deprecated.
-
-  ikeep/noikeep
-	When ikeep is specified, XFS does not delete empty inode clusters
-	and keeps them around on disk. ikeep is the traditional XFS
-	behaviour. When noikeep is specified, empty inode clusters
-	are returned to the free space pool. The default is noikeep for
-	non-DMAPI mounts, while ikeep is the default when DMAPI is in use.
-
-  inode64
-	Indicates that XFS is allowed to create inodes at any location
-	in the filesystem, including those which will result in inode
-	numbers occupying more than 32 bits of significance.  This is
-	the default allocation option. Applications which do not handle
-	inode numbers bigger than 32 bits, should use inode32 option.
+  nodiscard (*)
+	Enable/disable the issuing of commands to let the block
+	device reclaim space freed by the filesystem.  This is
+	useful for SSD devices, thinly provisioned LUNs and virtual
+	machine images, but may have a performance impact.
+
+	Note: It is currently recommended that you use the fstrim
+	application to discard unused blocks rather than the discard
+	mount option because the performance impact of this option
+	is quite severe.
+
+  grpid/bsdgroups
+  nogrpid/sysvgroups (*)
+	These options define what group ID a newly created file
+	gets.  When grpid is set, it takes the group ID of the
+	directory in which it is created; otherwise it takes the
+	fsgid of the current process, unless the directory has the
+	setgid bit set, in which case it takes the gid from the
+	parent directory, and also gets the setgid bit set if it is
+	a directory itself.
+
+  filestreams
+	Make the data allocator use the filestreams allocation mode
+	across the entire filesystem rather than just on directories
+	configured to use it.
+
+  ikeep
+  noikeep (*)
+	When ikeep is specified, XFS does not delete empty inode
+	clusters and keeps them around on disk.  When noikeep is
+	specified, empty inode clusters are returned to the free
+	space pool.
 
   inode32
-	Indicates that XFS is limited to create inodes at locations which
-	will not result in inode numbers with more than 32 bits of
-	significance. This is provided for backwards compatibility, since
-	64 bits inode numbers might cause problems for some applications
-	that cannot handle large inode numbers.
-
-  largeio/nolargeio
+  inode64 (*)
+	When inode32 is specified, it indicates that XFS limits
+	inode creation to locations which will not result in inode
+	numbers with more than 32 bits of significance.
+
+	When inode64 is specified, it indicates that XFS is allowed
+	to create inodes at any location in the filesystem,
+	including those which will result in inode numbers occupying
+	more than 32 bits of significance. 
+
+	inode32 is provided for backwards compatibility with older
+	systems and applications, since 64 bits inode numbers might
+	cause problems for some applications that cannot handle
+	large inode numbers.  If applications are in use which do
+	not handle inode numbers bigger than 32 bits, the inode32
+	option should be specified.
+
+
+  largeio
+  nolargeio (*)
 	If "nolargeio" is specified, the optimal I/O reported in
-	st_blksize by stat(2) will be as small as possible to allow user
-	applications to avoid inefficient read/modify/write I/O.
-	If "largeio" specified, a filesystem that has a "swidth" specified
-	will return the "swidth" value (in bytes) in st_blksize. If the
-	filesystem does not have a "swidth" specified but does specify
-	an "allocsize" then "allocsize" (in bytes) will be returned
-	instead.
-	If neither of these two options are specified, then filesystem
-	will behave as if "nolargeio" was specified.
+	st_blksize by stat(2) will be as small as possible to allow
+	user applications to avoid inefficient read/modify/write
+	I/O.  This is typically the page size of the machine, as
+	this is the granularity of the page cache.
+
+	If "largeio" specified, a filesystem that was created with a
+	"swidth" specified will return the "swidth" value (in bytes)
+	in st_blksize. If the filesystem does not have a "swidth"
+	specified but does specify an "allocsize" then "allocsize"
+	(in bytes) will be returned instead. Otherwise the behaviour
+	is the same as if "nolargeio" was specified.
 
   logbufs=value
-	Set the number of in-memory log buffers.  Valid numbers range
-	from 2-8 inclusive.
-	The default value is 8 buffers for filesystems with a
-	blocksize of 64KiB, 4 buffers for filesystems with a blocksize
-	of 32KiB, 3 buffers for filesystems with a blocksize of 16KiB
-	and 2 buffers for all other configurations.  Increasing the
-	number of buffers may increase performance on some workloads
-	at the cost of the memory used for the additional log buffers
-	and their associated control structures.
+	Set the number of in-memory log buffers.  Valid numbers
+	range from 2-8 inclusive.
+
+	The default value is 8 buffers.
+
+	If the memory cost of 8 log buffers is too high on small
+	systems, then it may be reduced at some cost to performance
+	on metadata intensive workloads. The logbsize option below
+	controls the size of each buffer and so is also relevent to
+	this case.
 
   logbsize=value
-	Set the size of each in-memory log buffer.
-	Size may be specified in bytes, or in kilobytes with a "k" suffix.
-	Valid sizes for version 1 and version 2 logs are 16384 (16k) and
-	32768 (32k).  Valid sizes for version 2 logs also include
-	65536 (64k), 131072 (128k) and 262144 (256k).
-	The default value for machines with more than 32MiB of memory
-	is 32768, machines with less memory use 16384 by default.
+	Set the size of each in-memory log buffer.  The size may be
+	specified in bytes, or in kilobytes with a "k" suffix.
+	Valid sizes for version 1 and version 2 logs are 16384 (16k)
+	and 32768 (32k).  Valid sizes for version 2 logs also
+	include 65536 (64k), 131072 (128k) and 262144 (256k). The
+	logbsize must be an integer multiple of the log
+	stripe unit configured at mkfs time.
+
+	The default value for for version 1 logs is 32768, while the
+	default value for version 2 logs is MAX(32768, log_sunit).
 
   logdev=device and rtdev=device
 	Use an external log (metadata journal) and/or real-time device.
@@ -124,16 +157,11 @@ When mounting an XFS filesystem, the following options are accepted.
 	optional, and the log section can be separate from the data
 	section or contained within it.
 
-  mtpt=mountpoint
-	Use with the "dmapi" option.  The value specified here will be
-	included in the DMAPI mount event, and should be the path of
-	the actual mountpoint that is used.
-
   noalign
-	Data allocations will not be aligned at stripe unit boundaries.
-
-  noatime
-	Access timestamps are not updated when a file is read.
+	Data allocations will not be aligned at stripe unit
+	boundaries. This is only relevant to filesystems created
+	with non-zero data alignment parameters (sunit, swidth) by
+	mkfs.
 
   norecovery
 	The filesystem will be mounted without running log recovery.
@@ -144,8 +172,14 @@ When mounting an XFS filesystem, the following options are accepted.
 	the mount will fail.
 
   nouuid
-	Don't check for double mounted file systems using the file system uuid.
-	This is useful to mount LVM snapshot volumes.
+	Don't check for double mounted file systems using the file
+	system uuid.  This is useful to mount LVM snapshot volumes,
+	and often used in combination with "norecovery" for mounting
+	read-only snapshots.
+
+  noquota
+	Forcibly turns off all quota accounting and enforcement
+	within the filesystem.
 
   uquota/usrquota/uqnoenforce/quota
 	User disk quota accounting enabled, and limits (optionally)
@@ -160,24 +194,64 @@ When mounting an XFS filesystem, the following options are accepted.
 	enforced.  Refer to xfs_quota(8) for further details.
 
   sunit=value and swidth=value
-	Used to specify the stripe unit and width for a RAID device or
-	a stripe volume.  "value" must be specified in 512-byte block
-	units.
-	If this option is not specified and the filesystem was made on
-	a stripe volume or the stripe width or unit were specified for
-	the RAID device at mkfs time, then the mount system call will
-	restore the value from the superblock.  For filesystems that
-	are made directly on RAID devices, these options can be used
-	to override the information in the superblock if the underlying
-	disk layout changes after the filesystem has been created.
-	The "swidth" option is required if the "sunit" option has been
-	specified, and must be a multiple of the "sunit" value.
+	Used to specify the stripe unit and width for a RAID device
+	or a stripe volume.  "value" must be specified in 512-byte
+	block units. These options are only relevant to filesystems
+	that were created with non-zero data alignment parameters.
+
+	The sunit and swidth parameters specified must be compatible
+	with the existing filesystem alignment characteristics.  In
+	general, that means the only valid changes to sunit are
+	increasing it by a power-of-2 multiple. Valid swidth values
+	are any integer multiple of a valid sunit value.
+
+	Typically the only time these mount options are necessary if
+	after an underlying RAID device has had it's geometry
+	modified, such as adding a new disk to a RAID5 lun and
+	reshaping it.
 
   swalloc
 	Data allocations will be rounded up to stripe width boundaries
 	when the current end of file is being extended and the file
 	size is larger than the stripe width size.
 
+  wsync
+	When specified, all filesystem namespace operations are
+	executed synchronously. This ensures that when the namespace
+	operation (create, unlink, etc) completes, the change to the
+	namespace is on stable storage. This is useful in HA setups
+	where failover must not result in clients seeing
+	inconsistent namespace presentation during or after a
+	failover event.
+
+
+Deprecated Mount Options
+========================
+
+  delaylog/nodelaylog
+	Delayed logging is the only logging method that XFS supports
+	now, so these mount options are now ignored.
+
+	Due for removal in 3.12.
+
+  ihashsize=value
+	In memory inode hashes have been removed, so this option has
+	no function as of August 2007. Option is deprecated.
+
+	Due for removal in 3.12.
+
+  irixsgid
+	This behaviour is now controlled by a sysctl, so the mount
+	option is ignored.
+
+	Due for removal in 3.12.
+
+  osyncisdsync
+  osyncisosync
+	O_SYNC and O_DSYNC are fully supported, so there is no need
+	for these options any more.
+
+	Due for removal in 3.12.
 
 sysctls
 =======
@@ -189,15 +263,20 @@ The following sysctls are available for the XFS filesystem:
 	in /proc/fs/xfs/stat.  It then immediately resets to "0".
 
   fs.xfs.xfssyncd_centisecs	(Min: 100  Default: 3000  Max: 720000)
-  	The interval at which the xfssyncd thread flushes metadata
-  	out to disk.  This thread will flush log activity out, and
-  	do some processing on unlinked inodes.
+	The interval at which the filesystem flushes metadata
+	out to disk and runs internal cache cleanup routines.
 
-  fs.xfs.xfsbufd_centisecs	(Min: 50  Default: 100	Max: 3000)
-	The interval at which xfsbufd scans the dirty metadata buffers list.
+  fs.xfs.filestream_centisecs	(Min: 1  Default: 3000  Max: 360000)
+	The interval at which the filesystem ages filestreams cache
+	references and returns timed-out AGs back to the free stream
+	pool.
 
-  fs.xfs.age_buffer_centisecs	(Min: 100  Default: 1500  Max: 720000)
-	The age at which xfsbufd flushes dirty metadata buffers to disk.
+  fs.xfs.speculative_prealloc_lifetime
+		(Units: seconds   Min: 1  Default: 300  Max: 86400)
+	The interval at which the background scanning for inodes
+	with unused speculative preallocation runs. The scan
+	removes unused preallocation from clean inodes and releases
+	the unused space back to the free pool.
 
   fs.xfs.error_level		(Min: 0  Default: 3  Max: 11)
 	A volume knob for error reporting when internal errors occur.
@@ -254,9 +333,31 @@ The following sysctls are available for the XFS filesystem:
 	by the xfs_io(8) chattr command on a directory to be
 	inherited by files in that directory.
 
+  fs.xfs.inherit_nodefrag	(Min: 0  Default: 1  Max: 1)
+	Setting this to "1" will cause the "nodefrag" flag set
+	by the xfs_io(8) chattr command on a directory to be
+	inherited by files in that directory.
+
   fs.xfs.rotorstep		(Min: 1  Default: 1  Max: 256)
 	In "inode32" allocation mode, this option determines how many
 	files the allocator attempts to allocate in the same allocation
 	group before moving to the next allocation group.  The intent
 	is to control the rate at which the allocator moves between
 	allocation groups when allocating extents for new files.
+
+Deprecated Sysctls
+==================
+
+  fs.xfs.xfsbufd_centisecs	(Min: 50  Default: 100	Max: 3000)
+	Dirty metadata is now tracked by the log subsystem and
+	flushing is driven by log space and idling demands. The
+	xfsbufd no longer exists, so this syctl does nothing.
+
+	Due for removal in 3.14.
+
+  fs.xfs.age_buffer_centisecs	(Min: 100  Default: 1500  Max: 720000)
+	Dirty metadata is now tracked by the log subsystem and
+	flushing is driven by log space and idling demands. The
+	xfsbufd no longer exists, so this syctl does nothing.
+
+	Due for removal in 3.14.
-- 
cgit v1.2.3