From d01d4827858cdc2e1c437c87ab65ec0a00fd40f8 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 21 Sep 2009 11:06:27 +0200 Subject: sched: Always show Cpus_allowed field in /proc//status The Cpus_allowed fields in /proc//status is currently only shown in case of CONFIG_CPUSETS. However their contents are also useful for the !CONFIG_CPUSETS case. So change the current behaviour and always show these fields. Signed-off-by: Heiko Carstens Cc: Andrew Morton Cc: Oleg Nesterov Cc: Peter Zijlstra LKML-Reference: <20090921090627.GD4649@osiris.boeblingen.de.ibm.com> Signed-off-by: Ingo Molnar --- fs/proc/array.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'fs') diff --git a/fs/proc/array.c b/fs/proc/array.c index 725a650bbbb8..762aea9c9c71 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -321,6 +321,16 @@ static inline void task_context_switch_counts(struct seq_file *m, p->nivcsw); } +static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) +{ + seq_printf(m, "Cpus_allowed:\t"); + seq_cpumask(m, &task->cpus_allowed); + seq_printf(m, "\n"); + seq_printf(m, "Cpus_allowed_list:\t"); + seq_cpumask_list(m, &task->cpus_allowed); + seq_printf(m, "\n"); +} + int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { @@ -335,6 +345,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, } task_sig(m, task); task_cap(m, task); + task_cpus_allowed(m, task); cpuset_task_status_allowed(m, task); #if defined(CONFIG_S390) task_show_regs(m, task); -- cgit v1.2.3 From 8347a5cdd1422eea0470ed586274c7f29e274b47 Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 6 Oct 2009 18:31:29 +0000 Subject: [CIFS] Fixing to avoid invalid kfree() in cifs_get_tcp_session() trivial bug in fs/cifs/connect.c . The bug is caused by fail of extract_hostname() when mounting cifs file system. This is the situation when I noticed this bug. % sudo mount -t cifs //192.168.10.208 mountpoint -o options... Then my kernel says, [ 1461.807776] ------------[ cut here ]------------ [ 1461.807781] kernel BUG at mm/slab.c:521! [ 1461.807784] invalid opcode: 0000 [#2] PREEMPT SMP [ 1461.807790] last sysfs file: /sys/devices/pci0000:00/0000:00:1e.0/0000:09:02.0/resource [ 1461.807793] CPU 0 [ 1461.807796] Modules linked in: nls_iso8859_1 usbhid sbp2 uhci_hcd ehci_hcd i2c_i801 ohci1394 ieee1394 psmouse serio_raw pcspkr sky2 usbcore evdev [ 1461.807816] Pid: 3446, comm: mount Tainted: G D 2.6.32-rc2-vanilla [ 1461.807820] RIP: 0010:[] [] kfree+0x63/0x156 [ 1461.807829] RSP: 0018:ffff8800b4f7fbb8 EFLAGS: 00010046 [ 1461.807832] RAX: ffffea00033fff98 RBX: ffff8800afbae7e2 RCX: 0000000000000000 [ 1461.807836] RDX: ffffea0000000000 RSI: 000000000000005c RDI: ffffffffffffffea [ 1461.807839] RBP: ffff8800b4f7fbf8 R08: 0000000000000001 R09: 0000000000000000 [ 1461.807842] R10: 0000000000000000 R11: ffff8800b4f7fbf8 R12: 00000000ffffffea [ 1461.807845] R13: ffff8800afb23000 R14: ffff8800b4f87bc0 R15: ffffffffffffffea [ 1461.807849] FS: 00007f52b6f187c0(0000) GS:ffff880007600000(0000) knlGS:0000000000000000 [ 1461.807852] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 1461.807855] CR2: 0000000000613000 CR3: 00000000af8f9000 CR4: 00000000000006f0 [ 1461.807858] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1461.807861] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 1461.807865] Process mount (pid: 3446, threadinfo ffff8800b4f7e000, task ffff8800950e4380) [ 1461.807867] Stack: [ 1461.807869] 0000000000000202 0000000000000282 ffff8800b4f7fbf8 ffff8800afbae7e2 [ 1461.807876] <0> 00000000ffffffea ffff8800afb23000 ffff8800b4f87bc0 ffff8800b4f7fc28 [ 1461.807884] <0> ffff8800b4f7fcd8 ffffffff81159f6d ffffffff81147bc2 ffffffff816bfb48 [ 1461.807892] Call Trace: [ 1461.807899] [] cifs_get_tcp_session+0x440/0x44b [ 1461.807904] [] ? find_nls+0x1c/0xe9 [ 1461.807909] [] cifs_mount+0x16bc/0x2167 [ 1461.807917] [] ? _spin_unlock+0x30/0x4b [ 1461.807923] [] cifs_get_sb+0xa5/0x1a8 [ 1461.807928] [] vfs_kern_mount+0x56/0xc9 [ 1461.807933] [] do_kern_mount+0x47/0xe7 [ 1461.807938] [] do_mount+0x712/0x775 [ 1461.807943] [] ? copy_mount_options+0xcf/0x132 [ 1461.807948] [] sys_mount+0x7f/0xbf [ 1461.807953] [] ? lockdep_sys_exit_thunk+0x35/0x67 [ 1461.807960] [] system_call_fastpath+0x16/0x1b [ 1461.807963] Code: 00 00 00 00 ea ff ff 48 c1 e8 0c 48 6b c0 68 48 01 d0 66 83 38 00 79 04 48 8b 40 10 66 83 38 00 79 04 48 8b 40 10 80 38 00 78 04 <0f> 0b eb fe 4c 8b 70 58 4c 89 ff 41 8b 76 4c e8 b8 49 fb ff e8 [ 1461.808022] RIP [] kfree+0x63/0x156 [ 1461.808027] RSP [ 1461.808031] ---[ end trace ffe26fcdc72c0ce4 ]--- The reason of this bug is that the error handling code of cifs_get_tcp_session() calls kfree() when corresponding kmalloc() failed. (The kmalloc() is called by extract_hostname().) Signed-off-by: Hitoshi Mitake CC: Stable Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 43003e0bef18..b09098079916 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1577,7 +1577,8 @@ cifs_get_tcp_session(struct smb_vol *volume_info) out_err: if (tcp_ses) { - kfree(tcp_ses->hostname); + if (!IS_ERR(tcp_ses->hostname)) + kfree(tcp_ses->hostname); if (tcp_ses->ssocket) sock_release(tcp_ses->ssocket); kfree(tcp_ses); -- cgit v1.2.3 From 89eda06837094ce9f34fae269b8773fcfd70f046 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sun, 4 Oct 2009 21:49:47 +0900 Subject: LSM: Add security_path_chmod() and security_path_chown(). This patch allows pathname based LSM modules to check chmod()/chown() operations. Since notify_change() does not receive "struct vfsmount *", we add security_path_chmod() and security_path_chown() to the caller of notify_change(). These hooks are used by TOMOYO. Signed-off-by: Tetsuo Handa Signed-off-by: James Morris --- fs/open.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/open.c b/fs/open.c index 4f01e06227c6..b5c294d35bd1 100644 --- a/fs/open.c +++ b/fs/open.c @@ -616,6 +616,9 @@ SYSCALL_DEFINE2(fchmod, unsigned int, fd, mode_t, mode) err = mnt_want_write_file(file); if (err) goto out_putf; + err = security_path_chmod(dentry, file->f_vfsmnt, mode); + if (err) + goto out_drop_write; mutex_lock(&inode->i_mutex); if (mode == (mode_t) -1) mode = inode->i_mode; @@ -623,6 +626,7 @@ SYSCALL_DEFINE2(fchmod, unsigned int, fd, mode_t, mode) newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; err = notify_change(dentry, &newattrs); mutex_unlock(&inode->i_mutex); +out_drop_write: mnt_drop_write(file->f_path.mnt); out_putf: fput(file); @@ -645,6 +649,9 @@ SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, mode_t, mode) error = mnt_want_write(path.mnt); if (error) goto dput_and_out; + error = security_path_chmod(path.dentry, path.mnt, mode); + if (error) + goto out_drop_write; mutex_lock(&inode->i_mutex); if (mode == (mode_t) -1) mode = inode->i_mode; @@ -652,6 +659,7 @@ SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, mode_t, mode) newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; error = notify_change(path.dentry, &newattrs); mutex_unlock(&inode->i_mutex); +out_drop_write: mnt_drop_write(path.mnt); dput_and_out: path_put(&path); @@ -700,7 +708,9 @@ SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group) error = mnt_want_write(path.mnt); if (error) goto out_release; - error = chown_common(path.dentry, user, group); + error = security_path_chown(&path, user, group); + if (!error) + error = chown_common(path.dentry, user, group); mnt_drop_write(path.mnt); out_release: path_put(&path); @@ -725,7 +735,9 @@ SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user, error = mnt_want_write(path.mnt); if (error) goto out_release; - error = chown_common(path.dentry, user, group); + error = security_path_chown(&path, user, group); + if (!error) + error = chown_common(path.dentry, user, group); mnt_drop_write(path.mnt); out_release: path_put(&path); @@ -744,7 +756,9 @@ SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group error = mnt_want_write(path.mnt); if (error) goto out_release; - error = chown_common(path.dentry, user, group); + error = security_path_chown(&path, user, group); + if (!error) + error = chown_common(path.dentry, user, group); mnt_drop_write(path.mnt); out_release: path_put(&path); @@ -767,7 +781,9 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) goto out_fput; dentry = file->f_path.dentry; audit_inode(NULL, dentry); - error = chown_common(dentry, user, group); + error = security_path_chown(&file->f_path, user, group); + if (!error) + error = chown_common(dentry, user, group); mnt_drop_write(file->f_path.mnt); out_fput: fput(file); -- cgit v1.2.3 From 8b8efb44033c7e86b3dc76f825c693ec92ae30e9 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sun, 4 Oct 2009 21:49:48 +0900 Subject: LSM: Add security_path_chroot(). This patch allows pathname based LSM modules to check chroot() operations. This hook is used by TOMOYO. Signed-off-by: Tetsuo Handa Signed-off-by: James Morris --- fs/open.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/open.c b/fs/open.c index b5c294d35bd1..201041dfca57 100644 --- a/fs/open.c +++ b/fs/open.c @@ -587,6 +587,9 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename) error = -EPERM; if (!capable(CAP_SYS_CHROOT)) goto dput_and_out; + error = security_path_chroot(&path); + if (error) + goto dput_and_out; set_fs_root(current->fs, &path); error = 0; -- cgit v1.2.3 From a27ab9f26b729326778271c1efd895aef4fda1c4 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sun, 4 Oct 2009 21:49:49 +0900 Subject: LSM: Pass original mount flags to security_sb_mount(). This patch allows LSM modules to determine based on original mount flags passed to mount(). A LSM module can get masked mount flags (if needed) by flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | MS_STRICTATIME); Signed-off-by: Tetsuo Handa Signed-off-by: James Morris --- fs/namespace.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index bdc3cb4fd222..7d70d63ceb29 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1921,6 +1921,16 @@ long do_mount(char *dev_name, char *dir_name, char *type_page, if (data_page) ((char *)data_page)[PAGE_SIZE - 1] = 0; + /* ... and get the mountpoint */ + retval = kern_path(dir_name, LOOKUP_FOLLOW, &path); + if (retval) + return retval; + + retval = security_sb_mount(dev_name, &path, + type_page, flags, data_page); + if (retval) + goto dput_out; + /* Default to relatime unless overriden */ if (!(flags & MS_NOATIME)) mnt_flags |= MNT_RELATIME; @@ -1945,16 +1955,6 @@ long do_mount(char *dev_name, char *dir_name, char *type_page, MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | MS_STRICTATIME); - /* ... and get the mountpoint */ - retval = kern_path(dir_name, LOOKUP_FOLLOW, &path); - if (retval) - return retval; - - retval = security_sb_mount(dev_name, &path, - type_page, flags, data_page); - if (retval) - goto dput_out; - if (flags & MS_REMOUNT) retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags, data_page); -- cgit v1.2.3 From 05277c75f6dea8ecf59138cd1b6781fb54ae08bd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 12 Oct 2009 23:42:10 +0000 Subject: xfs: fix double IRELE in xfs_dqrele_inode xfs_dqrele_inode calls xfs_iput to release the ilock and a reference and then also calls IRELE which does a second decrement of the reference count. This leads to a premature freeing of inodes when quotas were turned off while the filesystem was mounted. Thanks to Utako Kusaka for reporting the bug and provinding a good testcase. Signed-off-by: Christoph Hellwig Reported-by: Utako Kusaka Reviewed-by: Alex Elder Signed-off-by: Alex Elder --- fs/xfs/quota/xfs_qm_syscalls.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c index 4e4276b956e8..5d1a3b98a6e6 100644 --- a/fs/xfs/quota/xfs_qm_syscalls.c +++ b/fs/xfs/quota/xfs_qm_syscalls.c @@ -876,7 +876,6 @@ xfs_dqrele_inode( ip->i_gdquot = NULL; } xfs_iput(ip, XFS_ILOCK_EXCL); - IRELE(ip); return 0; } -- cgit v1.2.3 From 9f0d793b52eb2266359661369ef6303838904855 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Fri, 11 Sep 2009 13:03:19 -0400 Subject: fsnotify: do not set group for a mark before it is on the i_list fsnotify_add_mark is supposed to add a mark to the g_list and i_list and to set the group and inode for the mark. fsnotify_destroy_mark_by_entry uses the fact that ->group != NULL to know if this group should be destroyed or if it's already been done. But fsnotify_add_mark sets the group and inode before it actually adds the mark to the i_list and g_list. This can result in a race in inotify, it requires 3 threads. sys_inotify_add_watch("file") sys_inotify_add_watch("file") sys_inotify_rm_watch([a]) inotify_update_watch() inotify_new_watch() inotify_add_to_idr() ^--- returns wd = [a] inotfiy_update_watch() inotify_new_watch() inotify_add_to_idr() fsnotify_add_mark() ^--- returns wd = [b] returns to userspace; inotify_idr_find([a]) ^--- gives us the pointer from task 1 fsnotify_add_mark() ^--- this is going to set the mark->group and mark->inode fields, but will return -EEXIST because of the race with [b]. fsnotify_destroy_mark() ^--- since ->group != NULL we call back into inotify_freeing_mark() which calls inotify_remove_from_idr([a]) since fsnotify_add_mark() failed we call: inotify_remove_from_idr([a]) <------WHOOPS it's not in the idr, this could have been any entry added later! The fix is to make sure we don't set mark->group until we are sure the mark is on the inode and fsnotify_add_mark will return success. Signed-off-by: Eric Paris --- fs/notify/inode_mark.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index c8a07c65482b..3165d85aada2 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -324,11 +324,11 @@ int fsnotify_add_mark(struct fsnotify_mark_entry *entry, spin_lock(&group->mark_lock); spin_lock(&inode->i_lock); - entry->group = group; - entry->inode = inode; - lentry = fsnotify_find_mark_entry(group, inode); if (!lentry) { + entry->group = group; + entry->inode = inode; + hlist_add_head(&entry->i_list, &inode->i_fsnotify_mark_entries); list_add(&entry->g_list, &group->mark_entries); -- cgit v1.2.3 From 3de0ef4f2067da58fa5126d821a56dcb98cdb565 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 14 Oct 2009 20:54:03 +0800 Subject: inotify: fix coalesce duplicate events into a single event in special case If we do rename a dir entry, like this: rename("/tmp/ino7UrgoJ.rename1", "/tmp/ino7UrgoJ.rename2") rename("/tmp/ino7UrgoJ.rename2", "/tmp/ino7UrgoJ") The duplicate events should be coalesced into a single event. But those two events do not be coalesced into a single event, due to some bad check in event_compare(). It can not match the two NULL inodes as the same event. Signed-off-by: Wei Yongjun Signed-off-by: Eric Paris --- fs/notify/notification.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/notify/notification.c b/fs/notify/notification.c index 3816d5750dd5..b8bf53b4c108 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -143,7 +143,7 @@ static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new /* remember, after old was put on the wait_q we aren't * allowed to look at the inode any more, only thing * left to check was if the file_name is the same */ - if (old->name_len && + if (!old->name_len || !strcmp(old->file_name, new->file_name)) return true; break; -- cgit v1.2.3 From 945526846a84c00adac1efd1c6befdaa77039623 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Thu, 15 Oct 2009 00:13:23 +0200 Subject: dnotify: ignore FS_EVENT_ON_CHILD Mask off FS_EVENT_ON_CHILD in dnotify_handle_event(). Otherwise, when there is more than one watch on a directory and dnotify_should_send_event() succeeds, events with FS_EVENT_ON_CHILD set will trigger all watches and cause spurious events. This case was overlooked in commit e42e2773. #define _GNU_SOURCE #include #include #include #include #include #include #include #include static void create_event(int s, siginfo_t* si, void* p) { printf("create\n"); } static void delete_event(int s, siginfo_t* si, void* p) { printf("delete\n"); } int main (void) { struct sigaction action; char *tmpdir, *file; int fd1, fd2; sigemptyset (&action.sa_mask); action.sa_flags = SA_SIGINFO; action.sa_sigaction = create_event; sigaction (SIGRTMIN + 0, &action, NULL); action.sa_sigaction = delete_event; sigaction (SIGRTMIN + 1, &action, NULL); # define TMPDIR "/tmp/test.XXXXXX" tmpdir = malloc(strlen(TMPDIR) + 1); strcpy(tmpdir, TMPDIR); mkdtemp(tmpdir); # define TMPFILE "/file" file = malloc(strlen(tmpdir) + strlen(TMPFILE) + 1); sprintf(file, "%s/%s", tmpdir, TMPFILE); fd1 = open (tmpdir, O_RDONLY); fcntl(fd1, F_SETSIG, SIGRTMIN); fcntl(fd1, F_NOTIFY, DN_MULTISHOT | DN_CREATE); fd2 = open (tmpdir, O_RDONLY); fcntl(fd2, F_SETSIG, SIGRTMIN + 1); fcntl(fd2, F_NOTIFY, DN_MULTISHOT | DN_DELETE); if (fork()) { /* This triggers a create event */ creat(file, 0600); /* This triggers a create and delete event (!) */ unlink(file); } else { sleep(1); rmdir(tmpdir); } return 0; } Signed-off-by: Andreas Gruenbacher Signed-off-by: Eric Paris --- fs/notify/dnotify/dnotify.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 828a889be909..7e54e52964dd 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -91,6 +91,7 @@ static int dnotify_handle_event(struct fsnotify_group *group, struct dnotify_struct *dn; struct dnotify_struct **prev; struct fown_struct *fown; + __u32 test_mask = event->mask & ~FS_EVENT_ON_CHILD; to_tell = event->to_tell; @@ -106,7 +107,7 @@ static int dnotify_handle_event(struct fsnotify_group *group, spin_lock(&entry->lock); prev = &dnentry->dn; while ((dn = *prev) != NULL) { - if ((dn->dn_mask & event->mask) == 0) { + if ((dn->dn_mask & test_mask) == 0) { prev = &dn->dn_next; continue; } -- cgit v1.2.3 From ad3960243e55320d74195fb85c975e0a8cc4466c Mon Sep 17 00:00:00 2001 From: Earl Chew Date: Mon, 19 Oct 2009 15:55:41 -0700 Subject: fs: pipe.c null pointer dereference This patch fixes a null pointer exception in pipe_rdwr_open() which generates the stack trace: > Unable to handle kernel NULL pointer dereference at 0000000000000028 RIP: > [] pipe_rdwr_open+0x35/0x70 > [] __dentry_open+0x13c/0x230 > [] do_filp_open+0x2d/0x40 > [] do_sys_open+0x5a/0x100 > [] sysenter_do_call+0x1b/0x67 The failure mode is triggered by an attempt to open an anonymous pipe via /proc/pid/fd/* as exemplified by this script: ============================================================= while : ; do { echo y ; sleep 1 ; } | { while read ; do echo z$REPLY; done ; } & PID=$! OUT=$(ps -efl | grep 'sleep 1' | grep -v grep | { read PID REST ; echo $PID; } ) OUT="${OUT%% *}" DELAY=$((RANDOM * 1000 / 32768)) usleep $((DELAY * 1000 + RANDOM % 1000 )) echo n > /proc/$OUT/fd/1 # Trigger defect done ============================================================= Note that the failure window is quite small and I could only reliably reproduce the defect by inserting a small delay in pipe_rdwr_open(). For example: static int pipe_rdwr_open(struct inode *inode, struct file *filp) { msleep(100); mutex_lock(&inode->i_mutex); Although the defect was observed in pipe_rdwr_open(), I think it makes sense to replicate the change through all the pipe_*_open() functions. The core of the change is to verify that inode->i_pipe has not been released before attempting to manipulate it. If inode->i_pipe is no longer present, return ENOENT to indicate so. The comment about potentially using atomic_t for i_pipe->readers and i_pipe->writers has also been removed because it is no longer relevant in this context. The inode->i_mutex lock must be used so that inode->i_pipe can be dealt with correctly. Signed-off-by: Earl Chew Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- fs/pipe.c | 41 ++++++++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/pipe.c b/fs/pipe.c index 52c415114838..ae17d026aaa3 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -777,36 +777,55 @@ pipe_rdwr_release(struct inode *inode, struct file *filp) static int pipe_read_open(struct inode *inode, struct file *filp) { - /* We could have perhaps used atomic_t, but this and friends - below are the only places. So it doesn't seem worthwhile. */ + int ret = -ENOENT; + mutex_lock(&inode->i_mutex); - inode->i_pipe->readers++; + + if (inode->i_pipe) { + ret = 0; + inode->i_pipe->readers++; + } + mutex_unlock(&inode->i_mutex); - return 0; + return ret; } static int pipe_write_open(struct inode *inode, struct file *filp) { + int ret = -ENOENT; + mutex_lock(&inode->i_mutex); - inode->i_pipe->writers++; + + if (inode->i_pipe) { + ret = 0; + inode->i_pipe->writers++; + } + mutex_unlock(&inode->i_mutex); - return 0; + return ret; } static int pipe_rdwr_open(struct inode *inode, struct file *filp) { + int ret = -ENOENT; + mutex_lock(&inode->i_mutex); - if (filp->f_mode & FMODE_READ) - inode->i_pipe->readers++; - if (filp->f_mode & FMODE_WRITE) - inode->i_pipe->writers++; + + if (inode->i_pipe) { + ret = 0; + if (filp->f_mode & FMODE_READ) + inode->i_pipe->readers++; + if (filp->f_mode & FMODE_WRITE) + inode->i_pipe->writers++; + } + mutex_unlock(&inode->i_mutex); - return 0; + return ret; } /* -- cgit v1.2.3 From 4223a4a155f245d41c350ed9eba4fc32e965c4da Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 20 Oct 2009 14:13:46 +0900 Subject: nfs: Fix nfs_parse_mount_options() kfree() leak Fix a (small) memory leak in one of the error paths of the NFS mount options parsing code. Regression introduced in 2.6.30 by commit a67d18f (NFS: load the rpc/rdma transport module automatically). Reported-by: Yinghai Lu Reported-by: Pekka Enberg Signed-off-by: Ingo Molnar Signed-off-by: Trond Myklebust Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- fs/nfs/super.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index a2c18acb8568..90be551b80c1 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1253,6 +1253,7 @@ static int nfs_parse_mount_options(char *raw, default: dfprintk(MOUNT, "NFS: unrecognized " "transport protocol\n"); + kfree(string); return 0; } break; -- cgit v1.2.3 From a8b40bc7e635831b61c43acc71a86d3a68b2dff0 Mon Sep 17 00:00:00 2001 From: Terry Loftin Date: Thu, 22 Oct 2009 21:36:01 -0400 Subject: nfs: Panic when commit fails Actually pass the NFS_FILE_SYNC option to the server to avoid a Panic in nfs_direct_write_complete() when a commit fails. At the end of an nfs write, if the nfs commit fails, all the writes will be rescheduled. They are supposed to be rescheduled as NFS_FILE_SYNC writes, but the rpc_task structure is not completely intialized and so the option is not passed. When the rescheduled writes complete, the return indicates that they are NFS_UNSTABLE and we try to do another commit. This leads to a Panic because the commit data structure pointer was set to null in the initial (failed) commit attempt. Signed-off-by: Terry Loftin Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 6c3210099d51..e1d415e97849 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -457,6 +457,7 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) }; struct rpc_task_setup task_setup_data = { .rpc_client = NFS_CLIENT(inode), + .rpc_message = &msg, .callback_ops = &nfs_write_direct_ops, .workqueue = nfsiod_workqueue, .flags = RPC_TASK_ASYNC, -- cgit v1.2.3 From 52567b03ca38b6e556ced450d64dba8d66e23b0e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 23 Oct 2009 14:46:42 -0400 Subject: NFSv4: Fix a bug when the server returns NFS4ERR_RESOURCE RFC 3530 states that when we recieve the error NFS4ERR_RESOURCE, we are not supposed to bump the sequence number on OPEN, LOCK, LOCKU, CLOSE, etc operations. The problem is that we map that error into EREMOTEIO in the XDR layer, and so the NFSv4 middle-layer routines like seqid_mutating_err(), and nfs_increment_seqid() don't recognise it. The fix is to defer the mapping until after the middle layers have processed the error. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 11 ++++++++--- fs/nfs/nfs4xdr.c | 1 - 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ed7c269e2514..65c252798861 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -72,12 +72,17 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, /* Prevent leaks of NFSv4 errors into userland */ static int nfs4_map_errors(int err) { - if (err < -1000) { + if (err >= -1000) + return err; + switch (err) { + case -NFS4ERR_RESOURCE: + return -EREMOTEIO; + default: dprintk("%s could not handle NFSv4 error %d\n", __func__, -err); - return -EIO; + break; } - return err; + return -EIO; } /* diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 83ad47cbdd8a..20b4e30e6c82 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -5681,7 +5681,6 @@ static struct { { NFS4ERR_SERVERFAULT, -ESERVERFAULT }, { NFS4ERR_BADTYPE, -EBADTYPE }, { NFS4ERR_LOCKED, -EAGAIN }, - { NFS4ERR_RESOURCE, -EREMOTEIO }, { NFS4ERR_SYMLINK, -ELOOP }, { NFS4ERR_OP_ILLEGAL, -EOPNOTSUPP }, { NFS4ERR_DEADLOCK, -EDEADLK }, -- cgit v1.2.3 From 6c21a7fb492bf7e2c4985937082ce58ddeca84bd Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Thu, 22 Oct 2009 17:30:13 -0400 Subject: LSM: imbed ima calls in the security hooks Based on discussions on LKML and LSM, where there are consecutive security_ and ima_ calls in the vfs layer, move the ima_ calls to the existing security_ hooks. Signed-off-by: Mimi Zohar Signed-off-by: James Morris --- fs/exec.c | 4 ---- fs/file_table.c | 2 -- fs/inode.c | 10 ---------- 3 files changed, 16 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index d49be6bc1793..d164342c2b69 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -46,7 +46,6 @@ #include #include #include -#include #include #include #include @@ -1209,9 +1208,6 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) struct linux_binfmt *fmt; retval = security_bprm_check(bprm); - if (retval) - return retval; - retval = ima_bprm_check(bprm); if (retval) return retval; diff --git a/fs/file_table.c b/fs/file_table.c index 8eb44042e009..4bef4c01ec6f 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include @@ -280,7 +279,6 @@ void __fput(struct file *file) if (file->f_op && file->f_op->release) file->f_op->release(inode, file); security_file_free(file); - ima_file_free(file); if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL)) cdev_put(inode->i_cdev); fops_put(file->f_op); diff --git a/fs/inode.c b/fs/inode.c index 4d8e3be55976..06c1f02de611 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include @@ -157,11 +156,6 @@ int inode_init_always(struct super_block *sb, struct inode *inode) if (security_inode_alloc(inode)) goto out; - - /* allocate and initialize an i_integrity */ - if (ima_inode_alloc(inode)) - goto out_free_security; - spin_lock_init(&inode->i_lock); lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key); @@ -201,9 +195,6 @@ int inode_init_always(struct super_block *sb, struct inode *inode) #endif return 0; - -out_free_security: - security_inode_free(inode); out: return -ENOMEM; } @@ -235,7 +226,6 @@ static struct inode *alloc_inode(struct super_block *sb) void __destroy_inode(struct inode *inode) { BUG_ON(inode_has_buffers(inode)); - ima_inode_free(inode); security_inode_free(inode); fsnotify_inode_delete(inode); #ifdef CONFIG_FS_POSIX_ACL -- cgit v1.2.3 From ce0e7b28fb75cb003cfc8d0238613aaf1c55e797 Mon Sep 17 00:00:00 2001 From: Ryota Ozaki Date: Sat, 24 Oct 2009 01:20:10 +0900 Subject: sched, cpuacct: Fix niced guest time accounting CPU time of a guest is always accounted in 'user' time without concern for the nice value of its counterpart process although the guest is scheduled under the nice value. This patch fixes the defect and accounts cpu time of a niced guest in 'nice' time as same as a niced process. And also the patch adds 'guest_nice' to cpuacct. The value provides niced guest cpu time which is like 'nice' to 'user'. The original discussions can be found here: http://www.mail-archive.com/kvm@vger.kernel.org/msg23982.html http://www.mail-archive.com/kvm@vger.kernel.org/msg23860.html Signed-off-by: Ryota Ozaki Acked-by: Avi Kivity Cc: Peter Zijlstra LKML-Reference: <1256314810-7897-1-git-send-email-ozaki.ryota@gmail.com> Signed-off-by: Ingo Molnar --- fs/proc/stat.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 7cc726c6d70a..b9b7aad2003d 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -27,7 +27,7 @@ static int show_stat(struct seq_file *p, void *v) int i, j; unsigned long jif; cputime64_t user, nice, system, idle, iowait, irq, softirq, steal; - cputime64_t guest; + cputime64_t guest, guest_nice; u64 sum = 0; u64 sum_softirq = 0; unsigned int per_softirq_sums[NR_SOFTIRQS] = {0}; @@ -36,7 +36,7 @@ static int show_stat(struct seq_file *p, void *v) user = nice = system = idle = iowait = irq = softirq = steal = cputime64_zero; - guest = cputime64_zero; + guest = guest_nice = cputime64_zero; getboottime(&boottime); jif = boottime.tv_sec; @@ -51,6 +51,8 @@ static int show_stat(struct seq_file *p, void *v) softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest); + guest_nice = cputime64_add(guest_nice, + kstat_cpu(i).cpustat.guest_nice); for_each_irq_nr(j) { sum += kstat_irqs_cpu(j, i); } @@ -65,7 +67,8 @@ static int show_stat(struct seq_file *p, void *v) } sum += arch_irq_stat(); - seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", + seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu " + "%llu\n", (unsigned long long)cputime64_to_clock_t(user), (unsigned long long)cputime64_to_clock_t(nice), (unsigned long long)cputime64_to_clock_t(system), @@ -74,7 +77,8 @@ static int show_stat(struct seq_file *p, void *v) (unsigned long long)cputime64_to_clock_t(irq), (unsigned long long)cputime64_to_clock_t(softirq), (unsigned long long)cputime64_to_clock_t(steal), - (unsigned long long)cputime64_to_clock_t(guest)); + (unsigned long long)cputime64_to_clock_t(guest), + (unsigned long long)cputime64_to_clock_t(guest_nice)); for_each_online_cpu(i) { /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ @@ -88,8 +92,10 @@ static int show_stat(struct seq_file *p, void *v) softirq = kstat_cpu(i).cpustat.softirq; steal = kstat_cpu(i).cpustat.steal; guest = kstat_cpu(i).cpustat.guest; + guest_nice = kstat_cpu(i).cpustat.guest_nice; seq_printf(p, - "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", + "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu " + "%llu\n", i, (unsigned long long)cputime64_to_clock_t(user), (unsigned long long)cputime64_to_clock_t(nice), @@ -99,7 +105,8 @@ static int show_stat(struct seq_file *p, void *v) (unsigned long long)cputime64_to_clock_t(irq), (unsigned long long)cputime64_to_clock_t(softirq), (unsigned long long)cputime64_to_clock_t(steal), - (unsigned long long)cputime64_to_clock_t(guest)); + (unsigned long long)cputime64_to_clock_t(guest), + (unsigned long long)cputime64_to_clock_t(guest_nice)); } seq_printf(p, "intr %llu", (unsigned long long)sum); -- cgit v1.2.3 From 141aeb9f26f9f12f1584c128ce8697cdffb046e7 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 26 Oct 2009 08:09:46 -0400 Subject: NFSv4: Fix two unbalanced put_rpccred() issues. Commits 29fba38b (nfs41: lease renewal) and fc01cea9 (nfs41: sequence operation) introduce a couple of put_rpccred() calls on credentials for which there is no corresponding get_rpccred(). See http://bugzilla.kernel.org/show_bug.cgi?id=14249 Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 65c252798861..ff37454fa783 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3065,9 +3065,6 @@ static void nfs4_renew_done(struct rpc_task *task, void *data) if (time_before(clp->cl_last_renewal,timestamp)) clp->cl_last_renewal = timestamp; spin_unlock(&clp->cl_lock); - dprintk("%s calling put_rpccred on rpc_cred %p\n", __func__, - task->tk_msg.rpc_cred); - put_rpccred(task->tk_msg.rpc_cred); } static const struct rpc_call_ops nfs4_renew_ops = { @@ -4882,7 +4879,6 @@ void nfs41_sequence_call_done(struct rpc_task *task, void *data) nfs41_sequence_free_slot(clp, task->tk_msg.rpc_resp); dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred); - put_rpccred(task->tk_msg.rpc_cred); kfree(task->tk_msg.rpc_argp); kfree(task->tk_msg.rpc_resp); -- cgit v1.2.3 From 9a3936aac133037f65124fcb2d676a6c201a90a4 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 26 Oct 2009 08:09:46 -0400 Subject: NFSv4: The link() operation should return any delegation on the file Otherwise, we have to wait for the server to recall it. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 32062c33c859..7cb298525eef 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1536,6 +1536,8 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) old_dentry->d_parent->d_name.name, old_dentry->d_name.name, dentry->d_parent->d_name.name, dentry->d_name.name); + nfs_inode_return_delegation(inode); + d_drop(dentry); error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name); if (error == 0) { -- cgit v1.2.3 From 960cc0f4fef607baabc2232fbd7cce5368a9dcfd Mon Sep 17 00:00:00 2001 From: Neil Brown Date: Mon, 26 Oct 2009 08:59:17 +0100 Subject: block: use after free bug in __blkdev_get commit 0762b8bde9729f10f8e6249809660ff2ec3ad735 (from 14 months ago) introduced a use-after-free bug which has just recently started manifesting in my md testing. I tried git bisect to find out what caused the bug to start manifesting, and it could have been the recent change to blk_unregister_queue (48c0d4d4c04) but the results were inconclusive. This patch certainly fixes my symptoms and looks correct as the two calls are now in the same order as elsewhere in that function. Signed-off-by: NeilBrown Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- fs/block_dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index 9cf4b926f8e4..8bed0557d88c 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1248,8 +1248,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); } } else { - put_disk(disk); module_put(disk->fops->owner); + put_disk(disk); disk = NULL; if (bdev->bd_contains == bdev) { if (bdev->bd_disk->fops->open) { -- cgit v1.2.3 From ffb4a73d8906f71910e6c83ec2b499e70025ee8e Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Tue, 27 Oct 2009 07:22:37 +0900 Subject: sh: Fix hugetlbfs dependencies for SH-3 && MMU configurations. The hugetlb dependencies presently depend on SUPERH && MMU while the hugetlb page size definitions depend on CPU_SH4 or CPU_SH5. This unfortunately allows SH-3 + MMU configurations to enable hugetlbfs without a corresponding HPAGE_SHIFT definition, resulting in the build blowing up. As SH-3 doesn't support variable page sizes, we tighten up the dependenies a bit to prevent hugetlbfs from being enabled. These days we also have a shiny new SYS_SUPPORTS_HUGETLBFS, so switch to using that rather than adding to the list of corner cases in fs/Kconfig. Reported-by: Kristoffer Ericson Signed-off-by: Paul Mundt --- fs/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/Kconfig b/fs/Kconfig index d4bf8caad8d0..48b81f92a50e 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -135,8 +135,8 @@ config TMPFS_POSIX_ACL config HUGETLBFS bool "HugeTLB file system support" - depends on X86 || IA64 || PPC64 || SPARC64 || (SUPERH && MMU) || \ - (S390 && 64BIT) || SYS_SUPPORTS_HUGETLBFS || BROKEN + depends on X86 || IA64 || PPC64 || SPARC64 || (S390 && 64BIT) || \ + SYS_SUPPORTS_HUGETLBFS || BROKEN help hugetlbfs is a filesystem backing for HugeTLB pages, based on ramfs. For architectures that support it, say Y here and read -- cgit v1.2.3 From 0cd9ad73b8d181737005ff4e506b9b6bd043f4dd Mon Sep 17 00:00:00 2001 From: Kumar Gala Date: Fri, 16 Oct 2009 07:21:35 +0000 Subject: powerpc: Limit hugetlbfs support to PPC64 Book-3S machines Signed-off-by: Kumar Gala Signed-off-by: Benjamin Herrenschmidt --- fs/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/Kconfig b/fs/Kconfig index d4bf8caad8d0..f93d0ed4fe0c 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -135,7 +135,7 @@ config TMPFS_POSIX_ACL config HUGETLBFS bool "HugeTLB file system support" - depends on X86 || IA64 || PPC64 || SPARC64 || (SUPERH && MMU) || \ + depends on X86 || IA64 || PPC_BOOK3S_64 || SPARC64 || (SUPERH && MMU) || \ (S390 && 64BIT) || SYS_SUPPORTS_HUGETLBFS || BROKEN help hugetlbfs is a filesystem backing for HugeTLB pages, based on -- cgit v1.2.3 From fb5cbe9efd741b16e72133613747f76490bbecd3 Mon Sep 17 00:00:00 2001 From: Joel Becker Date: Wed, 28 Oct 2009 22:28:24 -0700 Subject: ocfs2: Return -EINVAL when a device is not ocfs2. In case of non-modular kernels the root filesystem is mounted by trying several filesystems. If ocfs2 was tried before the actual filesystem type, the mount would fail because ocfs2_sb_probe() returns -EAGAIN instead of -EINVAL. ocfs2 will now return -EINVAL properly. Signed-off-by: Joel Becker Reported-by: Laszlo Attila Toth --- fs/ocfs2/super.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index c0e48aeebb1c..960673004df1 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -773,18 +773,20 @@ static int ocfs2_sb_probe(struct super_block *sb, if (tmpstat < 0) { status = tmpstat; mlog_errno(status); - goto bail; + break; } di = (struct ocfs2_dinode *) (*bh)->b_data; memset(stats, 0, sizeof(struct ocfs2_blockcheck_stats)); spin_lock_init(&stats->b_lock); - status = ocfs2_verify_volume(di, *bh, blksize, stats); - if (status >= 0) - goto bail; - brelse(*bh); - *bh = NULL; - if (status != -EAGAIN) + tmpstat = ocfs2_verify_volume(di, *bh, blksize, stats); + if (tmpstat < 0) { + brelse(*bh); + *bh = NULL; + } + if (tmpstat != -EAGAIN) { + status = tmpstat; break; + } } bail: -- cgit v1.2.3 From 87f4b1bb98696e6cf84f57df7de41f28c2a7dbeb Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Thu, 15 Oct 2009 11:10:48 +0800 Subject: ocfs2: Move ocfs2_complete_reflink to the right place. As its name ocfs2_complete_reflink indicates, it should be called after all the work for reflink is done, so it really should be called after we reflink xattr successfully. Signed-off-by: Tao Ma Signed-off-by: Joel Becker Tested-by: Tristan Ye --- fs/ocfs2/refcounttree.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 60287fc56bcb..9d439b277304 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4013,10 +4013,6 @@ static int ocfs2_create_reflink_node(struct inode *s_inode, goto out_unlock_refcount; } - ret = ocfs2_complete_reflink(s_inode, s_bh, t_inode, t_bh, preserve); - if (ret) - mlog_errno(ret); - out_unlock_refcount: ocfs2_unlock_refcount_tree(osb, ref_tree, 1); brelse(ref_root_bh); @@ -4068,9 +4064,17 @@ static int __ocfs2_reflink(struct dentry *old_dentry, ret = ocfs2_reflink_xattrs(inode, old_bh, new_inode, new_bh, preserve); - if (ret) + if (ret) { mlog_errno(ret); + goto inode_unlock; + } } + + ret = ocfs2_complete_reflink(inode, old_bh, + new_inode, new_bh, preserve); + if (ret) + mlog_errno(ret); + inode_unlock: ocfs2_inode_unlock(new_inode, 1); brelse(new_bh); -- cgit v1.2.3 From 2f48d593b6ceb7bb63d34124ceba77d33be298cf Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Thu, 15 Oct 2009 11:10:49 +0800 Subject: ocfs2: duplicate inline data properly during reflink. The old reflink fails to handle inodes with inline data and will oops if it encounters them. This patch copies inline data to the new inode. Extended attributes may still be refcounted. Signed-off-by: Tao Ma Signed-off-by: Joel Becker Tested-by: Tristan Ye --- fs/ocfs2/file.c | 3 ++- fs/ocfs2/refcounttree.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 89fc8ee1f5a5..de059f490586 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1712,7 +1712,8 @@ int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos, struct super_block *sb = inode->i_sb; if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) || - !(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) + !(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) || + OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) return 0; cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits; diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 9d439b277304..3a0df7a1b810 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -3743,6 +3743,9 @@ static int ocfs2_attach_refcount_tree(struct inode *inode, goto out; } + if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) + goto attach_xattr; + ocfs2_init_dinode_extent_tree(&di_et, INODE_CACHE(inode), di_bh); size = i_size_read(inode); @@ -3769,6 +3772,7 @@ static int ocfs2_attach_refcount_tree(struct inode *inode, cpos += num_clusters; } +attach_xattr: if (oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) { ret = ocfs2_xattr_attach_refcount_tree(inode, di_bh, &ref_tree->rf_ci, @@ -3858,6 +3862,49 @@ out: return ret; } +static int ocfs2_duplicate_inline_data(struct inode *s_inode, + struct buffer_head *s_bh, + struct inode *t_inode, + struct buffer_head *t_bh) +{ + int ret; + handle_t *handle; + struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb); + struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data; + struct ocfs2_dinode *t_di = (struct ocfs2_dinode *)t_bh->b_data; + + BUG_ON(!(OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)); + + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + t_di->id2.i_data.id_count = s_di->id2.i_data.id_count; + memcpy(t_di->id2.i_data.id_data, s_di->id2.i_data.id_data, + le16_to_cpu(s_di->id2.i_data.id_count)); + spin_lock(&OCFS2_I(t_inode)->ip_lock); + OCFS2_I(t_inode)->ip_dyn_features |= OCFS2_INLINE_DATA_FL; + t_di->i_dyn_features = cpu_to_le16(OCFS2_I(t_inode)->ip_dyn_features); + spin_unlock(&OCFS2_I(t_inode)->ip_lock); + + ocfs2_journal_dirty(handle, t_bh); + +out_commit: + ocfs2_commit_trans(osb, handle); +out: + return ret; +} + static int ocfs2_duplicate_extent_list(struct inode *s_inode, struct inode *t_inode, struct buffer_head *t_bh, @@ -3997,6 +4044,14 @@ static int ocfs2_create_reflink_node(struct inode *s_inode, goto out; } + if (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + ret = ocfs2_duplicate_inline_data(s_inode, s_bh, + t_inode, t_bh); + if (ret) + mlog_errno(ret); + goto out; + } + ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc), 1, &ref_tree, &ref_root_bh); if (ret) { -- cgit v1.2.3 From 370c28def65b3a5765192753a164403619b41d01 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 26 Oct 2009 16:49:32 -0700 Subject: hwpoison: fix/proc/meminfo alignment Given such a long name, the kB count in /proc/meminfo's HardwareCorrupted line is being shown too far right (it does align with x86_64's VmallocChunk above, but I hope nobody will ever have that much corrupted!). Align it. Signed-off-by: Hugh Dickins Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/meminfo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index c7bff4f603ff..a65239cfd97e 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -99,7 +99,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) "VmallocUsed: %8lu kB\n" "VmallocChunk: %8lu kB\n" #ifdef CONFIG_MEMORY_FAILURE - "HardwareCorrupted: %8lu kB\n" + "HardwareCorrupted: %5lu kB\n" #endif , K(i.totalram), -- cgit v1.2.3 From 5c36fe3d87b3f0c85894a49193c66096a3d6b26f Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 26 Oct 2009 16:49:51 -0700 Subject: hfsplus: refuse to mount volumes larger than 2TB As found in , hfsplus is using type u32 rather than sector_t for some sector number calculations. In particular, hfsplus_get_block() does: u32 ablock, dblock, mask; ... map_bh(bh_result, sb, (dblock << HFSPLUS_SB(sb).fs_shift) + HFSPLUS_SB(sb).blockoffset + (iblock & mask)); I am not confident that I can find and fix all cases where a sector number may be truncated. For now, avoid data loss by refusing to mount HFS+ volumes with more than 2^32 sectors (2TB). [akpm@linux-foundation.org: fix 32 and 64-bit issues] Signed-off-by: Ben Hutchings Cc: Eric Sesterhenn Cc: Roman Zippel Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hfsplus/wrapper.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index 175d08eacc86..bed78ac8f6d1 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -99,6 +99,10 @@ int hfsplus_read_wrapper(struct super_block *sb) if (hfsplus_get_last_session(sb, &part_start, &part_size)) return -EINVAL; + if ((u64)part_start + part_size > 0x100000000ULL) { + pr_err("hfs: volumes larger than 2TB are not supported yet\n"); + return -EINVAL; + } while (1) { bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr); if (!bh) -- cgit v1.2.3 From 47f365eb575735c6b2edf5d08e0d16d26a9c23bd Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Mon, 26 Oct 2009 16:49:56 -0700 Subject: hfs: fix oops on mount with corrupted btree extent records A particular fsfuzzer run caused an hfs file system to crash on mount. This is due to a corrupted MDB extent record causing a miscalculation of HFS_I(inode)->first_blocks for the extent tree. If the extent records are zereod out, it won't trigger the first_blocks special case. Instead it falls through to the extent code which we're still in the middle of initializing. This patch catches the 0 size extent records, reports the corruption, and fails the mount. Reported-by: Ramon de Carvalho Valle Signed-off-by: Jeff Mahoney Cc: Valdis Kletnieks Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hfs/btree.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c index 9b9d6395bad3..052f214ea6f0 100644 --- a/fs/hfs/btree.c +++ b/fs/hfs/btree.c @@ -58,6 +58,11 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke } unlock_new_inode(tree->inode); + if (!HFS_I(tree->inode)->first_blocks) { + printk(KERN_ERR "hfs: invalid btree extent records (0 size).\n"); + goto free_inode; + } + mapping = tree->inode->i_mapping; page = read_mapping_page(mapping, 0, NULL); if (IS_ERR(page)) -- cgit v1.2.3 From 837711f862bb71ac263837a0f0714dd8cc4ef7ea Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 16 Jan 2009 16:33:05 +0800 Subject: ocfs2: return f_fsid info in ocfs2_statfs() Currently the f_fsid of struct kstatfs returned from ocfs2_statfs() is undefined (vfs layer fills in 0 as default). Since in some conditions, f_fsid value might be used in a (f_fsid, ino) pair to uniquely identify a file, ocfs2 should return a unique defined f_fsid value from ocfs2_statfs(). Because uuid_str is the same on big or litlle endian machine, it's endian consistent to use osb->uuid_str to generate f_fsid value. Signed-off-by: Coly Li Cc: Sunil Mushran Cc: Mark Fasheh Signed-off-by: Joel Becker --- fs/ocfs2/super.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 960673004df1..14f47d2bfe02 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1647,6 +1647,10 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = buf->f_bfree; buf->f_files = numbits; buf->f_ffree = freebits; + buf->f_fsid.val[0] = crc32_le(0, osb->uuid_str, OCFS2_VOL_UUID_LEN) + & 0xFFFFFFFFUL; + buf->f_fsid.val[1] = crc32_le(0, osb->uuid_str + OCFS2_VOL_UUID_LEN, + OCFS2_VOL_UUID_LEN) & 0xFFFFFFFFUL; brelse(bh); -- cgit v1.2.3 From 5a1eb5c4453207ad9e7f6e8ca4f8db289743c993 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Fri, 30 Oct 2009 15:03:54 +1100 Subject: powerpc: Cleanup Kconfig selection of hugetlbfs support Signed-off-by: Benjamin Herrenschmidt --- fs/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/Kconfig b/fs/Kconfig index 2126078a38ed..64d44efad7a5 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -135,7 +135,7 @@ config TMPFS_POSIX_ACL config HUGETLBFS bool "HugeTLB file system support" - depends on X86 || IA64 || PPC_BOOK3S_64 || SPARC64 || (S390 && 64BIT) || \ + depends on X86 || IA64 || SPARC64 || (S390 && 64BIT) || \ SYS_SUPPORTS_HUGETLBFS || BROKEN help hugetlbfs is a filesystem backing for HugeTLB pages, based on -- cgit v1.2.3 From 3b826386d376e5545d2e92b2da5ebd965cafae97 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Fri, 30 Oct 2009 09:27:07 +0100 Subject: xfs: free temporary cursor in xfs_dialloc Commit bd169565993b39b9b4b102cdac8b13e0a259ce2f seems to have a slight regression where this code path: if (!--searchdistance) { /* * Not in range - save last search * location and allocate a new inode */ ... goto newino; } doesn't free the temporary cursor (tcur) that got dup'd in this function. This leaks an item in the xfs_btree_cur zone, and it's caught on module unload: =========================================================== BUG xfs_btree_cur: Objects remaining on kmem_cache_close() ----------------------------------------------------------- It seems like maybe a single free at the end of the function might be cleaner, but for now put a del_cursor right in this code block similar to the handling in the rest of the function. Signed-off-by: Eric Sandeen Signed-off-by: Christoph Hellwig --- fs/xfs/xfs_ialloc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index ab64f3efb43b..0785797db828 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -880,6 +880,7 @@ nextag: * Not in range - save last search * location and allocate a new inode */ + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); pag->pagl_leftrec = trec.ir_startino; pag->pagl_rightrec = rec.ir_startino; pag->pagl_pagino = pagino; -- cgit v1.2.3 From c7ff91d722e44c98504e6e2c357b47e1988dfbbd Mon Sep 17 00:00:00 2001 From: Ryota Yamauchi Date: Fri, 30 Oct 2009 09:27:44 +0100 Subject: xfs: fix xfs_quota remove error The xfs_quota returns ENOSYS when remove command is executed. Reproducable with following steps. # mount -t xfs -o uquota /dev/sda7 /mnt/mp1 # xfs_quota -x -c off -c remove XFS_QUOTARM: Function not implemented. The remove command is allowed during quotaoff, but xfs_fs_set_xstate() checks whether quota is running, and it leads to ENOSYS. To solve this problem, add a check for X_QUOTARM. Signed-off-by: Ryota Yamauchi Signed-off-by: Utako Kusaka Signed-off-by: Christoph Hellwig --- fs/xfs/linux-2.6/xfs_quotaops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c index cb6e2cca214f..13cc7b524d6e 100644 --- a/fs/xfs/linux-2.6/xfs_quotaops.c +++ b/fs/xfs/linux-2.6/xfs_quotaops.c @@ -80,7 +80,7 @@ xfs_fs_set_xstate( if (sb->s_flags & MS_RDONLY) return -EROFS; - if (!XFS_IS_QUOTA_RUNNING(mp)) + if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp)) return -ENOSYS; if (!capable(CAP_SYS_ADMIN)) return -EPERM; -- cgit v1.2.3 From ad0bf11070ebb3c95f8ce82e6219dbd79c8e8b69 Mon Sep 17 00:00:00 2001 From: Alberto Bertogli Date: Mon, 2 Nov 2009 11:39:22 +0100 Subject: bio_put(): add bio_clone() to the list of functions in the comment In bio_put()'s comment, add bio_clone() to the list of functions that can give you a bio reference. Signed-off-by: Alberto Bertogli Signed-off-by: Jens Axboe --- fs/bio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index 402cb84a92a1..bcab9e93d3a7 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -407,7 +407,7 @@ EXPORT_SYMBOL(zero_fill_bio); * * Description: * Put a reference to a &struct bio, either one you have gotten with - * bio_alloc or bio_get. The last put of a bio will free it. + * bio_alloc, bio_get or bio_clone. The last put of a bio will free it. **/ void bio_put(struct bio *bio) { -- cgit v1.2.3 From 5f04eeb8a76521dec371ceb05e8263889a8af2bc Mon Sep 17 00:00:00 2001 From: Alberto Bertogli Date: Mon, 2 Nov 2009 11:39:42 +0100 Subject: Fix bio_alloc() and bio_kmalloc() documentation Commit 451a9ebf accidentally broke bio_alloc() and bio_kmalloc() comments by (almost) swapping them. This patch fixes that, by placing the comments in the right place. Signed-off-by: Alberto Bertogli Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- fs/bio.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index bcab9e93d3a7..12da5db8682c 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -325,8 +325,16 @@ static void bio_fs_destructor(struct bio *bio) * @gfp_mask: allocation mask to use * @nr_iovecs: number of iovecs * - * Allocate a new bio with @nr_iovecs bvecs. If @gfp_mask - * contains __GFP_WAIT, the allocation is guaranteed to succeed. + * bio_alloc will allocate a bio and associated bio_vec array that can hold + * at least @nr_iovecs entries. Allocations will be done from the + * fs_bio_set. Also see @bio_alloc_bioset and @bio_kmalloc. + * + * If %__GFP_WAIT is set, then bio_alloc will always be able to allocate + * a bio. This is due to the mempool guarantees. To make this work, callers + * must never allocate more than 1 bio at a time from this pool. Callers + * that need to allocate more than 1 bio must always submit the previously + * allocated bio for IO before attempting to allocate a new one. Failure to + * do so can cause livelocks under memory pressure. * * RETURNS: * Pointer to new bio on success, NULL on failure. @@ -350,21 +358,13 @@ static void bio_kmalloc_destructor(struct bio *bio) } /** - * bio_alloc - allocate a bio for I/O + * bio_kmalloc - allocate a bio for I/O using kmalloc() * @gfp_mask: the GFP_ mask given to the slab allocator * @nr_iovecs: number of iovecs to pre-allocate * * Description: - * bio_alloc will allocate a bio and associated bio_vec array that can hold - * at least @nr_iovecs entries. Allocations will be done from the - * fs_bio_set. Also see @bio_alloc_bioset. - * - * If %__GFP_WAIT is set, then bio_alloc will always be able to allocate - * a bio. This is due to the mempool guarantees. To make this work, callers - * must never allocate more than 1 bio at a time from this pool. Callers - * that need to allocate more than 1 bio must always submit the previously - * allocated bio for IO before attempting to allocate a new one. Failure to - * do so can cause livelocks under memory pressure. + * Allocate a new bio with @nr_iovecs bvecs. If @gfp_mask contains + * %__GFP_WAIT, the allocation is guaranteed to succeed. * **/ struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs) -- cgit v1.2.3 From f91b90993f0d286be89f06c2f547ced8cfe291c6 Mon Sep 17 00:00:00 2001 From: Martin Stava Date: Mon, 2 Nov 2009 08:39:35 -0600 Subject: 9p: fix a small bug in readdir for long directories Here is a proposed patch for bug in readdir. Listing of dirs with many files fails without this patch. Signed-off-by: Martin Stava Signed-off-by: Eric Van Hensbergen --- fs/9p/vfs_dir.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 873cd31baa47..cae53d405f21 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -90,6 +90,7 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) if (err <= 0) break; + i = 0; n = err; while (i < n) { err = p9stat_read(statbuf + i, buflen-i, &st, -- cgit v1.2.3 From 2511cd0b3b9e9b1c3e9360cc565c3745ac3f3f3f Mon Sep 17 00:00:00 2001 From: Martin Stava Date: Mon, 2 Nov 2009 08:39:34 -0600 Subject: 9p: fix readlink I do not know if you've looked on the patch, but unfortunately it is incorrect. A suggested better version is in this email (the old version didn't work in case the user provided buffer was not long enough - it incorrectly appended null byte on a position of last char, and thus broke the contract of the readlink method). However, I'm still not sure this is 100% correct thing to do, I think readlink is supposed to return buffer without last null byte in all cases, but we do return last null byte (even the old version).. on the other hand it is likely unspecified what is in the remaining part of the buffer, so null character may be fine there ;): Signed-off-by: Martin Stava Signed-off-by: Eric Van Hensbergen --- fs/9p/vfs_inode.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 5947628aefef..18f74ec4dce9 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -994,8 +994,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen) P9_DPRINTK(P9_DEBUG_VFS, "%s -> %s (%s)\n", dentry->d_name.name, st->extension, buffer); - retval = buflen; - + retval = strnlen(buffer, buflen); done: kfree(st); return retval; @@ -1062,7 +1061,7 @@ static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd) __putname(link); link = ERR_PTR(len); } else - link[len] = 0; + link[min(len, PATH_MAX-1)] = 0; } nd_set_link(nd, link); -- cgit v1.2.3 From 3e2796a90cf349527e50b3bc4d0b2f4019b1ce7a Mon Sep 17 00:00:00 2001 From: Eric Van Hensbergen Date: Mon, 2 Nov 2009 08:39:28 -0600 Subject: 9p: fix readdir corner cases The patch below also addresses a couple of other corner cases in readdir seen with a large (e.g. 64k) msize. I'm not sure what people think of my co-opting of fid->aux here. I'd be happy to rework if there's a better way. When the size of the user supplied buffer passed to readdir is smaller than the data returned in one go by the 9P read request, v9fs_dir_readdir() currently discards extra data so that, on the next call, a 9P read request will be issued with offset < previous offset + bytes returned, which voilates the constraint described in paragraph 3 of read(5) description. This patch preseves the leftover data in fid->aux for use in the next call. Signed-off-by: Jim Garlick Signed-off-by: Eric Van Hensbergen --- fs/9p/vfs_dir.c | 94 ++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 67 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index cae53d405f21..15cce53bf61e 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -39,6 +39,24 @@ #include "v9fs_vfs.h" #include "fid.h" +/** + * struct p9_rdir - readdir accounting + * @mutex: mutex protecting readdir + * @head: start offset of current dirread buffer + * @tail: end offset of current dirread buffer + * @buf: dirread buffer + * + * private structure for keeping track of readdir + * allocated on demand + */ + +struct p9_rdir { + struct mutex mutex; + int head; + int tail; + uint8_t *buf; +}; + /** * dt_type - return file type * @mistat: mistat structure @@ -70,57 +88,79 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) { int over; struct p9_wstat st; - int err; + int err = 0; struct p9_fid *fid; int buflen; - char *statbuf; - int n, i = 0; + int reclen = 0; + struct p9_rdir *rdir; P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name); fid = filp->private_data; buflen = fid->clnt->msize - P9_IOHDRSZ; - statbuf = kmalloc(buflen, GFP_KERNEL); - if (!statbuf) - return -ENOMEM; - - while (1) { - err = v9fs_file_readn(filp, statbuf, NULL, buflen, - fid->rdir_fpos); - if (err <= 0) - break; - - i = 0; - n = err; - while (i < n) { - err = p9stat_read(statbuf + i, buflen-i, &st, - fid->clnt->dotu); + + /* allocate rdir on demand */ + if (!fid->rdir) { + rdir = kmalloc(sizeof(struct p9_rdir) + buflen, GFP_KERNEL); + + if (rdir == NULL) { + err = -ENOMEM; + goto exit; + } + spin_lock(&filp->f_dentry->d_lock); + if (!fid->rdir) { + rdir->buf = (uint8_t *)rdir + sizeof(struct p9_rdir); + mutex_init(&rdir->mutex); + rdir->head = rdir->tail = 0; + fid->rdir = (void *) rdir; + rdir = NULL; + } + spin_unlock(&filp->f_dentry->d_lock); + kfree(rdir); + } + rdir = (struct p9_rdir *) fid->rdir; + + err = mutex_lock_interruptible(&rdir->mutex); + while (err == 0) { + if (rdir->tail == rdir->head) { + err = v9fs_file_readn(filp, rdir->buf, NULL, + buflen, filp->f_pos); + if (err <= 0) + goto unlock_and_exit; + + rdir->head = 0; + rdir->tail = err; + } + + while (rdir->head < rdir->tail) { + err = p9stat_read(rdir->buf + rdir->head, + buflen - rdir->head, &st, + fid->clnt->dotu); if (err) { P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); err = -EIO; p9stat_free(&st); - goto free_and_exit; + goto unlock_and_exit; } - - i += st.size+2; - fid->rdir_fpos += st.size+2; + reclen = st.size+2; over = filldir(dirent, st.name, strlen(st.name), filp->f_pos, v9fs_qid2ino(&st.qid), dt_type(&st)); - filp->f_pos += st.size+2; - p9stat_free(&st); if (over) { err = 0; - goto free_and_exit; + goto unlock_and_exit; } + rdir->head += reclen; + filp->f_pos += reclen; } } -free_and_exit: - kfree(statbuf); +unlock_and_exit: + mutex_unlock(&rdir->mutex); +exit: return err; } -- cgit v1.2.3 From d4da6c9ccf648f3f1cb5bf9d981a62c253d30e28 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 2 Nov 2009 10:15:27 -0800 Subject: Revert "ext4: Remove journal_checksum mount option and enable it by default" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit d0646f7b636d067d715fab52a2ba9c6f0f46b0d7, as requested by Eric Sandeen. It can basically cause an ext4 filesystem to miss recovery (and thus get mounted with errors) if the journal checksum does not match. Quoth Eric: "My hand-wavy hunch about what is happening is that we're finding a bad checksum on the last partially-written transaction, which is not surprising, but if we have a wrapped log and we're doing the initial scan for head/tail, and we abort scanning on that bad checksum, then we are essentially running an unrecovered filesystem. But that's hand-wavy and I need to go look at the code. We lived without journal checksums on by default until now, and at this point they're doing more harm than good, so we should revert the default-changing commit until we can fix it and do some good power-fail testing with the fixes in place." See http://bugzilla.kernel.org/show_bug.cgi?id=14354 for all the gory details. Requested-by: Eric Sandeen Cc: Theodore Tso Cc: Alexey Fisher Cc: Maxim Levitsky Cc: Aneesh Kumar K.V Cc: Mathias Burén Signed-off-by: Linus Torvalds --- fs/ext4/ext4.h | 1 + fs/ext4/super.c | 20 ++++++++++++++------ 2 files changed, 15 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 984ca0cb38c3..00d153f2f261 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -743,6 +743,7 @@ struct ext4_inode_info { #define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ #define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ #define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ +#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 312211ee05af..d4ca92aab514 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1300,9 +1300,11 @@ static int parse_options(char *options, struct super_block *sb, *journal_devnum = option; break; case Opt_journal_checksum: - break; /* Kept for backwards compatibility */ + set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM); + break; case Opt_journal_async_commit: set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT); + set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM); break; case Opt_noload: set_opt(sbi->s_mount_opt, NOLOAD); @@ -2759,14 +2761,20 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount4; } - jbd2_journal_set_features(sbi->s_journal, - JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0); - if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) - jbd2_journal_set_features(sbi->s_journal, 0, 0, + if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { + jbd2_journal_set_features(sbi->s_journal, + JBD2_FEATURE_COMPAT_CHECKSUM, 0, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); - else + } else if (test_opt(sb, JOURNAL_CHECKSUM)) { + jbd2_journal_set_features(sbi->s_journal, + JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0); jbd2_journal_clear_features(sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); + } else { + jbd2_journal_clear_features(sbi->s_journal, + JBD2_FEATURE_COMPAT_CHECKSUM, 0, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); + } /* We have now updated the journal if required, so we can * validate the data journaling mode. */ -- cgit v1.2.3 From fa5d11133b07053270e18fa9c18560e66e79217e Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 2 Nov 2009 18:50:49 -0500 Subject: ext4: discard preallocation when restarting a transaction during truncate When restart a transaction during a truncate operation, we drop and reacquire i_data_sem. After reacquiring i_data_sem, we need to discard any inode-based preallocation that might have been grabbed while we released i_data_sem (for example, if pdflush is allocating blocks and racing against the truncate). Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5c5bc5dafff8..d1ec698a91d1 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -193,7 +193,7 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode) * so before we call here everything must be consistently dirtied against * this transaction. */ - int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode, +int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode, int nblocks) { int ret; @@ -209,6 +209,7 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode) up_write(&EXT4_I(inode)->i_data_sem); ret = ext4_journal_restart(handle, blocks_for_truncate(inode)); down_write(&EXT4_I(inode)->i_data_sem); + ext4_discard_preallocations(inode); return ret; } -- cgit v1.2.3 From b1e19e5601277845b4f17ecd7c9ba04f73ee11aa Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Tue, 3 Nov 2009 00:25:53 +0900 Subject: nilfs2: fix dirty page accounting leak causing hang at write MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bruno Prémont and Dunphy, Bill noticed me that NILFS will certainly hang on ARM-based targets. I found this was caused by an underflow of dirty pages counter. A b-tree cache routine was marking page dirty without adjusting page account information. This fixes the dirty page accounting leak and resolves the hang on arm-based targets. Reported-by: Bruno Prémont Reported-by: Dunphy, Bill Signed-off-by: Ryusuke Konishi Tested-by: Bruno Prémont Cc: stable --- fs/nilfs2/btnode.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index 5941958f1e47..435864ce06be 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -276,8 +276,7 @@ void nilfs_btnode_commit_change_key(struct address_space *btnc, "invalid oldkey %lld (newkey=%lld)", (unsigned long long)oldkey, (unsigned long long)newkey); - if (!test_set_buffer_dirty(obh) && TestSetPageDirty(opage)) - BUG(); + nilfs_btnode_mark_dirty(obh); spin_lock_irq(&btnc->tree_lock); radix_tree_delete(&btnc->page_tree, oldkey); -- cgit v1.2.3 From aeda7f6343e6375a832e52ff5ed389c115023ca5 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Mon, 2 Nov 2009 15:08:13 +0900 Subject: nilfs2: fix irregular checkpoint creation due to data flush When nilfs flushes out dirty data to reduce memory pressure, creation of checkpoints is wrongly postponed. This bug causes irregular checkpoint creation especially in small footprint systems. To correct this issue, a timer for the checkpoint creation has to be continued if a log writer does not create a checkpoint. This will do the correction. Signed-off-by: Ryusuke Konishi --- fs/nilfs2/segment.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 683df89dbae5..6eff66a070d5 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2468,17 +2468,22 @@ static void nilfs_segctor_notify(struct nilfs_sc_info *sci, /* Clear requests (even when the construction failed) */ spin_lock(&sci->sc_state_lock); - sci->sc_state &= ~NILFS_SEGCTOR_COMMIT; - if (req->mode == SC_LSEG_SR) { + sci->sc_state &= ~NILFS_SEGCTOR_COMMIT; sci->sc_seq_done = req->seq_accepted; nilfs_segctor_wakeup(sci, req->sc_err ? : req->sb_err); sci->sc_flush_request = 0; - } else if (req->mode == SC_FLUSH_FILE) - sci->sc_flush_request &= ~FLUSH_FILE_BIT; - else if (req->mode == SC_FLUSH_DAT) - sci->sc_flush_request &= ~FLUSH_DAT_BIT; + } else { + if (req->mode == SC_FLUSH_FILE) + sci->sc_flush_request &= ~FLUSH_FILE_BIT; + else if (req->mode == SC_FLUSH_DAT) + sci->sc_flush_request &= ~FLUSH_DAT_BIT; + /* re-enable timer if checkpoint creation was not done */ + if (sci->sc_timer && (sci->sc_state & NILFS_SEGCTOR_COMMIT) && + time_before(jiffies, sci->sc_timer->expires)) + add_timer(sci->sc_timer); + } spin_unlock(&sci->sc_state_lock); } -- cgit v1.2.3 From 05b4358ad564d7a6a51b3717afe771d36711e9c4 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Mon, 14 Sep 2009 01:20:35 +0900 Subject: nilfs2: add zero-fill for new btree node buffers Adds missing initialization of newly allocated b-tree node buffers. This avoids garbage data to be mixed in b-tree node blocks. Signed-off-by: Ryusuke Konishi --- fs/nilfs2/btnode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index 435864ce06be..84c25382f8e3 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -87,6 +87,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, brelse(bh); BUG(); } + memset(bh->b_data, 0, 1 << inode->i_blkbits); bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev; bh->b_blocknr = blocknr; set_buffer_mapped(bh); -- cgit v1.2.3 From 109f55651954def97fa41ee71c464d268c512ab0 Mon Sep 17 00:00:00 2001 From: Mingming Date: Tue, 10 Nov 2009 10:48:08 -0500 Subject: ext4: fix ext4_ext_direct_IO()'s return value after converting uninit extents After a direct I/O request covering an uninitalized extent (i.e., created using the fallocate system call) or a hole in a file, ext4 will convert the uninitialized extent so it is marked as initialized by calling ext4_convert_unwritten_extents(). This function returns zero on success. This return value was getting returned by ext4_direct_IO(); however the file system's direct_IO function is supposed to return the number of bytes read or written on a success. By returning zero, it confused the direct I/O code into falling back to buffered I/O unnecessarily. Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 1 + fs/ext4/inode.c | 10 +++++++--- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 10539e364283..441716f33b4a 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3519,6 +3519,7 @@ retry: * * This function is called from the direct IO end io call back * function, to convert the fallocated extents after IO is completed. + * Returns 0 on success. */ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, loff_t len) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d1ec698a91d1..12d727f8fedf 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3772,13 +3772,17 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { ext4_free_io_end(iocb->private); iocb->private = NULL; - } else if (ret > 0) + } else if (ret > 0) { + int err; /* * for non AIO case, since the IO is already * completed, we could do the convertion right here */ - ret = ext4_convert_unwritten_extents(inode, - offset, ret); + err = ext4_convert_unwritten_extents(inode, + offset, ret); + if (err < 0) + ret = err; + } return ret; } -- cgit v1.2.3 From 5f5249507e4b5c4fc0f9c93f33d133d8c95f47e1 Mon Sep 17 00:00:00 2001 From: Mingming Date: Tue, 10 Nov 2009 10:48:04 -0500 Subject: ext4: skip conversion of uninit extents after direct IO if there isn't any At the end of direct I/O operation, ext4_ext_direct_IO() always called ext4_convert_unwritten_extents(), regardless of whether there were any unwritten extents involved in the I/O or not. This commit adds a state flag so that ext4_ext_direct_IO() only calls ext4_convert_unwritten_extents() when necessary. Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 1 + fs/ext4/extents.c | 22 +++++++++++++++++----- fs/ext4/inode.c | 4 +++- 3 files changed, 21 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 00d153f2f261..8825515eeddd 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -322,6 +322,7 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) #define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */ #define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */ #define EXT4_STATE_EXT_MIGRATE 0x00000020 /* Inode is migrating */ +#define EXT4_STATE_DIO_UNWRITTEN 0x00000040 /* need convert on dio done*/ /* Used to pass group descriptor data when online resize is done */ struct ext4_new_group_input { diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 441716f33b4a..e991ae2b461c 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3048,12 +3048,18 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, ret = ext4_split_unwritten_extents(handle, inode, path, iblock, max_blocks, flags); - /* flag the io_end struct that we need convert when IO done */ + /* + * Flag the inode(non aio case) or end_io struct (aio case) + * that this IO needs to convertion to written when IO is + * completed + */ if (io) io->flag = DIO_AIO_UNWRITTEN; + else + EXT4_I(inode)->i_state |= EXT4_STATE_DIO_UNWRITTEN; goto out; } - /* DIO end_io complete, convert the filled extent to written */ + /* async DIO end_io complete, convert the filled extent to written */ if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) { ret = ext4_convert_unwritten_extents_dio(handle, inode, path); @@ -3295,10 +3301,16 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, * To avoid unecessary convertion for every aio dio rewrite * to the mid of file, here we flag the IO that is really * need the convertion. - * + * For non asycn direct IO case, flag the inode state + * that we need to perform convertion when IO is done. */ - if (io && flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) - io->flag = DIO_AIO_UNWRITTEN; + if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) { + if (io) + io->flag = DIO_AIO_UNWRITTEN; + else + EXT4_I(inode)->i_state |= + EXT4_STATE_DIO_UNWRITTEN;; + } } err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); if (err) { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 12d727f8fedf..a9ed2bce74d1 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3772,7 +3772,8 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { ext4_free_io_end(iocb->private); iocb->private = NULL; - } else if (ret > 0) { + } else if (ret > 0 && (EXT4_I(inode)->i_state & + EXT4_STATE_DIO_UNWRITTEN)) { int err; /* * for non AIO case, since the IO is already @@ -3782,6 +3783,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, offset, ret); if (err < 0) ret = err; + EXT4_I(inode)->i_state &= ~EXT4_STATE_DIO_UNWRITTEN; } return ret; } -- cgit v1.2.3 From 4b70df181611012a3556f017b57dfcef7e1d279f Mon Sep 17 00:00:00 2001 From: Mingming Date: Tue, 3 Nov 2009 14:44:54 -0500 Subject: ext4: code clean up for dio fallocate handling The ext4_debug() call in ext4_end_io_dio() should be moved after the check to make sure that io_end is non-NULL. The comment above ext4_get_block_dio_write() ("Maximum number of blocks...") is a duplicate; the original and correct comment is above the #define DIO_MAX_BLOCKS up above. Based on review comments from Curt Wohlgemuth. Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a9ed2bce74d1..2c8caa51addb 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3446,8 +3446,6 @@ out: return ret; } -/* Maximum number of blocks we map for direct IO at once. */ - static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { @@ -3655,13 +3653,14 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, ext4_io_end_t *io_end = iocb->private; struct workqueue_struct *wq; + /* if not async direct IO or dio with 0 bytes write, just return */ + if (!io_end || !size) + return; + ext_debug("ext4_end_io_dio(): io_end 0x%p" "for inode %lu, iocb 0x%p, offset %llu, size %llu\n", iocb->private, io_end->inode->i_ino, iocb, offset, size); - /* if not async direct IO or dio with 0 bytes write, just return */ - if (!io_end || !size) - return; /* if not aio dio with unwritten extents, just free io and return */ if (io_end->flag != DIO_AIO_UNWRITTEN){ -- cgit v1.2.3 From f60311d5f7670d9539b424e4ed8b5c0872fc9e83 Mon Sep 17 00:00:00 2001 From: "Anand V. Avati" Date: Thu, 22 Oct 2009 06:24:52 -0700 Subject: fuse: prevent fuse_put_request on invalid pointer fuse_direct_io() has a loop where requests are allocated in each iteration. if allocation fails, the loop is broken out and follows into an unconditional fuse_put_request() on that invalid pointer. Signed-off-by: Anand V. Avati Signed-off-by: Miklos Szeredi Cc: stable@kernel.org --- fs/fuse/file.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index a3492f7d207c..5887a6395ad2 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1063,7 +1063,8 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf, break; } } - fuse_put_request(fc, req); + if (!IS_ERR(req)) + fuse_put_request(fc, req); if (res > 0) *ppos = pos; -- cgit v1.2.3 From 0bd87182d3ab18a32a8e9175d3f68754c58e3432 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 3 Nov 2009 11:40:44 +0100 Subject: fuse: fix kunmap in fuse_ioctl_copy_user Looks like another victim of the confusing kmap() vs kmap_atomic() API differences. Reported-by: Todor Gyumyushev Signed-off-by: Jens Axboe Signed-off-by: Miklos Szeredi Cc: Tejun Heo Cc: stable@kernel.org --- fs/fuse/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 5887a6395ad2..c18913a777ae 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1600,7 +1600,7 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov, kaddr += copy; } - kunmap(map); + kunmap(page); } return 0; -- cgit v1.2.3 From 5219f346b0ea2a2a8821f1e966b190788c285b0b Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 4 Nov 2009 10:24:52 +0100 Subject: fuse: invalidate target of rename Invalidate the target's attributes, which may have changed (such as nlink, change time) so that they are refreshed on the next getattr(). Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 992f6c9410bb..8ada78aade58 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -712,8 +712,10 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent, fuse_invalidate_attr(newdir); /* newent will end up negative */ - if (newent->d_inode) + if (newent->d_inode) { + fuse_invalidate_attr(newent->d_inode); fuse_invalidate_entry_cache(newent); + } } else if (err == -EINTR) { /* If request was interrupted, DEITY only knows if the rename actually took place. If the invalidation -- cgit v1.2.3 From 4c3da2209b1261af9a948b7509a38904c8eee554 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 4 Nov 2009 02:50:06 -0800 Subject: sysfs: Don't leak secdata when a sysfs_dirent is freed. While refreshing my sysfs patches I noticed a leak in the secdata implementation. We don't free the secdata when we free the sysfs dirent. This is a bug in 2.6.32-rc5 that we really should close. Signed-off-by: Eric W. Biederman Acked-by: Serge Hallyn Signed-off-by: James Morris --- fs/sysfs/dir.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 5fad489ce5bc..e0201837d244 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "sysfs.h" DEFINE_MUTEX(sysfs_mutex); @@ -285,6 +286,9 @@ void release_sysfs_dirent(struct sysfs_dirent * sd) sysfs_put(sd->s_symlink.target_sd); if (sysfs_type(sd) & SYSFS_COPY_NAME) kfree(sd->s_name); + if (sd->s_iattr && sd->s_iattr->ia_secdata) + security_release_secctx(sd->s_iattr->ia_secdata, + sd->s_iattr->ia_secdata_len); kfree(sd->s_iattr); sysfs_free_ino(sd->s_ino); kmem_cache_free(sysfs_dir_cachep, sd); -- cgit v1.2.3 From ba230c3f6dc88ec008806adb27b12088486d508e Mon Sep 17 00:00:00 2001 From: Mingming Date: Fri, 6 Nov 2009 04:01:23 -0500 Subject: ext4: Fix return value of ext4_split_unwritten_extents() to fix direct I/O To prepare for a direct I/O write, we need to split the unwritten extents before submitting the I/O. When no extents needed to be split, ext4_split_unwritten_extents() was incorrectly returning 0 instead of the size of uninitialized extents. This bug caused the wrong return value sent back to VFS code when it gets called from async IO path, leading to an unnecessary fall back to buffered IO. This bug also hid the fact that the check to see whether or not a split would be necessary was incorrect; we can only skip splitting the extent if the write completely covers the uninitialized extent. Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index e991ae2b461c..715264b4bae4 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2807,6 +2807,8 @@ fix_extent_len: * into three uninitialized extent(at most). After IO complete, the part * being filled will be convert to initialized by the end_io callback function * via ext4_convert_unwritten_extents(). + * + * Returns the size of uninitialized extent to be written on success. */ static int ext4_split_unwritten_extents(handle_t *handle, struct inode *inode, @@ -2824,7 +2826,6 @@ static int ext4_split_unwritten_extents(handle_t *handle, unsigned int allocated, ee_len, depth; ext4_fsblk_t newblock; int err = 0; - int ret = 0; ext_debug("ext4_split_unwritten_extents: inode %lu," "iblock %llu, max_blocks %u\n", inode->i_ino, @@ -2842,12 +2843,12 @@ static int ext4_split_unwritten_extents(handle_t *handle, ext4_ext_store_pblock(&orig_ex, ext_pblock(ex)); /* - * if the entire unintialized extent length less than - * the size of extent to write, there is no need to split - * uninitialized extent + * If the uninitialized extent begins at the same logical + * block where the write begins, and the write completely + * covers the extent, then we don't need to split it. */ - if (allocated <= max_blocks) - return ret; + if ((iblock == ee_block) && (allocated <= max_blocks)) + return allocated; err = ext4_ext_get_access(handle, inode, path + depth); if (err) -- cgit v1.2.3 From ec06aedd44541129840ed52e6165afa3796a27bf Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 6 Nov 2009 14:18:29 -0500 Subject: cifs: clean up handling when server doesn't consistently support inode numbers It's possible that a server will return a valid FileID when we query the FILE_INTERNAL_INFO for the root inode, but then zeroed out inode numbers when we do a FindFile with an infolevel of SMB_FIND_FILE_ID_FULL_DIR_INFO. In this situation turn off querying for server inode numbers, generate a warning for the user and just generate an inode number using iunique. Once we generate any inode number with iunique we can no longer use any server inode numbers or we risk collisions, so ensure that we don't do that in cifs_get_inode_info either. Cc: Stable Reported-by: Timothy Normand Miller Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsproto.h | 1 + fs/cifs/inode.c | 7 ++----- fs/cifs/misc.c | 14 ++++++++++++++ fs/cifs/readdir.c | 7 ++++--- 4 files changed, 21 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 6928c24d1d42..5646727e33f5 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -388,4 +388,5 @@ extern int CIFSSMBSetPosixACL(const int xid, struct cifsTconInfo *tcon, const struct nls_table *nls_codepage, int remap_special_chars); extern int CIFSGetExtAttr(const int xid, struct cifsTconInfo *tcon, const int netfid, __u64 *pExtAttrBits, __u64 *pMask); +extern void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb); #endif /* _CIFSPROTO_H */ diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 5e2492535daa..cababd8a52df 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -512,13 +512,10 @@ int cifs_get_inode_info(struct inode **pinode, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); - if (rc1) { + if (rc1 || !fattr.cf_uniqueid) { cFYI(1, ("GetSrvInodeNum rc %d", rc1)); fattr.cf_uniqueid = iunique(sb, ROOT_I); - /* disable serverino if call not supported */ - if (rc1 == -EINVAL) - cifs_sb->mnt_cifs_flags &= - ~CIFS_MOUNT_SERVER_INUM; + cifs_autodisable_serverino(cifs_sb); } } else { fattr.cf_uniqueid = iunique(sb, ROOT_I); diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 0241b25ac33f..1e25efcb55c8 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -715,3 +715,17 @@ cifsConvertToUCS(__le16 *target, const char *source, int maxlen, ctoUCS_out: return i; } + +void +cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb) +{ + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { + cifs_sb->mnt_cifs_flags &= CIFS_MOUNT_SERVER_INUM; + cERROR(1, ("Autodisabling the use of server inode numbers on " + "%s. This server doesn't seem to support them " + "properly. Hardlinks will not be recognized on this " + "mount. Consider mounting with the \"noserverino\" " + "option to silence this message.", + cifs_sb->tcon->treeName)); + } +} diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 1f098ca71636..f84062f9a985 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -727,11 +727,12 @@ static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir, cifs_dir_info_to_fattr(&fattr, (FILE_DIRECTORY_INFO *) pfindEntry, cifs_sb); - /* FIXME: make _to_fattr functions fill this out */ - if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_ID_FULL_DIR_INFO) + if (inum && (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) { fattr.cf_uniqueid = inum; - else + } else { fattr.cf_uniqueid = iunique(sb, ROOT_I); + cifs_autodisable_serverino(cifs_sb); + } ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid); tmp_dentry = cifs_readdir_lookup(file->f_dentry, &qstring, &fattr); -- cgit v1.2.3 From f475f6775465283494346663f201ad04810d2e8a Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 6 Nov 2009 14:18:49 -0500 Subject: cifs: don't use CIFSGetSrvInodeNumber in is_path_accessible Because it's lighter weight, CIFS tries to use CIFSGetSrvInodeNumber to verify the accessibility of the root inode and then falls back to doing a full QPathInfo if that fails with -EOPNOTSUPP. I have at least a report of a server that returns NT_STATUS_INTERNAL_ERROR rather than something that translates to EOPNOTSUPP. Rather than trying to be clever with that call, just have is_path_accessible do a normal QPathInfo. That call is widely supported and it shouldn't increase the overhead significantly. Cc: Stable Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index b09098079916..63ea83ff687f 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2220,16 +2220,8 @@ is_path_accessible(int xid, struct cifsTconInfo *tcon, struct cifs_sb_info *cifs_sb, const char *full_path) { int rc; - __u64 inode_num; FILE_ALL_INFO *pfile_info; - rc = CIFSGetSrvInodeNumber(xid, tcon, full_path, &inode_num, - cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - if (rc != -EOPNOTSUPP) - return rc; - pfile_info = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); if (pfile_info == NULL) return -ENOMEM; -- cgit v1.2.3 From 5399dd1fc8f5e812db931225ef5f67d89f3b1a56 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Sat, 7 Nov 2009 18:45:16 +0900 Subject: nilfs2: fix kernel oops in error case of nilfs_ioctl_move_blocks This fixes a kernel oops reported by Markus Trippelsdorf in the email titled "[NILFS users] kernel Oops while running nilfs_cleanerd". The oops was caused by a bug of error path in nilfs_ioctl_move_blocks() function, which was inlined in nilfs_ioctl_clean_segments(). nilfs_ioctl_move_blocks checks duplication of blocks which will be moved in garbage collection. But, the check should have be done within nilfs_ioctl_move_inode_block() to prevent list corruption among buffers storing the target blocks. To fix the kernel oops, this moves forward the duplication check before the list insertion. I also tested this for stable trees [2.6.30, 2.6.31]. Reported-by: Markus Trippelsdorf Signed-off-by: Ryusuke Konishi Cc: stable --- fs/nilfs2/ioctl.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 6572ea4bc4df..89dd73ead9ac 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -297,7 +297,18 @@ static int nilfs_ioctl_move_inode_block(struct inode *inode, (unsigned long long)vdesc->vd_vblocknr); return ret; } - bh->b_private = vdesc; + if (unlikely(!list_empty(&bh->b_assoc_buffers))) { + printk(KERN_CRIT "%s: conflicting %s buffer: ino=%llu, " + "cno=%llu, offset=%llu, blocknr=%llu, vblocknr=%llu\n", + __func__, vdesc->vd_flags ? "node" : "data", + (unsigned long long)vdesc->vd_ino, + (unsigned long long)vdesc->vd_cno, + (unsigned long long)vdesc->vd_offset, + (unsigned long long)vdesc->vd_blocknr, + (unsigned long long)vdesc->vd_vblocknr); + brelse(bh); + return -EEXIST; + } list_add_tail(&bh->b_assoc_buffers, buffers); return 0; } @@ -335,24 +346,10 @@ static int nilfs_ioctl_move_blocks(struct the_nilfs *nilfs, list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) { ret = nilfs_gccache_wait_and_mark_dirty(bh); if (unlikely(ret < 0)) { - if (ret == -EEXIST) { - vdesc = bh->b_private; - printk(KERN_CRIT - "%s: conflicting %s buffer: " - "ino=%llu, cno=%llu, offset=%llu, " - "blocknr=%llu, vblocknr=%llu\n", - __func__, - vdesc->vd_flags ? "node" : "data", - (unsigned long long)vdesc->vd_ino, - (unsigned long long)vdesc->vd_cno, - (unsigned long long)vdesc->vd_offset, - (unsigned long long)vdesc->vd_blocknr, - (unsigned long long)vdesc->vd_vblocknr); - } + WARN_ON(ret == -EEXIST); goto failed; } list_del_init(&bh->b_assoc_buffers); - bh->b_private = NULL; brelse(bh); } return nmembs; @@ -360,7 +357,6 @@ static int nilfs_ioctl_move_blocks(struct the_nilfs *nilfs, failed: list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) { list_del_init(&bh->b_assoc_buffers); - bh->b_private = NULL; brelse(bh); } return ret; -- cgit v1.2.3 From c083234f1592ef3fad3d8083663c5e4a357ec77c Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Sun, 8 Nov 2009 12:09:24 +0900 Subject: nilfs2: fix missing cleanup of gc cache on error cases This fixes an -rc1 regression brought by the commit: 1cf58fa840472ec7df6bf2312885949ebb308853 ("nilfs2: shorten freeze period due to GC in write operation v3"). Although the patch moved out a function call of nilfs_ioctl_move_blocks() to nilfs_ioctl_clean_segments() from nilfs_ioctl_prepare_clean_segments(), it didn't move corresponding cleanup job needed for the error case. This will move the missing cleanup job to the destination function. Signed-off-by: Ryusuke Konishi Acked-by: Jiro SEKIBA --- fs/nilfs2/ioctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 89dd73ead9ac..d24057d58f17 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -467,7 +467,6 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs, return 0; failed: - nilfs_remove_all_gcinode(nilfs); printk(KERN_ERR "NILFS: GC failed during preparation: %s: err=%d\n", msg, ret); return ret; @@ -556,6 +555,8 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp, else ret = nilfs_clean_segments(inode->i_sb, argv, kbufs); + if (ret < 0) + nilfs_remove_all_gcinode(nilfs); clear_nilfs_gc_running(nilfs); out_free: -- cgit v1.2.3 From 1e424a348303694fabdf8b1efbfcb1a892dfa63a Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 8 Nov 2009 15:45:44 -0500 Subject: ext4: partial revert to fix double brelse WARNING() This is a partial revert of commit 6487a9d (only the changes made to fs/ext4/namei.c), since it is causing the following brelse() double-free warning when running fsstress on a file system with 1k blocksize and we run into a block allocation failure while converting a single-block directory to a multi-block hash-tree indexed directory. WARNING: at fs/buffer.c:1197 __brelse+0x2e/0x33() Hardware name: VFS: brelse: Trying to free free buffer Modules linked in: Pid: 2226, comm: jbd2/sdd-8 Not tainted 2.6.32-rc6-00577-g0003f55 #101 Call Trace: [] warn_slowpath_common+0x65/0x95 [] warn_slowpath_fmt+0x29/0x2c [] __brelse+0x2e/0x33 [] jbd2_journal_refile_buffer+0x67/0x6c [] jbd2_journal_commit_transaction+0x319/0x14d8 [] ? try_to_del_timer_sync+0x58/0x60 [] ? sched_clock_cpu+0x12a/0x13e [] ? trace_hardirqs_off+0xb/0xd [] ? cpu_clock+0x3f/0x5b [] ? lock_release_holdtime+0x36/0x137 [] ? _spin_unlock_irqrestore+0x44/0x51 [] ? trace_hardirqs_on_caller+0x103/0x124 [] ? trace_hardirqs_on+0xb/0xd [] ? try_to_del_timer_sync+0x58/0x60 [] kjournald2+0x11a/0x310 [] ? autoremove_wake_function+0x0/0x38 [] ? kjournald2+0x0/0x310 [] kthread+0x66/0x6b [] ? kthread+0x0/0x6b [] kernel_thread_helper+0x7/0x10 ---[ end trace 5579351b86af61e3 ]--- Commit 6487a9d was an attempt some buffer head leaks in an ENOSPC error path, but in some cases it actually results in an excess ENOSPC, as shown above. Fixing this means cleaning up who is responsible for releasing the buffer heads from the callee to the caller of add_dirent_to_buf(). Since that's a relatively complex change, and we're late in the rcX development cycle, I'm reverting this now, and holding back a more complete fix until after 2.6.32 ships. We've lived with this buffer_head leak on ENOSPC in ext3 and ext4 for a very long time; a few more months won't kill us. Signed-off-by: "Theodore Ts'o" Cc: Curt Wohlgemuth --- fs/ext4/namei.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 7c8fe80bacdd..6d2c1b897fc7 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1518,12 +1518,8 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, return retval; if (blocks == 1 && !dx_fallback && - EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) { - retval = make_indexed_dir(handle, dentry, inode, bh); - if (retval == -ENOSPC) - brelse(bh); - return retval; - } + EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) + return make_indexed_dir(handle, dentry, inode, bh); brelse(bh); } bh = ext4_append(handle, dir, &block, &retval); @@ -1532,10 +1528,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, de = (struct ext4_dir_entry_2 *) bh->b_data; de->inode = 0; de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize); - retval = add_dirent_to_buf(handle, dentry, inode, de, bh); - if (retval == -ENOSPC) - brelse(bh); - return retval; + return add_dirent_to_buf(handle, dentry, inode, de, bh); } /* @@ -1664,8 +1657,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, if (!de) goto cleanup; err = add_dirent_to_buf(handle, dentry, inode, de, bh); - if (err != -ENOSPC) - bh = NULL; + bh = NULL; goto cleanup; journal_error: -- cgit v1.2.3 From 96d25e532234bec1a1989e6e1baf702d43a78b0d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Nov 2009 16:15:42 +0900 Subject: NFSv4: Fix a cache validation bug which causes getcwd() to return ENOENT Changeset a65318bf3afc93ce49227e849d213799b072c5fd (NFSv4: Simplify some cache consistency post-op GETATTRs) incorrectly changed the getattr bitmap for readdir(). This causes the readdir() function to fail to return a fileid/inode number, which again exposed a bug in the NFS readdir code that causes spurious ENOENT errors to appear in applications (see http://bugzilla.kernel.org/show_bug.cgi?id=14541). The immediate band aid is to revert the incorrect bitmap change, but more long term, we should change the NFS readdir code to cope with the fact that NFSv4 servers are not required to support fileids/inode numbers. Reported-by: Daniel J Blueman Cc: stable@kernel.org Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ff37454fa783..741a562177fc 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2767,7 +2767,7 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, .pages = &page, .pgbase = 0, .count = count, - .bitmask = NFS_SERVER(dentry->d_inode)->cache_consistency_bitmask, + .bitmask = NFS_SERVER(dentry->d_inode)->attr_bitmask, }; struct nfs4_readdir_res res; struct rpc_message msg = { -- cgit v1.2.3 From ea0174a7137c8ca9f130ca681f3a99c872da6778 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 12 Oct 2009 21:34:27 -0500 Subject: ext3: retry failed direct IO allocations On a 256M 4k block filesystem, doing this in a loop: dd if=/dev/zero of=test oflag=direct bs=1M count=64 rm -f test eventually leads to spurious ENOSPC: dd: writing `test': No space left on device As with other block allocation callers, it looks like we need to potentially retry the allocations on the initial ENOSPC. A similar patch went into ext4 (commit fbbf69456619de5d251cb9f1df609069178c62d5) Signed-off-by: Eric Sandeen Signed-off-by: Jan Kara --- fs/ext3/inode.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index acf1b1423327..069a163393b4 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1735,6 +1735,7 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb, ssize_t ret; int orphan = 0; size_t count = iov_length(iov, nr_segs); + int retries = 0; if (rw == WRITE) { loff_t final_size = offset + count; @@ -1757,9 +1758,12 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb, } } +retry: ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, ext3_get_block, NULL); + if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) + goto retry; if (orphan) { int err; -- cgit v1.2.3 From fe8bc91c4c30122b357d197117705cfd4fabaf28 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 16 Oct 2009 19:26:15 +0200 Subject: ext3: Wait for proper transaction commit on fsync We cannot rely on buffer dirty bits during fsync because pdflush can come before fsync is called and clear dirty bits without forcing a transaction commit. What we do is that we track which transaction has last changed the inode and which transaction last changed allocation and force it to disk on fsync. Signed-off-by: Jan Kara Reviewed-by: Aneesh Kumar K.V --- fs/ext3/fsync.c | 36 ++++++++++++++++-------------------- fs/ext3/inode.c | 32 +++++++++++++++++++++++++++++++- fs/ext3/super.c | 2 ++ 3 files changed, 49 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c index 451d166bbe93..8209f266e9ad 100644 --- a/fs/ext3/fsync.c +++ b/fs/ext3/fsync.c @@ -46,19 +46,21 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync) { struct inode *inode = dentry->d_inode; + struct ext3_inode_info *ei = EXT3_I(inode); + journal_t *journal = EXT3_SB(inode->i_sb)->s_journal; int ret = 0; + tid_t commit_tid; + + if (inode->i_sb->s_flags & MS_RDONLY) + return 0; J_ASSERT(ext3_journal_current_handle() == NULL); /* - * data=writeback: + * data=writeback,ordered: * The caller's filemap_fdatawrite()/wait will sync the data. - * sync_inode() will sync the metadata - * - * data=ordered: - * The caller's filemap_fdatawrite() will write the data and - * sync_inode() will write the inode if it is dirty. Then the caller's - * filemap_fdatawait() will wait on the pages. + * Metadata is in the journal, we wait for a proper transaction + * to commit here. * * data=journal: * filemap_fdatawrite won't do anything (the buffers are clean). @@ -73,22 +75,16 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync) goto out; } - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) - goto flush; + if (datasync) + commit_tid = atomic_read(&ei->i_datasync_tid); + else + commit_tid = atomic_read(&ei->i_sync_tid); - /* - * The VFS has written the file data. If the inode is unaltered - * then we need not start a commit. - */ - if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = 0, /* sys_fsync did this */ - }; - ret = sync_inode(inode, &wbc); + if (log_start_commit(journal, commit_tid)) { + log_wait_commit(journal, commit_tid); goto out; } -flush: + /* * In case we didn't commit a transaction, we have to flush * disk caches manually so that data really is on persistent diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 069a163393b4..354ed3b47b30 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -699,8 +699,9 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode, int err = 0; struct ext3_block_alloc_info *block_i; ext3_fsblk_t current_block; + struct ext3_inode_info *ei = EXT3_I(inode); - block_i = EXT3_I(inode)->i_block_alloc_info; + block_i = ei->i_block_alloc_info; /* * If we're splicing into a [td]indirect block (as opposed to the * inode) then we need to get write access to the [td]indirect block @@ -741,6 +742,8 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode, inode->i_ctime = CURRENT_TIME_SEC; ext3_mark_inode_dirty(handle, inode); + /* ext3_mark_inode_dirty already updated i_sync_tid */ + atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid); /* had we spliced it onto indirect block? */ if (where->bh) { @@ -2754,6 +2757,8 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino) struct ext3_inode_info *ei; struct buffer_head *bh; struct inode *inode; + journal_t *journal = EXT3_SB(sb)->s_journal; + transaction_t *transaction; long ret; int block; @@ -2831,6 +2836,30 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino) ei->i_data[block] = raw_inode->i_block[block]; INIT_LIST_HEAD(&ei->i_orphan); + /* + * Set transaction id's of transactions that have to be committed + * to finish f[data]sync. We set them to currently running transaction + * as we cannot be sure that the inode or some of its metadata isn't + * part of the transaction - the inode could have been reclaimed and + * now it is reread from disk. + */ + if (journal) { + tid_t tid; + + spin_lock(&journal->j_state_lock); + if (journal->j_running_transaction) + transaction = journal->j_running_transaction; + else + transaction = journal->j_committing_transaction; + if (transaction) + tid = transaction->t_tid; + else + tid = journal->j_commit_sequence; + spin_unlock(&journal->j_state_lock); + atomic_set(&ei->i_sync_tid, tid); + atomic_set(&ei->i_datasync_tid, tid); + } + if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 && EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) { /* @@ -3015,6 +3044,7 @@ again: err = rc; ei->i_state &= ~EXT3_STATE_NEW; + atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid); out_brelse: brelse (bh); ext3_std_error(inode->i_sb, err); diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 7a520a862f49..427496c4767c 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -466,6 +466,8 @@ static struct inode *ext3_alloc_inode(struct super_block *sb) return NULL; ei->i_block_alloc_info = NULL; ei->vfs_inode.i_version = 1; + atomic_set(&ei->i_datasync_tid, 0); + atomic_set(&ei->i_sync_tid, 0); return &ei->vfs_inode; } -- cgit v1.2.3 From 7b02bec07efe1d6c7d48c786e0c1a38d28fe7245 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Tue, 10 Nov 2009 17:13:22 +0800 Subject: JBD/JBD2: free j_wbuf if journal init fails. If journal init fails, we need to free j_wbuf. Cc: Andrew Morton Cc: Jan Kara Signed-off-by: Tao Ma Signed-off-by: Jan Kara --- fs/jbd/journal.c | 2 ++ fs/jbd2/journal.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index bd3c073b485d..49d5cd6053c8 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -756,6 +756,7 @@ journal_t * journal_init_dev(struct block_device *bdev, return journal; out_err: + kfree(journal->j_wbuf); kfree(journal); return NULL; } @@ -820,6 +821,7 @@ journal_t * journal_init_inode (struct inode *inode) return journal; out_err: + kfree(journal->j_wbuf); kfree(journal); return NULL; } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index b0ab5219becb..fed85388ee86 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -913,6 +913,7 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev, return journal; out_err: + kfree(journal->j_wbuf); jbd2_stats_proc_exit(journal); kfree(journal); return NULL; @@ -986,6 +987,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode) return journal; out_err: + kfree(journal->j_wbuf); jbd2_stats_proc_exit(journal); kfree(journal); return NULL; -- cgit v1.2.3 From 6346c93988caa3048bf4d81f9ba3608a7a195aa2 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 10 Nov 2009 21:23:47 -0500 Subject: Btrfs: fix data allocation hint start Sometimes our start allocation hint when we cow a file can be either EXTENT_HOLE or some other such place holder, which is not optimal. So if we find that our em->block_start is one of these special values, check to see where the first block of the inode is stored, and use that as a hint. If that block is also a special value, just fallback on a hint of 0 and let the allocator figure out a good place to put the data. Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 78139efe41fc..d8393ddc72a3 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -743,8 +743,22 @@ static noinline int cow_file_range(struct inode *inode, em = search_extent_mapping(&BTRFS_I(inode)->extent_tree, start, num_bytes); if (em) { - alloc_hint = em->block_start; - free_extent_map(em); + /* + * if block start isn't an actual block number then find the + * first block in this inode and use that as a hint. If that + * block is also bogus then just don't worry about it. + */ + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + free_extent_map(em); + em = search_extent_mapping(em_tree, 0, 0); + if (em && em->block_start < EXTENT_MAP_LAST_BYTE) + alloc_hint = em->block_start; + if (em) + free_extent_map(em); + } else { + alloc_hint = em->block_start; + free_extent_map(em); + } } read_unlock(&BTRFS_I(inode)->extent_tree.lock); btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); -- cgit v1.2.3 From 249ac1e55c642c670f47aacdc57629bbbf10a8db Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 10 Nov 2009 21:23:48 -0500 Subject: Btrfs: cleanup transaction starting and fix journal_info usage We use journal_info to tell if we're in a nested transaction to make sure we don't commit the transaction within a nested transaction. We use another method to see if there are any outstanding ioctl trans handles, so if we're starting one do not set current->journal_info, since it will screw with other filesystems. This patch also cleans up the starting stuff so there aren't any magic numbers. Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/transaction.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index bca82a4ca8e6..c207e8c32c9b 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -163,8 +163,14 @@ static void wait_current_trans(struct btrfs_root *root) } } +enum btrfs_trans_type { + TRANS_START, + TRANS_JOIN, + TRANS_USERSPACE, +}; + static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, - int num_blocks, int wait) + int num_blocks, int type) { struct btrfs_trans_handle *h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); @@ -172,7 +178,8 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, mutex_lock(&root->fs_info->trans_mutex); if (!root->fs_info->log_root_recovering && - ((wait == 1 && !root->fs_info->open_ioctl_trans) || wait == 2)) + ((type == TRANS_START && !root->fs_info->open_ioctl_trans) || + type == TRANS_USERSPACE)) wait_current_trans(root); ret = join_transaction(root); BUG_ON(ret); @@ -186,7 +193,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, h->alloc_exclude_start = 0; h->delayed_ref_updates = 0; - if (!current->journal_info) + if (!current->journal_info && type != TRANS_USERSPACE) current->journal_info = h; root->fs_info->running_transaction->use_count++; @@ -198,18 +205,18 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, int num_blocks) { - return start_transaction(root, num_blocks, 1); + return start_transaction(root, num_blocks, TRANS_START); } struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, int num_blocks) { - return start_transaction(root, num_blocks, 0); + return start_transaction(root, num_blocks, TRANS_JOIN); } struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, int num_blocks) { - return start_transaction(r, num_blocks, 2); + return start_transaction(r, num_blocks, TRANS_USERSPACE); } /* wait for a transaction commit to be fully complete */ -- cgit v1.2.3 From 01dea1efc23b511d3b58bb94da07ddb6d6db9895 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 10 Nov 2009 21:23:48 -0500 Subject: Btrfs: fix how we set max_size for free space clusters This patch fixes a problem where max_size can be set to 0 even though we filled the cluster properly. We set max_size to 0 if we restart the cluster window, but if the new start entry is big enough to be our new cluster then we could return with a max_size set to 0, which will mean the next time we try to allocate from this cluster it will fail. So set max_extent to the entry's size. Tested this on my box and now we actually allocate from the cluster after we fill it. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/free-space-cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 5c2caad76212..cb2849f03251 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1296,7 +1296,7 @@ again: window_start = entry->offset; window_free = entry->bytes; last = entry; - max_extent = 0; + max_extent = entry->bytes; } else { last = next; window_free += next->bytes; -- cgit v1.2.3 From 5df6a9f606bf2ee25ab8031bff124ed883b823be Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 10 Nov 2009 21:23:48 -0500 Subject: Btrfs: fix some metadata enospc issues We weren't reserving metadata space for rename, rmdir and unlink, which could cause problems. Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 36 +++++++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d8393ddc72a3..bb7fd8072802 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2488,7 +2488,19 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) root = BTRFS_I(dir)->root; + /* + * 5 items for unlink inode + * 1 for orphan + */ + ret = btrfs_reserve_metadata_space(root, 6); + if (ret) + return ret; + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + btrfs_unreserve_metadata_space(root, 6); + return PTR_ERR(trans); + } btrfs_set_trans_block_group(trans, dir); @@ -2503,6 +2515,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); + btrfs_unreserve_metadata_space(root, 6); btrfs_btree_balance_dirty(root, nr); return ret; } @@ -2583,7 +2596,16 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) return -ENOTEMPTY; + ret = btrfs_reserve_metadata_space(root, 5); + if (ret) + return ret; + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + btrfs_unreserve_metadata_space(root, 5); + return PTR_ERR(trans); + } + btrfs_set_trans_block_group(trans, dir); if (unlikely(inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { @@ -2606,6 +2628,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) out: nr = trans->blocks_used; ret = btrfs_end_transaction_throttle(trans, root); + btrfs_unreserve_metadata_space(root, 5); btrfs_btree_balance_dirty(root, nr); if (ret && !err) @@ -5297,11 +5320,14 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, return -ENOTEMPTY; /* - * 2 items for dir items - * 1 item for orphan entry - * 1 item for ref + * We want to reserve the absolute worst case amount of items. So if + * both inodes are subvols and we need to unlink them then that would + * require 4 item modifications, but if they are both normal inodes it + * would require 5 item modifications, so we'll assume their normal + * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items + * should cover the worst case number of items we'll modify. */ - ret = btrfs_reserve_metadata_space(root, 4); + ret = btrfs_reserve_metadata_space(root, 11); if (ret) return ret; @@ -5417,7 +5443,7 @@ out_fail: if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&root->fs_info->subvol_sem); - btrfs_unreserve_metadata_space(root, 4); + btrfs_unreserve_metadata_space(root, 11); return ret; } -- cgit v1.2.3 From df66916e71231e9f2377cac9c5c1e2d190f9a427 Mon Sep 17 00:00:00 2001 From: Li Dongyang Date: Fri, 6 Nov 2009 14:33:01 +0000 Subject: Btrfs: skip btrfs_release_path in btrfs_update_root and btrfs_del_root We don't need to call btrfs_release_path because btrfs_free_path will do that for us. Signed-off-by: Li Dongyang Signed-off-by: Chris Mason --- fs/btrfs/root-tree.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 9351428f30e2..67fa2d29d663 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -159,7 +159,6 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root write_extent_buffer(l, item, ptr, sizeof(*item)); btrfs_mark_buffer_dirty(path->nodes[0]); out: - btrfs_release_path(root, path); btrfs_free_path(path); return ret; } @@ -332,7 +331,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, BUG_ON(refs != 0); ret = btrfs_del_item(trans, root, path); out: - btrfs_release_path(root, path); btrfs_free_path(path); return ret; } -- cgit v1.2.3 From 4eb3991c5def39bcf553c14ebe2618fcb47b627f Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 10 Nov 2009 09:01:43 +0000 Subject: Btrfs: avoid null deref in unpin_extent_cache() I re-orderred the checks to avoid dereferencing "em" if it was null. Found by smatch static checker. Signed-off-by: Dan Carpenter Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 2c726b7b9faa..ccbdcb54ec5d 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -208,7 +208,7 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len) write_lock(&tree->lock); em = lookup_extent_mapping(tree, start, len); - WARN_ON(em->start != start || !em); + WARN_ON(!em || em->start != start); if (!em) goto out; -- cgit v1.2.3 From ccf0e72537a9f68611ca575121afd08e2b4d0fb0 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 10 Nov 2009 21:23:48 -0500 Subject: Btrfs: find ideal block group for caching This patch changes a few things. Hopefully the comments are helpfull, but I'll try and be as verbose here. Problem: My fedora box was taking 1 minute and 21 seconds to boot with btrfs as root. Part of this problem was we pick the first block group we can find and start caching it, even if it may not have enough free space. The other problem is we only search for cached block groups the first time around, which we won't find any cached block groups because this is a newly mounted fs, so we end up caching several block groups during bootup, which with alot of fragmentation takes around 30-45 seconds to complete, which bogs down the system. So Solution: 1) Don't cache block groups willy-nilly at first. Instead try and figure out which block group has the most free, and therefore will take the least amount of time to cache. 2) Don't be so picky about cached block groups. The other problem is once we've filled up a cluster, if the block group isn't finished caching the next time we try and do the allocation we'll completely ignore the cluster and start searching from the beginning of the space, which makes us cache more block groups, which slows us down even more. So instead of skipping block groups that are not finished caching when we have a hint, only skip the block group if it hasn't started caching yet. There is one other tweak in here. Before if we allocated a chunk and still couldn't find new space, we'd end up switching the space info to force another chunk allocation. This could make us end up with way too many chunks, so keep track of this particular case. With this patch and my previous cluster fixes my fedora box now boots in 43 seconds, and according to the bootchart is not held up by our block group caching at all. Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 109 ++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 86 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c56f91639dc1..2a4cdceeb575 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4101,7 +4101,7 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache) } enum btrfs_loop_type { - LOOP_CACHED_ONLY = 0, + LOOP_FIND_IDEAL = 0, LOOP_CACHING_NOWAIT = 1, LOOP_CACHING_WAIT = 2, LOOP_ALLOC_CHUNK = 3, @@ -4130,12 +4130,15 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group = NULL; int empty_cluster = 2 * 1024 * 1024; int allowed_chunk_alloc = 0; + int done_chunk_alloc = 0; struct btrfs_space_info *space_info; int last_ptr_loop = 0; int loop = 0; bool found_uncached_bg = false; bool failed_cluster_refill = false; bool failed_alloc = false; + u64 ideal_cache_percent = 0; + u64 ideal_cache_offset = 0; WARN_ON(num_bytes < root->sectorsize); btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); @@ -4171,14 +4174,19 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, empty_cluster = 0; if (search_start == hint_byte) { +ideal_cache: block_group = btrfs_lookup_block_group(root->fs_info, search_start); /* * we don't want to use the block group if it doesn't match our * allocation bits, or if its not cached. + * + * However if we are re-searching with an ideal block group + * picked out then we don't care that the block group is cached. */ if (block_group && block_group_bits(block_group, data) && - block_group_cache_done(block_group)) { + (block_group->cached != BTRFS_CACHE_NO || + search_start == ideal_cache_offset)) { down_read(&space_info->groups_sem); if (list_empty(&block_group->list) || block_group->ro) { @@ -4190,13 +4198,13 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, */ btrfs_put_block_group(block_group); up_read(&space_info->groups_sem); - } else + } else { goto have_block_group; + } } else if (block_group) { btrfs_put_block_group(block_group); } } - search: down_read(&space_info->groups_sem); list_for_each_entry(block_group, &space_info->block_groups, list) { @@ -4208,28 +4216,45 @@ search: have_block_group: if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { + u64 free_percent; + + free_percent = btrfs_block_group_used(&block_group->item); + free_percent *= 100; + free_percent = div64_u64(free_percent, + block_group->key.offset); + free_percent = 100 - free_percent; + if (free_percent > ideal_cache_percent && + likely(!block_group->ro)) { + ideal_cache_offset = block_group->key.objectid; + ideal_cache_percent = free_percent; + } + /* - * we want to start caching kthreads, but not too many - * right off the bat so we don't overwhelm the system, - * so only start them if there are less than 2 and we're - * in the initial allocation phase. + * We only want to start kthread caching if we are at + * the point where we will wait for caching to make + * progress, or if our ideal search is over and we've + * found somebody to start caching. */ if (loop > LOOP_CACHING_NOWAIT || - atomic_read(&space_info->caching_threads) < 2) { + (loop > LOOP_FIND_IDEAL && + atomic_read(&space_info->caching_threads) < 2)) { ret = cache_block_group(block_group); BUG_ON(ret); } - } - - cached = block_group_cache_done(block_group); - if (unlikely(!cached)) { found_uncached_bg = true; - /* if we only want cached bgs, loop */ - if (loop == LOOP_CACHED_ONLY) + /* + * If loop is set for cached only, try the next block + * group. + */ + if (loop == LOOP_FIND_IDEAL) goto loop; } + cached = block_group_cache_done(block_group); + if (unlikely(!cached)) + found_uncached_bg = true; + if (unlikely(block_group->ro)) goto loop; @@ -4409,9 +4434,11 @@ loop: } up_read(&space_info->groups_sem); - /* LOOP_CACHED_ONLY, only search fully cached block groups - * LOOP_CACHING_NOWAIT, search partially cached block groups, but - * dont wait foR them to finish caching + /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for + * for them to make caching progress. Also + * determine the best possible bg to cache + * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking + * caching kthreads as we move along * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching * LOOP_ALLOC_CHUNK, force a chunk allocation and try again * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try @@ -4420,12 +4447,47 @@ loop: if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE && (found_uncached_bg || empty_size || empty_cluster || allowed_chunk_alloc)) { - if (found_uncached_bg) { + if (loop == LOOP_FIND_IDEAL && found_uncached_bg) { found_uncached_bg = false; - if (loop < LOOP_CACHING_WAIT) { - loop++; + loop++; + if (!ideal_cache_percent && + atomic_read(&space_info->caching_threads)) goto search; - } + + /* + * 1 of the following 2 things have happened so far + * + * 1) We found an ideal block group for caching that + * is mostly full and will cache quickly, so we might + * as well wait for it. + * + * 2) We searched for cached only and we didn't find + * anything, and we didn't start any caching kthreads + * either, so chances are we will loop through and + * start a couple caching kthreads, and then come back + * around and just wait for them. This will be slower + * because we will have 2 caching kthreads reading at + * the same time when we could have just started one + * and waited for it to get far enough to give us an + * allocation, so go ahead and go to the wait caching + * loop. + */ + loop = LOOP_CACHING_WAIT; + search_start = ideal_cache_offset; + ideal_cache_percent = 0; + goto ideal_cache; + } else if (loop == LOOP_FIND_IDEAL) { + /* + * Didn't find a uncached bg, wait on anything we find + * next. + */ + loop = LOOP_CACHING_WAIT; + goto search; + } + + if (loop < LOOP_CACHING_WAIT) { + loop++; + goto search; } if (loop == LOOP_ALLOC_CHUNK) { @@ -4437,7 +4499,8 @@ loop: ret = do_chunk_alloc(trans, root, num_bytes + 2 * 1024 * 1024, data, 1); allowed_chunk_alloc = 0; - } else { + done_chunk_alloc = 1; + } else if (!done_chunk_alloc) { space_info->force_alloc = 1; } -- cgit v1.2.3 From f5a84ee3cdd88d96b7bcede10af58598ad8d52a7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 10 Nov 2009 21:23:48 -0500 Subject: Btrfs: fallback on uncompressed io if compressed io fails Currently compressed IO does not deal with not having its entire extent able to be allocated. So if we have enough free space to allocate for the extent, but its not contiguous, it will fail spectacularly. This patch fixes this by falling back on uncompressed IO which lets us spread the delalloc extent across multiple extents. I tested this by making us randomly think the reservation had failed to make it fallback on the uncompressed io way and it seemed to work fine. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index bb7fd8072802..d3d7d46a6af2 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -538,7 +538,7 @@ static noinline int submit_compressed_extents(struct inode *inode, struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct extent_io_tree *io_tree; - int ret; + int ret = 0; if (list_empty(&async_cow->extents)) return 0; @@ -552,6 +552,7 @@ static noinline int submit_compressed_extents(struct inode *inode, io_tree = &BTRFS_I(inode)->io_tree; +retry: /* did the compression code fall back to uncompressed IO? */ if (!async_extent->pages) { int page_started = 0; @@ -562,11 +563,11 @@ static noinline int submit_compressed_extents(struct inode *inode, async_extent->ram_size - 1, GFP_NOFS); /* allocate blocks */ - cow_file_range(inode, async_cow->locked_page, - async_extent->start, - async_extent->start + - async_extent->ram_size - 1, - &page_started, &nr_written, 0); + ret = cow_file_range(inode, async_cow->locked_page, + async_extent->start, + async_extent->start + + async_extent->ram_size - 1, + &page_started, &nr_written, 0); /* * if page_started, cow_file_range inserted an @@ -574,7 +575,7 @@ static noinline int submit_compressed_extents(struct inode *inode, * and IO for us. Otherwise, we need to submit * all those pages down to the drive. */ - if (!page_started) + if (!page_started && !ret) extent_write_locked_range(io_tree, inode, async_extent->start, async_extent->start + @@ -602,7 +603,21 @@ static noinline int submit_compressed_extents(struct inode *inode, async_extent->compressed_size, 0, alloc_hint, (u64)-1, &ins, 1); - BUG_ON(ret); + if (ret) { + int i; + for (i = 0; i < async_extent->nr_pages; i++) { + WARN_ON(async_extent->pages[i]->mapping); + page_cache_release(async_extent->pages[i]); + } + kfree(async_extent->pages); + async_extent->nr_pages = 0; + async_extent->pages = NULL; + unlock_extent(io_tree, async_extent->start, + async_extent->start + + async_extent->ram_size - 1, GFP_NOFS); + goto retry; + } + em = alloc_extent_map(GFP_NOFS); em->start = async_extent->start; em->len = async_extent->ram_size; -- cgit v1.2.3 From 33b258086441dd07e00133c79fcd8cbc6a76d737 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 11 Nov 2009 10:16:57 -0500 Subject: Btrfs: allow more metadata chunk preallocation On an FS where all of the space has not been allocated into chunks yet, the enospc can return enospc just because the existing metadata chunks are full. We get around this by allowing more metadata chunks to be allocated up to a certain limit, and finding the right limit is a little fuzzy. The problem is the reservations for delalloc would preallocate way too much of the FS as metadata. We need to start saying no and just force some IO to happen. But we also need to let a reasonable amount of the FS become metadata. This bumps the hard limit up, later releases will have a better system. Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 2a4cdceeb575..8d1fd6dc22ac 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2976,10 +2976,10 @@ static int maybe_allocate_chunk(struct btrfs_root *root, free_space = btrfs_super_total_bytes(disk_super); /* - * we allow the metadata to grow to a max of either 5gb or 5% of the + * we allow the metadata to grow to a max of either 10gb or 5% of the * space in the volume. */ - min_metadata = min((u64)5 * 1024 * 1024 * 1024, + min_metadata = min((u64)10 * 1024 * 1024 * 1024, div64_u64(free_space * 5, 100)); if (info->total_bytes >= min_metadata) { spin_unlock(&info->lock); -- cgit v1.2.3 From a6dbd429d8dd3382bbd9594b8d2ec74843a260d9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 11 Nov 2009 15:53:34 -0500 Subject: Btrfs: fix panic when trying to destroy a newly allocated There is a problem where iget5_locked will look for an inode, not find it, and then subsequently try to allocate it. Another CPU will have raced in and allocated the inode instead, so when iget5_locked gets the inode spin lock again and does a search, it finds the new inode. So it goes ahead and calls destroy_inode on the inode it just allocated. The problem is we don't set BTRFS_I(inode)->root until the new inode is completely initialized. This patch makes us set root to NULL when alloc'ing a new inode, so when we get to btrfs_destroy_inode and we see that root is NULL we can just free up the memory and continue on. This fixes the panic http://www.kerneloops.org/submitresult.php?number=812690 Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d3d7d46a6af2..ee92801fc5db 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5180,6 +5180,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->logged_trans = 0; ei->outstanding_extents = 0; ei->reserved_extents = 0; + ei->root = NULL; spin_lock_init(&ei->accounting_lock); btrfs_ordered_inode_tree_init(&ei->ordered_tree); INIT_LIST_HEAD(&ei->i_orphan); @@ -5195,6 +5196,14 @@ void btrfs_destroy_inode(struct inode *inode) WARN_ON(!list_empty(&inode->i_dentry)); WARN_ON(inode->i_data.nrpages); + /* + * This can happen where we create an inode, but somebody else also + * created the same inode and we need to destroy the one we already + * created. + */ + if (!root) + goto free; + /* * Make sure we're properly removed from the ordered operation * lists. @@ -5230,6 +5239,7 @@ void btrfs_destroy_inode(struct inode *inode) } inode_tree_del(inode); btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); +free: kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); } -- cgit v1.2.3 From ff5e4b51a397568c6a2901903c35a4bfaaf752a4 Mon Sep 17 00:00:00 2001 From: Stefan Schmidt Date: Thu, 12 Nov 2009 09:53:50 +0100 Subject: fs/jbd: Export log_start_commit to fix ext3 build. This fixes: ERROR: "log_start_commit" [fs/ext3/ext3.ko] undefined! Signed-off-by: Stefan Schmidt --- fs/jbd/journal.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 49d5cd6053c8..4160afad6d00 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -73,6 +73,7 @@ EXPORT_SYMBOL(journal_errno); EXPORT_SYMBOL(journal_ack_err); EXPORT_SYMBOL(journal_clear_err); EXPORT_SYMBOL(log_wait_commit); +EXPORT_SYMBOL(log_start_commit); EXPORT_SYMBOL(journal_start_commit); EXPORT_SYMBOL(journal_force_commit_nested); EXPORT_SYMBOL(journal_wipe); -- cgit v1.2.3 From 29f12ca32122db98481150be09d35bd72b68045e Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Wed, 11 Nov 2009 14:26:32 -0800 Subject: pidns: fix a leak in /proc dentries and inodes with pid namespaces. Daniel Lezcano reported a leak in 'struct pid' and 'struct pid_namespace' that is discussed in: http://lkml.org/lkml/2009/10/2/159. To summarize the thread, when container-init is terminated, it sets the PF_EXITING flag, zaps other processes in the container and waits to reap them. As a part of reaping, the container-init should flush any /proc dentries associated with the processes. But because the container-init is itself exiting and the following PF_EXITING check, the dentries are not flushed, resulting in leak in /proc inodes and dentries. This fix reverts the commit 7766755a2f249e7e0 ("Fix /proc dcache deadlock in do_exit") which introduced the check for PF_EXITING. At the time of the commit, shrink_dcache_parent() flushed dentries from other filesystems also and could have caused a deadlock which the commit fixed. But as pointed out by Eric Biederman, after commit 0feae5c47aabdde59, shrink_dcache_parent() no longer affects other filesystems. So reverting the commit is now safe. As pointed out by Jan Kara, the leak is not as critical since the unclaimed space will be reclaimed under memory pressure or by: echo 3 > /proc/sys/vm/drop_caches But since this check is no longer required, its best to remove it. Signed-off-by: Sukadev Bhattiprolu Reported-by: Daniel Lezcano Acked-by: Eric W. Biederman Acked-by: Jan Kara Cc: Andrea Arcangeli Cc: Serge Hallyn Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 837469a96598..af643b5aefe8 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2597,8 +2597,7 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) name.len = snprintf(buf, sizeof(buf), "%d", pid); dentry = d_hash_and_lookup(mnt->mnt_root, &name); if (dentry) { - if (!(current->flags & PF_EXITING)) - shrink_dcache_parent(dentry); + shrink_dcache_parent(dentry); d_drop(dentry); dput(dentry); } -- cgit v1.2.3 From 7779d7bed950a7fb1af4f540c2f82a6b81b65901 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 11 Nov 2009 14:26:34 -0800 Subject: fs: add missing compat_ptr handling for FS_IOC_RESVSP ioctl For FS_IOC_RESVSP and FS_IOC_RESVSP64 compat_sys_ioctl() uses its arg argument as a pointer to userspace. However it is missing a a call to compat_ptr() which will do a proper pointer conversion. This was introduced with 3e63cbb1 "fs: Add new pre-allocation ioctls to vfs for compatibility with legacy xfs ioctls". Signed-off-by: Heiko Carstens Cc: Ankit Jain Acked-by: Christoph Hellwig Cc: Al Viro Acked-by: Arnd Bergmann Acked-by: David S. Miller Cc: [2.6.31.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/compat_ioctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index f91fd51b32e3..d84e7058c298 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -1800,7 +1800,7 @@ struct space_resv_32 { /* just account for different alignment */ static int compat_ioctl_preallocate(struct file *file, unsigned long arg) { - struct space_resv_32 __user *p32 = (void __user *)arg; + struct space_resv_32 __user *p32 = compat_ptr(arg); struct space_resv __user *p = compat_alloc_user_space(sizeof(*p)); if (copy_in_user(&p->l_type, &p32->l_type, sizeof(s16)) || @@ -2802,7 +2802,7 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd, #else case FS_IOC_RESVSP: case FS_IOC_RESVSP64: - error = ioctl_preallocate(filp, (void __user *)arg); + error = ioctl_preallocate(filp, compat_ptr(arg)); goto out_fput; #endif -- cgit v1.2.3 From fc63cf237078c86214abcb2ee9926d8ad289da9b Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 11 Nov 2009 14:26:48 -0800 Subject: exec: setup_arg_pages() fails to return errors In setup_arg_pages we work hard to assign a value to ret, but on exit we always return 0. Also remove a now duplicated exit path and branch to out_unlock instead. Signed-off-by: Anton Blanchard Acked-by: Serge Hallyn Reviewed-by: WANG Cong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index d49be6bc1793..ba112bd4a339 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -624,10 +624,8 @@ int setup_arg_pages(struct linux_binprm *bprm, /* Move stack pages down in memory. */ if (stack_shift) { ret = shift_arg_pages(vma, stack_shift); - if (ret) { - up_write(&mm->mmap_sem); - return ret; - } + if (ret) + goto out_unlock; } #ifdef CONFIG_STACK_GROWSUP @@ -641,7 +639,7 @@ int setup_arg_pages(struct linux_binprm *bprm, out_unlock: up_write(&mm->mmap_sem); - return 0; + return ret; } EXPORT_SYMBOL(setup_arg_pages); -- cgit v1.2.3 From e04b5ef8b49db87d01a9b3a47fe41a918a0c0ff5 Mon Sep 17 00:00:00 2001 From: Mike Hommey Date: Wed, 11 Nov 2009 14:26:55 -0800 Subject: __generic_block_fiemap(): fix for files bigger than 4GB Because of an integer overflow on start_blk, various kind of wrong results would be returned by the generic_block_fiemap() handler, such as no extents when there is a 4GB+ hole at the beginning of the file, or wrong fe_logical when an extent starts after the first 4GB. Signed-off-by: Mike Hommey Cc: Alexander Viro Cc: Steven Whitehouse Cc: Theodore Ts'o Cc: Eric Sandeen Cc: Josef Bacik Cc: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ioctl.c b/fs/ioctl.c index 7b17a14396ff..6c751106c2e5 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -254,7 +254,7 @@ int __generic_block_fiemap(struct inode *inode, u64 len, get_block_t *get_block) { struct buffer_head tmp; - unsigned int start_blk; + unsigned long long start_blk; long long length = 0, map_len = 0; u64 logical = 0, phys = 0, size = 0; u32 flags = FIEMAP_EXTENT_MERGED; -- cgit v1.2.3 From c1ea985c710f41e97f1c72c29bbf367375370f0b Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Thu, 12 Nov 2009 00:13:32 +0900 Subject: nilfs2: fix lock order reversal in chcp operation Will fix the following lock order reversal lockdep detected: ======================================================= [ INFO: possible circular locking dependency detected ] 2.6.32-rc6 #7 ------------------------------------------------------- chcp/30157 is trying to acquire lock: (&nilfs->ns_mount_mutex){+.+.+.}, at: [] nilfs_cpfile_change_cpmode+0x46/0x752 [nilfs2] but task is already holding lock: (&nilfs->ns_segctor_sem){++++.+}, at: [] nilfs_transaction_begin+0xba/0x110 [nilfs2] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (&nilfs->ns_segctor_sem){++++.+}: [] __lock_acquire+0x109c/0x139d [] lock_acquire+0x89/0xa0 [] down_read+0x31/0x45 [] nilfs_attach_checkpoint+0x8f/0x16b [nilfs2] [] nilfs_get_sb+0x3e7/0x653 [nilfs2] [] vfs_kern_mount+0x8b/0x124 [] do_kern_mount+0x37/0xc3 [] do_mount+0x64d/0x69d [] sys_mount+0x66/0x95 [] sysenter_do_call+0x12/0x32 -> #1 (&type->s_umount_key#31/1){+.+.+.}: [] __lock_acquire+0x109c/0x139d [] lock_acquire+0x89/0xa0 [] down_write_nested+0x34/0x52 [] sget+0x22e/0x389 [] nilfs_get_sb+0x187/0x653 [nilfs2] [] vfs_kern_mount+0x8b/0x124 [] do_kern_mount+0x37/0xc3 [] do_mount+0x64d/0x69d [] sys_mount+0x66/0x95 [] sysenter_do_call+0x12/0x32 -> #0 (&nilfs->ns_mount_mutex){+.+.+.}: [] __lock_acquire+0xe27/0x139d [] lock_acquire+0x89/0xa0 [] mutex_lock_nested+0x41/0x23e [] nilfs_cpfile_change_cpmode+0x46/0x752 [nilfs2] [] nilfs_ioctl+0x11a/0x7da [nilfs2] [] vfs_ioctl+0x27/0x6e [] do_vfs_ioctl+0x491/0x4db [] sys_ioctl+0x45/0x5f [] sysenter_do_call+0x12/0x32 Signed-off-by: Ryusuke Konishi --- fs/nilfs2/cpfile.c | 2 -- fs/nilfs2/ioctl.c | 6 +++++- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index 1c6cfb59128d..3f5d5d06f53c 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -871,7 +871,6 @@ int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode) * exclusive with a new mount job. Though it doesn't cover * umount, it's enough for the purpose. */ - mutex_lock(&nilfs->ns_mount_mutex); if (nilfs_checkpoint_is_mounted(nilfs, cno, 1)) { /* Current implementation does not have to protect plain read-only mounts since they are exclusive @@ -880,7 +879,6 @@ int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode) ret = -EBUSY; } else ret = nilfs_cpfile_clear_snapshot(cpfile, cno); - mutex_unlock(&nilfs->ns_mount_mutex); return ret; case NILFS_SNAPSHOT: return nilfs_cpfile_set_snapshot(cpfile, cno); diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index d24057d58f17..f6af76042d80 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -99,7 +99,8 @@ static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs, static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp, unsigned int cmd, void __user *argp) { - struct inode *cpfile = NILFS_SB(inode->i_sb)->s_nilfs->ns_cpfile; + struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs; + struct inode *cpfile = nilfs->ns_cpfile; struct nilfs_transaction_info ti; struct nilfs_cpmode cpmode; int ret; @@ -109,14 +110,17 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp, if (copy_from_user(&cpmode, argp, sizeof(cpmode))) return -EFAULT; + mutex_lock(&nilfs->ns_mount_mutex); nilfs_transaction_begin(inode->i_sb, &ti, 0); ret = nilfs_cpfile_change_cpmode( cpfile, cpmode.cm_cno, cpmode.cm_mode); if (unlikely(ret < 0)) { nilfs_transaction_abort(inode->i_sb); + mutex_unlock(&nilfs->ns_mount_mutex); return ret; } nilfs_transaction_commit(inode->i_sb); /* never fails */ + mutex_unlock(&nilfs->ns_mount_mutex); return ret; } -- cgit v1.2.3 From 7aee47b0bb9f93baecdbea205e878fe0f155f7da Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Fri, 6 Nov 2009 14:50:22 -0800 Subject: ocfs2: Trivial cleanup of jbd compatibility layer removal Mainline commit 53ef99cad9878f02f27bb30bc304fc42af8bdd6e removed the JBD compatibility layer from OCFS2. This patch removes the last remaining remnants of that. Signed-off-by: Sunil Mushran Signed-off-by: Joel Becker --- fs/ocfs2/ocfs2.h | 7 +------ fs/ocfs2/uptodate.c | 5 ----- 2 files changed, 1 insertion(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index eae404602424..d963d8638709 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -35,12 +35,7 @@ #include #include #include -#ifndef CONFIG_OCFS2_COMPAT_JBD -# include -#else -# include -# include "ocfs2_jbd_compat.h" -#endif +#include /* For union ocfs2_dlm_lksb */ #include "stackglue.h" diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c index b6284f235d2f..c61369342a27 100644 --- a/fs/ocfs2/uptodate.c +++ b/fs/ocfs2/uptodate.c @@ -53,11 +53,6 @@ #include #include #include -#ifndef CONFIG_OCFS2_COMPAT_JBD -# include -#else -# include -#endif #define MLOG_MASK_PREFIX ML_UPTODATE -- cgit v1.2.3 From 479c2553af9a176a0613894b9f1ec73425fd56a3 Mon Sep 17 00:00:00 2001 From: Petr Vandrovec Date: Sat, 14 Nov 2009 10:47:07 +0100 Subject: Fix memory corruption caused by nfsd readdir+ Commit 8177e6d6dfb9cd03d9bdeb647c32161f8f58f686 ("nfsd: clean up readdirplus encoding") introduced single character typo in nfs3 readdir+ implementation. Unfortunately that typo has quite bad side effects: random memory corruption, followed (on my box) with immediate spontaneous box reboot. Using 'p1' instead of 'p' fixes my Linux box rebooting whenever VMware ESXi box tries to list contents of my home directory. Signed-off-by: Petr Vandrovec Cc: "J. Bruce Fields" Cc: Neil Brown Signed-off-by: Linus Torvalds --- fs/nfsd/nfs3xdr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index edf926e1062f..d0a2ce1b4324 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -958,7 +958,7 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen, p1 = encode_entry_baggage(cd, p1, name, namlen, ino); if (plus) - p = encode_entryplus_baggage(cd, p1, name, namlen); + p1 = encode_entryplus_baggage(cd, p1, name, namlen); /* determine entry word length and lengths to go in pages */ num_entry_words = p1 - tmp; -- cgit v1.2.3 From 18dafac1a4c6c88867a50f9a82492976f20383d6 Mon Sep 17 00:00:00 2001 From: Jiro SEKIBA Date: Sun, 15 Nov 2009 13:49:45 +0900 Subject: nilfs2: deleted inconsistent comment in nilfs_load_inode_block() The comment says, "Caller of this function MUST lock s_inode_lock", however just above the comment, it locks s_inode_lock in the function. Signed-off-by: Jiro SEKIBA Signed-off-by: Ryusuke Konishi --- fs/nilfs2/inode.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 5040220c3732..2a0a5a3ac134 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -664,7 +664,6 @@ int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode, int err; spin_lock(&sbi->s_inode_lock); - /* Caller of this function MUST lock s_inode_lock */ if (ii->i_bh == NULL) { spin_unlock(&sbi->s_inode_lock); err = nilfs_ifile_get_inode_block(sbi->s_ifile, inode->i_ino, -- cgit v1.2.3 From f534dc994397560343be4a3223b9bbaa8e739e1f Mon Sep 17 00:00:00 2001 From: Suresh Jayaraman Date: Mon, 16 Nov 2009 12:03:16 +0530 Subject: cifs: clear server inode number flag while autodisabling Fix the commit ec06aedd44 that intended to turn off querying for server inode numbers when server doesn't consistently support inode numbers. Presumably the commit didn't actually clear the CIFS_MOUNT_SERVER_INUM flag, perhaps a typo. Signed-off-by: Suresh Jayaraman Acked-by: Jeff Layton Cc: Stable Signed-off-by: Steve French --- fs/cifs/misc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 1e25efcb55c8..d27d4ec6579b 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -720,7 +720,7 @@ void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb) { if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { - cifs_sb->mnt_cifs_flags &= CIFS_MOUNT_SERVER_INUM; + cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM; cERROR(1, ("Autodisabling the use of server inode numbers on " "%s. This server doesn't seem to support them " "properly. Hardlinks will not be recognized on this " -- cgit v1.2.3 From 8ec6dba2581754e375be66f7bedd708d856d8b30 Mon Sep 17 00:00:00 2001 From: Jan Rekorajski Date: Mon, 16 Nov 2009 11:57:02 +0000 Subject: XFS bug in log recover with quota (bugzilla id 855) Hi, I was hit by a bug in linux 2.6.31 when XFS is not able to recover the log after a crash if fs was mounted with quotas. Gory details in XFS bugzilla: http://oss.sgi.com/bugzilla/show_bug.cgi?id=855. It looks like wrong struct is used in buffer length check, and the following patch should fix the problem. xfs_dqblk_t has a size of 104+32 bytes, while xfs_disk_dquot_t is 104 bytes long, and this is exactly what I see in system logs - "XFS: dquot too small (104) in xlog_recover_do_dquot_trans." Signed-off-by: Jan Rekorajski Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_log_recover.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 1099395d7d6c..fb17f8226b09 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1980,7 +1980,7 @@ xlog_recover_do_reg_buffer( "XFS: NULL dquot in %s.", __func__); goto next; } - if (item->ri_buf[i].i_len < sizeof(xfs_dqblk_t)) { + if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) { cmn_err(CE_ALERT, "XFS: dquot too small (%d) in %s.", item->ri_buf[i].i_len, __func__); @@ -2635,7 +2635,7 @@ xlog_recover_do_dquot_trans( "XFS: NULL dquot in %s.", __func__); return XFS_ERROR(EIO); } - if (item->ri_buf[1].i_len < sizeof(xfs_dqblk_t)) { + if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) { cmn_err(CE_ALERT, "XFS: dquot too small (%d) in %s.", item->ri_buf[1].i_len, __func__); -- cgit v1.2.3 From 6c06f072c2d797ddbb2270363de97c53ebbe0385 Mon Sep 17 00:00:00 2001 From: "Nathaniel W. Turner" Date: Mon, 16 Nov 2009 19:51:48 +0000 Subject: xfs: copy li_lsn before dropping AIL lock Access to log items on the AIL is generally protected by m_ail_lock; this is particularly needed when we're getting or setting the 64-bit li_lsn on a 32-bit platform. This patch fixes a couple places where we were accessing the log item after dropping the AIL lock on 32-bit machines. This can result in a partially-zeroed log->l_tail_lsn if xfs_trans_ail_delete is racing with xfs_trans_ail_update, and in at least some cases, this can leave the l_tail_lsn with a zero cycle number, which means xlog_space_left will think the log is full (unless CONFIG_XFS_DEBUG is set, in which case we'll trip an ASSERT), leading to processes stuck forever in xlog_grant_log_space. Thanks to Adrian VanderSpek for first spotting the race potential and to Dave Chinner for debug assistance. Signed-off-by: Nathaniel W. Turner Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_trans_ail.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index f31271c30de9..2ffc570679be 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -467,6 +467,7 @@ xfs_trans_ail_update( { xfs_log_item_t *dlip = NULL; xfs_log_item_t *mlip; /* ptr to minimum lip */ + xfs_lsn_t tail_lsn; mlip = xfs_ail_min(ailp); @@ -483,8 +484,16 @@ xfs_trans_ail_update( if (mlip == dlip) { mlip = xfs_ail_min(ailp); + /* + * It is not safe to access mlip after the AIL lock is + * dropped, so we must get a copy of li_lsn before we do + * so. This is especially important on 32-bit platforms + * where accessing and updating 64-bit values like li_lsn + * is not atomic. + */ + tail_lsn = mlip->li_lsn; spin_unlock(&ailp->xa_lock); - xfs_log_move_tail(ailp->xa_mount, mlip->li_lsn); + xfs_log_move_tail(ailp->xa_mount, tail_lsn); } else { spin_unlock(&ailp->xa_lock); } @@ -514,6 +523,7 @@ xfs_trans_ail_delete( { xfs_log_item_t *dlip; xfs_log_item_t *mlip; + xfs_lsn_t tail_lsn; if (lip->li_flags & XFS_LI_IN_AIL) { mlip = xfs_ail_min(ailp); @@ -527,9 +537,16 @@ xfs_trans_ail_delete( if (mlip == dlip) { mlip = xfs_ail_min(ailp); + /* + * It is not safe to access mlip after the AIL lock + * is dropped, so we must get a copy of li_lsn + * before we do so. This is especially important + * on 32-bit platforms where accessing and updating + * 64-bit values like li_lsn is not atomic. + */ + tail_lsn = mlip ? mlip->li_lsn : 0; spin_unlock(&ailp->xa_lock); - xfs_log_move_tail(ailp->xa_mount, - (mlip ? mlip->li_lsn : 0)); + xfs_log_move_tail(ailp->xa_mount, tail_lsn); } else { spin_unlock(&ailp->xa_lock); } -- cgit v1.2.3 From 9ebd4eba761b624a6a6c9189335adeddcb1fa0e0 Mon Sep 17 00:00:00 2001 From: Stefani Seibold Date: Tue, 17 Nov 2009 14:06:23 -0800 Subject: procfs: fix /proc//stat stack pointer for kernel threads Fix a small issue for the stack pointer in /proc//stat. In case of a kernel thread the value of the printed stack pointer should be 0. Signed-off-by: Stefani Seibold Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/array.c b/fs/proc/array.c index 07f77a7945c3..822c2d506518 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -571,7 +571,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, rsslim, mm ? mm->start_code : 0, mm ? mm->end_code : 0, - (permitted) ? task->stack_start : 0, + (permitted && mm) ? task->stack_start : 0, esp, eip, /* The signal information here is obsolete. -- cgit v1.2.3 From 978b4053aefd422713f289f2a315ce2acba62018 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 17 Nov 2009 14:06:24 -0800 Subject: fcntl: rename F_OWNER_GID to F_OWNER_PGRP This is for consistency with various ioctl() operations that include the suffix "PGRP" in their names, and also for consistency with PRIO_PGRP, used with setpriority() and getpriority(). Also, using PGRP instead of GID avoids confusion with the common abbreviation of "group ID". I'm fine with anything that makes it more consistent, and if PGRP is what is the predominant abbreviation then I see no need to further confuse matters by adding a third one. Signed-off-by: Peter Zijlstra Acked-by: Michael Kerrisk Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fcntl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fcntl.c b/fs/fcntl.c index fc089f2f7f56..2cf93ec40a67 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -284,7 +284,7 @@ static int f_setown_ex(struct file *filp, unsigned long arg) type = PIDTYPE_PID; break; - case F_OWNER_GID: + case F_OWNER_PGRP: type = PIDTYPE_PGID; break; @@ -321,7 +321,7 @@ static int f_getown_ex(struct file *filp, unsigned long arg) break; case PIDTYPE_PGID: - owner.type = F_OWNER_GID; + owner.type = F_OWNER_PGRP; break; default: -- cgit v1.2.3 From 3d7a641e544e428191667e8b1f83f96fa46dbd65 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:10:23 +0000 Subject: SLOW_WORK: Wait for outstanding work items belonging to a module to clear Wait for outstanding slow work items belonging to a module to clear when unregistering that module as a user of the facility. This prevents the put_ref code of a work item from being taken away before it returns. Signed-off-by: David Howells --- fs/fscache/main.c | 6 +++--- fs/fscache/object.c | 1 + fs/fscache/operation.c | 1 + fs/gfs2/main.c | 4 ++-- fs/gfs2/recovery.c | 1 + 5 files changed, 8 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/fscache/main.c b/fs/fscache/main.c index 4de41b597499..add6bdb53f04 100644 --- a/fs/fscache/main.c +++ b/fs/fscache/main.c @@ -48,7 +48,7 @@ static int __init fscache_init(void) { int ret; - ret = slow_work_register_user(); + ret = slow_work_register_user(THIS_MODULE); if (ret < 0) goto error_slow_work; @@ -80,7 +80,7 @@ error_kobj: error_cookie_jar: fscache_proc_cleanup(); error_proc: - slow_work_unregister_user(); + slow_work_unregister_user(THIS_MODULE); error_slow_work: return ret; } @@ -97,7 +97,7 @@ static void __exit fscache_exit(void) kobject_put(fscache_root); kmem_cache_destroy(fscache_cookie_jar); fscache_proc_cleanup(); - slow_work_unregister_user(); + slow_work_unregister_user(THIS_MODULE); printk(KERN_NOTICE "FS-Cache: Unloaded\n"); } diff --git a/fs/fscache/object.c b/fs/fscache/object.c index 392a41b1b79d..d236eb1d6f37 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -45,6 +45,7 @@ static void fscache_enqueue_dependents(struct fscache_object *); static void fscache_dequeue_object(struct fscache_object *); const struct slow_work_ops fscache_object_slow_work_ops = { + .owner = THIS_MODULE, .get_ref = fscache_object_slow_work_get_ref, .put_ref = fscache_object_slow_work_put_ref, .execute = fscache_object_slow_work_execute, diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index e7f8d53b8b6b..f1a2857b2ff5 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -453,6 +453,7 @@ static void fscache_op_execute(struct slow_work *work) } const struct slow_work_ops fscache_op_slow_work_ops = { + .owner = THIS_MODULE, .get_ref = fscache_op_get_ref, .put_ref = fscache_op_put_ref, .execute = fscache_op_execute, diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index eacd78a5d082..5b31f7741a8f 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -114,7 +114,7 @@ static int __init init_gfs2_fs(void) if (error) goto fail_unregister; - error = slow_work_register_user(); + error = slow_work_register_user(THIS_MODULE); if (error) goto fail_slow; @@ -163,7 +163,7 @@ static void __exit exit_gfs2_fs(void) gfs2_unregister_debugfs(); unregister_filesystem(&gfs2_fs_type); unregister_filesystem(&gfs2meta_fs_type); - slow_work_unregister_user(); + slow_work_unregister_user(THIS_MODULE); kmem_cache_destroy(gfs2_quotad_cachep); kmem_cache_destroy(gfs2_rgrpd_cachep); diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 59d2695509d3..b2bb779f09ed 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -593,6 +593,7 @@ fail: } struct slow_work_ops gfs2_recover_ops = { + .owner = THIS_MODULE, .get_ref = gfs2_recover_get_ref, .put_ref = gfs2_recover_put_ref, .execute = gfs2_recover_work, -- cgit v1.2.3 From 440f0affe247e9990c8f8778f1861da4fd7d5e50 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:11:01 +0000 Subject: FS-Cache: Annotate slow-work runqueue proc lines for FS-Cache work items Annotate slow-work runqueue proc lines for FS-Cache work items. Objects include the object ID and the state. Operations include the object ID, the operation ID and the operation type and state. Signed-off-by: David Howells --- fs/fscache/object.c | 41 ++++++++++++++++++++++++++++++++++++++++- fs/fscache/operation.c | 29 +++++++++++++++++++++++++++++ fs/fscache/page.c | 27 ++++++++++++++++++++++----- 3 files changed, 91 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/fscache/object.c b/fs/fscache/object.c index d236eb1d6f37..615b63dd9ecc 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -14,9 +14,10 @@ #define FSCACHE_DEBUG_LEVEL COOKIE #include +#include #include "internal.h" -const char *fscache_object_states[] = { +const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = { [FSCACHE_OBJECT_INIT] = "OBJECT_INIT", [FSCACHE_OBJECT_LOOKING_UP] = "OBJECT_LOOKING_UP", [FSCACHE_OBJECT_CREATING] = "OBJECT_CREATING", @@ -33,9 +34,28 @@ const char *fscache_object_states[] = { }; EXPORT_SYMBOL(fscache_object_states); +static const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = { + [FSCACHE_OBJECT_INIT] = "INIT", + [FSCACHE_OBJECT_LOOKING_UP] = "LOOK", + [FSCACHE_OBJECT_CREATING] = "CRTN", + [FSCACHE_OBJECT_AVAILABLE] = "AVBL", + [FSCACHE_OBJECT_ACTIVE] = "ACTV", + [FSCACHE_OBJECT_UPDATING] = "UPDT", + [FSCACHE_OBJECT_DYING] = "DYNG", + [FSCACHE_OBJECT_LC_DYING] = "LCDY", + [FSCACHE_OBJECT_ABORT_INIT] = "ABTI", + [FSCACHE_OBJECT_RELEASING] = "RELS", + [FSCACHE_OBJECT_RECYCLING] = "RCYC", + [FSCACHE_OBJECT_WITHDRAWING] = "WTHD", + [FSCACHE_OBJECT_DEAD] = "DEAD", +}; + static void fscache_object_slow_work_put_ref(struct slow_work *); static int fscache_object_slow_work_get_ref(struct slow_work *); static void fscache_object_slow_work_execute(struct slow_work *); +#ifdef CONFIG_SLOW_WORK_PROC +static void fscache_object_slow_work_desc(struct slow_work *, struct seq_file *); +#endif static void fscache_initialise_object(struct fscache_object *); static void fscache_lookup_object(struct fscache_object *); static void fscache_object_available(struct fscache_object *); @@ -49,6 +69,9 @@ const struct slow_work_ops fscache_object_slow_work_ops = { .get_ref = fscache_object_slow_work_get_ref, .put_ref = fscache_object_slow_work_put_ref, .execute = fscache_object_slow_work_execute, +#ifdef CONFIG_SLOW_WORK_PROC + .desc = fscache_object_slow_work_desc, +#endif }; EXPORT_SYMBOL(fscache_object_slow_work_ops); @@ -326,6 +349,22 @@ static void fscache_object_slow_work_execute(struct slow_work *work) fscache_enqueue_object(object); } +/* + * describe an object for slow-work debugging + */ +#ifdef CONFIG_SLOW_WORK_PROC +static void fscache_object_slow_work_desc(struct slow_work *work, + struct seq_file *m) +{ + struct fscache_object *object = + container_of(work, struct fscache_object, work); + + seq_printf(m, "FSC: OBJ%x: %s", + object->debug_id, + fscache_object_states_short[object->state]); +} +#endif + /* * initialise an object * - check the specified object's parent to see if we can make use of it diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index f1a2857b2ff5..91bbe6f0377c 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -13,6 +13,7 @@ #define FSCACHE_DEBUG_LEVEL OPERATION #include +#include #include "internal.h" atomic_t fscache_op_debug_id; @@ -31,6 +32,8 @@ void fscache_enqueue_operation(struct fscache_operation *op) _enter("{OBJ%x OP%x,%u}", op->object->debug_id, op->debug_id, atomic_read(&op->usage)); + fscache_set_op_state(op, "EnQ"); + ASSERT(op->processor != NULL); ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE); ASSERTCMP(atomic_read(&op->usage), >, 0); @@ -67,6 +70,8 @@ EXPORT_SYMBOL(fscache_enqueue_operation); static void fscache_run_op(struct fscache_object *object, struct fscache_operation *op) { + fscache_set_op_state(op, "Run"); + object->n_in_progress++; if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags)) wake_up_bit(&op->flags, FSCACHE_OP_WAITING); @@ -87,6 +92,8 @@ int fscache_submit_exclusive_op(struct fscache_object *object, _enter("{OBJ%x OP%x},", object->debug_id, op->debug_id); + fscache_set_op_state(op, "SubmitX"); + spin_lock(&object->lock); ASSERTCMP(object->n_ops, >=, object->n_in_progress); ASSERTCMP(object->n_ops, >=, object->n_exclusive); @@ -190,6 +197,8 @@ int fscache_submit_op(struct fscache_object *object, ASSERTCMP(atomic_read(&op->usage), >, 0); + fscache_set_op_state(op, "Submit"); + spin_lock(&object->lock); ASSERTCMP(object->n_ops, >=, object->n_in_progress); ASSERTCMP(object->n_ops, >=, object->n_exclusive); @@ -298,6 +307,8 @@ void fscache_put_operation(struct fscache_operation *op) if (!atomic_dec_and_test(&op->usage)) return; + fscache_set_op_state(op, "Put"); + _debug("PUT OP"); if (test_and_set_bit(FSCACHE_OP_DEAD, &op->flags)) BUG(); @@ -452,9 +463,27 @@ static void fscache_op_execute(struct slow_work *work) _leave(""); } +/* + * describe an operation for slow-work debugging + */ +#ifdef CONFIG_SLOW_WORK_PROC +static void fscache_op_desc(struct slow_work *work, struct seq_file *m) +{ + struct fscache_operation *op = + container_of(work, struct fscache_operation, slow_work); + + seq_printf(m, "FSC: OBJ%x OP%x: %s/%s fl=%lx", + op->object->debug_id, op->debug_id, + op->name, op->state, op->flags); +} +#endif + const struct slow_work_ops fscache_op_slow_work_ops = { .owner = THIS_MODULE, .get_ref = fscache_op_get_ref, .put_ref = fscache_op_put_ref, .execute = fscache_op_execute, +#ifdef CONFIG_SLOW_WORK_PROC + .desc = fscache_op_desc, +#endif }; diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 2568e0eb644f..e8bbc395cef6 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -63,14 +63,19 @@ static void fscache_end_page_write(struct fscache_cookie *cookie, struct page *p static void fscache_attr_changed_op(struct fscache_operation *op) { struct fscache_object *object = op->object; + int ret; _enter("{OBJ%x OP%x}", object->debug_id, op->debug_id); fscache_stat(&fscache_n_attr_changed_calls); - if (fscache_object_is_active(object) && - object->cache->ops->attr_changed(object) < 0) - fscache_abort_object(object); + if (fscache_object_is_active(object)) { + fscache_set_op_state(op, "CallFS"); + ret = object->cache->ops->attr_changed(object); + fscache_set_op_state(op, "Done"); + if (ret < 0) + fscache_abort_object(object); + } _leave(""); } @@ -99,6 +104,7 @@ int __fscache_attr_changed(struct fscache_cookie *cookie) fscache_operation_init(op, NULL); fscache_operation_init_slow(op, fscache_attr_changed_op); op->flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_EXCLUSIVE); + fscache_set_op_name(op, "Attr"); spin_lock(&cookie->lock); @@ -184,6 +190,7 @@ static struct fscache_retrieval *fscache_alloc_retrieval( op->start_time = jiffies; INIT_WORK(&op->op.fast_work, fscache_retrieval_work); INIT_LIST_HEAD(&op->to_do); + fscache_set_op_name(&op->op, "Retr"); return op; } @@ -257,6 +264,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, _leave(" = -ENOMEM"); return -ENOMEM; } + fscache_set_op_name(&op->op, "RetrRA1"); spin_lock(&cookie->lock); @@ -369,6 +377,7 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie, op = fscache_alloc_retrieval(mapping, end_io_func, context); if (!op) return -ENOMEM; + fscache_set_op_name(&op->op, "RetrRAN"); spin_lock(&cookie->lock); @@ -461,6 +470,7 @@ int __fscache_alloc_page(struct fscache_cookie *cookie, op = fscache_alloc_retrieval(page->mapping, NULL, NULL); if (!op) return -ENOMEM; + fscache_set_op_name(&op->op, "RetrAL1"); spin_lock(&cookie->lock); @@ -529,6 +539,8 @@ static void fscache_write_op(struct fscache_operation *_op) _enter("{OP%x,%d}", op->op.debug_id, atomic_read(&op->op.usage)); + fscache_set_op_state(&op->op, "GetPage"); + spin_lock(&cookie->lock); spin_lock(&object->lock); @@ -559,13 +571,17 @@ static void fscache_write_op(struct fscache_operation *_op) spin_unlock(&cookie->lock); if (page) { + fscache_set_op_state(&op->op, "Store"); ret = object->cache->ops->write_page(op, page); + fscache_set_op_state(&op->op, "EndWrite"); fscache_end_page_write(cookie, page); page_cache_release(page); - if (ret < 0) + if (ret < 0) { + fscache_set_op_state(&op->op, "Abort"); fscache_abort_object(object); - else + } else { fscache_enqueue_operation(&op->op); + } } _leave(""); @@ -634,6 +650,7 @@ int __fscache_write_page(struct fscache_cookie *cookie, fscache_operation_init(&op->op, fscache_release_write_op); fscache_operation_init_slow(&op->op, fscache_write_op); op->op.flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_WAITING); + fscache_set_op_name(&op->op, "Write1"); ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM); if (ret < 0) -- cgit v1.2.3 From 4fbf4291aa15926cd4fdca0ffe9122e89d0459db Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:11:04 +0000 Subject: FS-Cache: Allow the current state of all objects to be dumped Allow the current state of all fscache objects to be dumped by doing: cat /proc/fs/fscache/objects By default, all objects and all fields will be shown. This can be restricted by adding a suitable key to one of the caller's keyrings (such as the session keyring): keyctl add user fscache:objlist "" @s The are: K Show hexdump of object key (don't show if not given) A Show hexdump of object aux data (don't show if not given) And paired restrictions: C Show objects that have a cookie c Show objects that don't have a cookie B Show objects that are busy b Show objects that aren't busy W Show objects that have pending writes w Show objects that don't have pending writes R Show objects that have outstanding reads r Show objects that don't have outstanding reads S Show objects that have slow work queued s Show objects that don't have slow work queued If neither side of a restriction pair is given, then both are implied. For example: keyctl add user fscache:objlist KB @s shows objects that are busy, and lists their object keys, but does not dump their auxiliary data. It also implies "CcWwRrSs", but as 'B' is given, 'b' is not implied. Signed-off-by: David Howells --- fs/cachefiles/interface.c | 1 + fs/cachefiles/rdwr.c | 6 +- fs/fscache/Kconfig | 7 + fs/fscache/Makefile | 1 + fs/fscache/cache.c | 1 + fs/fscache/cookie.c | 2 + fs/fscache/internal.h | 13 ++ fs/fscache/object-list.c | 432 ++++++++++++++++++++++++++++++++++++++++++++++ fs/fscache/object.c | 2 +- fs/fscache/operation.c | 3 + fs/fscache/page.c | 6 + fs/fscache/proc.c | 13 ++ 12 files changed, 484 insertions(+), 3 deletions(-) create mode 100644 fs/fscache/object-list.c (limited to 'fs') diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 431accd475a7..dd7f852746cb 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -331,6 +331,7 @@ static void cachefiles_put_object(struct fscache_object *_object) } cache = object->fscache.cache; + fscache_object_destroy(&object->fscache); kmem_cache_free(cachefiles_object_jar, object); fscache_object_destroyed(cache); } diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index a69787e7dd96..3304646dae84 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -333,7 +333,8 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op, shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits; - op->op.flags = FSCACHE_OP_FAST; + op->op.flags &= FSCACHE_OP_KEEP_FLAGS; + op->op.flags |= FSCACHE_OP_FAST; op->op.processor = cachefiles_read_copier; pagevec_init(&pagevec, 0); @@ -639,7 +640,8 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op, pagevec_init(&pagevec, 0); - op->op.flags = FSCACHE_OP_FAST; + op->op.flags &= FSCACHE_OP_KEEP_FLAGS; + op->op.flags |= FSCACHE_OP_FAST; op->op.processor = cachefiles_read_copier; INIT_LIST_HEAD(&backpages); diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig index 9bbb8ce7bea0..864dac20a242 100644 --- a/fs/fscache/Kconfig +++ b/fs/fscache/Kconfig @@ -54,3 +54,10 @@ config FSCACHE_DEBUG enabled by setting bits in /sys/modules/fscache/parameter/debug. See Documentation/filesystems/caching/fscache.txt for more information. + +config FSCACHE_OBJECT_LIST + bool "Maintain global object list for debugging purposes" + depends on FSCACHE && PROC_FS + help + Maintain a global list of active fscache objects that can be + retrieved through /proc/fs/fscache/objects for debugging purposes diff --git a/fs/fscache/Makefile b/fs/fscache/Makefile index 91571b95aacc..6d561531cb36 100644 --- a/fs/fscache/Makefile +++ b/fs/fscache/Makefile @@ -15,5 +15,6 @@ fscache-y := \ fscache-$(CONFIG_PROC_FS) += proc.o fscache-$(CONFIG_FSCACHE_STATS) += stats.o fscache-$(CONFIG_FSCACHE_HISTOGRAM) += histogram.o +fscache-$(CONFIG_FSCACHE_OBJECT_LIST) += object-list.o obj-$(CONFIG_FSCACHE) := fscache.o diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c index e21985bbb1fb..724384ef96de 100644 --- a/fs/fscache/cache.c +++ b/fs/fscache/cache.c @@ -263,6 +263,7 @@ int fscache_add_cache(struct fscache_cache *cache, spin_lock(&cache->object_list_lock); list_add_tail(&ifsdef->cache_link, &cache->object_list); spin_unlock(&cache->object_list_lock); + fscache_objlist_add(ifsdef); /* add the cache's netfs definition index object to the top level index * cookie as a known backing object */ diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 72fd18f6c71f..9b5187328230 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -349,6 +349,8 @@ static int fscache_attach_object(struct fscache_cookie *cookie, object->cookie = cookie; atomic_inc(&cookie->usage); hlist_add_head(&object->cookie_link, &cookie->backing_objects); + + fscache_objlist_add(object); ret = 0; cant_attach_object: diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index 1c341304621f..fe02973a9516 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -88,10 +88,23 @@ extern int fscache_wait_bit_interruptible(void *); /* * object.c */ +extern const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5]; + extern void fscache_withdrawing_object(struct fscache_cache *, struct fscache_object *); extern void fscache_enqueue_object(struct fscache_object *); +/* + * object-list.c + */ +#ifdef CONFIG_FSCACHE_OBJECT_LIST +extern const struct file_operations fscache_objlist_fops; + +extern void fscache_objlist_add(struct fscache_object *); +#else +#define fscache_objlist_add(object) do {} while(0) +#endif + /* * operation.c */ diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c new file mode 100644 index 000000000000..e590242fa41a --- /dev/null +++ b/fs/fscache/object-list.c @@ -0,0 +1,432 @@ +/* Global fscache object list maintainer and viewer + * + * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#define FSCACHE_DEBUG_LEVEL COOKIE +#include +#include +#include +#include +#include "internal.h" + +static struct rb_root fscache_object_list; +static DEFINE_RWLOCK(fscache_object_list_lock); + +struct fscache_objlist_data { + unsigned long config; /* display configuration */ +#define FSCACHE_OBJLIST_CONFIG_KEY 0x00000001 /* show object keys */ +#define FSCACHE_OBJLIST_CONFIG_AUX 0x00000002 /* show object auxdata */ +#define FSCACHE_OBJLIST_CONFIG_COOKIE 0x00000004 /* show objects with cookies */ +#define FSCACHE_OBJLIST_CONFIG_NOCOOKIE 0x00000008 /* show objects without cookies */ +#define FSCACHE_OBJLIST_CONFIG_BUSY 0x00000010 /* show busy objects */ +#define FSCACHE_OBJLIST_CONFIG_IDLE 0x00000020 /* show idle objects */ +#define FSCACHE_OBJLIST_CONFIG_PENDWR 0x00000040 /* show objects with pending writes */ +#define FSCACHE_OBJLIST_CONFIG_NOPENDWR 0x00000080 /* show objects without pending writes */ +#define FSCACHE_OBJLIST_CONFIG_READS 0x00000100 /* show objects with active reads */ +#define FSCACHE_OBJLIST_CONFIG_NOREADS 0x00000200 /* show objects without active reads */ +#define FSCACHE_OBJLIST_CONFIG_EVENTS 0x00000400 /* show objects with events */ +#define FSCACHE_OBJLIST_CONFIG_NOEVENTS 0x00000800 /* show objects without no events */ +#define FSCACHE_OBJLIST_CONFIG_WORK 0x00001000 /* show objects with slow work */ +#define FSCACHE_OBJLIST_CONFIG_NOWORK 0x00002000 /* show objects without slow work */ + + u8 buf[512]; /* key and aux data buffer */ +}; + +/* + * Add an object to the object list + * - we use the address of the fscache_object structure as the key into the + * tree + */ +void fscache_objlist_add(struct fscache_object *obj) +{ + struct fscache_object *xobj; + struct rb_node **p = &fscache_object_list.rb_node, *parent = NULL; + + write_lock(&fscache_object_list_lock); + + while (*p) { + parent = *p; + xobj = rb_entry(parent, struct fscache_object, objlist_link); + + if (obj < xobj) + p = &(*p)->rb_left; + else if (obj > xobj) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&obj->objlist_link, parent, p); + rb_insert_color(&obj->objlist_link, &fscache_object_list); + + write_unlock(&fscache_object_list_lock); +} + +/** + * fscache_object_destroy - Note that a cache object is about to be destroyed + * @object: The object to be destroyed + * + * Note the imminent destruction and deallocation of a cache object record. + */ +void fscache_object_destroy(struct fscache_object *obj) +{ + write_lock(&fscache_object_list_lock); + + BUG_ON(RB_EMPTY_ROOT(&fscache_object_list)); + rb_erase(&obj->objlist_link, &fscache_object_list); + + write_unlock(&fscache_object_list_lock); +} +EXPORT_SYMBOL(fscache_object_destroy); + +/* + * find the object in the tree on or after the specified index + */ +static struct fscache_object *fscache_objlist_lookup(loff_t *_pos) +{ + struct fscache_object *pobj, *obj, *minobj = NULL; + struct rb_node *p; + unsigned long pos; + + if (*_pos >= (unsigned long) ERR_PTR(-ENOENT)) + return NULL; + pos = *_pos; + + /* banners (can't represent line 0 by pos 0 as that would involve + * returning a NULL pointer) */ + if (pos == 0) + return (struct fscache_object *) ++(*_pos); + if (pos < 3) + return (struct fscache_object *)pos; + + pobj = (struct fscache_object *)pos; + p = fscache_object_list.rb_node; + while (p) { + obj = rb_entry(p, struct fscache_object, objlist_link); + if (pobj < obj) { + if (!minobj || minobj > obj) + minobj = obj; + p = p->rb_left; + } else if (pobj > obj) { + p = p->rb_right; + } else { + minobj = obj; + break; + } + obj = NULL; + } + + if (!minobj) + *_pos = (unsigned long) ERR_PTR(-ENOENT); + else if (minobj != obj) + *_pos = (unsigned long) minobj; + return minobj; +} + +/* + * set up the iterator to start reading from the first line + */ +static void *fscache_objlist_start(struct seq_file *m, loff_t *_pos) + __acquires(&fscache_object_list_lock) +{ + read_lock(&fscache_object_list_lock); + return fscache_objlist_lookup(_pos); +} + +/* + * move to the next line + */ +static void *fscache_objlist_next(struct seq_file *m, void *v, loff_t *_pos) +{ + (*_pos)++; + return fscache_objlist_lookup(_pos); +} + +/* + * clean up after reading + */ +static void fscache_objlist_stop(struct seq_file *m, void *v) + __releases(&fscache_object_list_lock) +{ + read_unlock(&fscache_object_list_lock); +} + +/* + * display an object + */ +static int fscache_objlist_show(struct seq_file *m, void *v) +{ + struct fscache_objlist_data *data = m->private; + struct fscache_object *obj = v; + unsigned long config = data->config; + uint16_t keylen, auxlen; + char _type[3], *type; + bool no_cookie; + u8 *buf = data->buf, *p; + + if ((unsigned long) v == 1) { + seq_puts(m, "OBJECT PARENT STAT CHLDN OPS OOP IPR EX READS" + " EM EV F S" + " | NETFS_COOKIE_DEF TY FL NETFS_DATA"); + if (config & (FSCACHE_OBJLIST_CONFIG_KEY | + FSCACHE_OBJLIST_CONFIG_AUX)) + seq_puts(m, " "); + if (config & FSCACHE_OBJLIST_CONFIG_KEY) + seq_puts(m, "OBJECT_KEY"); + if ((config & (FSCACHE_OBJLIST_CONFIG_KEY | + FSCACHE_OBJLIST_CONFIG_AUX)) == + (FSCACHE_OBJLIST_CONFIG_KEY | FSCACHE_OBJLIST_CONFIG_AUX)) + seq_puts(m, ", "); + if (config & FSCACHE_OBJLIST_CONFIG_AUX) + seq_puts(m, "AUX_DATA"); + seq_puts(m, "\n"); + return 0; + } + + if ((unsigned long) v == 2) { + seq_puts(m, "======== ======== ==== ===== === === === == =====" + " == == = =" + " | ================ == == ================"); + if (config & (FSCACHE_OBJLIST_CONFIG_KEY | + FSCACHE_OBJLIST_CONFIG_AUX)) + seq_puts(m, " ================"); + seq_puts(m, "\n"); + return 0; + } + + /* filter out any unwanted objects */ +#define FILTER(criterion, _yes, _no) \ + do { \ + unsigned long yes = FSCACHE_OBJLIST_CONFIG_##_yes; \ + unsigned long no = FSCACHE_OBJLIST_CONFIG_##_no; \ + if (criterion) { \ + if (!(config & yes)) \ + return 0; \ + } else { \ + if (!(config & no)) \ + return 0; \ + } \ + } while(0) + + if (~config) { + FILTER(obj->cookie, + COOKIE, NOCOOKIE); + FILTER(obj->state != FSCACHE_OBJECT_ACTIVE || + obj->n_ops != 0 || + obj->n_obj_ops != 0 || + obj->flags || + !list_empty(&obj->dependents), + BUSY, IDLE); + FILTER(test_bit(FSCACHE_OBJECT_PENDING_WRITE, &obj->flags), + PENDWR, NOPENDWR); + FILTER(atomic_read(&obj->n_reads), + READS, NOREADS); + FILTER(obj->events & obj->event_mask, + EVENTS, NOEVENTS); + FILTER(obj->work.flags & ~(1UL << SLOW_WORK_VERY_SLOW), + WORK, NOWORK); + } + + seq_printf(m, + "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %1lx %1lx | ", + obj->debug_id, + obj->parent ? obj->parent->debug_id : -1, + fscache_object_states_short[obj->state], + obj->n_children, + obj->n_ops, + obj->n_obj_ops, + obj->n_in_progress, + obj->n_exclusive, + atomic_read(&obj->n_reads), + obj->event_mask & FSCACHE_OBJECT_EVENTS_MASK, + obj->events, + obj->flags, + obj->work.flags); + + no_cookie = true; + keylen = auxlen = 0; + if (obj->cookie) { + spin_lock(&obj->lock); + if (obj->cookie) { + switch (obj->cookie->def->type) { + case 0: + type = "IX"; + break; + case 1: + type = "DT"; + break; + default: + sprintf(_type, "%02u", + obj->cookie->def->type); + type = _type; + break; + } + + seq_printf(m, "%-16s %s %2lx %16p", + obj->cookie->def->name, + type, + obj->cookie->flags, + obj->cookie->netfs_data); + + if (obj->cookie->def->get_key && + config & FSCACHE_OBJLIST_CONFIG_KEY) + keylen = obj->cookie->def->get_key( + obj->cookie->netfs_data, + buf, 400); + + if (obj->cookie->def->get_aux && + config & FSCACHE_OBJLIST_CONFIG_AUX) + auxlen = obj->cookie->def->get_aux( + obj->cookie->netfs_data, + buf + keylen, 512 - keylen); + + no_cookie = false; + } + spin_unlock(&obj->lock); + + if (!no_cookie && (keylen > 0 || auxlen > 0)) { + seq_printf(m, " "); + for (p = buf; keylen > 0; keylen--) + seq_printf(m, "%02x", *p++); + if (auxlen > 0) { + if (config & FSCACHE_OBJLIST_CONFIG_KEY) + seq_printf(m, ", "); + for (; auxlen > 0; auxlen--) + seq_printf(m, "%02x", *p++); + } + } + } + + if (no_cookie) + seq_printf(m, "\n"); + else + seq_printf(m, "\n"); + return 0; +} + +static const struct seq_operations fscache_objlist_ops = { + .start = fscache_objlist_start, + .stop = fscache_objlist_stop, + .next = fscache_objlist_next, + .show = fscache_objlist_show, +}; + +/* + * get the configuration for filtering the list + */ +static void fscache_objlist_config(struct fscache_objlist_data *data) +{ +#ifdef CONFIG_KEYS + struct user_key_payload *confkey; + unsigned long config; + struct key *key; + const char *buf; + int len; + + key = request_key(&key_type_user, "fscache:objlist", NULL); + if (IS_ERR(key)) + goto no_config; + + config = 0; + rcu_read_lock(); + + confkey = key->payload.data; + buf = confkey->data; + + for (len = confkey->datalen - 1; len >= 0; len--) { + switch (buf[len]) { + case 'K': config |= FSCACHE_OBJLIST_CONFIG_KEY; break; + case 'A': config |= FSCACHE_OBJLIST_CONFIG_AUX; break; + case 'C': config |= FSCACHE_OBJLIST_CONFIG_COOKIE; break; + case 'c': config |= FSCACHE_OBJLIST_CONFIG_NOCOOKIE; break; + case 'B': config |= FSCACHE_OBJLIST_CONFIG_BUSY; break; + case 'b': config |= FSCACHE_OBJLIST_CONFIG_IDLE; break; + case 'W': config |= FSCACHE_OBJLIST_CONFIG_PENDWR; break; + case 'w': config |= FSCACHE_OBJLIST_CONFIG_NOPENDWR; break; + case 'R': config |= FSCACHE_OBJLIST_CONFIG_READS; break; + case 'r': config |= FSCACHE_OBJLIST_CONFIG_NOREADS; break; + case 'S': config |= FSCACHE_OBJLIST_CONFIG_WORK; break; + case 's': config |= FSCACHE_OBJLIST_CONFIG_NOWORK; break; + } + } + + rcu_read_unlock(); + key_put(key); + + if (!(config & (FSCACHE_OBJLIST_CONFIG_COOKIE | FSCACHE_OBJLIST_CONFIG_NOCOOKIE))) + config |= FSCACHE_OBJLIST_CONFIG_COOKIE | FSCACHE_OBJLIST_CONFIG_NOCOOKIE; + if (!(config & (FSCACHE_OBJLIST_CONFIG_BUSY | FSCACHE_OBJLIST_CONFIG_IDLE))) + config |= FSCACHE_OBJLIST_CONFIG_BUSY | FSCACHE_OBJLIST_CONFIG_IDLE; + if (!(config & (FSCACHE_OBJLIST_CONFIG_PENDWR | FSCACHE_OBJLIST_CONFIG_NOPENDWR))) + config |= FSCACHE_OBJLIST_CONFIG_PENDWR | FSCACHE_OBJLIST_CONFIG_NOPENDWR; + if (!(config & (FSCACHE_OBJLIST_CONFIG_READS | FSCACHE_OBJLIST_CONFIG_NOREADS))) + config |= FSCACHE_OBJLIST_CONFIG_READS | FSCACHE_OBJLIST_CONFIG_NOREADS; + if (!(config & (FSCACHE_OBJLIST_CONFIG_EVENTS | FSCACHE_OBJLIST_CONFIG_NOEVENTS))) + config |= FSCACHE_OBJLIST_CONFIG_EVENTS | FSCACHE_OBJLIST_CONFIG_NOEVENTS; + if (!(config & (FSCACHE_OBJLIST_CONFIG_WORK | FSCACHE_OBJLIST_CONFIG_NOWORK))) + config |= FSCACHE_OBJLIST_CONFIG_WORK | FSCACHE_OBJLIST_CONFIG_NOWORK; + + data->config = config; + return; + +no_config: +#endif + data->config = ULONG_MAX; +} + +/* + * open "/proc/fs/fscache/objects" to provide a list of active objects + * - can be configured by a user-defined key added to the caller's keyrings + */ +static int fscache_objlist_open(struct inode *inode, struct file *file) +{ + struct fscache_objlist_data *data; + struct seq_file *m; + int ret; + + ret = seq_open(file, &fscache_objlist_ops); + if (ret < 0) + return ret; + + m = file->private_data; + + /* buffer for key extraction */ + data = kmalloc(sizeof(struct fscache_objlist_data), GFP_KERNEL); + if (!data) { + seq_release(inode, file); + return -ENOMEM; + } + + /* get the configuration key */ + fscache_objlist_config(data); + + m->private = data; + return 0; +} + +/* + * clean up on close + */ +static int fscache_objlist_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = file->private_data; + + kfree(m->private); + m->private = NULL; + return seq_release(inode, file); +} + +const struct file_operations fscache_objlist_fops = { + .owner = THIS_MODULE, + .open = fscache_objlist_open, + .read = seq_read, + .llseek = seq_lseek, + .release = fscache_objlist_release, +}; diff --git a/fs/fscache/object.c b/fs/fscache/object.c index 615b63dd9ecc..ad1644f073bd 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -34,7 +34,7 @@ const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = { }; EXPORT_SYMBOL(fscache_object_states); -static const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = { +const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = { [FSCACHE_OBJECT_INIT] = "INIT", [FSCACHE_OBJECT_LOOKING_UP] = "LOOK", [FSCACHE_OBJECT_CREATING] = "CRTN", diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index 91bbe6f0377c..09e43b6e822f 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -322,6 +322,9 @@ void fscache_put_operation(struct fscache_operation *op) object = op->object; + if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags)) + atomic_dec(&object->n_reads); + /* now... we may get called with the object spinlock held, so we * complete the cleanup here only if we can immediately acquire the * lock, and defer it otherwise */ diff --git a/fs/fscache/page.c b/fs/fscache/page.c index e8bbc395cef6..c5973e38ce39 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -275,6 +275,9 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, ASSERTCMP(object->state, >, FSCACHE_OBJECT_LOOKING_UP); + atomic_inc(&object->n_reads); + set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags); + if (fscache_submit_op(object, &op->op) < 0) goto nobufs_unlock; spin_unlock(&cookie->lock); @@ -386,6 +389,9 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie, object = hlist_entry(cookie->backing_objects.first, struct fscache_object, cookie_link); + atomic_inc(&object->n_reads); + set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags); + if (fscache_submit_op(object, &op->op) < 0) goto nobufs_unlock; spin_unlock(&cookie->lock); diff --git a/fs/fscache/proc.c b/fs/fscache/proc.c index beeab44bc31a..1d9e4951a597 100644 --- a/fs/fscache/proc.c +++ b/fs/fscache/proc.c @@ -37,10 +37,20 @@ int __init fscache_proc_init(void) goto error_histogram; #endif +#ifdef CONFIG_FSCACHE_OBJECT_LIST + if (!proc_create("fs/fscache/objects", S_IFREG | 0444, NULL, + &fscache_objlist_fops)) + goto error_objects; +#endif + _leave(" = 0"); return 0; +#ifdef CONFIG_FSCACHE_OBJECT_LIST +error_objects: +#endif #ifdef CONFIG_FSCACHE_HISTOGRAM + remove_proc_entry("fs/fscache/histogram", NULL); error_histogram: #endif #ifdef CONFIG_FSCACHE_STATS @@ -58,6 +68,9 @@ error_dir: */ void fscache_proc_cleanup(void) { +#ifdef CONFIG_FSCACHE_OBJECT_LIST + remove_proc_entry("fs/fscache/objects", NULL); +#endif #ifdef CONFIG_FSCACHE_HISTOGRAM remove_proc_entry("fs/fscache/histogram", NULL); #endif -- cgit v1.2.3 From 52bd75fdb135d6133d878ae60c6e7e3f4ebc1cfc Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:11:08 +0000 Subject: FS-Cache: Add counters for entry/exit to/from cache operation functions Count entries to and exits from cache operation table functions. Maintain these as a single counter that's added to or removed from as appropriate. Signed-off-by: David Howells --- fs/fscache/cache.c | 4 ++++ fs/fscache/cookie.c | 9 ++++++++- fs/fscache/internal.h | 22 ++++++++++++++++++++++ fs/fscache/object.c | 26 ++++++++++++++++++++++++-- fs/fscache/page.c | 29 +++++++++++++++++++++++------ fs/fscache/stats.c | 37 +++++++++++++++++++++++++++++++++++++ 6 files changed, 118 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c index 724384ef96de..6a3c48abd677 100644 --- a/fs/fscache/cache.c +++ b/fs/fscache/cache.c @@ -381,11 +381,15 @@ void fscache_withdraw_cache(struct fscache_cache *cache) /* make sure all pages pinned by operations on behalf of the netfs are * written to disk */ + fscache_stat(&fscache_n_cop_sync_cache); cache->ops->sync_cache(cache); + fscache_stat_d(&fscache_n_cop_sync_cache); /* dissociate all the netfs pages backed by this cache from the block * mappings in the cache */ + fscache_stat(&fscache_n_cop_dissociate_pages); cache->ops->dissociate_pages(cache); + fscache_stat_d(&fscache_n_cop_dissociate_pages); /* we now have to destroy all the active objects pertaining to this * cache - which we do by passing them off to thread pool to be diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 9b5187328230..432482edc738 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -249,7 +249,9 @@ static int fscache_alloc_object(struct fscache_cache *cache, /* ask the cache to allocate an object (we may end up with duplicate * objects at this stage, but we sort that out later) */ + fscache_stat(&fscache_n_cop_alloc_object); object = cache->ops->alloc_object(cache, cookie); + fscache_stat_d(&fscache_n_cop_alloc_object); if (IS_ERR(object)) { fscache_stat(&fscache_n_object_no_alloc); ret = PTR_ERR(object); @@ -270,8 +272,11 @@ static int fscache_alloc_object(struct fscache_cache *cache, /* only attach if we managed to allocate all we needed, otherwise * discard the object we just allocated and instead use the one * attached to the cookie */ - if (fscache_attach_object(cookie, object) < 0) + if (fscache_attach_object(cookie, object) < 0) { + fscache_stat(&fscache_n_cop_put_object); cache->ops->put_object(object); + fscache_stat_d(&fscache_n_cop_put_object); + } _leave(" = 0"); return 0; @@ -287,7 +292,9 @@ object_already_extant: return 0; error_put: + fscache_stat(&fscache_n_cop_put_object); cache->ops->put_object(object); + fscache_stat_d(&fscache_n_cop_put_object); error: _leave(" = %d", ret); return ret; diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index fe02973a9516..b85cc8906818 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -208,11 +208,33 @@ extern atomic_t fscache_n_checkaux_okay; extern atomic_t fscache_n_checkaux_update; extern atomic_t fscache_n_checkaux_obsolete; +extern atomic_t fscache_n_cop_alloc_object; +extern atomic_t fscache_n_cop_lookup_object; +extern atomic_t fscache_n_cop_lookup_complete; +extern atomic_t fscache_n_cop_grab_object; +extern atomic_t fscache_n_cop_update_object; +extern atomic_t fscache_n_cop_drop_object; +extern atomic_t fscache_n_cop_put_object; +extern atomic_t fscache_n_cop_sync_cache; +extern atomic_t fscache_n_cop_attr_changed; +extern atomic_t fscache_n_cop_read_or_alloc_page; +extern atomic_t fscache_n_cop_read_or_alloc_pages; +extern atomic_t fscache_n_cop_allocate_page; +extern atomic_t fscache_n_cop_allocate_pages; +extern atomic_t fscache_n_cop_write_page; +extern atomic_t fscache_n_cop_uncache_page; +extern atomic_t fscache_n_cop_dissociate_pages; + static inline void fscache_stat(atomic_t *stat) { atomic_inc(stat); } +static inline void fscache_stat_d(atomic_t *stat) +{ + atomic_dec(stat); +} + extern const struct file_operations fscache_stats_fops; #else diff --git a/fs/fscache/object.c b/fs/fscache/object.c index ad1644f073bd..0d65c0c92b46 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -144,13 +144,17 @@ static void fscache_object_state_machine(struct fscache_object *object) case FSCACHE_OBJECT_UPDATING: clear_bit(FSCACHE_OBJECT_EV_UPDATE, &object->events); fscache_stat(&fscache_n_updates_run); + fscache_stat(&fscache_n_cop_update_object); object->cache->ops->update_object(object); + fscache_stat_d(&fscache_n_cop_update_object); goto active_transit; /* handle an object dying during lookup or creation */ case FSCACHE_OBJECT_LC_DYING: object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE); + fscache_stat(&fscache_n_cop_lookup_complete); object->cache->ops->lookup_complete(object); + fscache_stat_d(&fscache_n_cop_lookup_complete); spin_lock(&object->lock); object->state = FSCACHE_OBJECT_DYING; @@ -416,7 +420,9 @@ static void fscache_initialise_object(struct fscache_object *object) * binding on to us, so we need to make sure we don't * add ourself to the list multiple times */ if (list_empty(&object->dep_link)) { + fscache_stat(&fscache_n_cop_grab_object); object->cache->ops->grab_object(object); + fscache_stat_d(&fscache_n_cop_grab_object); list_add(&object->dep_link, &parent->dependents); @@ -478,7 +484,9 @@ static void fscache_lookup_object(struct fscache_object *object) object->cache->tag->name); fscache_stat(&fscache_n_object_lookups); + fscache_stat(&fscache_n_cop_lookup_object); object->cache->ops->lookup_object(object); + fscache_stat_d(&fscache_n_cop_lookup_object); if (test_bit(FSCACHE_OBJECT_EV_ERROR, &object->events)) set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags); @@ -602,7 +610,9 @@ static void fscache_object_available(struct fscache_object *object) } spin_unlock(&object->lock); + fscache_stat(&fscache_n_cop_lookup_complete); object->cache->ops->lookup_complete(object); + fscache_stat_d(&fscache_n_cop_lookup_complete); fscache_enqueue_dependents(object); fscache_hist(fscache_obj_instantiate_histogram, object->lookup_jif); @@ -625,7 +635,9 @@ static void fscache_drop_object(struct fscache_object *object) list_del_init(&object->cache_link); spin_unlock(&cache->object_list_lock); + fscache_stat(&fscache_n_cop_drop_object); cache->ops->drop_object(object); + fscache_stat_d(&fscache_n_cop_drop_object); if (parent) { _debug("release parent OBJ%x {%d}", @@ -640,7 +652,9 @@ static void fscache_drop_object(struct fscache_object *object) } /* this just shifts the object release to the slow work processor */ + fscache_stat(&fscache_n_cop_put_object); object->cache->ops->put_object(object); + fscache_stat_d(&fscache_n_cop_put_object); _leave(""); } @@ -730,8 +744,12 @@ static int fscache_object_slow_work_get_ref(struct slow_work *work) { struct fscache_object *object = container_of(work, struct fscache_object, work); + int ret; - return object->cache->ops->grab_object(object) ? 0 : -EAGAIN; + fscache_stat(&fscache_n_cop_grab_object); + ret = object->cache->ops->grab_object(object) ? 0 : -EAGAIN; + fscache_stat_d(&fscache_n_cop_grab_object); + return ret; } /* @@ -742,7 +760,9 @@ static void fscache_object_slow_work_put_ref(struct slow_work *work) struct fscache_object *object = container_of(work, struct fscache_object, work); - return object->cache->ops->put_object(object); + fscache_stat(&fscache_n_cop_put_object); + object->cache->ops->put_object(object); + fscache_stat_d(&fscache_n_cop_put_object); } /* @@ -779,7 +799,9 @@ static void fscache_enqueue_dependents(struct fscache_object *object) /* sort onto appropriate lists */ fscache_enqueue_object(dep); + fscache_stat(&fscache_n_cop_put_object); dep->cache->ops->put_object(dep); + fscache_stat_d(&fscache_n_cop_put_object); if (!list_empty(&object->dependents)) cond_resched_lock(&object->lock); diff --git a/fs/fscache/page.c b/fs/fscache/page.c index c5973e38ce39..250dfd34c07b 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -71,7 +71,9 @@ static void fscache_attr_changed_op(struct fscache_operation *op) if (fscache_object_is_active(object)) { fscache_set_op_state(op, "CallFS"); + fscache_stat(&fscache_n_cop_attr_changed); ret = object->cache->ops->attr_changed(object); + fscache_stat_d(&fscache_n_cop_attr_changed); fscache_set_op_state(op, "Done"); if (ret < 0) fscache_abort_object(object); @@ -300,11 +302,15 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, /* ask the cache to honour the operation */ if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) { + fscache_stat(&fscache_n_cop_allocate_page); ret = object->cache->ops->allocate_page(op, page, gfp); + fscache_stat_d(&fscache_n_cop_allocate_page); if (ret == 0) ret = -ENODATA; } else { + fscache_stat(&fscache_n_cop_read_or_alloc_page); ret = object->cache->ops->read_or_alloc_page(op, page, gfp); + fscache_stat_d(&fscache_n_cop_read_or_alloc_page); } if (ret == -ENOMEM) @@ -358,7 +364,6 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie, void *context, gfp_t gfp) { - fscache_pages_retrieval_func_t func; struct fscache_retrieval *op; struct fscache_object *object; int ret; @@ -413,11 +418,17 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie, } /* ask the cache to honour the operation */ - if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) - func = object->cache->ops->allocate_pages; - else - func = object->cache->ops->read_or_alloc_pages; - ret = func(op, pages, nr_pages, gfp); + if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) { + fscache_stat(&fscache_n_cop_allocate_pages); + ret = object->cache->ops->allocate_pages( + op, pages, nr_pages, gfp); + fscache_stat_d(&fscache_n_cop_allocate_pages); + } else { + fscache_stat(&fscache_n_cop_read_or_alloc_pages); + ret = object->cache->ops->read_or_alloc_pages( + op, pages, nr_pages, gfp); + fscache_stat_d(&fscache_n_cop_read_or_alloc_pages); + } if (ret == -ENOMEM) fscache_stat(&fscache_n_retrievals_nomem); @@ -500,7 +511,9 @@ int __fscache_alloc_page(struct fscache_cookie *cookie, } /* ask the cache to honour the operation */ + fscache_stat(&fscache_n_cop_allocate_page); ret = object->cache->ops->allocate_page(op, page, gfp); + fscache_stat_d(&fscache_n_cop_allocate_page); if (ret < 0) fscache_stat(&fscache_n_allocs_nobufs); @@ -578,7 +591,9 @@ static void fscache_write_op(struct fscache_operation *_op) if (page) { fscache_set_op_state(&op->op, "Store"); + fscache_stat(&fscache_n_cop_write_page); ret = object->cache->ops->write_page(op, page); + fscache_stat_d(&fscache_n_cop_write_page); fscache_set_op_state(&op->op, "EndWrite"); fscache_end_page_write(cookie, page); page_cache_release(page); @@ -786,7 +801,9 @@ void __fscache_uncache_page(struct fscache_cookie *cookie, struct page *page) if (TestClearPageFsCache(page) && object->cache->ops->uncache_page) { /* the cache backend releases the cookie lock */ + fscache_stat(&fscache_n_cop_uncache_page); object->cache->ops->uncache_page(object, page); + fscache_stat_d(&fscache_n_cop_uncache_page); goto done; } diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c index 65deb99e756b..20233fb44bfd 100644 --- a/fs/fscache/stats.c +++ b/fs/fscache/stats.c @@ -93,6 +93,23 @@ atomic_t fscache_n_checkaux_okay; atomic_t fscache_n_checkaux_update; atomic_t fscache_n_checkaux_obsolete; +atomic_t fscache_n_cop_alloc_object; +atomic_t fscache_n_cop_lookup_object; +atomic_t fscache_n_cop_lookup_complete; +atomic_t fscache_n_cop_grab_object; +atomic_t fscache_n_cop_update_object; +atomic_t fscache_n_cop_drop_object; +atomic_t fscache_n_cop_put_object; +atomic_t fscache_n_cop_sync_cache; +atomic_t fscache_n_cop_attr_changed; +atomic_t fscache_n_cop_read_or_alloc_page; +atomic_t fscache_n_cop_read_or_alloc_pages; +atomic_t fscache_n_cop_allocate_page; +atomic_t fscache_n_cop_allocate_pages; +atomic_t fscache_n_cop_write_page; +atomic_t fscache_n_cop_uncache_page; +atomic_t fscache_n_cop_dissociate_pages; + /* * display the general statistics */ @@ -192,6 +209,26 @@ static int fscache_stats_show(struct seq_file *m, void *v) atomic_read(&fscache_n_op_deferred_release), atomic_read(&fscache_n_op_release), atomic_read(&fscache_n_op_gc)); + + seq_printf(m, "CacheOp: alo=%d luo=%d luc=%d gro=%d\n", + atomic_read(&fscache_n_cop_alloc_object), + atomic_read(&fscache_n_cop_lookup_object), + atomic_read(&fscache_n_cop_lookup_complete), + atomic_read(&fscache_n_cop_grab_object)); + seq_printf(m, "CacheOp: upo=%d dro=%d pto=%d atc=%d syn=%d\n", + atomic_read(&fscache_n_cop_update_object), + atomic_read(&fscache_n_cop_drop_object), + atomic_read(&fscache_n_cop_put_object), + atomic_read(&fscache_n_cop_attr_changed), + atomic_read(&fscache_n_cop_sync_cache)); + seq_printf(m, "CacheOp: rap=%d ras=%d alp=%d als=%d wrp=%d ucp=%d dsp=%d\n", + atomic_read(&fscache_n_cop_read_or_alloc_page), + atomic_read(&fscache_n_cop_read_or_alloc_pages), + atomic_read(&fscache_n_cop_allocate_page), + atomic_read(&fscache_n_cop_allocate_pages), + atomic_read(&fscache_n_cop_write_page), + atomic_read(&fscache_n_cop_uncache_page), + atomic_read(&fscache_n_cop_dissociate_pages)); return 0; } -- cgit v1.2.3 From 7e311a207d596b9273d811149d6e3e14f05ac4c0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:11:11 +0000 Subject: FS-Cache: Clear netfs pointers in cookie after detaching object, not before Clear the pointers from the fscache_cookie struct to netfs private data after clearing the pointer to the cookie from the fscache_object struct and releasing the object lock, rather than before. This allows the netfs private data pointers to be relied on simply by holding the object lock, rather than having to hold the cookie lock. This is makes things simpler as the cookie lock has to be taken before the object lock, but sometimes the object pointer is all that the code has. Signed-off-by: David Howells --- fs/fscache/cookie.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 432482edc738..b1870a6c2cd3 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -437,12 +437,8 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire) event = retire ? FSCACHE_OBJECT_EV_RETIRE : FSCACHE_OBJECT_EV_RELEASE; - /* detach pointers back to the netfs */ spin_lock(&cookie->lock); - cookie->netfs_data = NULL; - cookie->def = NULL; - /* break links with all the active objects */ while (!hlist_empty(&cookie->backing_objects)) { object = hlist_entry(cookie->backing_objects.first, @@ -465,6 +461,10 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire) BUG(); } + /* detach pointers back to the netfs */ + cookie->netfs_data = NULL; + cookie->def = NULL; + spin_unlock(&cookie->lock); if (cookie->parent) { -- cgit v1.2.3 From b34df792b4e9e311db47fad27949095d0629c197 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:11:14 +0000 Subject: FS-Cache: Use radix tree preload correctly in tracking of pages to be stored __fscache_write_page() attempts to load the radix tree preallocation pool for the CPU it is on before calling radix_tree_insert(), as the insertion must be done inside a pair of spinlocks. Use of the preallocation pool, however, is contingent on the radix tree being initialised without __GFP_WAIT specified. __fscache_acquire_cookie() was passing GFP_NOFS to INIT_RADIX_TREE() - but that includes __GFP_WAIT. The solution is to AND out __GFP_WAIT. Additionally, the banner comment to radix_tree_preload() is altered to make note of this prerequisite. Possibly there should be a WARN_ON() too. Without this fix, I have seen the following recursive deadlock caused by radix_tree_insert() attempting to allocate memory inside the spinlocked region, which resulted in FS-Cache being called back into to release memory - which required the spinlock already held. ============================================= [ INFO: possible recursive locking detected ] 2.6.32-rc6-cachefs #24 --------------------------------------------- nfsiod/7916 is trying to acquire lock: (&cookie->lock){+.+.-.}, at: [] __fscache_uncache_page+0xdb/0x160 [fscache] but task is already holding lock: (&cookie->lock){+.+.-.}, at: [] __fscache_write_page+0x15c/0x3f3 [fscache] other info that might help us debug this: 5 locks held by nfsiod/7916: #0: (nfsiod){+.+.+.}, at: [] worker_thread+0x19a/0x2e2 #1: (&task->u.tk_work#2){+.+.+.}, at: [] worker_thread+0x19a/0x2e2 #2: (&cookie->lock){+.+.-.}, at: [] __fscache_write_page+0x15c/0x3f3 [fscache] #3: (&object->lock#2){+.+.-.}, at: [] __fscache_write_page+0x197/0x3f3 [fscache] #4: (&cookie->stores_lock){+.+...}, at: [] __fscache_write_page+0x19f/0x3f3 [fscache] stack backtrace: Pid: 7916, comm: nfsiod Not tainted 2.6.32-rc6-cachefs #24 Call Trace: [] __lock_acquire+0x1649/0x16e3 [] ? __lock_acquire+0x7b7/0x16e3 [] ? dump_trace+0x248/0x257 [] lock_acquire+0x57/0x6d [] ? __fscache_uncache_page+0xdb/0x160 [fscache] [] _spin_lock+0x2c/0x3b [] ? __fscache_uncache_page+0xdb/0x160 [fscache] [] __fscache_uncache_page+0xdb/0x160 [fscache] [] ? __fscache_check_page_write+0x0/0x71 [fscache] [] nfs_fscache_release_page+0x86/0xc4 [nfs] [] nfs_release_page+0x3c/0x41 [nfs] [] try_to_release_page+0x32/0x3b [] shrink_page_list+0x316/0x4ac [] ? mark_held_locks+0x52/0x70 [] ? _spin_unlock_irq+0x2b/0x31 [] shrink_inactive_list+0x392/0x67c [] ? mark_held_locks+0x52/0x70 [] shrink_list+0x8d/0x8f [] shrink_zone+0x278/0x33c [] ? ktime_get_ts+0xad/0xba [] try_to_free_pages+0x22e/0x392 [] ? isolate_pages_global+0x0/0x212 [] __alloc_pages_nodemask+0x3dc/0x5cf [] cache_alloc_refill+0x34d/0x6c1 [] ? radix_tree_node_alloc+0x52/0x5c [] kmem_cache_alloc+0xb2/0x118 [] radix_tree_node_alloc+0x52/0x5c [] radix_tree_insert+0x57/0x19c [] __fscache_write_page+0x1e3/0x3f3 [fscache] [] __nfs_readpage_to_fscache+0x58/0x11e [nfs] [] nfs_readpage_release+0x34/0x9b [nfs] [] nfs_readpage_release_full+0x32/0x4b [nfs] [] rpc_release_calldata+0x12/0x14 [sunrpc] [] rpc_free_task+0x59/0x61 [sunrpc] [] rpc_async_release+0x10/0x12 [sunrpc] [] worker_thread+0x1ef/0x2e2 [] ? worker_thread+0x19a/0x2e2 [] ? thread_return+0x3e/0x101 [] ? rpc_async_release+0x0/0x12 [sunrpc] [] ? autoremove_wake_function+0x0/0x34 [] ? trace_hardirqs_on+0xd/0xf [] ? worker_thread+0x0/0x2e2 [] kthread+0x7a/0x82 [] child_rip+0xa/0x20 [] ? restore_args+0x0/0x30 [] ? add_wait_queue+0x15/0x44 [] ? kthread+0x0/0x82 [] ? child_rip+0x0/0x20 Signed-off-by: David Howells --- fs/fscache/cookie.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index b1870a6c2cd3..e6854f5222f5 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -102,7 +102,9 @@ struct fscache_cookie *__fscache_acquire_cookie( cookie->netfs_data = netfs_data; cookie->flags = 0; - INIT_RADIX_TREE(&cookie->stores, GFP_NOFS); + /* radix tree insertion won't use the preallocation pool unless it's + * told it may not wait */ + INIT_RADIX_TREE(&cookie->stores, GFP_NOFS & ~__GFP_WAIT); switch (cookie->def->type) { case FSCACHE_COOKIE_TYPE_INDEX: -- cgit v1.2.3 From 5753c441889253e4323eee85f791a1d64cf08196 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:11:19 +0000 Subject: FS-Cache: Permit cache retrieval ops to be interrupted in the initial wait phase Permit the operations to retrieve data from the cache or to allocate space in the cache for future writes to be interrupted whilst they're waiting for permission for the operation to proceed. Typically this wait occurs whilst the cache object is being looked up on disk in the background. If an interruption occurs, and the operation has not yet been given the go-ahead to run, the operation is dequeued and cancelled, and control returns to the read operation of the netfs routine with none of the requested pages having been read or in any way marked as known by the cache. This means that the initial wait is done interruptibly rather than uninterruptibly. In addition, extra stats values are made available to show the number of ops cancelled and the number of cache space allocations interrupted. Signed-off-by: David Howells --- fs/fscache/internal.h | 3 ++ fs/fscache/operation.c | 82 +++++++++++++++++++++++++++++++++----------------- fs/fscache/page.c | 55 ++++++++++++++++++++++++++++----- fs/fscache/stats.c | 12 +++++--- 4 files changed, 113 insertions(+), 39 deletions(-) (limited to 'fs') diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index b85cc8906818..50324ad2b194 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -112,6 +112,7 @@ extern int fscache_submit_exclusive_op(struct fscache_object *, struct fscache_operation *); extern int fscache_submit_op(struct fscache_object *, struct fscache_operation *); +extern int fscache_cancel_op(struct fscache_operation *); extern void fscache_abort_object(struct fscache_object *); extern void fscache_start_operations(struct fscache_object *); extern void fscache_operation_gc(struct work_struct *); @@ -140,6 +141,7 @@ extern atomic_t fscache_n_op_enqueue; extern atomic_t fscache_n_op_deferred_release; extern atomic_t fscache_n_op_release; extern atomic_t fscache_n_op_gc; +extern atomic_t fscache_n_op_cancelled; extern atomic_t fscache_n_attr_changed; extern atomic_t fscache_n_attr_changed_ok; @@ -151,6 +153,7 @@ extern atomic_t fscache_n_allocs; extern atomic_t fscache_n_allocs_ok; extern atomic_t fscache_n_allocs_wait; extern atomic_t fscache_n_allocs_nobufs; +extern atomic_t fscache_n_allocs_intr; extern atomic_t fscache_n_alloc_ops; extern atomic_t fscache_n_alloc_op_waits; diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index 09e43b6e822f..296492efb81b 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -34,32 +34,31 @@ void fscache_enqueue_operation(struct fscache_operation *op) fscache_set_op_state(op, "EnQ"); + ASSERT(list_empty(&op->pend_link)); ASSERT(op->processor != NULL); ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE); ASSERTCMP(atomic_read(&op->usage), >, 0); - if (list_empty(&op->pend_link)) { - switch (op->flags & FSCACHE_OP_TYPE) { - case FSCACHE_OP_FAST: - _debug("queue fast"); - atomic_inc(&op->usage); - if (!schedule_work(&op->fast_work)) - fscache_put_operation(op); - break; - case FSCACHE_OP_SLOW: - _debug("queue slow"); - slow_work_enqueue(&op->slow_work); - break; - case FSCACHE_OP_MYTHREAD: - _debug("queue for caller's attention"); - break; - default: - printk(KERN_ERR "FS-Cache: Unexpected op type %lx", - op->flags); - BUG(); - break; - } - fscache_stat(&fscache_n_op_enqueue); + fscache_stat(&fscache_n_op_enqueue); + switch (op->flags & FSCACHE_OP_TYPE) { + case FSCACHE_OP_FAST: + _debug("queue fast"); + atomic_inc(&op->usage); + if (!schedule_work(&op->fast_work)) + fscache_put_operation(op); + break; + case FSCACHE_OP_SLOW: + _debug("queue slow"); + slow_work_enqueue(&op->slow_work); + break; + case FSCACHE_OP_MYTHREAD: + _debug("queue for caller's attention"); + break; + default: + printk(KERN_ERR "FS-Cache: Unexpected op type %lx", + op->flags); + BUG(); + break; } } EXPORT_SYMBOL(fscache_enqueue_operation); @@ -97,6 +96,7 @@ int fscache_submit_exclusive_op(struct fscache_object *object, spin_lock(&object->lock); ASSERTCMP(object->n_ops, >=, object->n_in_progress); ASSERTCMP(object->n_ops, >=, object->n_exclusive); + ASSERT(list_empty(&op->pend_link)); ret = -ENOBUFS; if (fscache_object_is_active(object)) { @@ -202,6 +202,7 @@ int fscache_submit_op(struct fscache_object *object, spin_lock(&object->lock); ASSERTCMP(object->n_ops, >=, object->n_in_progress); ASSERTCMP(object->n_ops, >=, object->n_exclusive); + ASSERT(list_empty(&op->pend_link)); ostate = object->state; smp_rmb(); @@ -273,12 +274,7 @@ void fscache_start_operations(struct fscache_object *object) stop = true; } list_del_init(&op->pend_link); - object->n_in_progress++; - - if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags)) - wake_up_bit(&op->flags, FSCACHE_OP_WAITING); - if (op->processor) - fscache_enqueue_operation(op); + fscache_run_op(object, op); /* the pending queue was holding a ref on the object */ fscache_put_operation(op); @@ -290,6 +286,36 @@ void fscache_start_operations(struct fscache_object *object) object->n_in_progress, object->debug_id); } +/* + * cancel an operation that's pending on an object + */ +int fscache_cancel_op(struct fscache_operation *op) +{ + struct fscache_object *object = op->object; + int ret; + + _enter("OBJ%x OP%x}", op->object->debug_id, op->debug_id); + + spin_lock(&object->lock); + + ret = -EBUSY; + if (!list_empty(&op->pend_link)) { + fscache_stat(&fscache_n_op_cancelled); + list_del_init(&op->pend_link); + object->n_ops--; + if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) + object->n_exclusive--; + if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags)) + wake_up_bit(&op->flags, FSCACHE_OP_WAITING); + fscache_put_operation(op); + ret = 0; + } + + spin_unlock(&object->lock); + _leave(" = %d", ret); + return ret; +} + /* * release an operation * - queues pending ops if this is the last in-progress op diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 250dfd34c07b..e6f2e61133a1 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -295,8 +295,20 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) { _debug(">>> WT"); fscache_stat(&fscache_n_retrieval_op_waits); - wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, - fscache_wait_bit, TASK_UNINTERRUPTIBLE); + if (wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, + fscache_wait_bit_interruptible, + TASK_INTERRUPTIBLE) < 0) { + ret = fscache_cancel_op(&op->op); + if (ret == 0) { + ret = -ERESTARTSYS; + goto error; + } + + /* it's been removed from the pending queue by another + * party, so we should get to run shortly */ + wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, + fscache_wait_bit, TASK_UNINTERRUPTIBLE); + } _debug("<<< GO"); } @@ -313,6 +325,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, fscache_stat_d(&fscache_n_cop_read_or_alloc_page); } +error: if (ret == -ENOMEM) fscache_stat(&fscache_n_retrievals_nomem); else if (ret == -ERESTARTSYS) @@ -412,8 +425,20 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie, if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) { _debug(">>> WT"); fscache_stat(&fscache_n_retrieval_op_waits); - wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, - fscache_wait_bit, TASK_UNINTERRUPTIBLE); + if (wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, + fscache_wait_bit_interruptible, + TASK_INTERRUPTIBLE) < 0) { + ret = fscache_cancel_op(&op->op); + if (ret == 0) { + ret = -ERESTARTSYS; + goto error; + } + + /* it's been removed from the pending queue by another + * party, so we should get to run shortly */ + wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, + fscache_wait_bit, TASK_UNINTERRUPTIBLE); + } _debug("<<< GO"); } @@ -430,6 +455,7 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie, fscache_stat_d(&fscache_n_cop_read_or_alloc_pages); } +error: if (ret == -ENOMEM) fscache_stat(&fscache_n_retrievals_nomem); else if (ret == -ERESTARTSYS) @@ -505,8 +531,20 @@ int __fscache_alloc_page(struct fscache_cookie *cookie, if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) { _debug(">>> WT"); fscache_stat(&fscache_n_alloc_op_waits); - wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, - fscache_wait_bit, TASK_UNINTERRUPTIBLE); + if (wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, + fscache_wait_bit_interruptible, + TASK_INTERRUPTIBLE) < 0) { + ret = fscache_cancel_op(&op->op); + if (ret == 0) { + ret = -ERESTARTSYS; + goto error; + } + + /* it's been removed from the pending queue by another + * party, so we should get to run shortly */ + wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, + fscache_wait_bit, TASK_UNINTERRUPTIBLE); + } _debug("<<< GO"); } @@ -515,7 +553,10 @@ int __fscache_alloc_page(struct fscache_cookie *cookie, ret = object->cache->ops->allocate_page(op, page, gfp); fscache_stat_d(&fscache_n_cop_allocate_page); - if (ret < 0) +error: + if (ret == -ERESTARTSYS) + fscache_stat(&fscache_n_allocs_intr); + else if (ret < 0) fscache_stat(&fscache_n_allocs_nobufs); else fscache_stat(&fscache_n_allocs_ok); diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c index 20233fb44bfd..4c07439d1307 100644 --- a/fs/fscache/stats.c +++ b/fs/fscache/stats.c @@ -25,6 +25,7 @@ atomic_t fscache_n_op_requeue; atomic_t fscache_n_op_deferred_release; atomic_t fscache_n_op_release; atomic_t fscache_n_op_gc; +atomic_t fscache_n_op_cancelled; atomic_t fscache_n_attr_changed; atomic_t fscache_n_attr_changed_ok; @@ -36,6 +37,7 @@ atomic_t fscache_n_allocs; atomic_t fscache_n_allocs_ok; atomic_t fscache_n_allocs_wait; atomic_t fscache_n_allocs_nobufs; +atomic_t fscache_n_allocs_intr; atomic_t fscache_n_alloc_ops; atomic_t fscache_n_alloc_op_waits; @@ -169,11 +171,12 @@ static int fscache_stats_show(struct seq_file *m, void *v) atomic_read(&fscache_n_attr_changed_nomem), atomic_read(&fscache_n_attr_changed_calls)); - seq_printf(m, "Allocs : n=%u ok=%u wt=%u nbf=%u\n", + seq_printf(m, "Allocs : n=%u ok=%u wt=%u nbf=%u int=%u\n", atomic_read(&fscache_n_allocs), atomic_read(&fscache_n_allocs_ok), atomic_read(&fscache_n_allocs_wait), - atomic_read(&fscache_n_allocs_nobufs)); + atomic_read(&fscache_n_allocs_nobufs), + atomic_read(&fscache_n_allocs_intr)); seq_printf(m, "Allocs : ops=%u owt=%u\n", atomic_read(&fscache_n_alloc_ops), atomic_read(&fscache_n_alloc_op_waits)); @@ -201,10 +204,11 @@ static int fscache_stats_show(struct seq_file *m, void *v) atomic_read(&fscache_n_store_ops), atomic_read(&fscache_n_store_calls)); - seq_printf(m, "Ops : pend=%u run=%u enq=%u\n", + seq_printf(m, "Ops : pend=%u run=%u enq=%u can=%u\n", atomic_read(&fscache_n_op_pend), atomic_read(&fscache_n_op_run), - atomic_read(&fscache_n_op_enqueue)); + atomic_read(&fscache_n_op_enqueue), + atomic_read(&fscache_n_op_cancelled)); seq_printf(m, "Ops : dfr=%u rel=%u gc=%u\n", atomic_read(&fscache_n_op_deferred_release), atomic_read(&fscache_n_op_release), -- cgit v1.2.3 From 6897e3df8fc37bd4a58bbcdef8306da7fc175584 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:11:22 +0000 Subject: FS-Cache: The object-available state can't rely on the cookie to be available The object-available state in the object processing state machine (as processed by fscache_object_available()) can't rely on the cookie to be available because the FSCACHE_COOKIE_CREATING bit may have been cleared by fscache_obtained_object() prior to the object being put into the FSCACHE_OBJECT_AVAILABLE state. Clearing the FSCACHE_COOKIE_CREATING bit on a cookie permits __fscache_relinquish_cookie() to proceed and detach the cookie from the object. To deal with this, we don't dereference object->cookie in fscache_object_available() if the object has already been detached. In addition, a couple of assertions are added into fscache_drop_object() to make sure the object is unbound from the cookie before it gets there. Signed-off-by: David Howells --- fs/fscache/object.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fscache/object.c b/fs/fscache/object.c index 0d65c0c92b46..1a1afa82f798 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -158,7 +158,8 @@ static void fscache_object_state_machine(struct fscache_object *object) spin_lock(&object->lock); object->state = FSCACHE_OBJECT_DYING; - if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, + if (object->cookie && + test_and_clear_bit(FSCACHE_COOKIE_CREATING, &object->cookie->flags)) wake_up_bit(&object->cookie->flags, FSCACHE_COOKIE_CREATING); @@ -594,7 +595,8 @@ static void fscache_object_available(struct fscache_object *object) spin_lock(&object->lock); - if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, &object->cookie->flags)) + if (object->cookie && + test_and_clear_bit(FSCACHE_COOKIE_CREATING, &object->cookie->flags)) wake_up_bit(&object->cookie->flags, FSCACHE_COOKIE_CREATING); fscache_done_parent_op(object); @@ -631,6 +633,9 @@ static void fscache_drop_object(struct fscache_object *object) _enter("{OBJ%x,%d}", object->debug_id, object->n_children); + ASSERTCMP(object->cookie, ==, NULL); + ASSERT(hlist_unhashed(&object->cookie_link)); + spin_lock(&cache->object_list_lock); list_del_init(&object->cache_link); spin_unlock(&cache->object_list_lock); -- cgit v1.2.3 From 1bccf513ac49d44604ba1cddcc29f5886e70f1b6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:11:25 +0000 Subject: FS-Cache: Fix lock misorder in fscache_write_op() FS-Cache has two structs internally for keeping track of the internal state of a cached file: the fscache_cookie struct, which represents the netfs's state, and fscache_object struct, which represents the cache's state. Each has a pointer that points to the other (when both are in existence), and each has a spinlock for pointer maintenance. Since netfs operations approach these structures from the cookie side, they get the cookie lock first, then the object lock. Cache operations, on the other hand, approach from the object side, and get the object lock first. It is not then permitted for a cache operation to get the cookie lock whilst it is holding the object lock lest deadlock occur; instead, it must do one of two things: (1) increment the cookie usage counter, drop the object lock and then get both locks in order, or (2) simply hold the object lock as certain parts of the cookie may not be altered whilst the object lock is held. It is also not permitted to follow either pointer without holding the lock at the end you start with. To break the pointers between the cookie and the object, both locks must be held. fscache_write_op(), however, violates the locking rules: It attempts to get the cookie lock without (a) checking that the cookie pointer is a valid pointer, and (b) holding the object lock to protect the cookie pointer whilst it follows it. This is so that it can access the pending page store tree without interference from __fscache_write_page(). This is fixed by splitting the cookie lock, such that the page store tracking tree is protected by its own lock, and checking that the cookie pointer is non-NULL before we attempt to follow it whilst holding the object lock. The new lock is subordinate to both the cookie lock and the object lock, and so should be taken after those. Signed-off-by: David Howells --- fs/fscache/cookie.c | 1 + fs/fscache/internal.h | 4 ++++ fs/fscache/page.c | 52 ++++++++++++++++++++++++++++++++++----------------- fs/fscache/stats.c | 10 ++++++++-- 4 files changed, 48 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index e6854f5222f5..f979659c1b3f 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -36,6 +36,7 @@ void fscache_cookie_init_once(void *_cookie) memset(cookie, 0, sizeof(*cookie)); spin_lock_init(&cookie->lock); + spin_lock_init(&cookie->stores_lock); INIT_HLIST_HEAD(&cookie->backing_objects); } diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index 50324ad2b194..ba1853fa1ff9 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -17,6 +17,7 @@ * - cache->object_list_lock * - object->lock * - object->parent->lock + * - cookie->stores_lock * - fscache_thread_lock * */ @@ -174,6 +175,9 @@ extern atomic_t fscache_n_stores_nobufs; extern atomic_t fscache_n_stores_oom; extern atomic_t fscache_n_store_ops; extern atomic_t fscache_n_store_calls; +extern atomic_t fscache_n_store_pages; +extern atomic_t fscache_n_store_radix_deletes; +extern atomic_t fscache_n_store_pages_over_limit; extern atomic_t fscache_n_marks; extern atomic_t fscache_n_uncaches; diff --git a/fs/fscache/page.c b/fs/fscache/page.c index e6f2e61133a1..3ea8897bc217 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -45,16 +45,26 @@ EXPORT_SYMBOL(__fscache_wait_on_page_write); /* * note that a page has finished being written to the cache */ -static void fscache_end_page_write(struct fscache_cookie *cookie, struct page *page) +static void fscache_end_page_write(struct fscache_object *object, + struct page *page) { - struct page *xpage; + struct fscache_cookie *cookie; + struct page *xpage = NULL; - spin_lock(&cookie->lock); - xpage = radix_tree_delete(&cookie->stores, page->index); - spin_unlock(&cookie->lock); - ASSERT(xpage != NULL); - - wake_up_bit(&cookie->flags, 0); + spin_lock(&object->lock); + cookie = object->cookie; + if (cookie) { + /* delete the page from the tree if it is now no longer + * pending */ + spin_lock(&cookie->stores_lock); + fscache_stat(&fscache_n_store_radix_deletes); + xpage = radix_tree_delete(&cookie->stores, page->index); + spin_unlock(&cookie->stores_lock); + wake_up_bit(&cookie->flags, 0); + } + spin_unlock(&object->lock); + if (xpage) + page_cache_release(xpage); } /* @@ -591,7 +601,7 @@ static void fscache_write_op(struct fscache_operation *_op) struct fscache_storage *op = container_of(_op, struct fscache_storage, op); struct fscache_object *object = op->op.object; - struct fscache_cookie *cookie = object->cookie; + struct fscache_cookie *cookie; struct page *page; unsigned n; void *results[1]; @@ -601,16 +611,17 @@ static void fscache_write_op(struct fscache_operation *_op) fscache_set_op_state(&op->op, "GetPage"); - spin_lock(&cookie->lock); spin_lock(&object->lock); + cookie = object->cookie; - if (!fscache_object_is_active(object)) { + if (!fscache_object_is_active(object) || !cookie) { spin_unlock(&object->lock); - spin_unlock(&cookie->lock); _leave(""); return; } + spin_lock(&cookie->stores_lock); + fscache_stat(&fscache_n_store_calls); /* find a page to store */ @@ -621,23 +632,25 @@ static void fscache_write_op(struct fscache_operation *_op) goto superseded; page = results[0]; _debug("gang %d [%lx]", n, page->index); - if (page->index > op->store_limit) + if (page->index > op->store_limit) { + fscache_stat(&fscache_n_store_pages_over_limit); goto superseded; + } radix_tree_tag_clear(&cookie->stores, page->index, FSCACHE_COOKIE_PENDING_TAG); + spin_unlock(&cookie->stores_lock); spin_unlock(&object->lock); - spin_unlock(&cookie->lock); if (page) { fscache_set_op_state(&op->op, "Store"); + fscache_stat(&fscache_n_store_pages); fscache_stat(&fscache_n_cop_write_page); ret = object->cache->ops->write_page(op, page); fscache_stat_d(&fscache_n_cop_write_page); fscache_set_op_state(&op->op, "EndWrite"); - fscache_end_page_write(cookie, page); - page_cache_release(page); + fscache_end_page_write(object, page); if (ret < 0) { fscache_set_op_state(&op->op, "Abort"); fscache_abort_object(object); @@ -653,9 +666,9 @@ superseded: /* this writer is going away and there aren't any more things to * write */ _debug("cease"); + spin_unlock(&cookie->stores_lock); clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); spin_unlock(&object->lock); - spin_unlock(&cookie->lock); _leave(""); } @@ -731,6 +744,7 @@ int __fscache_write_page(struct fscache_cookie *cookie, /* add the page to the pending-storage radix tree on the backing * object */ spin_lock(&object->lock); + spin_lock(&cookie->stores_lock); _debug("store limit %llx", (unsigned long long) object->store_limit); @@ -751,6 +765,7 @@ int __fscache_write_page(struct fscache_cookie *cookie, if (test_and_set_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags)) goto already_pending; + spin_unlock(&cookie->stores_lock); spin_unlock(&object->lock); op->op.debug_id = atomic_inc_return(&fscache_op_debug_id); @@ -772,6 +787,7 @@ int __fscache_write_page(struct fscache_cookie *cookie, already_queued: fscache_stat(&fscache_n_stores_again); already_pending: + spin_unlock(&cookie->stores_lock); spin_unlock(&object->lock); spin_unlock(&cookie->lock); radix_tree_preload_end(); @@ -781,7 +797,9 @@ already_pending: return 0; submit_failed: + spin_lock(&cookie->stores_lock); radix_tree_delete(&cookie->stores, page->index); + spin_unlock(&cookie->stores_lock); page_cache_release(page); ret = -ENOBUFS; goto nobufs; diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c index 4c07439d1307..1d53ea68409e 100644 --- a/fs/fscache/stats.c +++ b/fs/fscache/stats.c @@ -58,6 +58,9 @@ atomic_t fscache_n_stores_nobufs; atomic_t fscache_n_stores_oom; atomic_t fscache_n_store_ops; atomic_t fscache_n_store_calls; +atomic_t fscache_n_store_pages; +atomic_t fscache_n_store_radix_deletes; +atomic_t fscache_n_store_pages_over_limit; atomic_t fscache_n_marks; atomic_t fscache_n_uncaches; @@ -200,9 +203,12 @@ static int fscache_stats_show(struct seq_file *m, void *v) atomic_read(&fscache_n_stores_again), atomic_read(&fscache_n_stores_nobufs), atomic_read(&fscache_n_stores_oom)); - seq_printf(m, "Stores : ops=%u run=%u\n", + seq_printf(m, "Stores : ops=%u run=%u pgs=%u rxd=%u olm=%u\n", atomic_read(&fscache_n_store_ops), - atomic_read(&fscache_n_store_calls)); + atomic_read(&fscache_n_store_calls), + atomic_read(&fscache_n_store_pages), + atomic_read(&fscache_n_store_radix_deletes), + atomic_read(&fscache_n_store_pages_over_limit)); seq_printf(m, "Ops : pend=%u run=%u enq=%u can=%u\n", atomic_read(&fscache_n_op_pend), -- cgit v1.2.3 From 285e728b0ac55b53a673114096168d6f74930167 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:11:29 +0000 Subject: FS-Cache: Don't delete pending pages from the page-store tracking tree Don't delete pending pages from the page-store tracking tree, but rather send them for another write as they've presumably been updated. Signed-off-by: David Howells --- fs/fscache/page.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 3ea8897bc217..022a5da8e130 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -57,8 +57,11 @@ static void fscache_end_page_write(struct fscache_object *object, /* delete the page from the tree if it is now no longer * pending */ spin_lock(&cookie->stores_lock); - fscache_stat(&fscache_n_store_radix_deletes); - xpage = radix_tree_delete(&cookie->stores, page->index); + if (!radix_tree_tag_get(&cookie->stores, page->index, + FSCACHE_COOKIE_PENDING_TAG)) { + fscache_stat(&fscache_n_store_radix_deletes); + xpage = radix_tree_delete(&cookie->stores, page->index); + } spin_unlock(&cookie->stores_lock); wake_up_bit(&cookie->flags, 0); } -- cgit v1.2.3 From e3d4d28b1c8cc7c26536a50b43d86ccd39878550 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:11:32 +0000 Subject: FS-Cache: Handle read request vs lookup, creation or other cache failure FS-Cache doesn't correctly handle the netfs requesting a read from the cache on an object that failed or was withdrawn by the cache. A trace similar to the following might be seen: CacheFiles: Lookup failed error -105 [exe ] unexpected submission OP165afe [OBJ6cac OBJECT_LC_DYING] [exe ] objstate=OBJECT_LC_DYING [OBJECT_LC_DYING] [exe ] objflags=0 [exe ] objevent=9 [fffffffffffffffb] [exe ] ops=0 inp=0 exc=0 Pid: 6970, comm: exe Not tainted 2.6.32-rc6-cachefs #50 Call Trace: [] fscache_submit_op+0x3ff/0x45a [fscache] [] __fscache_read_or_alloc_pages+0x187/0x3c4 [fscache] [] ? nfs_readpage_from_fscache_complete+0x0/0x66 [nfs] [] __nfs_readpages_from_fscache+0x7e/0x176 [nfs] [] ? __alloc_pages_nodemask+0x11c/0x5cf [] nfs_readpages+0x114/0x1d7 [nfs] [] __do_page_cache_readahead+0x15f/0x1ec [] ? __do_page_cache_readahead+0x73/0x1ec [] ra_submit+0x1c/0x20 [] ondemand_readahead+0x227/0x23a [] page_cache_sync_readahead+0x17/0x19 [] generic_file_aio_read+0x236/0x5a0 [] nfs_file_read+0xe4/0xf3 [nfs] [] do_sync_read+0xe3/0x120 [] ? _spin_unlock_irq+0x2b/0x31 [] ? autoremove_wake_function+0x0/0x34 [] ? selinux_file_permission+0x5d/0x10f [] ? thread_return+0x3e/0x101 [] ? security_file_permission+0x11/0x13 [] vfs_read+0xaa/0x16f [] ? trace_hardirqs_on_caller+0x10c/0x130 [] sys_read+0x45/0x6c [] system_call_fastpath+0x16/0x1b The object state might also be OBJECT_DYING or OBJECT_WITHDRAWING. This should be handled by simply rejecting the new operation with ENOBUFS. There's no need to log an error for it. Events of this type now appear in the stats file under Ops:rej. Signed-off-by: David Howells --- fs/fscache/internal.h | 1 + fs/fscache/operation.c | 5 +++++ fs/fscache/stats.c | 6 ++++-- 3 files changed, 10 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index ba1853fa1ff9..a0769872b19c 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -143,6 +143,7 @@ extern atomic_t fscache_n_op_deferred_release; extern atomic_t fscache_n_op_release; extern atomic_t fscache_n_op_gc; extern atomic_t fscache_n_op_cancelled; +extern atomic_t fscache_n_op_rejected; extern atomic_t fscache_n_attr_changed; extern atomic_t fscache_n_attr_changed_ok; diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index 296492efb81b..313e79a14266 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -232,6 +232,11 @@ int fscache_submit_op(struct fscache_object *object, list_add_tail(&op->pend_link, &object->pending_ops); fscache_stat(&fscache_n_op_pend); ret = 0; + } else if (object->state == FSCACHE_OBJECT_DYING || + object->state == FSCACHE_OBJECT_LC_DYING || + object->state == FSCACHE_OBJECT_WITHDRAWING) { + fscache_stat(&fscache_n_op_rejected); + ret = -ENOBUFS; } else if (!test_bit(FSCACHE_IOERROR, &object->cache->flags)) { fscache_report_unexpected_submission(object, op, ostate); ASSERT(!fscache_object_is_active(object)); diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c index 1d53ea68409e..045ba396dbf2 100644 --- a/fs/fscache/stats.c +++ b/fs/fscache/stats.c @@ -26,6 +26,7 @@ atomic_t fscache_n_op_deferred_release; atomic_t fscache_n_op_release; atomic_t fscache_n_op_gc; atomic_t fscache_n_op_cancelled; +atomic_t fscache_n_op_rejected; atomic_t fscache_n_attr_changed; atomic_t fscache_n_attr_changed_ok; @@ -210,11 +211,12 @@ static int fscache_stats_show(struct seq_file *m, void *v) atomic_read(&fscache_n_store_radix_deletes), atomic_read(&fscache_n_store_pages_over_limit)); - seq_printf(m, "Ops : pend=%u run=%u enq=%u can=%u\n", + seq_printf(m, "Ops : pend=%u run=%u enq=%u can=%u rej=%u\n", atomic_read(&fscache_n_op_pend), atomic_read(&fscache_n_op_run), atomic_read(&fscache_n_op_enqueue), - atomic_read(&fscache_n_op_cancelled)); + atomic_read(&fscache_n_op_cancelled), + atomic_read(&fscache_n_op_rejected)); seq_printf(m, "Ops : dfr=%u rel=%u gc=%u\n", atomic_read(&fscache_n_op_deferred_release), atomic_read(&fscache_n_op_release), -- cgit v1.2.3 From 201a15428bd54f83eccec8b7c64a04b8f9431204 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:11:35 +0000 Subject: FS-Cache: Handle pages pending storage that get evicted under OOM conditions Handle netfs pages that the vmscan algorithm wants to evict from the pagecache under OOM conditions, but that are waiting for write to the cache. Under these conditions, vmscan calls the releasepage() function of the netfs, asking if a page can be discarded. The problem is typified by the following trace of a stuck process: kslowd005 D 0000000000000000 0 4253 2 0x00000080 ffff88001b14f370 0000000000000046 ffff880020d0d000 0000000000000007 0000000000000006 0000000000000001 ffff88001b14ffd8 ffff880020d0d2a8 000000000000ddf0 00000000000118c0 00000000000118c0 ffff880020d0d2a8 Call Trace: [] __fscache_wait_on_page_write+0x8b/0xa7 [fscache] [] ? autoremove_wake_function+0x0/0x34 [] ? __fscache_check_page_write+0x63/0x70 [fscache] [] nfs_fscache_release_page+0x4e/0xc4 [nfs] [] nfs_release_page+0x3c/0x41 [nfs] [] try_to_release_page+0x32/0x3b [] shrink_page_list+0x316/0x4ac [] shrink_inactive_list+0x392/0x67c [] ? __mutex_unlock_slowpath+0x100/0x10b [] ? trace_hardirqs_on_caller+0x10c/0x130 [] ? mutex_unlock+0x9/0xb [] shrink_list+0x8d/0x8f [] shrink_zone+0x278/0x33c [] ? ktime_get_ts+0xad/0xba [] try_to_free_pages+0x22e/0x392 [] ? isolate_pages_global+0x0/0x212 [] __alloc_pages_nodemask+0x3dc/0x5cf [] grab_cache_page_write_begin+0x65/0xaa [] ext3_write_begin+0x78/0x1eb [] generic_file_buffered_write+0x109/0x28c [] ? current_fs_time+0x22/0x29 [] __generic_file_aio_write+0x350/0x385 [] ? generic_file_aio_write+0x4a/0xae [] generic_file_aio_write+0x60/0xae [] do_sync_write+0xe3/0x120 [] ? autoremove_wake_function+0x0/0x34 [] ? __dentry_open+0x1a5/0x2b8 [] ? dentry_open+0x82/0x89 [] cachefiles_write_page+0x298/0x335 [cachefiles] [] fscache_write_op+0x178/0x2c2 [fscache] [] fscache_op_execute+0x7a/0xd1 [fscache] [] slow_work_execute+0x18f/0x2d1 [] slow_work_thread+0x1c5/0x308 [] ? autoremove_wake_function+0x0/0x34 [] ? slow_work_thread+0x0/0x308 [] kthread+0x7a/0x82 [] child_rip+0xa/0x20 [] ? restore_args+0x0/0x30 [] ? tg_shares_up+0x171/0x227 [] ? kthread+0x0/0x82 [] ? child_rip+0x0/0x20 In the above backtrace, the following is happening: (1) A page storage operation is being executed by a slow-work thread (fscache_write_op()). (2) FS-Cache farms the operation out to the cache to perform (cachefiles_write_page()). (3) CacheFiles is then calling Ext3 to perform the actual write, using Ext3's standard write (do_sync_write()) under KERNEL_DS directly from the netfs page. (4) However, for Ext3 to perform the write, it must allocate some memory, in particular, it must allocate at least one page cache page into which it can copy the data from the netfs page. (5) Under OOM conditions, the memory allocator can't immediately come up with a page, so it uses vmscan to find something to discard (try_to_free_pages()). (6) vmscan finds a clean netfs page it might be able to discard (possibly the one it's trying to write out). (7) The netfs is called to throw the page away (nfs_release_page()) - but it's called with __GFP_WAIT, so the netfs decides to wait for the store to complete (__fscache_wait_on_page_write()). (8) This blocks a slow-work processing thread - possibly against itself. The system ends up stuck because it can't write out any netfs pages to the cache without allocating more memory. To avoid this, we make FS-Cache cancel some writes that aren't in the middle of actually being performed. This means that some data won't make it into the cache this time. To support this, a new FS-Cache function is added fscache_maybe_release_page() that replaces what the netfs releasepage() functions used to do with respect to the cache. The decisions fscache_maybe_release_page() makes are counted and displayed through /proc/fs/fscache/stats on a line labelled "VmScan". There are four counters provided: "nos=N" - pages that weren't pending storage; "gon=N" - pages that were pending storage when we first looked, but weren't by the time we got the object lock; "bsy=N" - pages that we ignored as they were actively being written when we looked; and "can=N" - pages that we cancelled the storage of. What I'd really like to do is alter the behaviour of the cancellation heuristics, depending on how necessary it is to expel pages. If there are plenty of other pages that aren't waiting to be written to the cache that could be ejected first, then it would be nice to hold up on immediate cancellation of cache writes - but I don't see a way of doing that. Signed-off-by: David Howells --- fs/9p/cache.c | 14 +-------- fs/afs/file.c | 15 ++-------- fs/fscache/internal.h | 5 ++++ fs/fscache/page.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++-- fs/fscache/stats.c | 11 +++++++ fs/nfs/fscache.c | 10 ++----- 6 files changed, 100 insertions(+), 34 deletions(-) (limited to 'fs') diff --git a/fs/9p/cache.c b/fs/9p/cache.c index 51c94e26a346..bcc5357a9069 100644 --- a/fs/9p/cache.c +++ b/fs/9p/cache.c @@ -343,18 +343,7 @@ int __v9fs_fscache_release_page(struct page *page, gfp_t gfp) BUG_ON(!vcookie->fscache); - if (PageFsCache(page)) { - if (fscache_check_page_write(vcookie->fscache, page)) { - if (!(gfp & __GFP_WAIT)) - return 0; - fscache_wait_on_page_write(vcookie->fscache, page); - } - - fscache_uncache_page(vcookie->fscache, page); - ClearPageFsCache(page); - } - - return 1; + return fscache_maybe_release_page(vnode->cache, page, gfp); } void __v9fs_fscache_invalidate_page(struct page *page) @@ -368,7 +357,6 @@ void __v9fs_fscache_invalidate_page(struct page *page) fscache_wait_on_page_write(vcookie->fscache, page); BUG_ON(!PageLocked(page)); fscache_uncache_page(vcookie->fscache, page); - ClearPageFsCache(page); } } diff --git a/fs/afs/file.c b/fs/afs/file.c index 681c2a7b013f..39b301662f22 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -315,7 +315,6 @@ static void afs_invalidatepage(struct page *page, unsigned long offset) struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); fscache_wait_on_page_write(vnode->cache, page); fscache_uncache_page(vnode->cache, page); - ClearPageFsCache(page); } #endif @@ -349,17 +348,9 @@ static int afs_releasepage(struct page *page, gfp_t gfp_flags) /* deny if page is being written to the cache and the caller hasn't * elected to wait */ #ifdef CONFIG_AFS_FSCACHE - if (PageFsCache(page)) { - if (fscache_check_page_write(vnode->cache, page)) { - if (!(gfp_flags & __GFP_WAIT)) { - _leave(" = F [cache busy]"); - return 0; - } - fscache_wait_on_page_write(vnode->cache, page); - } - - fscache_uncache_page(vnode->cache, page); - ClearPageFsCache(page); + if (!fscache_maybe_release_page(vnode->cache, page, gfp_flags)) { + _leave(" = F [cache busy]"); + return 0; } #endif diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index a0769872b19c..e5046519b153 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -180,6 +180,11 @@ extern atomic_t fscache_n_store_pages; extern atomic_t fscache_n_store_radix_deletes; extern atomic_t fscache_n_store_pages_over_limit; +extern atomic_t fscache_n_store_vmscan_not_storing; +extern atomic_t fscache_n_store_vmscan_gone; +extern atomic_t fscache_n_store_vmscan_busy; +extern atomic_t fscache_n_store_vmscan_cancelled; + extern atomic_t fscache_n_marks; extern atomic_t fscache_n_uncaches; diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 022a5da8e130..fc76798bd968 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -42,6 +42,75 @@ void __fscache_wait_on_page_write(struct fscache_cookie *cookie, struct page *pa } EXPORT_SYMBOL(__fscache_wait_on_page_write); +/* + * decide whether a page can be released, possibly by cancelling a store to it + * - we're allowed to sleep if __GFP_WAIT is flagged + */ +bool __fscache_maybe_release_page(struct fscache_cookie *cookie, + struct page *page, + gfp_t gfp) +{ + struct page *xpage; + void *val; + + _enter("%p,%p,%x", cookie, page, gfp); + + rcu_read_lock(); + val = radix_tree_lookup(&cookie->stores, page->index); + if (!val) { + rcu_read_unlock(); + fscache_stat(&fscache_n_store_vmscan_not_storing); + __fscache_uncache_page(cookie, page); + return true; + } + + /* see if the page is actually undergoing storage - if so we can't get + * rid of it till the cache has finished with it */ + if (radix_tree_tag_get(&cookie->stores, page->index, + FSCACHE_COOKIE_STORING_TAG)) { + rcu_read_unlock(); + goto page_busy; + } + + /* the page is pending storage, so we attempt to cancel the store and + * discard the store request so that the page can be reclaimed */ + spin_lock(&cookie->stores_lock); + rcu_read_unlock(); + + if (radix_tree_tag_get(&cookie->stores, page->index, + FSCACHE_COOKIE_STORING_TAG)) { + /* the page started to undergo storage whilst we were looking, + * so now we can only wait or return */ + spin_unlock(&cookie->stores_lock); + goto page_busy; + } + + xpage = radix_tree_delete(&cookie->stores, page->index); + spin_unlock(&cookie->stores_lock); + + if (xpage) { + fscache_stat(&fscache_n_store_vmscan_cancelled); + fscache_stat(&fscache_n_store_radix_deletes); + ASSERTCMP(xpage, ==, page); + } else { + fscache_stat(&fscache_n_store_vmscan_gone); + } + + wake_up_bit(&cookie->flags, 0); + if (xpage) + page_cache_release(xpage); + __fscache_uncache_page(cookie, page); + return true; + +page_busy: + /* we might want to wait here, but that could deadlock the allocator as + * the slow-work threads writing to the cache may all end up sleeping + * on memory allocation */ + fscache_stat(&fscache_n_store_vmscan_busy); + return false; +} +EXPORT_SYMBOL(__fscache_maybe_release_page); + /* * note that a page has finished being written to the cache */ @@ -57,6 +126,8 @@ static void fscache_end_page_write(struct fscache_object *object, /* delete the page from the tree if it is now no longer * pending */ spin_lock(&cookie->stores_lock); + radix_tree_tag_clear(&cookie->stores, page->index, + FSCACHE_COOKIE_STORING_TAG); if (!radix_tree_tag_get(&cookie->stores, page->index, FSCACHE_COOKIE_PENDING_TAG)) { fscache_stat(&fscache_n_store_radix_deletes); @@ -640,8 +711,12 @@ static void fscache_write_op(struct fscache_operation *_op) goto superseded; } - radix_tree_tag_clear(&cookie->stores, page->index, - FSCACHE_COOKIE_PENDING_TAG); + if (page) { + radix_tree_tag_set(&cookie->stores, page->index, + FSCACHE_COOKIE_STORING_TAG); + radix_tree_tag_clear(&cookie->stores, page->index, + FSCACHE_COOKIE_PENDING_TAG); + } spin_unlock(&cookie->stores_lock); spin_unlock(&object->lock); diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c index 045ba396dbf2..cda69994e06d 100644 --- a/fs/fscache/stats.c +++ b/fs/fscache/stats.c @@ -63,6 +63,11 @@ atomic_t fscache_n_store_pages; atomic_t fscache_n_store_radix_deletes; atomic_t fscache_n_store_pages_over_limit; +atomic_t fscache_n_store_vmscan_not_storing; +atomic_t fscache_n_store_vmscan_gone; +atomic_t fscache_n_store_vmscan_busy; +atomic_t fscache_n_store_vmscan_cancelled; + atomic_t fscache_n_marks; atomic_t fscache_n_uncaches; @@ -211,6 +216,12 @@ static int fscache_stats_show(struct seq_file *m, void *v) atomic_read(&fscache_n_store_radix_deletes), atomic_read(&fscache_n_store_pages_over_limit)); + seq_printf(m, "VmScan : nos=%u gon=%u bsy=%u can=%u\n", + atomic_read(&fscache_n_store_vmscan_not_storing), + atomic_read(&fscache_n_store_vmscan_gone), + atomic_read(&fscache_n_store_vmscan_busy), + atomic_read(&fscache_n_store_vmscan_cancelled)); + seq_printf(m, "Ops : pend=%u run=%u enq=%u can=%u rej=%u\n", atomic_read(&fscache_n_op_pend), atomic_read(&fscache_n_op_run), diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index 70fad69eb959..fa588006588d 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -359,17 +359,13 @@ int nfs_fscache_release_page(struct page *page, gfp_t gfp) BUG_ON(!cookie); - if (fscache_check_page_write(cookie, page)) { - if (!(gfp & __GFP_WAIT)) - return 0; - fscache_wait_on_page_write(cookie, page); - } - if (PageFsCache(page)) { dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n", cookie, page, nfsi); - fscache_uncache_page(cookie, page); + if (!fscache_maybe_release_page(cookie, page, gfp)) + return 0; + nfs_add_fscache_stats(page->mapping->host, NFSIOS_FSCACHE_PAGES_UNCACHED, 1); } -- cgit v1.2.3 From 2175bb06dc6cf2af9c098a1770561f9e63edae4e Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:11:38 +0000 Subject: FS-Cache: Add a retirement stat counter Add a stat counter to count retirement events rather than ordinary release events (the retire argument to fscache_relinquish_cookie()). Signed-off-by: David Howells --- fs/fscache/cookie.c | 2 ++ fs/fscache/internal.h | 1 + fs/fscache/stats.c | 6 ++++-- 3 files changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index f979659c1b3f..990535071a8a 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -415,6 +415,8 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire) unsigned long event; fscache_stat(&fscache_n_relinquishes); + if (retire) + fscache_stat(&fscache_n_relinquishes_retire); if (!cookie) { fscache_stat(&fscache_n_relinquishes_null); diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index e5046519b153..2bf463d26080 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -202,6 +202,7 @@ extern atomic_t fscache_n_updates_run; extern atomic_t fscache_n_relinquishes; extern atomic_t fscache_n_relinquishes_null; extern atomic_t fscache_n_relinquishes_waitcrt; +extern atomic_t fscache_n_relinquishes_retire; extern atomic_t fscache_n_cookie_index; extern atomic_t fscache_n_cookie_data; diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c index cda69994e06d..9e15289eb5c1 100644 --- a/fs/fscache/stats.c +++ b/fs/fscache/stats.c @@ -85,6 +85,7 @@ atomic_t fscache_n_updates_run; atomic_t fscache_n_relinquishes; atomic_t fscache_n_relinquishes_null; atomic_t fscache_n_relinquishes_waitcrt; +atomic_t fscache_n_relinquishes_retire; atomic_t fscache_n_cookie_index; atomic_t fscache_n_cookie_data; @@ -168,10 +169,11 @@ static int fscache_stats_show(struct seq_file *m, void *v) atomic_read(&fscache_n_updates_null), atomic_read(&fscache_n_updates_run)); - seq_printf(m, "Relinqs: n=%u nul=%u wcr=%u\n", + seq_printf(m, "Relinqs: n=%u nul=%u wcr=%u rtr=%u\n", atomic_read(&fscache_n_relinquishes), atomic_read(&fscache_n_relinquishes_null), - atomic_read(&fscache_n_relinquishes_waitcrt)); + atomic_read(&fscache_n_relinquishes_waitcrt), + atomic_read(&fscache_n_relinquishes_retire)); seq_printf(m, "AttrChg: n=%u ok=%u nbf=%u oom=%u run=%u\n", atomic_read(&fscache_n_attr_changed), -- cgit v1.2.3 From d461d26dde901b0523c46b0317e7fccf574a3933 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:11:41 +0000 Subject: FS-Cache: Make sure FSCACHE_COOKIE_LOOKING_UP cleared on lookup failure We must make sure that FSCACHE_COOKIE_LOOKING_UP is cleared on lookup failure (if an object reaches the LC_DYING state), and we should clear it before clearing FSCACHE_COOKIE_CREATING. If this doesn't happen then fscache_wait_for_deferred_lookup() may hold allocation and retrieval operations indefinitely until they're interrupted by signals - which in turn pins the dying object until they go away. Signed-off-by: David Howells --- fs/fscache/object.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/fscache/object.c b/fs/fscache/object.c index 1a1afa82f798..74bc562a2cbc 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -105,6 +105,7 @@ static inline void fscache_done_parent_op(struct fscache_object *object) static void fscache_object_state_machine(struct fscache_object *object) { enum fscache_object_state new_state; + struct fscache_cookie *cookie; ASSERT(object != NULL); @@ -158,11 +159,17 @@ static void fscache_object_state_machine(struct fscache_object *object) spin_lock(&object->lock); object->state = FSCACHE_OBJECT_DYING; - if (object->cookie && - test_and_clear_bit(FSCACHE_COOKIE_CREATING, - &object->cookie->flags)) - wake_up_bit(&object->cookie->flags, - FSCACHE_COOKIE_CREATING); + cookie = object->cookie; + if (cookie) { + if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP, + &cookie->flags)) + wake_up_bit(&cookie->flags, + FSCACHE_COOKIE_LOOKING_UP); + if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, + &cookie->flags)) + wake_up_bit(&cookie->flags, + FSCACHE_COOKIE_CREATING); + } spin_unlock(&object->lock); fscache_done_parent_op(object); -- cgit v1.2.3 From 60d543ca724be155c2b6166e36a00c80b21bd810 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:11:45 +0000 Subject: FS-Cache: Start processing an object's operations on that object's death Start processing an object's operations when that object moves into the DYING state as the object cannot be destroyed until all its outstanding operations have completed. Furthermore, make sure that read and allocation operations handle being woken up on a dead object. Such events are recorded in the Allocs.abt and Retrvls.abt statistics as viewable through /proc/fs/fscache/stats. The code for waiting for object activation for the read and allocation operations is also extracted into its own function as it is much the same in all cases, differing only in the stats incremented. Signed-off-by: David Howells --- fs/fscache/internal.h | 5 +++ fs/fscache/object.c | 1 + fs/fscache/page.c | 112 +++++++++++++++++++++++++------------------------- fs/fscache/stats.c | 12 ++++-- 4 files changed, 69 insertions(+), 61 deletions(-) (limited to 'fs') diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index 2bf463d26080..5b49a373689b 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -156,6 +156,7 @@ extern atomic_t fscache_n_allocs_ok; extern atomic_t fscache_n_allocs_wait; extern atomic_t fscache_n_allocs_nobufs; extern atomic_t fscache_n_allocs_intr; +extern atomic_t fscache_n_allocs_object_dead; extern atomic_t fscache_n_alloc_ops; extern atomic_t fscache_n_alloc_op_waits; @@ -166,6 +167,7 @@ extern atomic_t fscache_n_retrievals_nodata; extern atomic_t fscache_n_retrievals_nobufs; extern atomic_t fscache_n_retrievals_intr; extern atomic_t fscache_n_retrievals_nomem; +extern atomic_t fscache_n_retrievals_object_dead; extern atomic_t fscache_n_retrieval_ops; extern atomic_t fscache_n_retrieval_op_waits; @@ -249,9 +251,12 @@ static inline void fscache_stat_d(atomic_t *stat) atomic_dec(stat); } +#define __fscache_stat(stat) (stat) + extern const struct file_operations fscache_stats_fops; #else +#define __fscache_stat(stat) (NULL) #define fscache_stat(stat) do {} while (0) #endif diff --git a/fs/fscache/object.c b/fs/fscache/object.c index 74bc562a2cbc..c85c9f582166 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -201,6 +201,7 @@ static void fscache_object_state_machine(struct fscache_object *object) } spin_unlock(&object->lock); fscache_enqueue_dependents(object); + fscache_start_operations(object); goto terminal_transit; /* handle an abort during initialisation */ diff --git a/fs/fscache/page.c b/fs/fscache/page.c index fc76798bd968..c598ea4c4e7d 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -313,6 +313,43 @@ static int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie) return 0; } +/* + * wait for an object to become active (or dead) + */ +static int fscache_wait_for_retrieval_activation(struct fscache_object *object, + struct fscache_retrieval *op, + atomic_t *stat_op_waits, + atomic_t *stat_object_dead) +{ + int ret; + + if (!test_bit(FSCACHE_OP_WAITING, &op->op.flags)) + goto check_if_dead; + + _debug(">>> WT"); + fscache_stat(stat_op_waits); + if (wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, + fscache_wait_bit_interruptible, + TASK_INTERRUPTIBLE) < 0) { + ret = fscache_cancel_op(&op->op); + if (ret == 0) + return -ERESTARTSYS; + + /* it's been removed from the pending queue by another party, + * so we should get to run shortly */ + wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, + fscache_wait_bit, TASK_UNINTERRUPTIBLE); + } + _debug("<<< GO"); + +check_if_dead: + if (unlikely(fscache_object_is_dead(object))) { + fscache_stat(stat_object_dead); + return -ENOBUFS; + } + return 0; +} + /* * read a page from the cache or allocate a block in which to store it * - we return: @@ -376,25 +413,12 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, /* we wait for the operation to become active, and then process it * *here*, in this thread, and not in the thread pool */ - if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) { - _debug(">>> WT"); - fscache_stat(&fscache_n_retrieval_op_waits); - if (wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, - fscache_wait_bit_interruptible, - TASK_INTERRUPTIBLE) < 0) { - ret = fscache_cancel_op(&op->op); - if (ret == 0) { - ret = -ERESTARTSYS; - goto error; - } - - /* it's been removed from the pending queue by another - * party, so we should get to run shortly */ - wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, - fscache_wait_bit, TASK_UNINTERRUPTIBLE); - } - _debug("<<< GO"); - } + ret = fscache_wait_for_retrieval_activation( + object, op, + __fscache_stat(&fscache_n_retrieval_op_waits), + __fscache_stat(&fscache_n_retrievals_object_dead)); + if (ret < 0) + goto error; /* ask the cache to honour the operation */ if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) { @@ -506,25 +530,12 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie, /* we wait for the operation to become active, and then process it * *here*, in this thread, and not in the thread pool */ - if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) { - _debug(">>> WT"); - fscache_stat(&fscache_n_retrieval_op_waits); - if (wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, - fscache_wait_bit_interruptible, - TASK_INTERRUPTIBLE) < 0) { - ret = fscache_cancel_op(&op->op); - if (ret == 0) { - ret = -ERESTARTSYS; - goto error; - } - - /* it's been removed from the pending queue by another - * party, so we should get to run shortly */ - wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, - fscache_wait_bit, TASK_UNINTERRUPTIBLE); - } - _debug("<<< GO"); - } + ret = fscache_wait_for_retrieval_activation( + object, op, + __fscache_stat(&fscache_n_retrieval_op_waits), + __fscache_stat(&fscache_n_retrievals_object_dead)); + if (ret < 0) + goto error; /* ask the cache to honour the operation */ if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) { @@ -612,25 +623,12 @@ int __fscache_alloc_page(struct fscache_cookie *cookie, fscache_stat(&fscache_n_alloc_ops); - if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) { - _debug(">>> WT"); - fscache_stat(&fscache_n_alloc_op_waits); - if (wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, - fscache_wait_bit_interruptible, - TASK_INTERRUPTIBLE) < 0) { - ret = fscache_cancel_op(&op->op); - if (ret == 0) { - ret = -ERESTARTSYS; - goto error; - } - - /* it's been removed from the pending queue by another - * party, so we should get to run shortly */ - wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, - fscache_wait_bit, TASK_UNINTERRUPTIBLE); - } - _debug("<<< GO"); - } + ret = fscache_wait_for_retrieval_activation( + object, op, + __fscache_stat(&fscache_n_alloc_op_waits), + __fscache_stat(&fscache_n_allocs_object_dead)); + if (ret < 0) + goto error; /* ask the cache to honour the operation */ fscache_stat(&fscache_n_cop_allocate_page); diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c index 9e15289eb5c1..05f77caf4a2d 100644 --- a/fs/fscache/stats.c +++ b/fs/fscache/stats.c @@ -39,6 +39,7 @@ atomic_t fscache_n_allocs_ok; atomic_t fscache_n_allocs_wait; atomic_t fscache_n_allocs_nobufs; atomic_t fscache_n_allocs_intr; +atomic_t fscache_n_allocs_object_dead; atomic_t fscache_n_alloc_ops; atomic_t fscache_n_alloc_op_waits; @@ -49,6 +50,7 @@ atomic_t fscache_n_retrievals_nodata; atomic_t fscache_n_retrievals_nobufs; atomic_t fscache_n_retrievals_intr; atomic_t fscache_n_retrievals_nomem; +atomic_t fscache_n_retrievals_object_dead; atomic_t fscache_n_retrieval_ops; atomic_t fscache_n_retrieval_op_waits; @@ -188,9 +190,10 @@ static int fscache_stats_show(struct seq_file *m, void *v) atomic_read(&fscache_n_allocs_wait), atomic_read(&fscache_n_allocs_nobufs), atomic_read(&fscache_n_allocs_intr)); - seq_printf(m, "Allocs : ops=%u owt=%u\n", + seq_printf(m, "Allocs : ops=%u owt=%u abt=%u\n", atomic_read(&fscache_n_alloc_ops), - atomic_read(&fscache_n_alloc_op_waits)); + atomic_read(&fscache_n_alloc_op_waits), + atomic_read(&fscache_n_allocs_object_dead)); seq_printf(m, "Retrvls: n=%u ok=%u wt=%u nod=%u nbf=%u" " int=%u oom=%u\n", @@ -201,9 +204,10 @@ static int fscache_stats_show(struct seq_file *m, void *v) atomic_read(&fscache_n_retrievals_nobufs), atomic_read(&fscache_n_retrievals_intr), atomic_read(&fscache_n_retrievals_nomem)); - seq_printf(m, "Retrvls: ops=%u owt=%u\n", + seq_printf(m, "Retrvls: ops=%u owt=%u abt=%u\n", atomic_read(&fscache_n_retrieval_ops), - atomic_read(&fscache_n_retrieval_op_waits)); + atomic_read(&fscache_n_retrieval_op_waits), + atomic_read(&fscache_n_retrievals_object_dead)); seq_printf(m, "Stores : n=%u ok=%u agn=%u nbf=%u oom=%u\n", atomic_read(&fscache_n_stores), -- cgit v1.2.3 From 868411be3f445a83fafbd734f3e426400138add5 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:11:48 +0000 Subject: FS-Cache: Actually requeue an object when requested FS-Cache objects have an FSCACHE_OBJECT_EV_REQUEUE event that can theoretically be raised to ask the state machine to requeue the object for further processing before the work function returns to the slow-work facility. However, fscache_object_work_execute() was clearing that bit before checking the event mask to see whether the object has any pending events that require it to be requeued immediately. Instead, the bit should be cleared after the check and enqueue. Signed-off-by: David Howells --- fs/fscache/object.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fscache/object.c b/fs/fscache/object.c index c85c9f582166..f3f952cf887e 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -353,13 +353,12 @@ static void fscache_object_slow_work_execute(struct slow_work *work) _enter("{OBJ%x}", object->debug_id); - clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events); - start = jiffies; fscache_object_state_machine(object); fscache_hist(fscache_objs_histogram, start); if (object->events & object->event_mask) fscache_enqueue_object(object); + clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events); } /* -- cgit v1.2.3 From a17754fb8c28af19cd70dcbec6d5b0773b94e0c1 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:11:52 +0000 Subject: CacheFiles: Don't write a full page if there's only a partial page to cache cachefiles_write_page() writes a full page to the backing file for the last page of the netfs file, even if the netfs file's last page is only a partial page. This causes the EOF on the backing file to be extended beyond the EOF of the netfs, and thus the backing file will be truncated by cachefiles_attr_changed() called from cachefiles_lookup_object(). So we need to limit the write we make to the backing file on that last page such that it doesn't push the EOF too far. Also, if a backing file that has a partial page at the end is expanded, we discard the partial page and refetch it on the basis that we then have a hole in the file with invalid data, and should the power go out... A better way to deal with this could be to record a note that the partial page contains invalid data until the correct data is written into it. This isn't a problem for netfs's that discard the whole backing file if the file size changes (such as NFS). Signed-off-by: David Howells --- fs/cachefiles/interface.c | 20 +++++++++++++++++--- fs/cachefiles/rdwr.c | 23 +++++++++++++++++++---- 2 files changed, 36 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index dd7f852746cb..8e67abf05985 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -404,12 +404,26 @@ static int cachefiles_attr_changed(struct fscache_object *_object) if (oi_size == ni_size) return 0; - newattrs.ia_size = ni_size; - newattrs.ia_valid = ATTR_SIZE; - cachefiles_begin_secure(cache, &saved_cred); mutex_lock(&object->backer->d_inode->i_mutex); + + /* if there's an extension to a partial page at the end of the backing + * file, we need to discard the partial page so that we pick up new + * data after it */ + if (oi_size & ~PAGE_MASK && ni_size > oi_size) { + _debug("discard tail %llx", oi_size); + newattrs.ia_valid = ATTR_SIZE; + newattrs.ia_size = oi_size & PAGE_MASK; + ret = notify_change(object->backer, &newattrs); + if (ret < 0) + goto truncate_failed; + } + + newattrs.ia_valid = ATTR_SIZE; + newattrs.ia_size = ni_size; ret = notify_change(object->backer, &newattrs); + +truncate_failed: mutex_unlock(&object->backer->d_inode->i_mutex); cachefiles_end_secure(cache, saved_cred); diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 3304646dae84..5a84fd7109ad 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -803,7 +803,8 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page) struct cachefiles_cache *cache; mm_segment_t old_fs; struct file *file; - loff_t pos; + loff_t pos, eof; + size_t len; void *data; int ret; @@ -837,15 +838,29 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page) ret = -EIO; if (file->f_op->write) { pos = (loff_t) page->index << PAGE_SHIFT; + + /* we mustn't write more data than we have, so we have + * to beware of a partial page at EOF */ + eof = object->fscache.store_limit_l; + len = PAGE_SIZE; + if (eof & ~PAGE_MASK) { + ASSERTCMP(pos, <, eof); + if (eof - pos < PAGE_SIZE) { + _debug("cut short %llx to %llx", + pos, eof); + len = eof - pos; + ASSERTCMP(pos + len, ==, eof); + } + } + data = kmap(page); old_fs = get_fs(); set_fs(KERNEL_DS); ret = file->f_op->write( - file, (const void __user *) data, PAGE_SIZE, - &pos); + file, (const void __user *) data, len, &pos); set_fs(old_fs); kunmap(page); - if (ret != PAGE_SIZE) + if (ret != len) ret = -EIO; } fput(file); -- cgit v1.2.3 From 5e929b33c3935ecb029b3e495356b2b8af432efa Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:11:55 +0000 Subject: CacheFiles: Handle truncate unlocking the page we're reading Handle truncate unlocking the page we're attempting to read from the backing device before the read has completed. This was causing reports like the following to occur: Pid: 4765, comm: kslowd Not tainted 2.6.30.1 #1 Call Trace: [] ? cachefiles_read_waiter+0xd9/0x147 [cachefiles] [] ? __wait_on_bit+0x60/0x6f [] ? __wake_up_common+0x3f/0x71 [] ? __wake_up+0x30/0x44 [] ? __wake_up_bit+0x28/0x2d [] ? ext3_truncate+0x4d7/0x8ed [ext3] [] ? pagevec_lookup+0x17/0x1f [] ? unmap_mapping_range+0x59/0x1ff [] ? __wake_up+0x30/0x44 [] ? vmtruncate+0xc2/0xe2 [] ? inode_setattr+0x22/0x10a [] ? ext3_setattr+0x17b/0x1e6 [ext3] [] ? notify_change+0x186/0x2c9 [] ? cachefiles_attr_changed+0x133/0x1cd [cachefiles] [] ? cachefiles_lookup_object+0xcf/0x12a [cachefiles] [] ? fscache_lookup_object+0x110/0x122 [fscache] [] ? fscache_object_slow_work_execute+0x590/0x6bc [fscache] [] ? slow_work_thread+0x285/0x43a [] ? autoremove_wake_function+0x0/0x2e [] ? slow_work_thread+0x0/0x43a [] ? kthread+0x54/0x81 [] ? child_rip+0xa/0x20 [] ? kthread+0x0/0x81 [] ? child_rip+0x0/0x20 CacheFiles: I/O Error: Readpage failed on backing file 200000000000810 FS-Cache: Cache cachefiles stopped due to I/O error Reported-by: Christian Kujau Reported-by: Takashi Iwai Reported-by: Duc Le Minh Signed-off-by: David Howells --- fs/cachefiles/rdwr.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 93 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 5a84fd7109ad..1d8332563863 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -40,8 +40,10 @@ static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode, _debug("--- monitor %p %lx ---", page, page->flags); - if (!PageUptodate(page) && !PageError(page)) - dump_stack(); + if (!PageUptodate(page) && !PageError(page)) { + /* unlocked, not uptodate and not erronous? */ + _debug("page probably truncated"); + } /* remove from the waitqueue */ list_del(&wait->task_list); @@ -60,6 +62,84 @@ static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode, return 0; } +/* + * handle a probably truncated page + * - check to see if the page is still relevant and reissue the read if + * possible + * - return -EIO on error, -ENODATA if the page is gone, -EINPROGRESS if we + * must wait again and 0 if successful + */ +static int cachefiles_read_reissue(struct cachefiles_object *object, + struct cachefiles_one_read *monitor) +{ + struct address_space *bmapping = object->backer->d_inode->i_mapping; + struct page *backpage = monitor->back_page, *backpage2; + int ret; + + kenter("{ino=%lx},{%lx,%lx}", + object->backer->d_inode->i_ino, + backpage->index, backpage->flags); + + /* skip if the page was truncated away completely */ + if (backpage->mapping != bmapping) { + kleave(" = -ENODATA [mapping]"); + return -ENODATA; + } + + backpage2 = find_get_page(bmapping, backpage->index); + if (!backpage2) { + kleave(" = -ENODATA [gone]"); + return -ENODATA; + } + + if (backpage != backpage2) { + put_page(backpage2); + kleave(" = -ENODATA [different]"); + return -ENODATA; + } + + /* the page is still there and we already have a ref on it, so we don't + * need a second */ + put_page(backpage2); + + INIT_LIST_HEAD(&monitor->op_link); + add_page_wait_queue(backpage, &monitor->monitor); + + if (trylock_page(backpage)) { + ret = -EIO; + if (PageError(backpage)) + goto unlock_discard; + ret = 0; + if (PageUptodate(backpage)) + goto unlock_discard; + + kdebug("reissue read"); + ret = bmapping->a_ops->readpage(NULL, backpage); + if (ret < 0) + goto unlock_discard; + } + + /* but the page may have been read before the monitor was installed, so + * the monitor may miss the event - so we have to ensure that we do get + * one in such a case */ + if (trylock_page(backpage)) { + _debug("jumpstart %p {%lx}", backpage, backpage->flags); + unlock_page(backpage); + } + + /* it'll reappear on the todo list */ + kleave(" = -EINPROGRESS"); + return -EINPROGRESS; + +unlock_discard: + unlock_page(backpage); + spin_lock_irq(&object->work_lock); + list_del(&monitor->op_link); + spin_unlock_irq(&object->work_lock); + kleave(" = %d", ret); + return ret; +} + /* * copy data from backing pages to netfs pages to complete a read operation * - driven by FS-Cache's thread pool @@ -92,20 +172,26 @@ static void cachefiles_read_copier(struct fscache_operation *_op) _debug("- copy {%lu}", monitor->back_page->index); - error = -EIO; + recheck: if (PageUptodate(monitor->back_page)) { copy_highpage(monitor->netfs_page, monitor->back_page); pagevec_add(&pagevec, monitor->netfs_page); fscache_mark_pages_cached(monitor->op, &pagevec); error = 0; - } - - if (error) + } else if (!PageError(monitor->back_page)) { + /* the page has probably been truncated */ + error = cachefiles_read_reissue(object, monitor); + if (error == -EINPROGRESS) + goto next; + goto recheck; + } else { cachefiles_io_error_obj( object, "Readpage failed on backing file %lx", (unsigned long) monitor->back_page->flags); + error = -EIO; + } page_cache_release(monitor->back_page); @@ -114,6 +200,7 @@ static void cachefiles_read_copier(struct fscache_operation *_op) fscache_put_retrieval(op); kfree(monitor); + next: /* let the thread pool have some air occasionally */ max--; if (max < 0 || need_resched()) { -- cgit v1.2.3 From 6511de33c877a53b3df545bc06c29e0f272837ff Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:11:58 +0000 Subject: CacheFiles: Mark parent directory locks as I_MUTEX_PARENT to keep lockdep happy Mark parent directory locks as I_MUTEX_PARENT in the callers of cachefiles_bury_object() so that lockdep doesn't complain when that invokes vfs_unlink(): ============================================= [ INFO: possible recursive locking detected ] 2.6.32-rc6-cachefs #47 --------------------------------------------- kslowd002/3089 is trying to acquire lock: (&sb->s_type->i_mutex_key#7){+.+.+.}, at: [] vfs_unlink+0x8b/0x128 but task is already holding lock: (&sb->s_type->i_mutex_key#7){+.+.+.}, at: [] cachefiles_walk_to_object+0x1b0/0x831 [cachefiles] other info that might help us debug this: 1 lock held by kslowd002/3089: #0: (&sb->s_type->i_mutex_key#7){+.+.+.}, at: [] cachefiles_walk_to_object+0x1b0/0x831 [cachefiles] stack backtrace: Pid: 3089, comm: kslowd002 Not tainted 2.6.32-rc6-cachefs #47 Call Trace: [] __lock_acquire+0x1649/0x16e3 [] ? inode_has_perm+0x5f/0x61 [] lock_acquire+0x57/0x6d [] ? vfs_unlink+0x8b/0x128 [] mutex_lock_nested+0x54/0x292 [] ? vfs_unlink+0x8b/0x128 [] ? selinux_inode_permission+0x8e/0x90 [] ? security_inode_permission+0x1c/0x1e [] ? inode_permission+0x99/0xa5 [] vfs_unlink+0x8b/0x128 [] ? kfree+0xed/0xf9 [] cachefiles_bury_object+0xb6/0x420 [cachefiles] [] ? trace_hardirqs_on+0xd/0xf [] ? cachefiles_check_object_xattr+0x233/0x293 [cachefiles] [] cachefiles_walk_to_object+0x4ff/0x831 [cachefiles] [] ? finish_task_switch+0x0/0xb2 [] cachefiles_lookup_object+0xac/0x12a [cachefiles] [] fscache_lookup_object+0x1c7/0x214 [fscache] [] fscache_object_state_machine+0xa5/0x52d [fscache] [] fscache_object_slow_work_execute+0x5f/0xa0 [fscache] [] slow_work_execute+0x18f/0x2d1 [] slow_work_thread+0x1c5/0x308 [] ? autoremove_wake_function+0x0/0x34 [] ? slow_work_thread+0x0/0x308 [] kthread+0x7a/0x82 [] child_rip+0xa/0x20 [] ? restore_args+0x0/0x30 [] ? kthread+0x0/0x82 [] ? child_rip+0x0/0x20 Signed-off-by: Daivd Howells --- fs/cachefiles/namei.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 4ce818ae39ea..3df86952ca64 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -254,7 +254,7 @@ int cachefiles_delete_object(struct cachefiles_cache *cache, dir = dget_parent(object->dentry); - mutex_lock(&dir->d_inode->i_mutex); + mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); ret = cachefiles_bury_object(cache, dir, object->dentry); dput(dir); @@ -307,7 +307,7 @@ lookup_again: /* search the current directory for the element name */ _debug("lookup '%s'", name); - mutex_lock(&dir->d_inode->i_mutex); + mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); start = jiffies; next = lookup_one_len(name, dir, nlen); -- cgit v1.2.3 From d0e27b7808dc667f3015be0b6888f6d680e222c8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:12:02 +0000 Subject: CacheFiles: Better showing of debugging information in active object problems Show more debugging information if cachefiles_mark_object_active() is asked to activate an active object. This may happen, for instance, if the netfs tries to register an object with the same key multiple times. The code is changed to (a) get the appropriate object lock to protect the cookie pointer whilst we dereference it, and (b) get and display the cookie key if available. Signed-off-by: David Howells --- fs/cachefiles/namei.c | 102 +++++++++++++++++++++++++++++++++++++------------- 1 file changed, 75 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 3df86952ca64..00a0cda8f47a 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -27,6 +27,76 @@ static int cachefiles_wait_bit(void *flags) return 0; } +#define CACHEFILES_KEYBUF_SIZE 512 + +/* + * dump debugging info about an object + */ +static noinline +void __cachefiles_printk_object(struct cachefiles_object *object, + const char *prefix, + u8 *keybuf) +{ + struct fscache_cookie *cookie; + unsigned keylen, loop; + + printk(KERN_ERR "%sobject: OBJ%x\n", + prefix, object->fscache.debug_id); + printk(KERN_ERR "%sobjstate=%s fl=%lx swfl=%lx ev=%lx[%lx]\n", + prefix, fscache_object_states[object->fscache.state], + object->fscache.flags, object->fscache.work.flags, + object->fscache.events, + object->fscache.event_mask & FSCACHE_OBJECT_EVENTS_MASK); + printk(KERN_ERR "%sops=%u inp=%u exc=%u\n", + prefix, object->fscache.n_ops, object->fscache.n_in_progress, + object->fscache.n_exclusive); + printk(KERN_ERR "%sparent=%p\n", + prefix, object->fscache.parent); + + spin_lock(&object->fscache.lock); + cookie = object->fscache.cookie; + if (cookie) { + printk(KERN_ERR "%scookie=%p [pr=%p nd=%p fl=%lx]\n", + prefix, + object->fscache.cookie, + object->fscache.cookie->parent, + object->fscache.cookie->netfs_data, + object->fscache.cookie->flags); + if (keybuf) + keylen = cookie->def->get_key(cookie->netfs_data, keybuf, + CACHEFILES_KEYBUF_SIZE); + else + keylen = 0; + } else { + printk(KERN_ERR "%scookie=NULL\n", prefix); + keylen = 0; + } + spin_unlock(&object->fscache.lock); + + if (keylen) { + printk(KERN_ERR "%skey=[%u] '", prefix, keylen); + for (loop = 0; loop < keylen; loop++) + printk("%02x", keybuf[loop]); + printk("'\n"); + } +} + +/* + * dump debugging info about a pair of objects + */ +static noinline void cachefiles_printk_object(struct cachefiles_object *object, + struct cachefiles_object *xobject) +{ + u8 *keybuf; + + keybuf = kmalloc(CACHEFILES_KEYBUF_SIZE, GFP_NOIO); + if (object) + __cachefiles_printk_object(object, "", keybuf); + if (xobject) + __cachefiles_printk_object(xobject, "x", keybuf); + kfree(keybuf); +} + /* * record the fact that an object is now active */ @@ -42,8 +112,11 @@ static void cachefiles_mark_object_active(struct cachefiles_cache *cache, try_again: write_lock(&cache->active_lock); - if (test_and_set_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) + if (test_and_set_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) { + printk(KERN_ERR "CacheFiles: Error: Object already active\n"); + cachefiles_printk_object(object, NULL); BUG(); + } dentry = object->dentry; _p = &cache->active_nodes.rb_node; @@ -76,32 +149,7 @@ wait_for_old_object: printk(KERN_ERR "\n"); printk(KERN_ERR "CacheFiles: Error:" " Unexpected object collision\n"); - printk(KERN_ERR "xobject: OBJ%x\n", - xobject->fscache.debug_id); - printk(KERN_ERR "xobjstate=%s\n", - fscache_object_states[xobject->fscache.state]); - printk(KERN_ERR "xobjflags=%lx\n", xobject->fscache.flags); - printk(KERN_ERR "xobjevent=%lx [%lx]\n", - xobject->fscache.events, xobject->fscache.event_mask); - printk(KERN_ERR "xops=%u inp=%u exc=%u\n", - xobject->fscache.n_ops, xobject->fscache.n_in_progress, - xobject->fscache.n_exclusive); - printk(KERN_ERR "xcookie=%p [pr=%p nd=%p fl=%lx]\n", - xobject->fscache.cookie, - xobject->fscache.cookie->parent, - xobject->fscache.cookie->netfs_data, - xobject->fscache.cookie->flags); - printk(KERN_ERR "xparent=%p\n", - xobject->fscache.parent); - printk(KERN_ERR "object: OBJ%x\n", - object->fscache.debug_id); - printk(KERN_ERR "cookie=%p [pr=%p nd=%p fl=%lx]\n", - object->fscache.cookie, - object->fscache.cookie->parent, - object->fscache.cookie->netfs_data, - object->fscache.cookie->flags); - printk(KERN_ERR "parent=%p\n", - object->fscache.parent); + cachefiles_printk_object(object, xobject); BUG(); } atomic_inc(&xobject->usage); -- cgit v1.2.3 From fee096deb4f33897937b974cb2c5168bab7935be Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:12:05 +0000 Subject: CacheFiles: Catch an overly long wait for an old active object Catch an overly long wait for an old, dying active object when we want to replace it with a new one. The probability is that all the slow-work threads are hogged, and the delete can't get a look in. What we do instead is: (1) if there's nothing in the slow work queue, we sleep until either the dying object has finished dying or there is something in the slow work queue behind which we can queue our object. (2) if there is something in the slow work queue, we return ETIMEDOUT to fscache_lookup_object(), which then puts us back on the slow work queue, presumably behind the deletion that we're blocked by. We are then deferred for a while until we work our way back through the queue - without blocking a slow-work thread unnecessarily. A backtrace similar to the following may appear in the log without this patch: INFO: task kslowd004:5711 blocked for more than 120 seconds. "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. kslowd004 D 0000000000000000 0 5711 2 0x00000080 ffff88000340bb80 0000000000000046 ffff88002550d000 0000000000000000 ffff88002550d000 0000000000000007 ffff88000340bfd8 ffff88002550d2a8 000000000000ddf0 00000000000118c0 00000000000118c0 ffff88002550d2a8 Call Trace: [] ? trace_hardirqs_on+0xd/0xf [] ? cachefiles_wait_bit+0x0/0xd [cachefiles] [] cachefiles_wait_bit+0x9/0xd [cachefiles] [] __wait_on_bit+0x43/0x76 [] ? ext3_xattr_get+0x1ec/0x270 [] out_of_line_wait_on_bit+0x69/0x74 [] ? cachefiles_wait_bit+0x0/0xd [cachefiles] [] ? wake_bit_function+0x0/0x2e [] cachefiles_mark_object_active+0x203/0x23b [cachefiles] [] cachefiles_walk_to_object+0x558/0x827 [cachefiles] [] cachefiles_lookup_object+0xac/0x12a [cachefiles] [] fscache_lookup_object+0x1c7/0x214 [fscache] [] fscache_object_state_machine+0xa5/0x52d [fscache] [] fscache_object_slow_work_execute+0x5f/0xa0 [fscache] [] slow_work_execute+0x18f/0x2d1 [] slow_work_thread+0x1c5/0x308 [] ? autoremove_wake_function+0x0/0x34 [] ? slow_work_thread+0x0/0x308 [] kthread+0x7a/0x82 [] child_rip+0xa/0x20 [] ? restore_args+0x0/0x30 [] ? kthread+0x0/0x82 [] ? child_rip+0x0/0x20 1 lock held by kslowd004/5711: #0: (&sb->s_type->i_mutex_key#7/1){+.+.+.}, at: [] cachefiles_walk_to_object+0x1b3/0x827 [cachefiles] Signed-off-by: David Howells --- fs/cachefiles/interface.c | 6 ++-- fs/cachefiles/namei.c | 87 ++++++++++++++++++++++++++++++++++++----------- fs/fscache/internal.h | 1 + fs/fscache/object.c | 10 +++++- fs/fscache/stats.c | 4 ++- 5 files changed, 85 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 8e67abf05985..9d3c426044ae 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -114,8 +114,9 @@ nomem_lookup_data: /* * attempt to look up the nominated node in this cache + * - return -ETIMEDOUT to be scheduled again */ -static void cachefiles_lookup_object(struct fscache_object *_object) +static int cachefiles_lookup_object(struct fscache_object *_object) { struct cachefiles_lookup_data *lookup_data; struct cachefiles_object *parent, *object; @@ -145,13 +146,14 @@ static void cachefiles_lookup_object(struct fscache_object *_object) object->fscache.cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) cachefiles_attr_changed(&object->fscache); - if (ret < 0) { + if (ret < 0 && ret != -ETIMEDOUT) { printk(KERN_WARNING "CacheFiles: Lookup failed error %d\n", ret); fscache_object_lookup_error(&object->fscache); } _leave(" [%d]", ret); + return ret; } /* diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 00a0cda8f47a..14ac4806e291 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -21,12 +21,6 @@ #include #include "internal.h" -static int cachefiles_wait_bit(void *flags) -{ - schedule(); - return 0; -} - #define CACHEFILES_KEYBUF_SIZE 512 /* @@ -100,8 +94,8 @@ static noinline void cachefiles_printk_object(struct cachefiles_object *object, /* * record the fact that an object is now active */ -static void cachefiles_mark_object_active(struct cachefiles_cache *cache, - struct cachefiles_object *object) +static int cachefiles_mark_object_active(struct cachefiles_cache *cache, + struct cachefiles_object *object) { struct cachefiles_object *xobject; struct rb_node **_p, *_parent = NULL; @@ -139,8 +133,8 @@ try_again: rb_insert_color(&object->active_node, &cache->active_nodes); write_unlock(&cache->active_lock); - _leave(""); - return; + _leave(" = 0"); + return 0; /* an old object from a previous incarnation is hogging the slot - we * need to wait for it to be destroyed */ @@ -155,13 +149,64 @@ wait_for_old_object: atomic_inc(&xobject->usage); write_unlock(&cache->active_lock); - _debug(">>> wait"); - wait_on_bit(&xobject->flags, CACHEFILES_OBJECT_ACTIVE, - cachefiles_wait_bit, TASK_UNINTERRUPTIBLE); - _debug("<<< waited"); + if (test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) { + wait_queue_head_t *wq; + + signed long timeout = 60 * HZ; + wait_queue_t wait; + bool requeue; + + /* if the object we're waiting for is queued for processing, + * then just put ourselves on the queue behind it */ + if (slow_work_is_queued(&xobject->fscache.work)) { + _debug("queue OBJ%x behind OBJ%x immediately", + object->fscache.debug_id, + xobject->fscache.debug_id); + goto requeue; + } + + /* otherwise we sleep until either the object we're waiting for + * is done, or the slow-work facility wants the thread back to + * do other work */ + wq = bit_waitqueue(&xobject->flags, CACHEFILES_OBJECT_ACTIVE); + init_wait(&wait); + requeue = false; + do { + prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); + if (!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) + break; + requeue = slow_work_sleep_till_thread_needed( + &object->fscache.work, &timeout); + } while (timeout > 0 && !requeue); + finish_wait(wq, &wait); + + if (requeue && + test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) { + _debug("queue OBJ%x behind OBJ%x after wait", + object->fscache.debug_id, + xobject->fscache.debug_id); + goto requeue; + } + + if (timeout <= 0) { + printk(KERN_ERR "\n"); + printk(KERN_ERR "CacheFiles: Error: Overlong" + " wait for old active object to go away\n"); + cachefiles_printk_object(object, xobject); + goto requeue; + } + } + + ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)); cache->cache.ops->put_object(&xobject->fscache); goto try_again; + +requeue: + clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags); + cache->cache.ops->put_object(&xobject->fscache); + _leave(" = -ETIMEDOUT"); + return -ETIMEDOUT; } /* @@ -466,12 +511,15 @@ lookup_again: } /* note that we're now using this object */ - cachefiles_mark_object_active(cache, object); + ret = cachefiles_mark_object_active(cache, object); mutex_unlock(&dir->d_inode->i_mutex); dput(dir); dir = NULL; + if (ret == -ETIMEDOUT) + goto mark_active_timed_out; + _debug("=== OBTAINED_OBJECT ==="); if (object->new) { @@ -515,6 +563,10 @@ create_error: cachefiles_io_error(cache, "Create/mkdir failed"); goto error; +mark_active_timed_out: + _debug("mark active timed out"); + goto release_dentry; + check_error: _debug("check error %d", ret); write_lock(&cache->active_lock); @@ -522,7 +574,7 @@ check_error: clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags); wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE); write_unlock(&cache->active_lock); - +release_dentry: dput(object->dentry); object->dentry = NULL; goto error_out; @@ -543,9 +595,6 @@ error: error_out2: dput(dir); error_out: - if (ret == -ENOSPC) - ret = -ENOBUFS; - _leave(" = error %d", -ret); return ret; } diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index 5b49a373689b..0ca2566e038c 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -215,6 +215,7 @@ extern atomic_t fscache_n_object_no_alloc; extern atomic_t fscache_n_object_lookups; extern atomic_t fscache_n_object_lookups_negative; extern atomic_t fscache_n_object_lookups_positive; +extern atomic_t fscache_n_object_lookups_timed_out; extern atomic_t fscache_n_object_created; extern atomic_t fscache_n_object_avail; extern atomic_t fscache_n_object_dead; diff --git a/fs/fscache/object.c b/fs/fscache/object.c index f3f952cf887e..e513ac599c8e 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -468,6 +468,7 @@ static void fscache_lookup_object(struct fscache_object *object) { struct fscache_cookie *cookie = object->cookie; struct fscache_object *parent; + int ret; _enter(""); @@ -493,12 +494,19 @@ static void fscache_lookup_object(struct fscache_object *object) fscache_stat(&fscache_n_object_lookups); fscache_stat(&fscache_n_cop_lookup_object); - object->cache->ops->lookup_object(object); + ret = object->cache->ops->lookup_object(object); fscache_stat_d(&fscache_n_cop_lookup_object); if (test_bit(FSCACHE_OBJECT_EV_ERROR, &object->events)) set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags); + if (ret == -ETIMEDOUT) { + /* probably stuck behind another object, so move this one to + * the back of the queue */ + fscache_stat(&fscache_n_object_lookups_timed_out); + set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events); + } + _leave(""); } diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c index 05f77caf4a2d..46435f3aae68 100644 --- a/fs/fscache/stats.c +++ b/fs/fscache/stats.c @@ -98,6 +98,7 @@ atomic_t fscache_n_object_no_alloc; atomic_t fscache_n_object_lookups; atomic_t fscache_n_object_lookups_negative; atomic_t fscache_n_object_lookups_positive; +atomic_t fscache_n_object_lookups_timed_out; atomic_t fscache_n_object_created; atomic_t fscache_n_object_avail; atomic_t fscache_n_object_dead; @@ -160,10 +161,11 @@ static int fscache_stats_show(struct seq_file *m, void *v) atomic_read(&fscache_n_acquires_nobufs), atomic_read(&fscache_n_acquires_oom)); - seq_printf(m, "Lookups: n=%u neg=%u pos=%u crt=%u\n", + seq_printf(m, "Lookups: n=%u neg=%u pos=%u crt=%u tmo=%u\n", atomic_read(&fscache_n_object_lookups), atomic_read(&fscache_n_object_lookups_negative), atomic_read(&fscache_n_object_lookups_positive), + atomic_read(&fscache_n_object_lookups_timed_out), atomic_read(&fscache_n_object_created)); seq_printf(m, "Updates: n=%u nul=%u run=%u\n", -- cgit v1.2.3 From 14e69647c868459bcb910f771851ca7c699efd21 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:12:08 +0000 Subject: CacheFiles: Don't log lookup/create failing with ENOBUFS Don't log the CacheFiles lookup/create object routined failing with ENOBUFS as under high memory load or high cache load they can do this quite a lot. This error simply means that the requested object cannot be created on disk due to lack of space, or due to failure of the backing filesystem to find sufficient resources. Signed-off-by: David Howells --- fs/cachefiles/interface.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 9d3c426044ae..27089311fbea 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -147,8 +147,9 @@ static int cachefiles_lookup_object(struct fscache_object *_object) cachefiles_attr_changed(&object->fscache); if (ret < 0 && ret != -ETIMEDOUT) { - printk(KERN_WARNING "CacheFiles: Lookup failed error %d\n", - ret); + if (ret != -ENOBUFS) + printk(KERN_WARNING + "CacheFiles: Lookup failed error %d\n", ret); fscache_object_lookup_error(&object->fscache); } -- cgit v1.2.3 From 0109d7e614e016a842569628116f54847a177f6e Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 20 Nov 2009 21:50:36 +0000 Subject: SLOW_WORK: Fix CIFS to pass THIS_MODULE to slow_work_register_user() As of the patch: SLOW_WORK: Wait for outstanding work items belonging to a module to clear Wait for outstanding slow work items belonging to a module to clear when unregistering that module as a user of the facility. This prevents the put_ref code of a work item from being taken away before it returns. slow_work_register_user() takes a module pointer as an argument. CIFS must now pass THIS_MODULE as that argument, lest the following error be observed: fs/cifs/cifsfs.c: In function 'init_cifs': fs/cifs/cifsfs.c:1040: error: too few arguments to function 'slow_work_register_user' Signed-off-by: David Howells --- fs/cifs/cifsfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 9a5e4f5f3122..29f1da761bbf 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -1037,7 +1037,7 @@ init_cifs(void) if (rc) goto out_unregister_key_type; #endif - rc = slow_work_register_user(); + rc = slow_work_register_user(THIS_MODULE); if (rc) goto out_unregister_resolver_key; -- cgit v1.2.3 From 1c2ea8a2c0b71cc5e07f518370d458d692c9b21a Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 20 Nov 2009 21:50:40 +0000 Subject: SLOW_WORK: Fix GFS2 to #include before using THIS_MODULE GFS2 has been altered to pass THIS_MODULE to slow_work_register_user(), but hasn't been altered to #include to provide it, resulting in the following error: fs/gfs2/recovery.c:596: error: 'THIS_MODULE' undeclared here (not in a function) Add the missing #include. Signed-off-by: David Howells --- fs/gfs2/recovery.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index b2bb779f09ed..09fa31965576 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -7,6 +7,7 @@ * of the GNU General Public License version 2. */ +#include #include #include #include -- cgit v1.2.3 From 4fa9f4ede88b4e2ff135b6e5717499d734508c62 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 20 Nov 2009 21:50:44 +0000 Subject: FS-Cache: Provide nop fscache_stat_d() if CONFIG_FSCACHE_STATS=n Provide nop fscache_stat_d() macro if CONFIG_FSCACHE_STATS=n lest errors like the following occur: fs/fscache/cache.c: In function 'fscache_withdraw_cache': fs/fscache/cache.c:386: error: implicit declaration of function 'fscache_stat_d' fs/fscache/cache.c:386: error: 'fscache_n_cop_sync_cache' undeclared (first use in this function) fs/fscache/cache.c:386: error: (Each undeclared identifier is reported only once fs/fscache/cache.c:386: error: for each function it appears in.) fs/fscache/cache.c:392: error: 'fscache_n_cop_dissociate_pages' undeclared (first use in this function) Signed-off-by: David Howells --- fs/fscache/internal.h | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index 0ca2566e038c..edd7434ab6e5 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -259,6 +259,7 @@ extern const struct file_operations fscache_stats_fops; #define __fscache_stat(stat) (NULL) #define fscache_stat(stat) do {} while (0) +#define fscache_stat_d(stat) do {} while (0) #endif /* -- cgit v1.2.3 From fe542cf59bf0b31afe72b9e9749c0f6645419fa0 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sun, 22 Nov 2009 11:49:55 +0900 Subject: LSM: Move security_path_chmod()/security_path_chown() to after mutex_lock(). We should call security_path_chmod()/security_path_chown() after mutex_lock() in order to avoid races. Signed-off-by: Tetsuo Handa Acked-by: John Johansen Signed-off-by: James Morris --- fs/open.c | 36 +++++++++++++++--------------------- 1 file changed, 15 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/open.c b/fs/open.c index 201041dfca57..b4b31d277f3a 100644 --- a/fs/open.c +++ b/fs/open.c @@ -619,17 +619,17 @@ SYSCALL_DEFINE2(fchmod, unsigned int, fd, mode_t, mode) err = mnt_want_write_file(file); if (err) goto out_putf; + mutex_lock(&inode->i_mutex); err = security_path_chmod(dentry, file->f_vfsmnt, mode); if (err) - goto out_drop_write; - mutex_lock(&inode->i_mutex); + goto out_unlock; if (mode == (mode_t) -1) mode = inode->i_mode; newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; err = notify_change(dentry, &newattrs); +out_unlock: mutex_unlock(&inode->i_mutex); -out_drop_write: mnt_drop_write(file->f_path.mnt); out_putf: fput(file); @@ -652,17 +652,17 @@ SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, mode_t, mode) error = mnt_want_write(path.mnt); if (error) goto dput_and_out; + mutex_lock(&inode->i_mutex); error = security_path_chmod(path.dentry, path.mnt, mode); if (error) - goto out_drop_write; - mutex_lock(&inode->i_mutex); + goto out_unlock; if (mode == (mode_t) -1) mode = inode->i_mode; newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; error = notify_change(path.dentry, &newattrs); +out_unlock: mutex_unlock(&inode->i_mutex); -out_drop_write: mnt_drop_write(path.mnt); dput_and_out: path_put(&path); @@ -675,9 +675,9 @@ SYSCALL_DEFINE2(chmod, const char __user *, filename, mode_t, mode) return sys_fchmodat(AT_FDCWD, filename, mode); } -static int chown_common(struct dentry * dentry, uid_t user, gid_t group) +static int chown_common(struct path *path, uid_t user, gid_t group) { - struct inode *inode = dentry->d_inode; + struct inode *inode = path->dentry->d_inode; int error; struct iattr newattrs; @@ -694,7 +694,9 @@ static int chown_common(struct dentry * dentry, uid_t user, gid_t group) newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV; mutex_lock(&inode->i_mutex); - error = notify_change(dentry, &newattrs); + error = security_path_chown(path, user, group); + if (!error) + error = notify_change(path->dentry, &newattrs); mutex_unlock(&inode->i_mutex); return error; @@ -711,9 +713,7 @@ SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group) error = mnt_want_write(path.mnt); if (error) goto out_release; - error = security_path_chown(&path, user, group); - if (!error) - error = chown_common(path.dentry, user, group); + error = chown_common(&path, user, group); mnt_drop_write(path.mnt); out_release: path_put(&path); @@ -738,9 +738,7 @@ SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user, error = mnt_want_write(path.mnt); if (error) goto out_release; - error = security_path_chown(&path, user, group); - if (!error) - error = chown_common(path.dentry, user, group); + error = chown_common(&path, user, group); mnt_drop_write(path.mnt); out_release: path_put(&path); @@ -759,9 +757,7 @@ SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group error = mnt_want_write(path.mnt); if (error) goto out_release; - error = security_path_chown(&path, user, group); - if (!error) - error = chown_common(path.dentry, user, group); + error = chown_common(&path, user, group); mnt_drop_write(path.mnt); out_release: path_put(&path); @@ -784,9 +780,7 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) goto out_fput; dentry = file->f_path.dentry; audit_inode(NULL, dentry); - error = security_path_chown(&file->f_path, user, group); - if (!error) - error = chown_common(dentry, user, group); + error = chown_common(&file->f_path, user, group); mnt_drop_write(file->f_path.mnt); out_fput: fput(file); -- cgit v1.2.3 From 8e6c0332d5032aef2d3bc8f41771f999112c8c66 Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 24 Nov 2009 22:17:59 +0000 Subject: [CIFS] fix oops in cifs_lookup during net boot Fixes bugzilla.kernel.org bug number 14641 Lookup called during network boot (network root filesystem for diskless workstation) has case where nd is null in lookup. This patch fixes that in cifs_lookup. (Shirish noted that 2.6.30 and 2.6.31 stable need the same check) Signed-off-by: Shirish Pargaonkar Acked-by: Jeff Layton Tested-by: Vladimir Stavrinov CC: Stable Signed-off-by: Steve French --- fs/cifs/dir.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 627a60a6c1b1..32771f581b67 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -643,7 +643,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, * O_EXCL: optimize away the lookup, but don't hash the dentry. Let * the VFS handle the create. */ - if (nd->flags & LOOKUP_EXCL) { + if (nd && (nd->flags & LOOKUP_EXCL)) { d_instantiate(direntry, NULL); return 0; } @@ -675,7 +675,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, * reduction in network traffic in the other paths. */ if (pTcon->unix_ext) { - if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) && + if (nd && !(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) && (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open && (nd->intent.open.flags & O_CREAT)) { rc = cifs_posix_open(full_path, &newInode, nd->path.mnt, -- cgit v1.2.3 From cea62343956c24452700c06cf028b72414c58a74 Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 24 Nov 2009 22:49:37 +0000 Subject: [CIFS] Duplicate data on appending to some Samba servers SMB writes are sent with a starting offset and length. When the server supports the newer SMB trans2 posix open (rather than using the SMB NTCreateX) a file can be opened with SMB_O_APPEND flag, and for that case Samba server assumes that the offset sent in SMBWriteX is unneeded since the write should go to the end of the file - which can cause problems if the write was cached (since the beginning part of a page could be written twice by the client mm). Jeff suggested that masking the flag on posix open on the client is easiest for the time being. Note that recent Samba server also had an unrelated problem with SMB NTCreateX and append (see samba bugzilla bug number 6898) which should not affect current Linux clients (unless cifs Unix Extensions are disabled). The cifs client did not send the O_APPEND flag on posix open before 2.6.29 so the fix is unneeded on early kernels. In the future, for the non-cached case (O_DIRECT, and forcedirectio mounts) it would be possible and useful to send O_APPEND on posix open (for Windows case: FILE_APPEND_DATA but not FILE_WRITE_DATA on SMB NTCreateX) but for cached writes although the vfs sets the offset to end of file it may fragment a write across pages - so we can't send O_APPEND on open (could result in sending part of a page twice). CC: Stable Reviewed-by: Shirish Pargaonkar Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/dir.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 32771f581b67..d3a6b07e3355 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -214,8 +214,6 @@ int cifs_posix_open(char *full_path, struct inode **pinode, posix_flags |= SMB_O_EXCL; if (oflags & O_TRUNC) posix_flags |= SMB_O_TRUNC; - if (oflags & O_APPEND) - posix_flags |= SMB_O_APPEND; if (oflags & O_SYNC) posix_flags |= SMB_O_SYNC; if (oflags & O_DIRECTORY) -- cgit v1.2.3 From 2f81e752da4781fc276689fc14391346d0dbbe78 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 25 Nov 2009 00:11:31 +0000 Subject: [CIFS] Fix sparse warning Also update CHANGES file Signed-off-by: Steve French --- fs/cifs/CHANGES | 9 +++++++++ fs/cifs/dir.c | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES index 145540a316ab..094ea65afc85 100644 --- a/fs/cifs/CHANGES +++ b/fs/cifs/CHANGES @@ -1,3 +1,12 @@ +Version 1.61 +------------ +Fix append problem to Samba servers (files opened with O_APPEND could +have duplicated data). Fix oops in cifs_lookup. Workaround problem +mounting to OS/400 Netserve. Fix oops in cifs_get_tcp_session. +Disable use of server inode numbers when server only +partially supports them (e.g. for one server querying inode numbers on +FindFirst fails but QPathInfo queries works). + Version 1.60 ------------- Fix memory leak in reconnect. Fix oops in DFS mount error path. diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index d3a6b07e3355..1f42f772865a 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -643,7 +643,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, */ if (nd && (nd->flags & LOOKUP_EXCL)) { d_instantiate(direntry, NULL); - return 0; + return NULL; } /* can not grab the rename sem here since it would -- cgit v1.2.3 From d180c5bccec02612256fd8076ff3c1fac3429553 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Thu, 26 Nov 2009 14:48:30 +0900 Subject: sched: Introduce task_times() to replace task_{u,s}time() pair Functions task_{u,s}time() are called in pair in almost all cases. However task_stime() is implemented to call task_utime() from its inside, so such paired calls run task_utime() twice. It means we do heavy divisions (div_u64 + do_div) twice to get utime and stime which can be obtained at same time by one set of divisions. This patch introduces a function task_times(*tsk, *utime, *stime) to retrieve utime and stime at once in better, optimized way. Signed-off-by: Hidetoshi Seto Acked-by: Peter Zijlstra Cc: Stanislaw Gruszka Cc: Spencer Candland Cc: Oleg Nesterov Cc: Balbir Singh Cc: Americo Wang LKML-Reference: <4B0E16AE.906@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- fs/proc/array.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/array.c b/fs/proc/array.c index e209f64ab27b..330deda70d08 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -535,8 +535,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, if (!whole) { min_flt = task->min_flt; maj_flt = task->maj_flt; - utime = task_utime(task); - stime = task_stime(task); + task_times(task, &utime, &stime); gtime = task_gtime(task); } -- cgit v1.2.3 From d5b7c78e975302a1bab28263266c39ecb71acad4 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Thu, 26 Nov 2009 14:49:05 +0900 Subject: sched: Remove task_{u,s,g}time() Now all task_{u,s}time() pairs are replaced by task_times(). And task_gtime() is too simple to be an inline function. Cleanup them all. Signed-off-by: Hidetoshi Seto Acked-by: Peter Zijlstra Cc: Stanislaw Gruszka Cc: Spencer Candland Cc: Oleg Nesterov Cc: Balbir Singh Cc: Americo Wang LKML-Reference: <4B0E16D1.70902@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- fs/proc/array.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/array.c b/fs/proc/array.c index 330deda70d08..ca61a88aed66 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -511,7 +511,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, do { min_flt += t->min_flt; maj_flt += t->maj_flt; - gtime = cputime_add(gtime, task_gtime(t)); + gtime = cputime_add(gtime, t->gtime); t = next_thread(t); } while (t != task); @@ -536,7 +536,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, min_flt = task->min_flt; maj_flt = task->maj_flt; task_times(task, &utime, &stime); - gtime = task_gtime(task); + gtime = task->gtime; } /* scale priority and nice values from timeslices to -20..20 */ -- cgit v1.2.3 From 1b7323965a8c6eee9dc4e345a7ae4bff1dc93149 Mon Sep 17 00:00:00 2001 From: Csaba Henk Date: Fri, 27 Nov 2009 19:30:14 +0530 Subject: fuse: reject O_DIRECT flag also in fuse_create The comment in fuse_open about O_DIRECT: "VFS checks this, but only _after_ ->open()" also holds for fuse_create, however, the same kind of check was missing there. As an impact of this bug, open(newfile, O_RDWR|O_CREAT|O_DIRECT) fails, but a stub newfile will remain if the fuse server handled the implied FUSE_CREATE request appropriately. Other impact: in the above situation ima_file_free() will complain to open/free imbalance if CONFIG_IMA is set. Signed-off-by: Csaba Henk Signed-off-by: Miklos Szeredi Cc: Harshavardhana Cc: stable@kernel.org --- fs/fuse/dir.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 8ada78aade58..4787ae6c5c1c 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -385,6 +385,9 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode, if (fc->no_create) return -ENOSYS; + if (flags & O_DIRECT) + return -EINVAL; + forget_req = fuse_get_req(fc); if (IS_ERR(forget_req)) return PTR_ERR(forget_req); -- cgit v1.2.3 From 199bc9ff5ca5e4b3bcaff8927b2983c65f34c263 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Mon, 30 Nov 2009 09:06:40 +0000 Subject: jffs2: Fix memory corruption in jffs2_read_inode_range() In 2.6.23 kernel, commit a32ea1e1f925399e0d81ca3f7394a44a6dafa12c ("Fix read/truncate race") fixed a race in the generic code, and as a side effect, now do_generic_file_read() can ask us to readpage() past the i_size. This seems to be correctly handled by the block routines (e.g. block_read_full_page() fills the page with zeroes in case if somebody is trying to read past the last inode's block). JFFS2 doesn't handle this; it assumes that it won't be asked to read pages which don't exist -- and thus that there will be at least _one_ valid 'frag' on the page it's being asked to read. It will fill any holes with the following memset: memset(buf, 0, min(end, frag->ofs + frag->size) - offset); When the 'closest smaller match' returned by jffs2_lookup_node_frag() is actually on a previous page and ends before 'offset', that results in: memset(buf, 0, ); Hopefully, in most cases the corruption is fatal, and quickly causing random oopses, like this: root@10.0.0.4:~/ltp-fs-20090531# ./testcases/kernel/fs/ftest/ftest01 Unable to handle kernel paging request for data at address 0x00000008 Faulting instruction address: 0xc01cd980 Oops: Kernel access of bad area, sig: 11 [#1] [...] NIP [c01cd980] rb_insert_color+0x38/0x184 LR [c0043978] enqueue_hrtimer+0x88/0xc4 Call Trace: [c6c63b60] [c004f9a8] tick_sched_timer+0xa0/0xe4 (unreliable) [c6c63b80] [c0043978] enqueue_hrtimer+0x88/0xc4 [c6c63b90] [c0043a48] __run_hrtimer+0x94/0xbc [c6c63bb0] [c0044628] hrtimer_interrupt+0x140/0x2b8 [c6c63c10] [c000f8e8] timer_interrupt+0x13c/0x254 [c6c63c30] [c001352c] ret_from_except+0x0/0x14 --- Exception: 901 at memset+0x38/0x5c LR = jffs2_read_inode_range+0x144/0x17c [c6c63cf0] [00000000] (null) (unreliable) This patch fixes the issue, plus fixes all LTP tests on NAND/UBI with JFFS2 filesystem that were failing since 2.6.23 (seems like the bug above also broke the truncation). Reported-By: Anton Vorontsov Tested-By: Anton Vorontsov Signed-off-by: David Woodhouse Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- fs/jffs2/read.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/jffs2/read.c b/fs/jffs2/read.c index cfe05c1966a5..3f39be1b0455 100644 --- a/fs/jffs2/read.c +++ b/fs/jffs2/read.c @@ -164,12 +164,15 @@ int jffs2_read_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f, /* XXX FIXME: Where a single physical node actually shows up in two frags, we read it twice. Don't do that. */ - /* Now we're pointing at the first frag which overlaps our page */ + /* Now we're pointing at the first frag which overlaps our page + * (or perhaps is before it, if we've been asked to read off the + * end of the file). */ while(offset < end) { D2(printk(KERN_DEBUG "jffs2_read_inode_range: offset %d, end %d\n", offset, end)); - if (unlikely(!frag || frag->ofs > offset)) { + if (unlikely(!frag || frag->ofs > offset || + frag->ofs + frag->size <= offset)) { uint32_t holesize = end - offset; - if (frag) { + if (frag && frag->ofs > offset) { D1(printk(KERN_NOTICE "Eep. Hole in ino #%u fraglist. frag->ofs = 0x%08x, offset = 0x%08x\n", f->inocache->ino, frag->ofs, offset)); holesize = min(holesize, frag->ofs - offset); } -- cgit v1.2.3 From 6f054164322bc6c1233402b9ed6b40d4af39a98f Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 1 Dec 2009 13:38:45 +0000 Subject: 9p: fix build breakage introduced by FS-Cache While building 2.6.32-rc8-git2 for Fedora I noticed the following thinko in commit 201a15428bd54f83eccec8b7c64a04b8f9431204 ("FS-Cache: Handle pages pending storage that get evicted under OOM conditions"): fs/9p/cache.c: In function '__v9fs_fscache_release_page': fs/9p/cache.c:346: error: 'vnode' undeclared (first use in this function) fs/9p/cache.c:346: error: (Each undeclared identifier is reported only once fs/9p/cache.c:346: error: for each function it appears in.) make[2]: *** [fs/9p/cache.o] Error 1 Fix the 9P filesystem to correctly construct the argument to fscache_maybe_release_page(). Signed-off-by: Kyle McMartin Signed-off-by: Xiaotian Feng [from identical patch] Signed-off-by: Stefan Lippers-Hollmann [from identical patch] Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- fs/9p/cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/9p/cache.c b/fs/9p/cache.c index bcc5357a9069..e777961939f3 100644 --- a/fs/9p/cache.c +++ b/fs/9p/cache.c @@ -343,7 +343,7 @@ int __v9fs_fscache_release_page(struct page *page, gfp_t gfp) BUG_ON(!vcookie->fscache); - return fscache_maybe_release_page(vnode->cache, page, gfp); + return fscache_maybe_release_page(vcookie->fscache, page, gfp); } void __v9fs_fscache_invalidate_page(struct page *page) -- cgit v1.2.3 From 3350b2acdd39d23db52710045536b943fe38a35c Mon Sep 17 00:00:00 2001 From: Marc Dionne Date: Tue, 1 Dec 2009 14:09:24 +0000 Subject: CacheFiles: Update IMA counters when using dentry_open When IMA is active, using dentry_open without updating the IMA counters will result in free/open imbalance errors when fput is eventually called. Signed-off-by: Marc Dionne Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- fs/cachefiles/rdwr.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 1d8332563863..a6c8c6fe8df9 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -11,6 +11,7 @@ #include #include +#include #include "internal.h" /* @@ -922,6 +923,7 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page) if (IS_ERR(file)) { ret = PTR_ERR(file); } else { + ima_counts_get(file); ret = -EIO; if (file->f_op->write) { pos = (loff_t) page->index << PAGE_SHIFT; -- cgit v1.2.3 From 0cf55e1ec08bb5a22e068309e2d8ba1180ab4239 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Wed, 2 Dec 2009 17:28:07 +0900 Subject: sched, cputime: Introduce thread_group_times() This is a real fix for problem of utime/stime values decreasing described in the thread: http://lkml.org/lkml/2009/11/3/522 Now cputime is accounted in the following way: - {u,s}time in task_struct are increased every time when the thread is interrupted by a tick (timer interrupt). - When a thread exits, its {u,s}time are added to signal->{u,s}time, after adjusted by task_times(). - When all threads in a thread_group exits, accumulated {u,s}time (and also c{u,s}time) in signal struct are added to c{u,s}time in signal struct of the group's parent. So {u,s}time in task struct are "raw" tick count, while {u,s}time and c{u,s}time in signal struct are "adjusted" values. And accounted values are used by: - task_times(), to get cputime of a thread: This function returns adjusted values that originates from raw {u,s}time and scaled by sum_exec_runtime that accounted by CFS. - thread_group_cputime(), to get cputime of a thread group: This function returns sum of all {u,s}time of living threads in the group, plus {u,s}time in the signal struct that is sum of adjusted cputimes of all exited threads belonged to the group. The problem is the return value of thread_group_cputime(), because it is mixed sum of "raw" value and "adjusted" value: group's {u,s}time = foreach(thread){{u,s}time} + exited({u,s}time) This misbehavior can break {u,s}time monotonicity. Assume that if there is a thread that have raw values greater than adjusted values (e.g. interrupted by 1000Hz ticks 50 times but only runs 45ms) and if it exits, cputime will decrease (e.g. -5ms). To fix this, we could do: group's {u,s}time = foreach(t){task_times(t)} + exited({u,s}time) But task_times() contains hard divisions, so applying it for every thread should be avoided. This patch fixes the above problem in the following way: - Modify thread's exit (= __exit_signal()) not to use task_times(). It means {u,s}time in signal struct accumulates raw values instead of adjusted values. As the result it makes thread_group_cputime() to return pure sum of "raw" values. - Introduce a new function thread_group_times(*task, *utime, *stime) that converts "raw" values of thread_group_cputime() to "adjusted" values, in same calculation procedure as task_times(). - Modify group's exit (= wait_task_zombie()) to use this introduced thread_group_times(). It make c{u,s}time in signal struct to have adjusted values like before this patch. - Replace some thread_group_cputime() by thread_group_times(). This replacements are only applied where conveys the "adjusted" cputime to users, and where already uses task_times() near by it. (i.e. sys_times(), getrusage(), and /proc//stat.) This patch have a positive side effect: - Before this patch, if a group contains many short-life threads (e.g. runs 0.9ms and not interrupted by ticks), the group's cputime could be invisible since thread's cputime was accumulated after adjusted: imagine adjustment function as adj(ticks, runtime), {adj(0, 0.9) + adj(0, 0.9) + ....} = {0 + 0 + ....} = 0. After this patch it will not happen because the adjustment is applied after accumulated. v2: - remove if()s, put new variables into signal_struct. Signed-off-by: Hidetoshi Seto Acked-by: Peter Zijlstra Cc: Spencer Candland Cc: Americo Wang Cc: Oleg Nesterov Cc: Balbir Singh Cc: Stanislaw Gruszka LKML-Reference: <4B162517.8040909@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- fs/proc/array.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/proc/array.c b/fs/proc/array.c index ca61a88aed66..2571da43c736 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -506,7 +506,6 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, /* add up live thread stats at the group level */ if (whole) { - struct task_cputime cputime; struct task_struct *t = task; do { min_flt += t->min_flt; @@ -517,9 +516,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, min_flt += sig->min_flt; maj_flt += sig->maj_flt; - thread_group_cputime(task, &cputime); - utime = cputime.utime; - stime = cputime.stime; + thread_group_times(task, &utime, &stime); gtime = cputime_add(gtime, sig->gtime); } -- cgit v1.2.3 From 7e71c55ee73988d0cb61045660b899eaac23bf8f Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 22 Sep 2009 10:56:16 +0100 Subject: GFS2: Fix potential race in glock code We need to be careful of the ordering between clearing the GLF_LOCK bit and scheduling the workqueue. Signed-off-by: Steven Whitehouse --- fs/gfs2/glock.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 8b674b1f3a55..a3f90ad2af80 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -672,12 +672,17 @@ out: return; out_sched: + clear_bit(GLF_LOCK, &gl->gl_flags); + smp_mb__after_clear_bit(); gfs2_glock_hold(gl); if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) gfs2_glock_put_nolock(gl); + return; + out_unlock: clear_bit(GLF_LOCK, &gl->gl_flags); - goto out; + smp_mb__after_clear_bit(); + return; } static void delete_work_func(struct work_struct *work) @@ -1375,10 +1380,11 @@ static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask) handle_callback(gl, LM_ST_UNLOCKED, 0); nr--; } + clear_bit(GLF_LOCK, &gl->gl_flags); + smp_mb__after_clear_bit(); if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) gfs2_glock_put_nolock(gl); spin_unlock(&gl->gl_spin); - clear_bit(GLF_LOCK, &gl->gl_flags); spin_lock(&lru_lock); continue; } -- cgit v1.2.3 From f55073ff1eaf99f6b3bc62134a456638bca043a3 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Mon, 28 Sep 2009 10:30:49 +0100 Subject: GFS2: Fix -o meta mounts for subsequent mounts (i.e. all but the first one) We have a long term plan to use the "-o meta" flag to GFS2 mounts to access the alternate root which is used to store metadata for a GFS2 filesystem. This will allow us to eventually remove support for the gfs2meta filesystem type (which is in any case just a "front end" to the gfs2 filesystem type with the meta/master root). Currently the "-o meta" option is only taken into account on the initial mount of the filesystem. Subsequent mounts of the same filesystem (i.e. on the same device) result in basically the same as bind mounting the root of the original mount. This patch changes that by using what is more or less a copy of get_sb_bdev() and extending it so that it will take into account the alternate root in all cases. The main difference is that we have to parse the mount options a bit earlier. We can then use them to select the appropriate root towards the end of the function. In addition this also fixes a bug where it was possible (but certainly not desirable) to set different ro/rw options for the meta root when mounted via the gfs2meta fs compared with the original mount. Signed-off-by: Steven Whitehouse Cc: Alexander Viro --- fs/gfs2/ops_fstype.c | 135 ++++++++++++++++++++++++++++++++++++++++++++------- fs/gfs2/super.c | 16 +++--- fs/gfs2/super.h | 2 +- 3 files changed, 127 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 52fb6c048981..e5ee06231ae5 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1114,7 +1114,7 @@ void gfs2_online_uevent(struct gfs2_sbd *sdp) * Returns: errno */ -static int fill_super(struct super_block *sb, void *data, int silent) +static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent) { struct gfs2_sbd *sdp; struct gfs2_holder mount_gh; @@ -1125,17 +1125,7 @@ static int fill_super(struct super_block *sb, void *data, int silent) printk(KERN_WARNING "GFS2: can't alloc struct gfs2_sbd\n"); return -ENOMEM; } - - sdp->sd_args.ar_quota = GFS2_QUOTA_DEFAULT; - sdp->sd_args.ar_data = GFS2_DATA_DEFAULT; - sdp->sd_args.ar_commit = 60; - sdp->sd_args.ar_errors = GFS2_ERRORS_DEFAULT; - - error = gfs2_mount_args(sdp, &sdp->sd_args, data); - if (error) { - printk(KERN_WARNING "GFS2: can't parse mount arguments\n"); - goto fail; - } + sdp->sd_args = *args; if (sdp->sd_args.ar_spectator) { sb->s_flags |= MS_RDONLY; @@ -1243,18 +1233,125 @@ fail: return error; } -static int gfs2_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data, struct vfsmount *mnt) +static int set_gfs2_super(struct super_block *s, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt); + s->s_bdev = data; + s->s_dev = s->s_bdev->bd_dev; + + /* + * We set the bdi here to the queue backing, file systems can + * overwrite this in ->fill_super() + */ + s->s_bdi = &bdev_get_queue(s->s_bdev)->backing_dev_info; + return 0; } -static int test_meta_super(struct super_block *s, void *ptr) +static int test_gfs2_super(struct super_block *s, void *ptr) { struct block_device *bdev = ptr; return (bdev == s->s_bdev); } +/** + * gfs2_get_sb - Get the GFS2 superblock + * @fs_type: The GFS2 filesystem type + * @flags: Mount flags + * @dev_name: The name of the device + * @data: The mount arguments + * @mnt: The vfsmnt for this mount + * + * Q. Why not use get_sb_bdev() ? + * A. We need to select one of two root directories to mount, independent + * of whether this is the initial, or subsequent, mount of this sb + * + * Returns: 0 or -ve on error + */ + +static int gfs2_get_sb(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data, struct vfsmount *mnt) +{ + struct block_device *bdev; + struct super_block *s; + fmode_t mode = FMODE_READ; + int error; + struct gfs2_args args; + struct gfs2_sbd *sdp; + + if (!(flags & MS_RDONLY)) + mode |= FMODE_WRITE; + + bdev = open_bdev_exclusive(dev_name, mode, fs_type); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); + + /* + * once the super is inserted into the list by sget, s_umount + * will protect the lockfs code from trying to start a snapshot + * while we are mounting + */ + mutex_lock(&bdev->bd_fsfreeze_mutex); + if (bdev->bd_fsfreeze_count > 0) { + mutex_unlock(&bdev->bd_fsfreeze_mutex); + error = -EBUSY; + goto error_bdev; + } + s = sget(fs_type, test_gfs2_super, set_gfs2_super, bdev); + mutex_unlock(&bdev->bd_fsfreeze_mutex); + error = PTR_ERR(s); + if (IS_ERR(s)) + goto error_bdev; + + memset(&args, 0, sizeof(args)); + args.ar_quota = GFS2_QUOTA_DEFAULT; + args.ar_data = GFS2_DATA_DEFAULT; + args.ar_commit = 60; + args.ar_errors = GFS2_ERRORS_DEFAULT; + + error = gfs2_mount_args(&args, data); + if (error) { + printk(KERN_WARNING "GFS2: can't parse mount arguments\n"); + if (s->s_root) + goto error_super; + deactivate_locked_super(s); + return error; + } + + if (s->s_root) { + error = -EBUSY; + if ((flags ^ s->s_flags) & MS_RDONLY) + goto error_super; + close_bdev_exclusive(bdev, mode); + } else { + char b[BDEVNAME_SIZE]; + + s->s_flags = flags; + s->s_mode = mode; + strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); + sb_set_blocksize(s, block_size(bdev)); + error = fill_super(s, &args, flags & MS_SILENT ? 1 : 0); + if (error) { + deactivate_locked_super(s); + return error; + } + s->s_flags |= MS_ACTIVE; + bdev->bd_super = s; + } + + sdp = s->s_fs_info; + mnt->mnt_sb = s; + if (args.ar_meta) + mnt->mnt_root = dget(sdp->sd_master_dir); + else + mnt->mnt_root = dget(sdp->sd_root_dir); + return 0; + +error_super: + deactivate_locked_super(s); +error_bdev: + close_bdev_exclusive(bdev, mode); + return error; +} + static int set_meta_super(struct super_block *s, void *ptr) { return -EINVAL; @@ -1274,13 +1371,17 @@ static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags, dev_name, error); return error; } - s = sget(&gfs2_fs_type, test_meta_super, set_meta_super, + s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super, path.dentry->d_inode->i_sb->s_bdev); path_put(&path); if (IS_ERR(s)) { printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n"); return PTR_ERR(s); } + if ((flags ^ s->s_flags) & MS_RDONLY) { + deactivate_locked_super(s); + return -EBUSY; + } sdp = s->s_fs_info; mnt->mnt_sb = s; mnt->mnt_root = dget(sdp->sd_master_dir); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 0ec3ec672de1..42e5458703f0 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -106,13 +106,13 @@ static const match_table_t tokens = { /** * gfs2_mount_args - Parse mount options - * @sdp: - * @data: + * @args: The structure into which the parsed options will be written + * @options: The options to parse * * Return: errno */ -int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options) +int gfs2_mount_args(struct gfs2_args *args, char *options) { char *o; int token; @@ -157,7 +157,7 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options) break; case Opt_debug: if (args->ar_errors == GFS2_ERRORS_PANIC) { - fs_info(sdp, "-o debug and -o errors=panic " + printk(KERN_WARNING "GFS2: -o debug and -o errors=panic " "are mutually exclusive.\n"); return -EINVAL; } @@ -210,7 +210,7 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options) case Opt_commit: rv = match_int(&tmp[0], &args->ar_commit); if (rv || args->ar_commit <= 0) { - fs_info(sdp, "commit mount option requires a positive numeric argument\n"); + printk(KERN_WARNING "GFS2: commit mount option requires a positive numeric argument\n"); return rv ? rv : -EINVAL; } break; @@ -219,7 +219,7 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options) break; case Opt_err_panic: if (args->ar_debug) { - fs_info(sdp, "-o debug and -o errors=panic " + printk(KERN_WARNING "GFS2: -o debug and -o errors=panic " "are mutually exclusive.\n"); return -EINVAL; } @@ -227,7 +227,7 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options) break; case Opt_error: default: - fs_info(sdp, "invalid mount option: %s\n", o); + printk(KERN_WARNING "GFS2: invalid mount option: %s\n", o); return -EINVAL; } } @@ -1062,7 +1062,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) spin_lock(>->gt_spin); args.ar_commit = gt->gt_log_flush_secs; spin_unlock(>->gt_spin); - error = gfs2_mount_args(sdp, &args, data); + error = gfs2_mount_args(&args, data); if (error) return error; diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h index 235db3682885..ed962ea68c3a 100644 --- a/fs/gfs2/super.h +++ b/fs/gfs2/super.h @@ -27,7 +27,7 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp) extern void gfs2_jindex_free(struct gfs2_sbd *sdp); -extern int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *data); +extern int gfs2_mount_args(struct gfs2_args *args, char *data); extern struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid); extern int gfs2_jdesc_check(struct gfs2_jdesc *jd); -- cgit v1.2.3 From 2646a1f61a3b5525914757f10fa12b5b94713648 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Fri, 2 Oct 2009 11:50:54 +0100 Subject: GFS2: Fix up system xattrs This code has been shamelessly stolen from XFS at the suggestion of Christoph Hellwig. I've not added support for cached ACLs so far... watch for that in a later patch, although this is designed in such a way that they should be easy to add. Signed-off-by: Steven Whitehouse Cc: Christoph Hellwig --- fs/gfs2/acl.c | 170 ++++++++++++++++++++++++++++++++++++++------------------ fs/gfs2/acl.h | 24 ++------ fs/gfs2/xattr.c | 18 ------ 3 files changed, 120 insertions(+), 92 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 3fc4e3ac7d84..2168da121647 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -26,61 +27,6 @@ #include "trans.h" #include "util.h" -#define ACL_ACCESS 1 -#define ACL_DEFAULT 0 - -int gfs2_acl_validate_set(struct gfs2_inode *ip, int access, - struct gfs2_ea_request *er, int *remove, mode_t *mode) -{ - struct posix_acl *acl; - int error; - - error = gfs2_acl_validate_remove(ip, access); - if (error) - return error; - - if (!er->er_data) - return -EINVAL; - - acl = posix_acl_from_xattr(er->er_data, er->er_data_len); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (!acl) { - *remove = 1; - return 0; - } - - error = posix_acl_valid(acl); - if (error) - goto out; - - if (access) { - error = posix_acl_equiv_mode(acl, mode); - if (!error) - *remove = 1; - else if (error > 0) - error = 0; - } - -out: - posix_acl_release(acl); - return error; -} - -int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access) -{ - if (!GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl) - return -EOPNOTSUPP; - if (!is_owner_or_cap(&ip->i_inode)) - return -EPERM; - if (S_ISLNK(ip->i_inode.i_mode)) - return -EOPNOTSUPP; - if (!access && !S_ISDIR(ip->i_inode.i_mode)) - return -EACCES; - - return 0; -} - static int acl_get(struct gfs2_inode *ip, const char *name, struct posix_acl **acl, struct gfs2_ea_location *el, char **datap, unsigned int *lenp) @@ -277,3 +223,117 @@ out_brelse: return error; } +static int gfs2_acl_type(const char *name) +{ + if (strcmp(name, GFS2_POSIX_ACL_ACCESS) == 0) + return ACL_TYPE_ACCESS; + if (strcmp(name, GFS2_POSIX_ACL_DEFAULT) == 0) + return ACL_TYPE_DEFAULT; + return -EINVAL; +} + +static int gfs2_xattr_system_get(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + int type; + + type = gfs2_acl_type(name); + if (type < 0) + return type; + + return gfs2_xattr_get(inode, GFS2_EATYPE_SYS, name, buffer, size); +} + +static int gfs2_set_mode(struct inode *inode, mode_t mode) +{ + int error = 0; + + if (mode != inode->i_mode) { + struct iattr iattr; + + iattr.ia_valid = ATTR_MODE; + iattr.ia_mode = mode; + + error = gfs2_setattr_simple(GFS2_I(inode), &iattr); + } + + return error; +} + +static int gfs2_xattr_system_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct posix_acl *acl = NULL; + int error = 0, type; + + if (!sdp->sd_args.ar_posix_acl) + return -EOPNOTSUPP; + + type = gfs2_acl_type(name); + if (type < 0) + return type; + if (flags & XATTR_CREATE) + return -EINVAL; + if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) + return value ? -EACCES : 0; + if ((current_fsuid() != inode->i_uid) && !capable(CAP_FOWNER)) + return -EPERM; + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + + if (!value) + goto set_acl; + + acl = posix_acl_from_xattr(value, size); + if (!acl) { + /* + * acl_set_file(3) may request that we set default ACLs with + * zero length -- defend (gracefully) against that here. + */ + goto out; + } + if (IS_ERR(acl)) { + error = PTR_ERR(acl); + goto out; + } + + error = posix_acl_valid(acl); + if (error) + goto out_release; + + error = -EINVAL; + if (acl->a_count > GFS2_ACL_MAX_ENTRIES) + goto out_release; + + if (type == ACL_TYPE_ACCESS) { + mode_t mode = inode->i_mode; + error = posix_acl_equiv_mode(acl, &mode); + + if (error <= 0) { + posix_acl_release(acl); + acl = NULL; + + if (error < 0) + return error; + } + + error = gfs2_set_mode(inode, mode); + if (error) + goto out_release; + } + +set_acl: + error = gfs2_xattr_set(inode, GFS2_EATYPE_SYS, name, value, size, 0); +out_release: + posix_acl_release(acl); +out: + return error; +} + +struct xattr_handler gfs2_xattr_system_handler = { + .prefix = XATTR_SYSTEM_PREFIX, + .get = gfs2_xattr_system_get, + .set = gfs2_xattr_system_set, +}; + diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h index 6751930bfb64..cc954390b6da 100644 --- a/fs/gfs2/acl.h +++ b/fs/gfs2/acl.h @@ -13,26 +13,12 @@ #include "incore.h" #define GFS2_POSIX_ACL_ACCESS "posix_acl_access" -#define GFS2_POSIX_ACL_ACCESS_LEN 16 #define GFS2_POSIX_ACL_DEFAULT "posix_acl_default" -#define GFS2_POSIX_ACL_DEFAULT_LEN 17 +#define GFS2_ACL_MAX_ENTRIES 25 -#define GFS2_ACL_IS_ACCESS(name, len) \ - ((len) == GFS2_POSIX_ACL_ACCESS_LEN && \ - !memcmp(GFS2_POSIX_ACL_ACCESS, (name), (len))) - -#define GFS2_ACL_IS_DEFAULT(name, len) \ - ((len) == GFS2_POSIX_ACL_DEFAULT_LEN && \ - !memcmp(GFS2_POSIX_ACL_DEFAULT, (name), (len))) - -struct gfs2_ea_request; - -int gfs2_acl_validate_set(struct gfs2_inode *ip, int access, - struct gfs2_ea_request *er, - int *remove, mode_t *mode); -int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access); -int gfs2_check_acl(struct inode *inode, int mask); -int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip); -int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr); +extern int gfs2_check_acl(struct inode *inode, int mask); +extern int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip); +extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr); +extern struct xattr_handler gfs2_xattr_system_handler; #endif /* __ACL_DOT_H__ */ diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 8a0f8ef6ee27..6b803540951e 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -1507,18 +1507,6 @@ static int gfs2_xattr_user_set(struct inode *inode, const char *name, return gfs2_xattr_set(inode, GFS2_EATYPE_USR, name, value, size, flags); } -static int gfs2_xattr_system_get(struct inode *inode, const char *name, - void *buffer, size_t size) -{ - return gfs2_xattr_get(inode, GFS2_EATYPE_SYS, name, buffer, size); -} - -static int gfs2_xattr_system_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - return gfs2_xattr_set(inode, GFS2_EATYPE_SYS, name, value, size, flags); -} - static int gfs2_xattr_security_get(struct inode *inode, const char *name, void *buffer, size_t size) { @@ -1543,12 +1531,6 @@ static struct xattr_handler gfs2_xattr_security_handler = { .set = gfs2_xattr_security_set, }; -static struct xattr_handler gfs2_xattr_system_handler = { - .prefix = XATTR_SYSTEM_PREFIX, - .get = gfs2_xattr_system_get, - .set = gfs2_xattr_system_set, -}; - struct xattr_handler *gfs2_xattr_handlers[] = { &gfs2_xattr_user_handler, &gfs2_xattr_security_handler, -- cgit v1.2.3 From c65f7fb5342ecb8cb85e9b676327b3a43a5a4735 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Fri, 2 Oct 2009 11:54:39 +0100 Subject: GFS2: Use forget_all_cached_acls() Invalidate all the cached ACLs when we drop the glock. Signed-off-by: Steven Whitehouse --- fs/gfs2/glops.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 6985eef06c39..78554acc0605 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "gfs2.h" #include "incore.h" @@ -184,8 +185,10 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags) if (flags & DIO_METADATA) { struct address_space *mapping = gl->gl_aspace->i_mapping; truncate_inode_pages(mapping, 0); - if (ip) + if (ip) { set_bit(GIF_INVALID, &ip->i_flags); + forget_all_cached_acls(&ip->i_inode); + } } if (ip == GFS2_I(gl->gl_sbd->sd_rindex)) -- cgit v1.2.3 From 69dca42464962d8d0989b7e09877ba644c9cba66 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 29 Sep 2009 12:40:19 +0100 Subject: GFS2: Use gfs2_set_mode() instead of munge_mode() These two functions do the same thing, so lets only use one of them. Signed-off-by: Steven Whitehouse --- fs/gfs2/acl.c | 46 +++++++++++----------------------------------- 1 file changed, 11 insertions(+), 35 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 2168da121647..1be314837d31 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -104,29 +104,20 @@ int gfs2_check_acl(struct inode *inode, int mask) return -EAGAIN; } -static int munge_mode(struct gfs2_inode *ip, mode_t mode) +static int gfs2_set_mode(struct inode *inode, mode_t mode) { - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct buffer_head *dibh; - int error; + int error = 0; - error = gfs2_trans_begin(sdp, RES_DINODE, 0); - if (error) - return error; + if (mode != inode->i_mode) { + struct iattr iattr; - error = gfs2_meta_inode_buffer(ip, &dibh); - if (!error) { - gfs2_assert_withdraw(sdp, - (ip->i_inode.i_mode & S_IFMT) == (mode & S_IFMT)); - ip->i_inode.i_mode = mode; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - gfs2_dinode_out(ip, dibh->b_data); - brelse(dibh); - } + iattr.ia_valid = ATTR_MODE; + iattr.ia_mode = mode; - gfs2_trans_end(sdp); + error = gfs2_setattr_simple(GFS2_I(inode), &iattr); + } - return 0; + return error; } int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip) @@ -151,7 +142,7 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip) if (!acl) { mode &= ~current_umask(); if (mode != ip->i_inode.i_mode) - error = munge_mode(ip, mode); + error = gfs2_set_mode(&ip->i_inode, mode); return error; } @@ -181,7 +172,7 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip) if (error) goto out; munge: - error = munge_mode(ip, mode); + error = gfs2_set_mode(&ip->i_inode, mode); out: posix_acl_release(acl); kfree(data); @@ -244,21 +235,6 @@ static int gfs2_xattr_system_get(struct inode *inode, const char *name, return gfs2_xattr_get(inode, GFS2_EATYPE_SYS, name, buffer, size); } -static int gfs2_set_mode(struct inode *inode, mode_t mode) -{ - int error = 0; - - if (mode != inode->i_mode) { - struct iattr iattr; - - iattr.ia_valid = ATTR_MODE; - iattr.ia_mode = mode; - - error = gfs2_setattr_simple(GFS2_I(inode), &iattr); - } - - return error; -} static int gfs2_xattr_system_set(struct inode *inode, const char *name, const void *value, size_t size, int flags) -- cgit v1.2.3 From 479c427dd60fe1aadbbf2e6cbf2f84942baeb210 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Fri, 2 Oct 2009 12:00:00 +0100 Subject: GFS2: Clean up ACLs To prepare for support for caching of ACLs, this cleans up the GFS2 ACL support by pushing the xattr code back into xattr.c and changing the acl_get function into one which only returns ACLs so that we can drop the caching function into it shortly. Signed-off-by: Steven Whitehouse --- fs/gfs2/acl.c | 164 ++++++++++++++++++++++++++++---------------------------- fs/gfs2/acl.h | 2 +- fs/gfs2/inode.c | 2 +- fs/gfs2/xattr.c | 56 +++++++++++++++---- fs/gfs2/xattr.h | 8 +-- 5 files changed, 132 insertions(+), 100 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 1be314837d31..bd0fce964c41 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -27,53 +27,40 @@ #include "trans.h" #include "util.h" -static int acl_get(struct gfs2_inode *ip, const char *name, - struct posix_acl **acl, struct gfs2_ea_location *el, - char **datap, unsigned int *lenp) +static const char *gfs2_acl_name(int type) { - char *data; - unsigned int len; - int error; + switch (type) { + case ACL_TYPE_ACCESS: + return GFS2_POSIX_ACL_ACCESS; + case ACL_TYPE_DEFAULT: + return GFS2_POSIX_ACL_DEFAULT; + } + return NULL; +} - el->el_bh = NULL; +static struct posix_acl *gfs2_acl_get(struct gfs2_inode *ip, int type) +{ + struct posix_acl *acl; + const char *name; + char *data; + int len; if (!ip->i_eattr) - return 0; - - error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, name, el); - if (error) - return error; - if (!el->el_ea) - return 0; - if (!GFS2_EA_DATA_LEN(el->el_ea)) - goto out; - - len = GFS2_EA_DATA_LEN(el->el_ea); - data = kmalloc(len, GFP_NOFS); - error = -ENOMEM; - if (!data) - goto out; + return NULL; - error = gfs2_ea_get_copy(ip, el, data, len); - if (error < 0) - goto out_kfree; - error = 0; + name = gfs2_acl_name(type); + if (name == NULL) + return ERR_PTR(-EINVAL); - if (acl) { - *acl = posix_acl_from_xattr(data, len); - if (IS_ERR(*acl)) - error = PTR_ERR(*acl); - } + len = gfs2_xattr_acl_get(ip, name, &data); + if (len < 0) + return ERR_PTR(len); + if (len == 0) + return NULL; -out_kfree: - if (error || !datap) { - kfree(data); - } else { - *datap = data; - *lenp = len; - } -out: - return error; + acl = posix_acl_from_xattr(data, len); + kfree(data); + return acl; } /** @@ -86,14 +73,12 @@ out: int gfs2_check_acl(struct inode *inode, int mask) { - struct gfs2_ea_location el; - struct posix_acl *acl = NULL; + struct posix_acl *acl; int error; - error = acl_get(GFS2_I(inode), GFS2_POSIX_ACL_ACCESS, &acl, &el, NULL, NULL); - brelse(el.el_bh); - if (error) - return error; + acl = gfs2_acl_get(GFS2_I(inode), ACL_TYPE_ACCESS); + if (IS_ERR(acl)) + return PTR_ERR(acl); if (acl) { error = posix_acl_permission(inode, acl, mask); @@ -120,32 +105,57 @@ static int gfs2_set_mode(struct inode *inode, mode_t mode) return error; } -int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip) +static int gfs2_acl_set(struct inode *inode, int type, struct posix_acl *acl) { - struct gfs2_ea_location el; - struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); - struct posix_acl *acl = NULL, *clone; - mode_t mode = ip->i_inode.i_mode; - char *data = NULL; - unsigned int len; int error; + int len; + char *data; + const char *name = gfs2_acl_name(type); + + BUG_ON(name == NULL); + len = posix_acl_to_xattr(acl, NULL, 0); + if (len == 0) + return 0; + data = kmalloc(len, GFP_NOFS); + if (data == NULL) + return -ENOMEM; + error = posix_acl_to_xattr(acl, data, len); + if (error < 0) + goto out; + error = gfs2_xattr_set(inode, GFS2_EATYPE_SYS, name, data, len, 0); +out: + kfree(data); + return error; +} + +int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode) +{ + struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); + struct posix_acl *acl, *clone; + mode_t mode = inode->i_mode; + int error = 0; if (!sdp->sd_args.ar_posix_acl) return 0; - if (S_ISLNK(ip->i_inode.i_mode)) + if (S_ISLNK(inode->i_mode)) return 0; - error = acl_get(dip, GFS2_POSIX_ACL_DEFAULT, &acl, &el, &data, &len); - brelse(el.el_bh); - if (error) - return error; + acl = gfs2_acl_get(dip, ACL_TYPE_DEFAULT); + if (IS_ERR(acl)) + return PTR_ERR(acl); if (!acl) { mode &= ~current_umask(); - if (mode != ip->i_inode.i_mode) - error = gfs2_set_mode(&ip->i_inode, mode); + if (mode != inode->i_mode) + error = gfs2_set_mode(inode, mode); return error; } + if (S_ISDIR(inode->i_mode)) { + error = gfs2_acl_set(inode, ACL_TYPE_DEFAULT, acl); + if (error) + goto out; + } + clone = posix_acl_clone(acl, GFP_NOFS); error = -ENOMEM; if (!clone) @@ -153,43 +163,32 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip) posix_acl_release(acl); acl = clone; - if (S_ISDIR(ip->i_inode.i_mode)) { - error = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SYS, - GFS2_POSIX_ACL_DEFAULT, data, len, 0); - if (error) - goto out; - } - error = posix_acl_create_masq(acl, &mode); if (error < 0) goto out; if (error == 0) goto munge; - posix_acl_to_xattr(acl, data, len); - error = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SYS, - GFS2_POSIX_ACL_ACCESS, data, len, 0); + error = gfs2_acl_set(inode, ACL_TYPE_ACCESS, acl); if (error) goto out; munge: - error = gfs2_set_mode(&ip->i_inode, mode); + error = gfs2_set_mode(inode, mode); out: posix_acl_release(acl); - kfree(data); return error; } int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr) { - struct posix_acl *acl = NULL, *clone; - struct gfs2_ea_location el; + struct posix_acl *acl, *clone; char *data; unsigned int len; int error; - error = acl_get(ip, GFS2_POSIX_ACL_ACCESS, &acl, &el, &data, &len); - if (error) - goto out_brelse; + acl = gfs2_acl_get(ip, ACL_TYPE_ACCESS); + if (IS_ERR(acl)) + return PTR_ERR(acl); if (!acl) return gfs2_setattr_simple(ip, attr); @@ -202,15 +201,18 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr) error = posix_acl_chmod_masq(acl, attr->ia_mode); if (!error) { + len = posix_acl_to_xattr(acl, NULL, 0); + data = kmalloc(len, GFP_NOFS); + error = -ENOMEM; + if (data == NULL) + goto out; posix_acl_to_xattr(acl, data, len); - error = gfs2_ea_acl_chmod(ip, &el, attr, data); + error = gfs2_xattr_acl_chmod(ip, attr, data); + kfree(data); } out: posix_acl_release(acl); - kfree(data); -out_brelse: - brelse(el.el_bh); return error; } diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h index cc954390b6da..9306a2e6620c 100644 --- a/fs/gfs2/acl.h +++ b/fs/gfs2/acl.h @@ -17,7 +17,7 @@ #define GFS2_ACL_MAX_ENTRIES 25 extern int gfs2_check_acl(struct inode *inode, int mask); -extern int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip); +extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode); extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr); extern struct xattr_handler gfs2_xattr_system_handler; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index fb15d3b1f409..6380cd9314b0 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -871,7 +871,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name, if (error) goto fail_gunlock2; - error = gfs2_acl_create(dip, GFS2_I(inode)); + error = gfs2_acl_create(dip, inode); if (error) goto fail_gunlock2; diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 6b803540951e..912f5cbc4740 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -186,8 +186,8 @@ static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh, return 0; } -int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name, - struct gfs2_ea_location *el) +static int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name, + struct gfs2_ea_location *el) { struct ea_find ef; int error; @@ -516,8 +516,8 @@ out: return error; } -int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el, - char *data, size_t size) +static int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el, + char *data, size_t size) { int ret; size_t len = GFS2_EA_DATA_LEN(el->el_ea); @@ -534,6 +534,36 @@ int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el, return len; } +int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **ppdata) +{ + struct gfs2_ea_location el; + int error; + int len; + char *data; + + error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, name, &el); + if (error) + return error; + if (!el.el_ea) + goto out; + if (!GFS2_EA_DATA_LEN(el.el_ea)) + goto out; + + len = GFS2_EA_DATA_LEN(el.el_ea); + data = kmalloc(len, GFP_NOFS); + error = -ENOMEM; + if (data == NULL) + goto out; + + error = gfs2_ea_get_copy(ip, &el, data, len); + if (error == 0) + error = len; + *ppdata = data; +out: + brelse(el.el_bh); + return error; +} + /** * gfs2_xattr_get - Get a GFS2 extended attribute * @inode: The inode @@ -1259,22 +1289,26 @@ fail: return error; } -int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el, - struct iattr *attr, char *data) +int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data) { + struct gfs2_ea_location el; struct buffer_head *dibh; int error; - if (GFS2_EA_IS_STUFFED(el->el_ea)) { + error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, GFS2_POSIX_ACL_ACCESS, &el); + if (error) + return error; + + if (GFS2_EA_IS_STUFFED(el.el_ea)) { error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0); if (error) return error; - gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1); - memcpy(GFS2_EA2DATA(el->el_ea), data, - GFS2_EA_DATA_LEN(el->el_ea)); + gfs2_trans_add_bh(ip->i_gl, el.el_bh, 1); + memcpy(GFS2_EA2DATA(el.el_ea), data, + GFS2_EA_DATA_LEN(el.el_ea)); } else - error = ea_acl_chmod_unstuffed(ip, el->el_ea, data); + error = ea_acl_chmod_unstuffed(ip, el.el_ea, data); if (error) return error; diff --git a/fs/gfs2/xattr.h b/fs/gfs2/xattr.h index cbdfd7743733..8d6ae5813c4d 100644 --- a/fs/gfs2/xattr.h +++ b/fs/gfs2/xattr.h @@ -62,11 +62,7 @@ extern int gfs2_ea_dealloc(struct gfs2_inode *ip); /* Exported to acl.c */ -extern int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name, - struct gfs2_ea_location *el); -extern int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el, - char *data, size_t size); -extern int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el, - struct iattr *attr, char *data); +extern int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **data); +extern int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data); #endif /* __EATTR_DOT_H__ */ -- cgit v1.2.3 From 106381bfba997b83b64f68f2210e154162fc38e6 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 29 Sep 2009 16:26:23 +0100 Subject: GFS2: Add cached ACLs support The other patches in this series have been building towards being able to support cached ACLs like other filesystems. The only real difference with GFS2 is that we have to invalidate the cache when we drop a glock, but that is dealt with in earlier patches. Signed-off-by: Steven Whitehouse --- fs/gfs2/acl.c | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index bd0fce964c41..3eb1ea846173 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -48,6 +48,10 @@ static struct posix_acl *gfs2_acl_get(struct gfs2_inode *ip, int type) if (!ip->i_eattr) return NULL; + acl = get_cached_acl(&ip->i_inode, type); + if (acl != ACL_NOT_CACHED) + return acl; + name = gfs2_acl_name(type); if (name == NULL) return ERR_PTR(-EINVAL); @@ -123,6 +127,8 @@ static int gfs2_acl_set(struct inode *inode, int type, struct posix_acl *acl) if (error < 0) goto out; error = gfs2_xattr_set(inode, GFS2_EATYPE_SYS, name, data, len, 0); + if (!error) + set_cached_acl(inode, type, acl); out: kfree(data); return error; @@ -209,6 +215,7 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr) posix_acl_to_xattr(acl, data, len); error = gfs2_xattr_acl_chmod(ip, attr, data); kfree(data); + set_cached_acl(&ip->i_inode, ACL_TYPE_ACCESS, acl); } out: @@ -228,15 +235,25 @@ static int gfs2_acl_type(const char *name) static int gfs2_xattr_system_get(struct inode *inode, const char *name, void *buffer, size_t size) { + struct posix_acl *acl; int type; + int error; type = gfs2_acl_type(name); if (type < 0) return type; - return gfs2_xattr_get(inode, GFS2_EATYPE_SYS, name, buffer, size); -} + acl = gfs2_acl_get(GFS2_I(inode), type); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl == NULL) + return -ENODATA; + error = posix_acl_to_xattr(acl, buffer, size); + posix_acl_release(acl); + + return error; +} static int gfs2_xattr_system_set(struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -303,6 +320,12 @@ static int gfs2_xattr_system_set(struct inode *inode, const char *name, set_acl: error = gfs2_xattr_set(inode, GFS2_EATYPE_SYS, name, value, size, 0); + if (!error) { + if (acl) + set_cached_acl(inode, type, acl); + else + forget_cached_acl(inode, type); + } out_release: posix_acl_release(acl); out: -- cgit v1.2.3 From ab201832f75f58c8f5093436363f80ffa4a4c9a8 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 29 Sep 2009 16:31:03 +0100 Subject: VFS: Use GFP_NOFS in posix_acl_from_xattr() GFS2 needs to call this from under a glock, so we need GFP_NOFS and I suspect that other filesystems might require this too. Signed-off-by: Steven Whitehouse --- fs/xattr_acl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xattr_acl.c b/fs/xattr_acl.c index c6ad7c7e3ee9..05ac0fe9c4d3 100644 --- a/fs/xattr_acl.c +++ b/fs/xattr_acl.c @@ -36,7 +36,7 @@ posix_acl_from_xattr(const void *value, size_t size) if (count == 0) return NULL; - acl = posix_acl_alloc(count, GFP_KERNEL); + acl = posix_acl_alloc(count, GFP_NOFS); if (!acl) return ERR_PTR(-ENOMEM); acl_e = acl->a_entries; -- cgit v1.2.3 From 8c42d637f6f2859e0fb28b78d5add7f0dc6d0973 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Fri, 11 Sep 2009 14:36:44 +0100 Subject: GFS2: Alter arguments of gfs2_quota/statfs_sync These two functions are altered so that gfs2_quota_sync may in future be called directly from the VFS. The GFS2 superblock changes to a VFS super block and there is an addition of an int argument which is currently ignored. Signed-off-by: Steven Whitehouse --- fs/gfs2/quota.c | 7 ++++--- fs/gfs2/quota.h | 2 +- fs/gfs2/super.c | 7 ++++--- fs/gfs2/super.h | 2 +- fs/gfs2/sys.c | 4 ++-- 5 files changed, 12 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 2e9b9326bfc9..ed9e1971b2cd 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -1069,8 +1069,9 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change, } } -int gfs2_quota_sync(struct gfs2_sbd *sdp) +int gfs2_quota_sync(struct super_block *sb, int type) { + struct gfs2_sbd *sdp = sb->s_fs_info; struct gfs2_quota_data **qda; unsigned int max_qd = gfs2_tune_get(sdp, gt_quota_simul_sync); unsigned int num_qd; @@ -1298,12 +1299,12 @@ static void quotad_error(struct gfs2_sbd *sdp, const char *msg, int error) } static void quotad_check_timeo(struct gfs2_sbd *sdp, const char *msg, - int (*fxn)(struct gfs2_sbd *sdp), + int (*fxn)(struct super_block *sb, int type), unsigned long t, unsigned long *timeo, unsigned int *new_timeo) { if (t >= *timeo) { - int error = fxn(sdp); + int error = fxn(sdp->sd_vfs, 0); quotad_error(sdp, msg, error); *timeo = gfs2_tune_get_i(&sdp->sd_tune, new_timeo) * HZ; } else { diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index 0fa5fa63d0e8..437afa77663f 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -25,7 +25,7 @@ extern int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid); extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change, u32 uid, u32 gid); -extern int gfs2_quota_sync(struct gfs2_sbd *sdp); +extern int gfs2_quota_sync(struct super_block *sb, int type); extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id); extern int gfs2_quota_init(struct gfs2_sbd *sdp); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 42e5458703f0..e7b24d59d4ff 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -484,8 +484,9 @@ void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh, gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode)); } -int gfs2_statfs_sync(struct gfs2_sbd *sdp) +int gfs2_statfs_sync(struct super_block *sb, int type) { + struct gfs2_sbd *sdp = sb->s_fs_info; struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode); struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; @@ -712,8 +713,8 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp) int error; flush_workqueue(gfs2_delete_workqueue); - gfs2_quota_sync(sdp); - gfs2_statfs_sync(sdp); + gfs2_quota_sync(sdp->sd_vfs, 0); + gfs2_statfs_sync(sdp->sd_vfs, 0); error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE, &t_gh); diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h index ed962ea68c3a..3df60f2d84e3 100644 --- a/fs/gfs2/super.h +++ b/fs/gfs2/super.h @@ -44,7 +44,7 @@ extern void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf); extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh, struct buffer_head *l_bh); -extern int gfs2_statfs_sync(struct gfs2_sbd *sdp); +extern int gfs2_statfs_sync(struct super_block *sb, int type); extern int gfs2_freeze_fs(struct gfs2_sbd *sdp); extern void gfs2_unfreeze_fs(struct gfs2_sbd *sdp); diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 446329728d52..be1b8aca5906 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -158,7 +158,7 @@ static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf, if (simple_strtol(buf, NULL, 0) != 1) return -EINVAL; - gfs2_statfs_sync(sdp); + gfs2_statfs_sync(sdp->sd_vfs, 0); return len; } @@ -171,7 +171,7 @@ static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf, if (simple_strtol(buf, NULL, 0) != 1) return -EINVAL; - gfs2_quota_sync(sdp); + gfs2_quota_sync(sdp->sd_vfs, 0); return len; } -- cgit v1.2.3 From cc632e7f93465597896862cf9e50baefb1999215 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 15 Sep 2009 09:59:02 +0100 Subject: GFS2: Hook gfs2_quota_sync into VFS via gfs2_quotactl_ops The plan is to add further operations to the gfs2_quotactl_ops in future patches. The sync operation is easy, so we start with that one. We plan to use the XFS quota control functions because they more closely match the GFS2 ones. Signed-off-by: Steven Whitehouse --- fs/gfs2/Kconfig | 2 ++ fs/gfs2/ops_fstype.c | 3 +++ fs/gfs2/quota.c | 4 ++++ fs/gfs2/quota.h | 1 + 4 files changed, 10 insertions(+) (limited to 'fs') diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig index 5971359d2090..4dcddf83326f 100644 --- a/fs/gfs2/Kconfig +++ b/fs/gfs2/Kconfig @@ -8,6 +8,8 @@ config GFS2_FS select FS_POSIX_ACL select CRC32 select SLOW_WORK + select QUOTA + select QUOTACTL help A cluster filesystem. diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index e5ee06231ae5..36b11cba57e3 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "gfs2.h" #include "incore.h" @@ -1138,6 +1139,8 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent sb->s_op = &gfs2_super_ops; sb->s_export_op = &gfs2_export_ops; sb->s_xattr = gfs2_xattr_handlers; + sb->s_qcop = &gfs2_quotactl_ops; + sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; sb->s_time_gran = 1; sb->s_maxbytes = MAX_LFS_FILESIZE; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index ed9e1971b2cd..73a43cee0ea3 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -1378,3 +1378,7 @@ int gfs2_quotad(void *data) return 0; } +const struct quotactl_ops gfs2_quotactl_ops = { + .quota_sync = gfs2_quota_sync, +}; + diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index 437afa77663f..025d15ba24c0 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -50,5 +50,6 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip) } extern int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask); +extern const struct quotactl_ops gfs2_quotactl_ops; #endif /* __QUOTA_DOT_H__ */ -- cgit v1.2.3 From 91094d0fb650decd8bf48b85d86c892d7ca913ee Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Fri, 11 Sep 2009 15:21:56 +0100 Subject: GFS2: Remove obsolete code in quota.c There is no point in testing for GLF_DEMOTE here, we might as well always release the glock at that point. Signed-off-by: Steven Whitehouse --- fs/gfs2/glock.h | 9 --------- fs/gfs2/quota.c | 13 +++++-------- 2 files changed, 5 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index c609894ec0d0..13f0bd228132 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -180,15 +180,6 @@ static inline int gfs2_glock_is_held_shrd(struct gfs2_glock *gl) return gl->gl_state == LM_ST_SHARED; } -static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl) -{ - int ret; - spin_lock(&gl->gl_spin); - ret = test_bit(GLF_DEMOTE, &gl->gl_flags); - spin_unlock(&gl->gl_spin); - return ret; -} - int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, const struct gfs2_glock_operations *glops, int create, struct gfs2_glock **glp); diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 73a43cee0ea3..6aaa6c5e21bc 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -843,9 +843,8 @@ restart: if (force_refresh || qd->qd_qb.qb_magic != cpu_to_be32(GFS2_MAGIC)) { loff_t pos; gfs2_glock_dq_uninit(q_gh); - error = gfs2_glock_nq_init(qd->qd_gl, - LM_ST_EXCLUSIVE, GL_NOCACHE, - q_gh); + error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_EXCLUSIVE, + GL_NOCACHE, q_gh); if (error) return error; @@ -871,11 +870,9 @@ restart: qlvb->qb_value = cpu_to_be64(q.qu_value); qd->qd_qb = *qlvb; - if (gfs2_glock_is_blocking(qd->qd_gl)) { - gfs2_glock_dq_uninit(q_gh); - force_refresh = 0; - goto restart; - } + gfs2_glock_dq_uninit(q_gh); + force_refresh = 0; + goto restart; } return 0; -- cgit v1.2.3 From 1d371b5e179d943491a5fddad211cb317f38a142 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Fri, 11 Sep 2009 15:57:27 +0100 Subject: GFS2: Add get_xstate quota function This allows querying of the quota state via the XFS quota API. Signed-off-by: Steven Whitehouse --- fs/gfs2/quota.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'fs') diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 6aaa6c5e21bc..e7114be7b449 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -47,6 +47,7 @@ #include #include #include +#include #include "gfs2.h" #include "incore.h" @@ -1375,7 +1376,29 @@ int gfs2_quotad(void *data) return 0; } +static int gfs2_quota_get_xstate(struct super_block *sb, + struct fs_quota_stat *fqs) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + + memset(fqs, 0, sizeof(struct fs_quota_stat)); + fqs->qs_version = FS_QSTAT_VERSION; + if (sdp->sd_args.ar_quota == GFS2_QUOTA_ON) + fqs->qs_flags = (XFS_QUOTA_UDQ_ENFD | XFS_QUOTA_GDQ_ENFD); + else if (sdp->sd_args.ar_quota == GFS2_QUOTA_ACCOUNT) + fqs->qs_flags = (XFS_QUOTA_UDQ_ACCT | XFS_QUOTA_GDQ_ACCT); + if (sdp->sd_quota_inode) { + fqs->qs_uquota.qfs_ino = GFS2_I(sdp->sd_quota_inode)->i_no_addr; + fqs->qs_uquota.qfs_nblks = sdp->sd_quota_inode->i_blocks; + } + fqs->qs_uquota.qfs_nextents = 1; /* unsupported */ + fqs->qs_gquota = fqs->qs_uquota; /* its the same inode in both cases */ + fqs->qs_incoredqs = atomic_read(&qd_lru_count); + return 0; +} + const struct quotactl_ops gfs2_quotactl_ops = { .quota_sync = gfs2_quota_sync, + .get_xstate = gfs2_quota_get_xstate, }; -- cgit v1.2.3 From ea7623385930c63e2a35749cff9db72094cd06ad Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 15 Sep 2009 16:20:30 +0100 Subject: GFS2: Add proper error reporting to quota sync via sysfs For some reason, the errors were not making it to userspace. Signed-off-by: Steven Whitehouse --- fs/gfs2/sys.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index be1b8aca5906..c5dad1eb7b91 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -178,6 +178,7 @@ static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf, static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf, size_t len) { + int error; u32 id; if (!capable(CAP_SYS_ADMIN)) @@ -185,13 +186,14 @@ static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf, id = simple_strtoul(buf, NULL, 0); - gfs2_quota_refresh(sdp, 1, id); - return len; + error = gfs2_quota_refresh(sdp, 1, id); + return error ? error : len; } static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf, size_t len) { + int error; u32 id; if (!capable(CAP_SYS_ADMIN)) @@ -199,8 +201,8 @@ static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf, id = simple_strtoul(buf, NULL, 0); - gfs2_quota_refresh(sdp, 0, id); - return len; + error = gfs2_quota_refresh(sdp, 0, id); + return error ? error : len; } static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len) -- cgit v1.2.3 From 33a82529e7007ed7beceebc6b3f3cddadb5b67f0 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 15 Sep 2009 16:25:40 +0100 Subject: GFS2: Remove constant argument from qdsb_get() The "create" argument to qdsb_get() was only ever set to true, so this patch removes that argument. Signed-off-by: Steven Whitehouse --- fs/gfs2/quota.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index e7114be7b449..f790f5af74e3 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -462,12 +462,12 @@ static void qd_unlock(struct gfs2_quota_data *qd) qd_put(qd); } -static int qdsb_get(struct gfs2_sbd *sdp, int user, u32 id, int create, +static int qdsb_get(struct gfs2_sbd *sdp, int user, u32 id, struct gfs2_quota_data **qdp) { int error; - error = qd_get(sdp, user, id, create, qdp); + error = qd_get(sdp, user, id, CREATE, qdp); if (error) return error; @@ -509,20 +509,20 @@ int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid) if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) return 0; - error = qdsb_get(sdp, QUOTA_USER, ip->i_inode.i_uid, CREATE, qd); + error = qdsb_get(sdp, QUOTA_USER, ip->i_inode.i_uid, qd); if (error) goto out; al->al_qd_num++; qd++; - error = qdsb_get(sdp, QUOTA_GROUP, ip->i_inode.i_gid, CREATE, qd); + error = qdsb_get(sdp, QUOTA_GROUP, ip->i_inode.i_gid, qd); if (error) goto out; al->al_qd_num++; qd++; if (uid != NO_QUOTA_CHANGE && uid != ip->i_inode.i_uid) { - error = qdsb_get(sdp, QUOTA_USER, uid, CREATE, qd); + error = qdsb_get(sdp, QUOTA_USER, uid, qd); if (error) goto out; al->al_qd_num++; @@ -530,7 +530,7 @@ int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid) } if (gid != NO_QUOTA_CHANGE && gid != ip->i_inode.i_gid) { - error = qdsb_get(sdp, QUOTA_GROUP, gid, CREATE, qd); + error = qdsb_get(sdp, QUOTA_GROUP, gid, qd); if (error) goto out; al->al_qd_num++; -- cgit v1.2.3 From 6a6ada81e4ffc222bf7e54ea7503c7cc98b4f0d8 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 15 Sep 2009 16:30:38 +0100 Subject: GFS2: Remove constant argument from qd_get() This function was only ever called with the "create" argument set to true, so we can remove it. Signed-off-by: Steven Whitehouse --- fs/gfs2/quota.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index f790f5af74e3..db124af8998e 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -165,7 +165,7 @@ fail: return error; } -static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create, +static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, struct gfs2_quota_data **qdp) { struct gfs2_quota_data *qd = NULL, *new_qd = NULL; @@ -203,7 +203,7 @@ static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create, spin_unlock(&qd_lru_lock); - if (qd || !create) { + if (qd) { if (new_qd) { gfs2_glock_put(new_qd->qd_gl); kmem_cache_free(gfs2_quotad_cachep, new_qd); @@ -467,7 +467,7 @@ static int qdsb_get(struct gfs2_sbd *sdp, int user, u32 id, { int error; - error = qd_get(sdp, user, id, CREATE, qdp); + error = qd_get(sdp, user, id, qdp); if (error) return error; @@ -1117,7 +1117,7 @@ int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id) struct gfs2_holder q_gh; int error; - error = qd_get(sdp, user, id, CREATE, &qd); + error = qd_get(sdp, user, id, &qd); if (error) return error; -- cgit v1.2.3 From 1e72c0f7c40e665d2ed40014750fdd2fa9968bcf Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 15 Sep 2009 20:42:56 +0100 Subject: GFS2: Clean up gfs2_adjust_quota() and do_glock() Both of these functions contained confusing and in one case duplicate code. This patch adds a new check in do_glock() so that we report -ENOENT if we are asked to sync a quota entry which doesn't exist. Due to the previous patch this is now reported correctly to userspace. Also there are a few new comments, and I hope that the code is easier to understand now. Signed-off-by: Steven Whitehouse --- fs/gfs2/quota.c | 82 ++++++++++++++++++--------------------------------------- 1 file changed, 26 insertions(+), 56 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index db124af8998e..33e369f108b3 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -15,7 +15,7 @@ * fuzziness in the current usage value of IDs that are being used on different * nodes in the cluster simultaneously. So, it is possible for a user on * multiple nodes to overrun their quota, but that overrun is controlable. - * Since quota tags are part of transactions, there is no need to a quota check + * Since quota tags are part of transactions, there is no need for a quota check * program to be run on node crashes or anything like that. * * There are couple of knobs that let the administrator manage the quota @@ -66,13 +66,6 @@ #define QUOTA_USER 1 #define QUOTA_GROUP 0 -struct gfs2_quota_host { - u64 qu_limit; - u64 qu_warn; - s64 qu_value; - u32 qu_ll_next; -}; - struct gfs2_quota_change_host { u64 qc_change; u32 qc_flags; /* GFS2_QCF_... */ @@ -618,33 +611,19 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change) mutex_unlock(&sdp->sd_quota_mutex); } -static void gfs2_quota_in(struct gfs2_quota_host *qu, const void *buf) -{ - const struct gfs2_quota *str = buf; - - qu->qu_limit = be64_to_cpu(str->qu_limit); - qu->qu_warn = be64_to_cpu(str->qu_warn); - qu->qu_value = be64_to_cpu(str->qu_value); - qu->qu_ll_next = be32_to_cpu(str->qu_ll_next); -} - -static void gfs2_quota_out(const struct gfs2_quota_host *qu, void *buf) -{ - struct gfs2_quota *str = buf; - - str->qu_limit = cpu_to_be64(qu->qu_limit); - str->qu_warn = cpu_to_be64(qu->qu_warn); - str->qu_value = cpu_to_be64(qu->qu_value); - str->qu_ll_next = cpu_to_be32(qu->qu_ll_next); - memset(&str->qu_reserved, 0, sizeof(str->qu_reserved)); -} - /** - * gfs2_adjust_quota + * gfs2_adjust_quota - adjust record of current block usage + * @ip: The quota inode + * @loc: Offset of the entry in the quota file + * @change: The amount of change to record + * @qd: The quota data * * This function was mostly borrowed from gfs2_block_truncate_page which was * in turn mostly borrowed from ext3 + * + * Returns: 0 or -ve on error */ + static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, s64 change, struct gfs2_quota_data *qd) { @@ -656,8 +635,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, struct buffer_head *bh; struct page *page; void *kaddr; - char *ptr; - struct gfs2_quota_host qp; + struct gfs2_quota *qp; s64 value; int err = -EIO; @@ -701,18 +679,13 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, gfs2_trans_add_bh(ip->i_gl, bh, 0); kaddr = kmap_atomic(page, KM_USER0); - ptr = kaddr + offset; - gfs2_quota_in(&qp, ptr); - qp.qu_value += change; - value = qp.qu_value; - gfs2_quota_out(&qp, ptr); + qp = kaddr + offset; + value = (s64)be64_to_cpu(qp->qu_value) + change; + qp->qu_value = cpu_to_be64(value); + qd->qd_qb.qb_value = qp->qu_value; flush_dcache_page(page); kunmap_atomic(kaddr, KM_USER0); err = 0; - qd->qd_qb.qb_magic = cpu_to_be32(GFS2_MAGIC); - qd->qd_qb.qb_value = cpu_to_be64(value); - ((struct gfs2_quota_lvb*)(qd->qd_gl->gl_lvb))->qb_magic = cpu_to_be32(GFS2_MAGIC); - ((struct gfs2_quota_lvb*)(qd->qd_gl->gl_lvb))->qb_value = cpu_to_be64(value); unlock: unlock_page(page); page_cache_release(page); @@ -741,8 +714,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL); for (qx = 0; qx < num_qd; qx++) { - error = gfs2_glock_nq_init(qda[qx]->qd_gl, - LM_ST_EXCLUSIVE, + error = gfs2_glock_nq_init(qda[qx]->qd_gl, LM_ST_EXCLUSIVE, GL_NOCACHE, &ghs[qx]); if (error) goto out; @@ -797,8 +769,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) qd = qda[x]; offset = qd2offset(qd); error = gfs2_adjust_quota(ip, offset, qd->qd_change_sync, - (struct gfs2_quota_data *) - qd); + (struct gfs2_quota_data *)qd); if (error) goto out_end_trans; @@ -829,8 +800,7 @@ static int do_glock(struct gfs2_quota_data *qd, int force_refresh, struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); struct gfs2_holder i_gh; - struct gfs2_quota_host q; - char buf[sizeof(struct gfs2_quota)]; + struct gfs2_quota q; int error; struct gfs2_quota_lvb *qlvb; @@ -853,22 +823,23 @@ restart: if (error) goto fail; - memset(buf, 0, sizeof(struct gfs2_quota)); + memset(&q, 0, sizeof(struct gfs2_quota)); pos = qd2offset(qd); - error = gfs2_internal_read(ip, NULL, buf, &pos, - sizeof(struct gfs2_quota)); + error = gfs2_internal_read(ip, NULL, (char *)&q, &pos, sizeof(q)); if (error < 0) goto fail_gunlock; - + if ((error < sizeof(q)) && force_refresh) { + error = -ENOENT; + goto fail_gunlock; + } gfs2_glock_dq_uninit(&i_gh); - gfs2_quota_in(&q, buf); qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb; qlvb->qb_magic = cpu_to_be32(GFS2_MAGIC); qlvb->__pad = 0; - qlvb->qb_limit = cpu_to_be64(q.qu_limit); - qlvb->qb_warn = cpu_to_be64(q.qu_warn); - qlvb->qb_value = cpu_to_be64(q.qu_value); + qlvb->qb_limit = q.qu_limit; + qlvb->qb_warn = q.qu_warn; + qlvb->qb_value = q.qu_value; qd->qd_qb = *qlvb; gfs2_glock_dq_uninit(q_gh); @@ -1126,7 +1097,6 @@ int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id) gfs2_glock_dq_uninit(&q_gh); qd_put(qd); - return error; } -- cgit v1.2.3 From 113d6b3c99bf30d8083068d00e3c7304d91d4845 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Mon, 28 Sep 2009 11:52:16 +0100 Subject: GFS2: Add get_xquota support This adds support for viewing the current GFS2 quota settings via the XFS quota API. The setting of quotas will be addressed in a later patch. Fields which are not supported here are left set to zero. Signed-off-by: Steven Whitehouse Reviewed-by: Bob Peterson --- fs/gfs2/quota.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'fs') diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 33e369f108b3..6c5d6aa7d532 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -1367,8 +1367,51 @@ static int gfs2_quota_get_xstate(struct super_block *sb, return 0; } +static int gfs2_xquota_get(struct super_block *sb, int type, qid_t id, + struct fs_disk_quota *fdq) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_quota_lvb *qlvb; + struct gfs2_quota_data *qd; + struct gfs2_holder q_gh; + int error; + + memset(fdq, 0, sizeof(struct fs_disk_quota)); + + if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) + return -ESRCH; /* Crazy XFS error code */ + + if (type == USRQUOTA) + type = QUOTA_USER; + else if (type == GRPQUOTA) + type = QUOTA_GROUP; + else + return -EINVAL; + + error = qd_get(sdp, type, id, &qd); + if (error) + return error; + error = do_glock(qd, FORCE, &q_gh); + if (error) + goto out; + + qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb; + fdq->d_version = FS_DQUOT_VERSION; + fdq->d_flags = (type == QUOTA_USER) ? XFS_USER_QUOTA : XFS_GROUP_QUOTA; + fdq->d_id = id; + fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit); + fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn); + fdq->d_bcount = be64_to_cpu(qlvb->qb_value); + + gfs2_glock_dq_uninit(&q_gh); +out: + qd_put(qd); + return error; +} + const struct quotactl_ops gfs2_quotactl_ops = { .quota_sync = gfs2_quota_sync, .get_xstate = gfs2_quota_get_xstate, + .get_xquota = gfs2_xquota_get, }; -- cgit v1.2.3 From e285c100362762f7440643be637dd332460fdc75 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Wed, 23 Sep 2009 13:50:49 +0100 Subject: GFS2: Add set_xquota support This patch adds the ability to set GFS2 quota limit and warning levels via the XFS quota API. Signed-off-by: Steven Whitehouse --- fs/gfs2/quota.c | 198 ++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 172 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 6c5d6aa7d532..e8db5346a942 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -615,8 +615,9 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change) * gfs2_adjust_quota - adjust record of current block usage * @ip: The quota inode * @loc: Offset of the entry in the quota file - * @change: The amount of change to record + * @change: The amount of usage change to record * @qd: The quota data + * @fdq: The updated limits to record * * This function was mostly borrowed from gfs2_block_truncate_page which was * in turn mostly borrowed from ext3 @@ -625,19 +626,21 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change) */ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, - s64 change, struct gfs2_quota_data *qd) + s64 change, struct gfs2_quota_data *qd, + struct fs_disk_quota *fdq) { struct inode *inode = &ip->i_inode; struct address_space *mapping = inode->i_mapping; unsigned long index = loc >> PAGE_CACHE_SHIFT; unsigned offset = loc & (PAGE_CACHE_SIZE - 1); unsigned blocksize, iblock, pos; - struct buffer_head *bh; + struct buffer_head *bh, *dibh; struct page *page; void *kaddr; struct gfs2_quota *qp; s64 value; int err = -EIO; + u64 size; if (gfs2_is_stuffed(ip)) gfs2_unstuff_dinode(ip, NULL); @@ -683,9 +686,34 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, value = (s64)be64_to_cpu(qp->qu_value) + change; qp->qu_value = cpu_to_be64(value); qd->qd_qb.qb_value = qp->qu_value; + if (fdq) { + if (fdq->d_fieldmask & FS_DQ_BSOFT) { + qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit); + qd->qd_qb.qb_warn = qp->qu_warn; + } + if (fdq->d_fieldmask & FS_DQ_BHARD) { + qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit); + qd->qd_qb.qb_limit = qp->qu_limit; + } + } flush_dcache_page(page); kunmap_atomic(kaddr, KM_USER0); - err = 0; + + err = gfs2_meta_inode_buffer(ip, &dibh); + if (err) + goto unlock; + + size = loc + sizeof(struct gfs2_quota); + if (size > inode->i_size) { + ip->i_disksize = size; + i_size_write(inode, size); + } + inode->i_mtime = inode->i_atime = CURRENT_TIME; + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(ip, dibh->b_data); + brelse(dibh); + mark_inode_dirty(inode); + unlock: unlock_page(page); page_cache_release(page); @@ -713,6 +741,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) return -ENOMEM; sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL); + mutex_lock_nested(&ip->i_inode.i_mutex, I_MUTEX_QUOTA); for (qx = 0; qx < num_qd; qx++) { error = gfs2_glock_nq_init(qda[qx]->qd_gl, LM_ST_EXCLUSIVE, GL_NOCACHE, &ghs[qx]); @@ -768,8 +797,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) for (x = 0; x < num_qd; x++) { qd = qda[x]; offset = qd2offset(qd); - error = gfs2_adjust_quota(ip, offset, qd->qd_change_sync, - (struct gfs2_quota_data *)qd); + error = gfs2_adjust_quota(ip, offset, qd->qd_change_sync, qd, NULL); if (error) goto out_end_trans; @@ -789,20 +817,44 @@ out_gunlock: out: while (qx--) gfs2_glock_dq_uninit(&ghs[qx]); + mutex_unlock(&ip->i_inode.i_mutex); kfree(ghs); gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl); return error; } +static int update_qd(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd) +{ + struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); + struct gfs2_quota q; + struct gfs2_quota_lvb *qlvb; + loff_t pos; + int error; + + memset(&q, 0, sizeof(struct gfs2_quota)); + pos = qd2offset(qd); + error = gfs2_internal_read(ip, NULL, (char *)&q, &pos, sizeof(q)); + if (error < 0) + return error; + + qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb; + qlvb->qb_magic = cpu_to_be32(GFS2_MAGIC); + qlvb->__pad = 0; + qlvb->qb_limit = q.qu_limit; + qlvb->qb_warn = q.qu_warn; + qlvb->qb_value = q.qu_value; + qd->qd_qb = *qlvb; + + return 0; +} + static int do_glock(struct gfs2_quota_data *qd, int force_refresh, struct gfs2_holder *q_gh) { struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); struct gfs2_holder i_gh; - struct gfs2_quota q; int error; - struct gfs2_quota_lvb *qlvb; restart: error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_SHARED, 0, q_gh); @@ -812,7 +864,6 @@ restart: qd->qd_qb = *(struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb; if (force_refresh || qd->qd_qb.qb_magic != cpu_to_be32(GFS2_MAGIC)) { - loff_t pos; gfs2_glock_dq_uninit(q_gh); error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_EXCLUSIVE, GL_NOCACHE, q_gh); @@ -823,25 +874,11 @@ restart: if (error) goto fail; - memset(&q, 0, sizeof(struct gfs2_quota)); - pos = qd2offset(qd); - error = gfs2_internal_read(ip, NULL, (char *)&q, &pos, sizeof(q)); - if (error < 0) - goto fail_gunlock; - if ((error < sizeof(q)) && force_refresh) { - error = -ENOENT; + error = update_qd(sdp, qd); + if (error) goto fail_gunlock; - } - gfs2_glock_dq_uninit(&i_gh); - - qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb; - qlvb->qb_magic = cpu_to_be32(GFS2_MAGIC); - qlvb->__pad = 0; - qlvb->qb_limit = q.qu_limit; - qlvb->qb_warn = q.qu_warn; - qlvb->qb_value = q.qu_value; - qd->qd_qb = *qlvb; + gfs2_glock_dq_uninit(&i_gh); gfs2_glock_dq_uninit(q_gh); force_refresh = 0; goto restart; @@ -1409,9 +1446,118 @@ out: return error; } +/* GFS2 only supports a subset of the XFS fields */ +#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD) + +static int gfs2_xquota_set(struct super_block *sb, int type, qid_t id, + struct fs_disk_quota *fdq) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); + struct gfs2_quota_data *qd; + struct gfs2_holder q_gh, i_gh; + unsigned int data_blocks, ind_blocks; + unsigned int blocks = 0; + int alloc_required; + struct gfs2_alloc *al; + loff_t offset; + int error; + + if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) + return -ESRCH; /* Crazy XFS error code */ + + switch(type) { + case USRQUOTA: + type = QUOTA_USER; + if (fdq->d_flags != XFS_USER_QUOTA) + return -EINVAL; + break; + case GRPQUOTA: + type = QUOTA_GROUP; + if (fdq->d_flags != XFS_GROUP_QUOTA) + return -EINVAL; + break; + default: + return -EINVAL; + } + + if (fdq->d_fieldmask & ~GFS2_FIELDMASK) + return -EINVAL; + if (fdq->d_id != id) + return -EINVAL; + + error = qd_get(sdp, type, id, &qd); + if (error) + return error; + + mutex_lock(&ip->i_inode.i_mutex); + error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_EXCLUSIVE, 0, &q_gh); + if (error) + goto out_put; + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); + if (error) + goto out_q; + + /* Check for existing entry, if none then alloc new blocks */ + error = update_qd(sdp, qd); + if (error) + goto out_i; + + /* If nothing has changed, this is a no-op */ + if ((fdq->d_fieldmask & FS_DQ_BSOFT) && + (fdq->d_blk_softlimit == be64_to_cpu(qd->qd_qb.qb_warn))) + fdq->d_fieldmask ^= FS_DQ_BSOFT; + if ((fdq->d_fieldmask & FS_DQ_BHARD) && + (fdq->d_blk_hardlimit == be64_to_cpu(qd->qd_qb.qb_limit))) + fdq->d_fieldmask ^= FS_DQ_BHARD; + if (fdq->d_fieldmask == 0) + goto out_i; + + offset = qd2offset(qd); + error = gfs2_write_alloc_required(ip, offset, sizeof(struct gfs2_quota), + &alloc_required); + if (error) + goto out_i; + if (alloc_required) { + al = gfs2_alloc_get(ip); + if (al == NULL) + goto out_i; + gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota), + &data_blocks, &ind_blocks); + blocks = al->al_requested = 1 + data_blocks + ind_blocks; + error = gfs2_inplace_reserve(ip); + if (error) + goto out_alloc; + } + + error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 1, 0); + if (error) + goto out_release; + + /* Apply changes */ + error = gfs2_adjust_quota(ip, offset, 0, qd, fdq); + + gfs2_trans_end(sdp); +out_release: + if (alloc_required) { + gfs2_inplace_release(ip); +out_alloc: + gfs2_alloc_put(ip); + } +out_i: + gfs2_glock_dq_uninit(&i_gh); +out_q: + gfs2_glock_dq_uninit(&q_gh); +out_put: + mutex_unlock(&ip->i_inode.i_mutex); + qd_put(qd); + return error; +} + const struct quotactl_ops gfs2_quotactl_ops = { .quota_sync = gfs2_quota_sync, .get_xstate = gfs2_quota_get_xstate, .get_xquota = gfs2_xquota_get, + .set_xquota = gfs2_xquota_set, }; -- cgit v1.2.3 From 86e931a35e93d94e6e91b57cc76456e16d188ea9 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Mon, 28 Sep 2009 12:35:17 +0100 Subject: VFS: Export dquot_send_warning Sending a message to userspace in a generic format to warn of events (e.g. quota exceeded) in the quota subsystem is a generically useful feature. This patch makes some minor changes to the send_message function from dquot.c renaming it quota_send_message, moving it to quota.c and exporting it for use by filesystems which do not use the dquot code. Signed-off-by: Steven Whitehouse --- fs/quota/Kconfig | 2 +- fs/quota/dquot.c | 93 ++++++-------------------------------------------------- fs/quota/quota.c | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 103 insertions(+), 85 deletions(-) (limited to 'fs') diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig index 8047e01ef46b..353e78a9ebee 100644 --- a/fs/quota/Kconfig +++ b/fs/quota/Kconfig @@ -17,7 +17,7 @@ config QUOTA config QUOTA_NETLINK_INTERFACE bool "Report quota messages through netlink interface" - depends on QUOTA && NET + depends on QUOTACTL && NET help If you say Y here, quota warnings (about exceeding softlimit, reaching hardlimit, etc.) will be reported through netlink interface. If unsure, diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 39b49c42a7ed..9b6ad908dcb2 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -77,10 +77,6 @@ #include #include #include /* for inode_lock, oddly enough.. */ -#ifdef CONFIG_QUOTA_NETLINK_INTERFACE -#include -#include -#endif #include @@ -1071,73 +1067,6 @@ static void print_warning(struct dquot *dquot, const int warntype) } #endif -#ifdef CONFIG_QUOTA_NETLINK_INTERFACE - -/* Netlink family structure for quota */ -static struct genl_family quota_genl_family = { - .id = GENL_ID_GENERATE, - .hdrsize = 0, - .name = "VFS_DQUOT", - .version = 1, - .maxattr = QUOTA_NL_A_MAX, -}; - -/* Send warning to userspace about user which exceeded quota */ -static void send_warning(const struct dquot *dquot, const char warntype) -{ - static atomic_t seq; - struct sk_buff *skb; - void *msg_head; - int ret; - int msg_size = 4 * nla_total_size(sizeof(u32)) + - 2 * nla_total_size(sizeof(u64)); - - /* We have to allocate using GFP_NOFS as we are called from a - * filesystem performing write and thus further recursion into - * the fs to free some data could cause deadlocks. */ - skb = genlmsg_new(msg_size, GFP_NOFS); - if (!skb) { - printk(KERN_ERR - "VFS: Not enough memory to send quota warning.\n"); - return; - } - msg_head = genlmsg_put(skb, 0, atomic_add_return(1, &seq), - "a_genl_family, 0, QUOTA_NL_C_WARNING); - if (!msg_head) { - printk(KERN_ERR - "VFS: Cannot store netlink header in quota warning.\n"); - goto err_out; - } - ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, dquot->dq_type); - if (ret) - goto attr_err_out; - ret = nla_put_u64(skb, QUOTA_NL_A_EXCESS_ID, dquot->dq_id); - if (ret) - goto attr_err_out; - ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype); - if (ret) - goto attr_err_out; - ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MAJOR, - MAJOR(dquot->dq_sb->s_dev)); - if (ret) - goto attr_err_out; - ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR, - MINOR(dquot->dq_sb->s_dev)); - if (ret) - goto attr_err_out; - ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID, current_uid()); - if (ret) - goto attr_err_out; - genlmsg_end(skb, msg_head); - - genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS); - return; -attr_err_out: - printk(KERN_ERR "VFS: Not enough space to compose quota message!\n"); -err_out: - kfree_skb(skb); -} -#endif /* * Write warnings to the console and send warning messages over netlink. * @@ -1145,18 +1074,20 @@ err_out: */ static void flush_warnings(struct dquot *const *dquots, char *warntype) { + struct dquot *dq; int i; - for (i = 0; i < MAXQUOTAS; i++) - if (dquots[i] && warntype[i] != QUOTA_NL_NOWARN && - !warning_issued(dquots[i], warntype[i])) { + for (i = 0; i < MAXQUOTAS; i++) { + dq = dquots[i]; + if (dq && warntype[i] != QUOTA_NL_NOWARN && + !warning_issued(dq, warntype[i])) { #ifdef CONFIG_PRINT_QUOTA_WARNING - print_warning(dquots[i], warntype[i]); -#endif -#ifdef CONFIG_QUOTA_NETLINK_INTERFACE - send_warning(dquots[i], warntype[i]); + print_warning(dq, warntype[i]); #endif + quota_send_warning(dq->dq_type, dq->dq_id, + dq->dq_sb->s_dev, warntype[i]); } + } } static int ignore_hardlimit(struct dquot *dquot) @@ -2607,12 +2538,6 @@ static int __init dquot_init(void) register_shrinker(&dqcache_shrinker); -#ifdef CONFIG_QUOTA_NETLINK_INTERFACE - if (genl_register_family("a_genl_family) != 0) - printk(KERN_ERR - "VFS: Failed to create quota netlink interface.\n"); -#endif - return 0; } module_init(dquot_init); diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 95c5b42384b2..ee91e2756950 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -18,6 +18,8 @@ #include #include #include +#include +#include /* Check validity of generic quotactl commands */ static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, @@ -525,3 +527,94 @@ asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special, return ret; } #endif + + +#ifdef CONFIG_QUOTA_NETLINK_INTERFACE + +/* Netlink family structure for quota */ +static struct genl_family quota_genl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = 0, + .name = "VFS_DQUOT", + .version = 1, + .maxattr = QUOTA_NL_A_MAX, +}; + +/** + * quota_send_warning - Send warning to userspace about exceeded quota + * @type: The quota type: USRQQUOTA, GRPQUOTA,... + * @id: The user or group id of the quota that was exceeded + * @dev: The device on which the fs is mounted (sb->s_dev) + * @warntype: The type of the warning: QUOTA_NL_... + * + * This can be used by filesystems (including those which don't use + * dquot) to send a message to userspace relating to quota limits. + * + */ + +void quota_send_warning(short type, unsigned int id, dev_t dev, + const char warntype) +{ + static atomic_t seq; + struct sk_buff *skb; + void *msg_head; + int ret; + int msg_size = 4 * nla_total_size(sizeof(u32)) + + 2 * nla_total_size(sizeof(u64)); + + /* We have to allocate using GFP_NOFS as we are called from a + * filesystem performing write and thus further recursion into + * the fs to free some data could cause deadlocks. */ + skb = genlmsg_new(msg_size, GFP_NOFS); + if (!skb) { + printk(KERN_ERR + "VFS: Not enough memory to send quota warning.\n"); + return; + } + msg_head = genlmsg_put(skb, 0, atomic_add_return(1, &seq), + "a_genl_family, 0, QUOTA_NL_C_WARNING); + if (!msg_head) { + printk(KERN_ERR + "VFS: Cannot store netlink header in quota warning.\n"); + goto err_out; + } + ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, type); + if (ret) + goto attr_err_out; + ret = nla_put_u64(skb, QUOTA_NL_A_EXCESS_ID, id); + if (ret) + goto attr_err_out; + ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype); + if (ret) + goto attr_err_out; + ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MAJOR, MAJOR(dev)); + if (ret) + goto attr_err_out; + ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR, MINOR(dev)); + if (ret) + goto attr_err_out; + ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID, current_uid()); + if (ret) + goto attr_err_out; + genlmsg_end(skb, msg_head); + + genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS); + return; +attr_err_out: + printk(KERN_ERR "VFS: Not enough space to compose quota message!\n"); +err_out: + kfree_skb(skb); +} +EXPORT_SYMBOL(quota_send_warning); + +static int __init quota_init(void) +{ + if (genl_register_family("a_genl_family) != 0) + printk(KERN_ERR + "VFS: Failed to create quota netlink interface.\n"); + return 0; +}; + +module_init(quota_init); +#endif + -- cgit v1.2.3 From 2ec4650526c5a94d96bb760001fe0685b15988de Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Mon, 28 Sep 2009 12:49:15 +0100 Subject: GFS2: Use dquot_send_warning() This adds support to GFS2 to send quota warnings via netlink. Also it removes a stray \r which was left over from when the code used to print warnings on the console. Signed-off-by: Steven Whitehouse --- fs/gfs2/quota.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index e8db5346a942..1d4fc0413a3f 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include "gfs2.h" @@ -1001,7 +1002,7 @@ static int print_message(struct gfs2_quota_data *qd, char *type) { struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; - printk(KERN_INFO "GFS2: fsid=%s: quota %s for %s %u\r\n", + printk(KERN_INFO "GFS2: fsid=%s: quota %s for %s %u\n", sdp->sd_fsname, type, (test_bit(QDF_USER, &qd->qd_flags)) ? "user" : "group", qd->qd_id); @@ -1038,6 +1039,10 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid) if (be64_to_cpu(qd->qd_qb.qb_limit) && (s64)be64_to_cpu(qd->qd_qb.qb_limit) < value) { print_message(qd, "exceeded"); + quota_send_warning(test_bit(QDF_USER, &qd->qd_flags) ? + USRQUOTA : GRPQUOTA, qd->qd_id, + sdp->sd_vfs->s_dev, QUOTA_NL_BHARDWARN); + error = -EDQUOT; break; } else if (be64_to_cpu(qd->qd_qb.qb_warn) && @@ -1045,6 +1050,9 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid) time_after_eq(jiffies, qd->qd_last_warn + gfs2_tune_get(sdp, gt_quota_warn_period) * HZ)) { + quota_send_warning(test_bit(QDF_USER, &qd->qd_flags) ? + USRQUOTA : GRPQUOTA, qd->qd_id, + sdp->sd_vfs->s_dev, QUOTA_NL_BSOFTWARN); error = print_message(qd, "warning"); qd->qd_last_warn = jiffies; } -- cgit v1.2.3 From 3d3c10f2ce80d2a19e5e02023c2b7ab7086c36d5 Mon Sep 17 00:00:00 2001 From: Benjamin Marzinski Date: Tue, 20 Oct 2009 02:39:44 -0500 Subject: GFS2: Improve statfs and quota usability GFS2 now has three new mount options, statfs_quantum, quota_quantum and statfs_percent. statfs_quantum and quota_quantum simply allow you to set the tunables of the same name. Setting setting statfs_quantum to 0 will also turn on the statfs_slow tunable. statfs_percent accepts an integer between 0 and 100. Numbers between 1 and 100 will cause GFS2 to do any early sync when the local number of blocks free changes by at least statfs_percent from the totoal number of blocks free. Setting statfs_percent to 0 disables this. Signed-off-by: Benjamin Marzinski Signed-off-by: Steven Whitehouse --- fs/gfs2/incore.h | 4 +++ fs/gfs2/ops_fstype.c | 14 ++++++++--- fs/gfs2/quota.c | 21 +++++++++++++--- fs/gfs2/quota.h | 2 ++ fs/gfs2/super.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++++--- 5 files changed, 100 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 6edb423f90b3..c239b0f969c8 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -430,6 +430,9 @@ struct gfs2_args { unsigned int ar_discard:1; /* discard requests */ unsigned int ar_errors:2; /* errors=withdraw | panic */ int ar_commit; /* Commit interval */ + int ar_statfs_quantum; /* The fast statfs interval */ + int ar_quota_quantum; /* The quota interval */ + int ar_statfs_percent; /* The % change to force sync */ }; struct gfs2_tune { @@ -558,6 +561,7 @@ struct gfs2_sbd { spinlock_t sd_statfs_spin; struct gfs2_statfs_change_host sd_statfs_master; struct gfs2_statfs_change_host sd_statfs_local; + int sd_statfs_force_sync; /* Resource group stuff */ diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 36b11cba57e3..9744ee920473 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -63,13 +63,10 @@ static void gfs2_tune_init(struct gfs2_tune *gt) gt->gt_quota_warn_period = 10; gt->gt_quota_scale_num = 1; gt->gt_quota_scale_den = 1; - gt->gt_quota_quantum = 60; gt->gt_new_files_jdata = 0; gt->gt_max_readahead = 1 << 18; gt->gt_stall_secs = 600; gt->gt_complain_secs = 10; - gt->gt_statfs_quantum = 30; - gt->gt_statfs_slow = 0; } static struct gfs2_sbd *init_sbd(struct super_block *sb) @@ -1153,6 +1150,15 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift; sdp->sd_tune.gt_log_flush_secs = sdp->sd_args.ar_commit; + sdp->sd_tune.gt_quota_quantum = sdp->sd_args.ar_quota_quantum; + if (sdp->sd_args.ar_statfs_quantum) { + sdp->sd_tune.gt_statfs_slow = 0; + sdp->sd_tune.gt_statfs_quantum = sdp->sd_args.ar_statfs_quantum; + } + else { + sdp->sd_tune.gt_statfs_slow = 1; + sdp->sd_tune.gt_statfs_quantum = 30; + } error = init_names(sdp, silent); if (error) @@ -1308,6 +1314,8 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags, args.ar_quota = GFS2_QUOTA_DEFAULT; args.ar_data = GFS2_DATA_DEFAULT; args.ar_commit = 60; + args.ar_statfs_quantum = 30; + args.ar_quota_quantum = 60; args.ar_errors = GFS2_ERRORS_DEFAULT; error = gfs2_mount_args(&args, data); diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 1d4fc0413a3f..e3bf6eab8750 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -1344,6 +1344,14 @@ static void quotad_check_trunc_list(struct gfs2_sbd *sdp) } } +void gfs2_wake_up_statfs(struct gfs2_sbd *sdp) { + if (!sdp->sd_statfs_force_sync) { + sdp->sd_statfs_force_sync = 1; + wake_up(&sdp->sd_quota_wait); + } +} + + /** * gfs2_quotad - Write cached quota changes into the quota file * @sdp: Pointer to GFS2 superblock @@ -1363,8 +1371,15 @@ int gfs2_quotad(void *data) while (!kthread_should_stop()) { /* Update the master statfs file */ - quotad_check_timeo(sdp, "statfs", gfs2_statfs_sync, t, - &statfs_timeo, &tune->gt_statfs_quantum); + if (sdp->sd_statfs_force_sync) { + int error = gfs2_statfs_sync(sdp->sd_vfs, 0); + quotad_error(sdp, "statfs", error); + statfs_timeo = gfs2_tune_get(sdp, gt_statfs_quantum) * HZ; + } + else + quotad_check_timeo(sdp, "statfs", gfs2_statfs_sync, t, + &statfs_timeo, + &tune->gt_statfs_quantum); /* Update quota file */ quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t, @@ -1381,7 +1396,7 @@ int gfs2_quotad(void *data) spin_lock(&sdp->sd_trunc_lock); empty = list_empty(&sdp->sd_trunc_list); spin_unlock(&sdp->sd_trunc_lock); - if (empty) + if (empty && !sdp->sd_statfs_force_sync) t -= schedule_timeout(t); else t = 0; diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index 025d15ba24c0..e271fa07ad02 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -32,6 +32,8 @@ extern int gfs2_quota_init(struct gfs2_sbd *sdp); extern void gfs2_quota_cleanup(struct gfs2_sbd *sdp); extern int gfs2_quotad(void *data); +extern void gfs2_wake_up_statfs(struct gfs2_sbd *sdp); + static inline int gfs2_quota_lock_check(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index e7b24d59d4ff..3fee2fd3ae43 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -70,6 +70,9 @@ enum { Opt_commit, Opt_err_withdraw, Opt_err_panic, + Opt_statfs_quantum, + Opt_statfs_percent, + Opt_quota_quantum, Opt_error, }; @@ -101,6 +104,9 @@ static const match_table_t tokens = { {Opt_commit, "commit=%d"}, {Opt_err_withdraw, "errors=withdraw"}, {Opt_err_panic, "errors=panic"}, + {Opt_statfs_quantum, "statfs_quantum=%d"}, + {Opt_statfs_percent, "statfs_percent=%d"}, + {Opt_quota_quantum, "quota_quantum=%d"}, {Opt_error, NULL} }; @@ -214,6 +220,28 @@ int gfs2_mount_args(struct gfs2_args *args, char *options) return rv ? rv : -EINVAL; } break; + case Opt_statfs_quantum: + rv = match_int(&tmp[0], &args->ar_statfs_quantum); + if (rv || args->ar_statfs_quantum < 0) { + printk(KERN_WARNING "GFS2: statfs_quantum mount option requires a non-negative numeric argument\n"); + return rv ? rv : -EINVAL; + } + break; + case Opt_quota_quantum: + rv = match_int(&tmp[0], &args->ar_quota_quantum); + if (rv || args->ar_quota_quantum <= 0) { + printk(KERN_WARNING "GFS2: quota_quantum mount option requires a positive numeric argument\n"); + return rv ? rv : -EINVAL; + } + break; + case Opt_statfs_percent: + rv = match_int(&tmp[0], &args->ar_statfs_percent); + if (rv || args->ar_statfs_percent < 0 || + args->ar_statfs_percent > 100) { + printk(KERN_WARNING "statfs_percent mount option requires a numeric argument between 0 and 100\n"); + return rv ? rv : -EINVAL; + } + break; case Opt_err_withdraw: args->ar_errors = GFS2_ERRORS_WITHDRAW; break; @@ -442,7 +470,9 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, { struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode); struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; + struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; struct buffer_head *l_bh; + int percent, sync_percent; int error; error = gfs2_meta_inode_buffer(l_ip, &l_bh); @@ -456,9 +486,17 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, l_sc->sc_free += free; l_sc->sc_dinodes += dinodes; gfs2_statfs_change_out(l_sc, l_bh->b_data + sizeof(struct gfs2_dinode)); + if (m_sc->sc_free) + percent = (100 * l_sc->sc_free) / m_sc->sc_free; + else + percent = 100; spin_unlock(&sdp->sd_statfs_spin); brelse(l_bh); + sync_percent = sdp->sd_args.ar_statfs_percent; + if (sync_percent && (percent >= sync_percent || + percent <= -sync_percent)) + gfs2_wake_up_statfs(sdp); } void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh, @@ -522,6 +560,7 @@ int gfs2_statfs_sync(struct super_block *sb, int type) goto out_bh2; update_statfs(sdp, m_bh, l_bh); + sdp->sd_statfs_force_sync = 0; gfs2_trans_end(sdp); @@ -1062,6 +1101,11 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) spin_lock(>->gt_spin); args.ar_commit = gt->gt_log_flush_secs; + args.ar_quota_quantum = gt->gt_quota_quantum; + if (gt->gt_statfs_slow) + args.ar_statfs_quantum = 0; + else + args.ar_statfs_quantum = gt->gt_statfs_quantum; spin_unlock(>->gt_spin); error = gfs2_mount_args(&args, data); if (error) @@ -1100,6 +1144,15 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) sb->s_flags &= ~MS_POSIXACL; spin_lock(>->gt_spin); gt->gt_log_flush_secs = args.ar_commit; + gt->gt_quota_quantum = args.ar_quota_quantum; + if (args.ar_statfs_quantum) { + gt->gt_statfs_slow = 0; + gt->gt_statfs_quantum = args.ar_statfs_quantum; + } + else { + gt->gt_statfs_slow = 1; + gt->gt_statfs_quantum = 30; + } spin_unlock(>->gt_spin); gfs2_online_uevent(sdp); @@ -1180,7 +1233,7 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt) { struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info; struct gfs2_args *args = &sdp->sd_args; - int lfsecs; + int val; if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir)) seq_printf(s, ",meta"); @@ -1241,9 +1294,17 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt) } if (args->ar_discard) seq_printf(s, ",discard"); - lfsecs = sdp->sd_tune.gt_log_flush_secs; - if (lfsecs != 60) - seq_printf(s, ",commit=%d", lfsecs); + val = sdp->sd_tune.gt_log_flush_secs; + if (val != 60) + seq_printf(s, ",commit=%d", val); + val = sdp->sd_tune.gt_statfs_quantum; + if (val != 30) + seq_printf(s, ",statfs_quantum=%d", val); + val = sdp->sd_tune.gt_quota_quantum; + if (val != 60) + seq_printf(s, ",quota_quantum=%d", val); + if (args->ar_statfs_percent) + seq_printf(s, ",statfs_percent=%d", args->ar_statfs_percent); if (args->ar_errors != GFS2_ERRORS_DEFAULT) { const char *state; -- cgit v1.2.3 From c14f5735e724cb5338ca8298d42b1658008a10d7 Mon Sep 17 00:00:00 2001 From: Benjamin Marzinski Date: Mon, 26 Oct 2009 13:29:47 -0500 Subject: GFS2: remove division from new statfs code It's not necessary to do any 64bit division for the statfs sync code, so remove it. Signed-off-by: Benjamin Marzinski Signed-off-by: Steven Whitehouse --- fs/gfs2/super.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 3fee2fd3ae43..b1dcfab36465 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -472,7 +472,8 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; struct buffer_head *l_bh; - int percent, sync_percent; + s64 x, y; + int need_sync = 0; int error; error = gfs2_meta_inode_buffer(l_ip, &l_bh); @@ -486,16 +487,16 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, l_sc->sc_free += free; l_sc->sc_dinodes += dinodes; gfs2_statfs_change_out(l_sc, l_bh->b_data + sizeof(struct gfs2_dinode)); - if (m_sc->sc_free) - percent = (100 * l_sc->sc_free) / m_sc->sc_free; - else - percent = 100; + if (sdp->sd_args.ar_statfs_percent) { + x = 100 * l_sc->sc_free; + y = m_sc->sc_free * sdp->sd_args.ar_statfs_percent; + if (x >= y || x <= -y) + need_sync = 1; + } spin_unlock(&sdp->sd_statfs_spin); brelse(l_bh); - sync_percent = sdp->sd_args.ar_statfs_percent; - if (sync_percent && (percent >= sync_percent || - percent <= -sync_percent)) + if (need_sync) gfs2_wake_up_statfs(sdp); } -- cgit v1.2.3 From f25934c5f88655a8d5c3c40a540daed1f0e6dedc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 30 Oct 2009 08:03:27 +0100 Subject: GFS2: add barrier/nobarrier mount options Currently gfs2 issues barrier unconditionally. There are various reasons to disable them, be that just for testing or for stupid devices flushing large battert backed caches. Add a nobarrier option that matches xfs and btrfs for this. Also add a symmetric barrier option to turn it back on at remount time. Signed-off-by: Christoph Hellwig Signed-off-by: Steven Whitehouse --- fs/gfs2/incore.h | 1 + fs/gfs2/ops_fstype.c | 2 ++ fs/gfs2/super.c | 14 ++++++++++++++ 3 files changed, 17 insertions(+) (limited to 'fs') diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index c239b0f969c8..4792200978c8 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -429,6 +429,7 @@ struct gfs2_args { unsigned int ar_meta:1; /* mount metafs */ unsigned int ar_discard:1; /* discard requests */ unsigned int ar_errors:2; /* errors=withdraw | panic */ + unsigned int ar_nobarrier:1; /* do not send barriers */ int ar_commit; /* Commit interval */ int ar_statfs_quantum; /* The fast statfs interval */ int ar_quota_quantum; /* The quota interval */ diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 9744ee920473..edfee24f3636 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1131,6 +1131,8 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent } if (sdp->sd_args.ar_posix_acl) sb->s_flags |= MS_POSIXACL; + if (sdp->sd_args.ar_nobarrier) + set_bit(SDF_NOBARRIERS, &sdp->sd_flags); sb->s_magic = GFS2_MAGIC; sb->s_op = &gfs2_super_ops; diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index b1dcfab36465..5e4b31441a9e 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -73,6 +73,8 @@ enum { Opt_statfs_quantum, Opt_statfs_percent, Opt_quota_quantum, + Opt_barrier, + Opt_nobarrier, Opt_error, }; @@ -107,6 +109,8 @@ static const match_table_t tokens = { {Opt_statfs_quantum, "statfs_quantum=%d"}, {Opt_statfs_percent, "statfs_percent=%d"}, {Opt_quota_quantum, "quota_quantum=%d"}, + {Opt_barrier, "barrier"}, + {Opt_nobarrier, "nobarrier"}, {Opt_error, NULL} }; @@ -253,6 +257,12 @@ int gfs2_mount_args(struct gfs2_args *args, char *options) } args->ar_errors = GFS2_ERRORS_PANIC; break; + case Opt_barrier: + args->ar_nobarrier = 0; + break; + case Opt_nobarrier: + args->ar_nobarrier = 1; + break; case Opt_error: default: printk(KERN_WARNING "GFS2: invalid mount option: %s\n", o); @@ -1143,6 +1153,10 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) sb->s_flags |= MS_POSIXACL; else sb->s_flags &= ~MS_POSIXACL; + if (sdp->sd_args.ar_nobarrier) + set_bit(SDF_NOBARRIERS, &sdp->sd_flags); + else + clear_bit(SDF_NOBARRIERS, &sdp->sd_flags); spin_lock(>->gt_spin); gt->gt_log_flush_secs = args.ar_commit; gt->gt_quota_quantum = args.ar_quota_quantum; -- cgit v1.2.3 From cdcfde62dac64c86ff34e483c595d568a252c433 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Fri, 30 Oct 2009 10:48:53 +0000 Subject: GFS2: Display nobarrier option in /proc/mounts Since the default is barriers on, this only displays the nobarrier option when that is active. Signed-off-by: Steven Whitehouse --- fs/gfs2/super.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 5e4b31441a9e..c282ad41f3d1 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1336,6 +1336,9 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt) } seq_printf(s, ",errors=%s", state); } + if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) + seq_printf(s, ",nobarrier"); + return 0; } -- cgit v1.2.3 From 1579343a73e32b5886e186e8f3e4db85e420ed3f Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Fri, 6 Nov 2009 11:06:37 +0000 Subject: GFS2: Remove dirent_first() function This function only had one caller left, and that caller only called it for leaf blocks, hence one branch of the "if" was never taken. In addition the call to get_left had already verified the metadata type, so the function can be reduced to a single line of code in its caller. Signed-off-by: Steven Whitehouse --- fs/gfs2/dir.c | 34 +--------------------------------- 1 file changed, 1 insertion(+), 33 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 297d7e5cebad..25fddc100f18 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -525,38 +525,6 @@ consist_inode: return ERR_PTR(-EIO); } - -/** - * dirent_first - Return the first dirent - * @dip: the directory - * @bh: The buffer - * @dent: Pointer to list of dirents - * - * return first dirent whether bh points to leaf or stuffed dinode - * - * Returns: IS_LEAF, IS_DINODE, or -errno - */ - -static int dirent_first(struct gfs2_inode *dip, struct buffer_head *bh, - struct gfs2_dirent **dent) -{ - struct gfs2_meta_header *h = (struct gfs2_meta_header *)bh->b_data; - - if (be32_to_cpu(h->mh_type) == GFS2_METATYPE_LF) { - if (gfs2_meta_check(GFS2_SB(&dip->i_inode), bh)) - return -EIO; - *dent = (struct gfs2_dirent *)(bh->b_data + - sizeof(struct gfs2_leaf)); - return IS_LEAF; - } else { - if (gfs2_metatype_check(GFS2_SB(&dip->i_inode), bh, GFS2_METATYPE_DI)) - return -EIO; - *dent = (struct gfs2_dirent *)(bh->b_data + - sizeof(struct gfs2_dinode)); - return IS_DINODE; - } -} - static int dirent_check_reclen(struct gfs2_inode *dip, const struct gfs2_dirent *d, const void *end_p) { @@ -1006,7 +974,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name) divider = (start + half_len) << (32 - dip->i_depth); /* Copy the entries */ - dirent_first(dip, obh, &dent); + dent = (struct gfs2_dirent *)(obh->b_data + sizeof(struct gfs2_leaf)); do { next = dent; -- cgit v1.2.3 From 2c77634965ee28c8b4790ffb5e83dd5ff7ac8988 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Fri, 6 Nov 2009 11:10:51 +0000 Subject: GFS2: Locking order fix in gfs2_check_blk_state In some cases we already have the rindex lock when we enter this function. Signed-off-by: Steven Whitehouse --- fs/gfs2/rgrp.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 8f1cfb02a6cb..0608f490c295 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1710,11 +1710,16 @@ int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type) { struct gfs2_rgrpd *rgd; struct gfs2_holder ri_gh, rgd_gh; + struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex); + int ri_locked = 0; int error; - error = gfs2_rindex_hold(sdp, &ri_gh); - if (error) - goto fail; + if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { + error = gfs2_rindex_hold(sdp, &ri_gh); + if (error) + goto fail; + ri_locked = 1; + } error = -EINVAL; rgd = gfs2_blk2rgrpd(sdp, no_addr); @@ -1730,7 +1735,8 @@ int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type) gfs2_glock_dq_uninit(&rgd_gh); fail_rindex: - gfs2_glock_dq_uninit(&ri_gh); + if (ri_locked) + gfs2_glock_dq_uninit(&ri_gh); fail: return error; } -- cgit v1.2.3 From 0ab7d13fcbd7ce1658c563e345990ba453719deb Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Fri, 6 Nov 2009 16:20:51 +0000 Subject: GFS2: Tag all metadata with jid There are two spare field in the header common to all GFS2 metadata. One is just the right size to fit a journal id in it, and this patch updates the journal code so that each time a metadata block is modified, we tag it with the journal id of the node which is performing the modification. The reason for this is that it should make it much easier to debug issues which arise if we can tell which node was the last to modify a particular metadata block. Since the field is updated before the block is written into the journal, each journal should only contain metadata which is tagged with its own journal id. The one exception to this is the journal header block, which might have a different node's id in it, if that journal was recovered by another node in the cluster. Thus each journal will contain a record of which nodes recovered it, via the journal header. The other field in the metadata header could potentially be used to hold information about what kind of operation was performed, but for the time being we just zero it on each transaction so that if we use it for that in future, we'll know that the information (where it exists) is reliable. I did consider using the other field to hold the journal sequence number, however since in GFS2's journaling we write the modified data into the journal and not the original data, this gives no information as to what action caused the modification, so I think we can probably come up with a better use for those 64 bits in the future. Signed-off-by: Steven Whitehouse --- fs/gfs2/inode.c | 2 -- fs/gfs2/log.c | 2 ++ fs/gfs2/lops.c | 4 ++++ fs/gfs2/recovery.c | 2 ++ 4 files changed, 8 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 6380cd9314b0..26ba2a4c4a2d 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -947,9 +947,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf) str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC); str->di_header.mh_type = cpu_to_be32(GFS2_METATYPE_DI); - str->di_header.__pad0 = 0; str->di_header.mh_format = cpu_to_be32(GFS2_FORMAT_DI); - str->di_header.__pad1 = 0; str->di_num.no_addr = cpu_to_be64(ip->i_no_addr); str->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino); str->di_mode = cpu_to_be32(ip->i_inode.i_mode); diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 13c6237c5f67..4511b08fc451 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -596,7 +596,9 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull) memset(lh, 0, sizeof(struct gfs2_log_header)); lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC); lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH); + lh->lh_header.__pad0 = cpu_to_be64(0); lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH); + lh->lh_header.mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid); lh->lh_sequence = cpu_to_be64(sdp->sd_log_sequence++); lh->lh_flags = cpu_to_be32(flags); lh->lh_tail = cpu_to_be32(tail); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 9969ff062c5b..de97632ba32f 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -132,6 +132,7 @@ static struct buffer_head *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type) static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) { struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le); + struct gfs2_meta_header *mh; struct gfs2_trans *tr; lock_buffer(bd->bd_bh); @@ -148,6 +149,9 @@ static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); gfs2_meta_check(sdp, bd->bd_bh); gfs2_pin(sdp, bd->bd_bh); + mh = (struct gfs2_meta_header *)bd->bd_bh->b_data; + mh->__pad0 = cpu_to_be64(0); + mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid); sdp->sd_log_num_buf++; list_add(&le->le_list, &sdp->sd_log_le_buf); tr->tr_num_buf_new++; diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 09fa31965576..4b9bece3d437 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -410,7 +410,9 @@ static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *hea memset(lh, 0, sizeof(struct gfs2_log_header)); lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC); lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH); + lh->lh_header.__pad0 = cpu_to_be64(0); lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH); + lh->lh_header.mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid); lh->lh_sequence = cpu_to_be64(head->lh_sequence + 1); lh->lh_flags = cpu_to_be32(GFS2_LOG_HEAD_UNMOUNT); lh->lh_blkno = cpu_to_be32(lblock); -- cgit v1.2.3 From 9ae3c6de6981a1e8765b5d029f94555fc0f0fea0 Mon Sep 17 00:00:00 2001 From: Benjamin Marzinski Date: Tue, 10 Nov 2009 12:54:56 -0600 Subject: GFS2: drop rindex glock to refresh rindex list When a gfs2 filesystem is grown, it needs to rebuild the rindex list to be able to use the new space. gfs2 does this when the rindex is marked not uptodate, which happens when the rindex glock is dropped. However, on a single node setup, there is never any reason to drop the rindex glock, so gfs2 never invalidates the the rindex. This patch makes gfs2 automatically drop the rindex glock after filesystem grows, so it can refresh the rindex list. Signed-off-by: Benjamin Marzinski Signed-off-by: Steven Whitehouse --- fs/gfs2/aops.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 694b5d48f036..dce062a0b02a 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -819,8 +819,10 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh, mark_inode_dirty(inode); } - if (inode == sdp->sd_rindex) + if (inode == sdp->sd_rindex) { adjust_fs_space(inode); + ip->i_gh.gh_flags |= GL_NOCACHE; + } brelse(dibh); gfs2_trans_end(sdp); @@ -889,8 +891,10 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping, mark_inode_dirty(inode); } - if (inode == sdp->sd_rindex) + if (inode == sdp->sd_rindex) { adjust_fs_space(inode); + ip->i_gh.gh_flags |= GL_NOCACHE; + } brelse(dibh); gfs2_trans_end(sdp); -- cgit v1.2.3 From c29cd9004e72acb5a6cb8caf08508f1c5edee686 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 18 Nov 2009 18:09:41 +0800 Subject: writeback: remove unused nonblocking and congestion checks (gfs2) No one is calling wb_writeback and write_cache_pages with wbc.nonblocking=1 any more. And lumpy pageout will want to do nonblocking writeback without the congestion wait. Signed-off-by: Wu Fengguang Signed-off-by: Steven Whitehouse --- fs/gfs2/aops.c | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index dce062a0b02a..7b8da9415267 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -269,7 +269,6 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping, pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; unsigned offset = i_size & (PAGE_CACHE_SIZE-1); unsigned nrblocks = nr_pages * (PAGE_CACHE_SIZE/inode->i_sb->s_blocksize); - struct backing_dev_info *bdi = mapping->backing_dev_info; int i; int ret; @@ -313,11 +312,6 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping, if (ret || (--(wbc->nr_to_write) <= 0)) ret = 1; - if (wbc->nonblocking && bdi_write_congested(bdi)) { - wbc->encountered_congestion = 1; - ret = 1; - } - } gfs2_trans_end(sdp); return ret; @@ -338,7 +332,6 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping, static int gfs2_write_cache_jdata(struct address_space *mapping, struct writeback_control *wbc) { - struct backing_dev_info *bdi = mapping->backing_dev_info; int ret = 0; int done = 0; struct pagevec pvec; @@ -348,11 +341,6 @@ static int gfs2_write_cache_jdata(struct address_space *mapping, int scanned = 0; int range_whole = 0; - if (wbc->nonblocking && bdi_write_congested(bdi)) { - wbc->encountered_congestion = 1; - return 0; - } - pagevec_init(&pvec, 0); if (wbc->range_cyclic) { index = mapping->writeback_index; /* Start from prev offset */ -- cgit v1.2.3 From 26bb7505cf7db3560286be9f6384b6d3911f78b5 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Fri, 27 Nov 2009 10:31:11 +0000 Subject: GFS2: Fix glock refcount issues This patch fixes some ref counting issues. Firstly by moving the point at which we drop the ref count after a dlm lock operation has completed we ensure that we never call gfs2_glock_hold() on a lock with a zero ref count. Secondly, by using atomic_dec_and_lock() in gfs2_glock_put() we ensure that at no time will a glock with zero ref count appear on the lru_list. That means that we can remove the check for this in our shrinker (which was racy). Signed-off-by: Steven Whitehouse --- fs/gfs2/glock.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index a3f90ad2af80..f455a03a09e2 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -241,15 +241,14 @@ int gfs2_glock_put(struct gfs2_glock *gl) int rv = 0; write_lock(gl_lock_addr(gl->gl_hash)); - if (atomic_dec_and_test(&gl->gl_ref)) { + if (atomic_dec_and_lock(&gl->gl_ref, &lru_lock)) { hlist_del(&gl->gl_list); - write_unlock(gl_lock_addr(gl->gl_hash)); - spin_lock(&lru_lock); if (!list_empty(&gl->gl_lru)) { list_del_init(&gl->gl_lru); atomic_dec(&lru_count); } spin_unlock(&lru_lock); + write_unlock(gl_lock_addr(gl->gl_hash)); GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders)); glock_free(gl); rv = 1; @@ -513,7 +512,6 @@ retry: GLOCK_BUG_ON(gl, 1); } spin_unlock(&gl->gl_spin); - gfs2_glock_put(gl); return; } @@ -524,8 +522,6 @@ retry: if (glops->go_xmote_bh) { spin_unlock(&gl->gl_spin); rv = glops->go_xmote_bh(gl, gh); - if (rv == -EAGAIN) - return; spin_lock(&gl->gl_spin); if (rv) { do_error(gl, rv); @@ -540,7 +536,6 @@ out: clear_bit(GLF_LOCK, &gl->gl_flags); out_locked: spin_unlock(&gl->gl_spin); - gfs2_glock_put(gl); } static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock, @@ -600,7 +595,6 @@ __acquires(&gl->gl_spin) if (!(ret & LM_OUT_ASYNC)) { finish_xmote(gl, ret); - gfs2_glock_hold(gl); if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) gfs2_glock_put(gl); } else { @@ -712,9 +706,12 @@ static void glock_work_func(struct work_struct *work) { unsigned long delay = 0; struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work); + int drop_ref = 0; - if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags)) + if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags)) { finish_xmote(gl, gl->gl_reply); + drop_ref = 1; + } down_read(&gfs2_umount_flush_sem); spin_lock(&gl->gl_spin); if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && @@ -732,6 +729,8 @@ static void glock_work_func(struct work_struct *work) if (!delay || queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) gfs2_glock_put(gl); + if (drop_ref) + gfs2_glock_put(gl); } /** @@ -1366,10 +1365,6 @@ static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask) list_del_init(&gl->gl_lru); atomic_dec(&lru_count); - /* Check if glock is about to be freed */ - if (atomic_read(&gl->gl_ref) == 0) - continue; - /* Test for being demotable */ if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) { gfs2_glock_hold(gl); -- cgit v1.2.3