diff options
Diffstat (limited to 'fs')
582 files changed, 65598 insertions, 20376 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 047c791427aa..c061c3f18e7c 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -55,7 +55,7 @@ enum { Opt_err }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_debug, "debug=%x"}, {Opt_dfltuid, "dfltuid=%u"}, {Opt_dfltgid, "dfltgid=%u"}, diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index fd01d90cada5..57997fa14e69 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -51,4 +51,4 @@ int v9fs_dir_release(struct inode *inode, struct file *filp); int v9fs_file_open(struct inode *inode, struct file *file); void v9fs_inode2stat(struct inode *inode, struct p9_stat *stat); void v9fs_dentry_release(struct dentry *); -int v9fs_uflags2omode(int uflags); +int v9fs_uflags2omode(int uflags, int extended); diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 88e3787c6ea9..e298fe194093 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -119,6 +119,7 @@ int v9fs_dir_release(struct inode *inode, struct file *filp) const struct file_operations v9fs_dir_operations = { .read = generic_read_dir, + .llseek = generic_file_llseek, .readdir = v9fs_dir_readdir, .open = v9fs_file_open, .release = v9fs_dir_release, diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 0d55affe37d4..52944d2249a4 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -59,7 +59,7 @@ int v9fs_file_open(struct inode *inode, struct file *file) P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p \n", inode, file); v9ses = v9fs_inode2v9ses(inode); - omode = v9fs_uflags2omode(file->f_flags); + omode = v9fs_uflags2omode(file->f_flags, v9fs_extended(v9ses)); fid = file->private_data; if (!fid) { fid = v9fs_fid_clone(file->f_path.dentry); @@ -75,6 +75,8 @@ int v9fs_file_open(struct inode *inode, struct file *file) inode->i_size = 0; inode->i_blocks = 0; } + if ((file->f_flags & O_APPEND) && (!v9fs_extended(v9ses))) + generic_file_llseek(file, 0, SEEK_END); } file->private_data = fid; diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 40fa807bd929..e83aa5ebe861 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -132,10 +132,10 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode) /** * v9fs_uflags2omode- convert posix open flags to plan 9 mode bits * @uflags: flags to convert - * + * @extended: if .u extensions are active */ -int v9fs_uflags2omode(int uflags) +int v9fs_uflags2omode(int uflags, int extended) { int ret; @@ -155,14 +155,16 @@ int v9fs_uflags2omode(int uflags) break; } - if (uflags & O_EXCL) - ret |= P9_OEXCL; - if (uflags & O_TRUNC) ret |= P9_OTRUNC; - if (uflags & O_APPEND) - ret |= P9_OAPPEND; + if (extended) { + if (uflags & O_EXCL) + ret |= P9_OEXCL; + + if (uflags & O_APPEND) + ret |= P9_OAPPEND; + } return ret; } @@ -506,7 +508,7 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode, flags = O_RDWR; fid = v9fs_create(v9ses, dir, dentry, NULL, perm, - v9fs_uflags2omode(flags)); + v9fs_uflags2omode(flags, v9fs_extended(v9ses))); if (IS_ERR(fid)) { err = PTR_ERR(fid); fid = NULL; @@ -624,8 +626,7 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, return NULL; error: - if (fid) - p9_client_clunk(fid); + p9_client_clunk(fid); return ERR_PTR(result); } diff --git a/fs/Kconfig b/fs/Kconfig index cf12c403b8c7..9e9d70c02a07 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -136,37 +136,51 @@ config EXT3_FS_SECURITY If you are not using a security module that requires using extended attributes for file security labels, say N. -config EXT4DEV_FS - tristate "Ext4dev/ext4 extended fs support development (EXPERIMENTAL)" - depends on EXPERIMENTAL +config EXT4_FS + tristate "The Extended 4 (ext4) filesystem" select JBD2 select CRC16 help - Ext4dev is a predecessor filesystem of the next generation - extended fs ext4, based on ext3 filesystem code. It will be - renamed ext4 fs later, once ext4dev is mature and stabilized. + This is the next generation of the ext3 filesystem. Unlike the change from ext2 filesystem to ext3 filesystem, - the on-disk format of ext4dev is not the same as ext3 any more: - it is based on extent maps and it supports 48-bit physical block - numbers. These combined on-disk format changes will allow - ext4dev/ext4 to handle more than 16 TB filesystem volumes -- - a hard limit that ext3 cannot overcome without changing the - on-disk format. - - Other than extent maps and 48-bit block numbers, ext4dev also is - likely to have other new features such as persistent preallocation, - high resolution time stamps, and larger file support etc. These - features will be added to ext4dev gradually. + the on-disk format of ext4 is not forwards compatible with + ext3; it is based on extent maps and it supports 48-bit + physical block numbers. The ext4 filesystem also supports delayed + allocation, persistent preallocation, high resolution time stamps, + and a number of other features to improve performance and speed + up fsck time. For more information, please see the web pages at + http://ext4.wiki.kernel.org. + + The ext4 filesystem will support mounting an ext3 + filesystem; while there will be some performance gains from + the delayed allocation and inode table readahead, the best + performance gains will require enabling ext4 features in the + filesystem, or formating a new filesystem as an ext4 + filesystem initially. To compile this file system support as a module, choose M here. The module will be called ext4dev. If unsure, say N. -config EXT4DEV_FS_XATTR - bool "Ext4dev extended attributes" - depends on EXT4DEV_FS +config EXT4DEV_COMPAT + bool "Enable ext4dev compatibility" + depends on EXT4_FS + help + Starting with 2.6.28, the name of the ext4 filesystem was + renamed from ext4dev to ext4. Unfortunately there are some + legacy userspace programs (such as klibc's fstype) have + "ext4dev" hardcoded. + + To enable backwards compatibility so that systems that are + still expecting to mount ext4 filesystems using ext4dev, + chose Y here. This feature will go away by 2.6.31, so + please arrange to get your userspace programs fixed! + +config EXT4_FS_XATTR + bool "Ext4 extended attributes" + depends on EXT4_FS default y help Extended attributes are name:value pairs associated with inodes by @@ -175,11 +189,11 @@ config EXT4DEV_FS_XATTR If unsure, say N. - You need this for POSIX ACL support on ext4dev/ext4. + You need this for POSIX ACL support on ext4. -config EXT4DEV_FS_POSIX_ACL - bool "Ext4dev POSIX Access Control Lists" - depends on EXT4DEV_FS_XATTR +config EXT4_FS_POSIX_ACL + bool "Ext4 POSIX Access Control Lists" + depends on EXT4_FS_XATTR select FS_POSIX_ACL help POSIX Access Control Lists (ACLs) support permissions for users and @@ -190,14 +204,14 @@ config EXT4DEV_FS_POSIX_ACL If you don't know what Access Control Lists are, say N -config EXT4DEV_FS_SECURITY - bool "Ext4dev Security Labels" - depends on EXT4DEV_FS_XATTR +config EXT4_FS_SECURITY + bool "Ext4 Security Labels" + depends on EXT4_FS_XATTR help Security labels support alternative access control models implemented by security modules like SELinux. This option enables an extended attribute handler for file security - labels in the ext4dev/ext4 filesystem. + labels in the ext4 filesystem. If you are not using a security module that requires using extended attributes for file security labels, say N. @@ -206,17 +220,16 @@ config JBD tristate help This is a generic journalling layer for block devices. It is - currently used by the ext3 and OCFS2 file systems, but it could - also be used to add journal support to other file systems or block + currently used by the ext3 file system, but it could also be + used to add journal support to other file systems or block devices such as RAID or LVM. - If you are using the ext3 or OCFS2 file systems, you need to - say Y here. If you are not using ext3 OCFS2 then you will probably - want to say N. + If you are using the ext3 file system, you need to say Y here. + If you are not using ext3 then you will probably want to say N. To compile this device as a module, choose M here: the module will be - called jbd. If you are compiling ext3 or OCFS2 into the kernel, - you cannot compile this code as a module. + called jbd. If you are compiling ext3 into the kernel, you + cannot compile this code as a module. config JBD_DEBUG bool "JBD (ext3) debugging support" @@ -240,22 +253,23 @@ config JBD2 help This is a generic journaling layer for block devices that support both 32-bit and 64-bit block numbers. It is currently used by - the ext4dev/ext4 filesystem, but it could also be used to add + the ext4 and OCFS2 filesystems, but it could also be used to add journal support to other file systems or block devices such as RAID or LVM. - If you are using ext4dev/ext4, you need to say Y here. If you are not - using ext4dev/ext4 then you will probably want to say N. + If you are using ext4 or OCFS2, you need to say Y here. + If you are not using ext4 or OCFS2 then you will + probably want to say N. To compile this device as a module, choose M here. The module will be - called jbd2. If you are compiling ext4dev/ext4 into the kernel, + called jbd2. If you are compiling ext4 or OCFS2 into the kernel, you cannot compile this code as a module. config JBD2_DEBUG - bool "JBD2 (ext4dev/ext4) debugging support" + bool "JBD2 (ext4) debugging support" depends on JBD2 && DEBUG_FS help - If you are using the ext4dev/ext4 journaled file system (or + If you are using the ext4 journaled file system (or potentially any other filesystem/device using JBD2), this option allows you to enable debugging output while the system is running, in order to help track down any problems you are having. @@ -270,9 +284,9 @@ config JBD2_DEBUG config FS_MBCACHE # Meta block cache for Extended Attributes (ext2/ext3/ext4) tristate - depends on EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4DEV_FS_XATTR - default y if EXT2_FS=y || EXT3_FS=y || EXT4DEV_FS=y - default m if EXT2_FS=m || EXT3_FS=m || EXT4DEV_FS=m + depends on EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS_XATTR + default y if EXT2_FS=y || EXT3_FS=y || EXT4_FS=y + default m if EXT2_FS=m || EXT3_FS=m || EXT4_FS=m config REISERFS_FS tristate "Reiserfs support" @@ -419,6 +433,14 @@ config FS_POSIX_ACL bool default n +config FILE_LOCKING + bool "Enable POSIX file locking API" if EMBEDDED + default y + help + This option enables standard file locking support, required + for filesystems like NFS and for the flock() system + call. Disabling this option saves about 11k. + source "fs/xfs/Kconfig" source "fs/gfs2/Kconfig" @@ -426,7 +448,7 @@ config OCFS2_FS tristate "OCFS2 file system support" depends on NET && SYSFS select CONFIGFS_FS - select JBD + select JBD2 select CRC32 help OCFS2 is a general purpose extent based shared disk cluster file @@ -470,6 +492,14 @@ config OCFS2_FS_USERSPACE_CLUSTER It is safe to say Y, as the clustering method is run-time selectable. +config OCFS2_FS_STATS + bool "OCFS2 statistics" + depends on OCFS2_FS + default y + help + This option allows some fs statistics to be captured. Enabling + this option may increase the memory consumption. + config OCFS2_DEBUG_MASKLOG bool "OCFS2 logging support" depends on OCFS2_FS @@ -489,6 +519,16 @@ config OCFS2_DEBUG_FS this option for debugging only as it is likely to decrease performance of the filesystem. +config OCFS2_COMPAT_JBD + bool "Use JBD for compatibility" + depends on OCFS2_FS + default n + select JBD + help + The ocfs2 filesystem now uses JBD2 for its journalling. JBD2 + is backwards compatible with JBD. It is safe to say N here. + However, if you really want to use the original JBD, say Y here. + endif # BLOCK config DNOTIFY @@ -830,7 +870,7 @@ config NTFS_FS from the project web site. For more information see <file:Documentation/filesystems/ntfs.txt> - and <http://linux-ntfs.sourceforge.net/>. + and <http://www.linux-ntfs.org/>. To compile this file system support as a module, choose M here: the module will be called ntfs. @@ -894,65 +934,7 @@ endif # BLOCK menu "Pseudo filesystems" -config PROC_FS - bool "/proc file system support" if EMBEDDED - default y - help - This is a virtual file system providing information about the status - of the system. "Virtual" means that it doesn't take up any space on - your hard disk: the files are created on the fly by the kernel when - you try to access them. Also, you cannot read the files with older - version of the program less: you need to use more or cat. - - It's totally cool; for example, "cat /proc/interrupts" gives - information about what the different IRQs are used for at the moment - (there is a small number of Interrupt ReQuest lines in your computer - that are used by the attached devices to gain the CPU's attention -- - often a source of trouble if two devices are mistakenly configured - to use the same IRQ). The program procinfo to display some - information about your system gathered from the /proc file system. - - Before you can use the /proc file system, it has to be mounted, - meaning it has to be given a location in the directory hierarchy. - That location should be /proc. A command such as "mount -t proc proc - /proc" or the equivalent line in /etc/fstab does the job. - - The /proc file system is explained in the file - <file:Documentation/filesystems/proc.txt> and on the proc(5) manpage - ("man 5 proc"). - - This option will enlarge your kernel by about 67 KB. Several - programs depend on this, so everyone should say Y here. - -config PROC_KCORE - bool "/proc/kcore support" if !ARM - depends on PROC_FS && MMU - -config PROC_VMCORE - bool "/proc/vmcore support (EXPERIMENTAL)" - depends on PROC_FS && EXPERIMENTAL && CRASH_DUMP - default y - help - Exports the dump image of crashed kernel in ELF format. - -config PROC_SYSCTL - bool "Sysctl support (/proc/sys)" if EMBEDDED - depends on PROC_FS - select SYSCTL - default y - ---help--- - The sysctl interface provides a means of dynamically changing - certain kernel parameters and variables on the fly without requiring - a recompile of the kernel or reboot of the system. The primary - interface is through /proc/sys. If you say Y here a tree of - modifiable sysctl entries will be generated beneath the - /proc/sys directory. They are explained in the files - in <file:Documentation/sysctl/>. Note that enabling this - option will enlarge the kernel by at least 8 KB. - - As it is generally a good thing, you should say Y here unless - building a kernel for install/rescue disks or your system is very - limited in memory. +source "fs/proc/Kconfig" config SYSFS bool "sysfs file system support" if EMBEDDED @@ -1375,6 +1357,9 @@ config JFFS2_CMODE_FAVOURLZO endchoice +# UBIFS File system configuration +source "fs/ubifs/Kconfig" + config CRAMFS tristate "Compressed ROM file system support (cramfs)" depends on BLOCK @@ -1430,6 +1415,19 @@ config MINIX_FS partition (the one containing the directory /) cannot be compiled as a module. +config OMFS_FS + tristate "SonicBlue Optimized MPEG File System support" + depends on BLOCK + select CRC_ITU_T + help + This is the proprietary file system used by the Rio Karma music + player and ReplayTV DVR. Despite the name, this filesystem is not + more efficient than a standard FS for MPEG files, in fact likely + the opposite is true. Say Y if you have either of these devices + and wish to mount its disk. + + To compile this file system support as a module, choose M here: the + module will be called omfs. If unsure, say N. config HPFS_FS tristate "OS/2 HPFS file system support" @@ -1544,10 +1542,6 @@ config UFS_FS The recently released UFS2 variant (used in FreeBSD 5.x) is READ-ONLY supported. - If you only intend to mount files from some other Unix over the - network using NFS, you don't need the UFS file system support (but - you need NFS file system support obviously). - Note that this option is generally not needed for floppies, since a good portable way to transport files and directories between unixes (and even other operating systems) is given by the tar program ("man @@ -1587,6 +1581,7 @@ menuconfig NETWORK_FILESYSTEMS Say Y here to get to see options for network filesystems and filesystem-related networking code, such as NFS daemon and RPCSEC security modules. + This option alone does not add any kernel code. If you say N, all options in this submenu will be skipped and @@ -1595,76 +1590,92 @@ menuconfig NETWORK_FILESYSTEMS if NETWORK_FILESYSTEMS config NFS_FS - tristate "NFS file system support" + tristate "NFS client support" depends on INET select LOCKD select SUNRPC select NFS_ACL_SUPPORT if NFS_V3_ACL help - If you are connected to some other (usually local) Unix computer - (using SLIP, PLIP, PPP or Ethernet) and want to mount files residing - on that computer (the NFS server) using the Network File Sharing - protocol, say Y. "Mounting files" means that the client can access - the files with usual UNIX commands as if they were sitting on the - client's hard disk. For this to work, the server must run the - programs nfsd and mountd (but does not need to have NFS file system - support enabled in its kernel). NFS is explained in the Network - Administrator's Guide, available from - <http://www.tldp.org/docs.html#guide>, on its man page: "man - nfs", and in the NFS-HOWTO. - - A superior but less widely used alternative to NFS is provided by - the Coda file system; see "Coda file system support" below. + Choose Y here if you want to access files residing on other + computers using Sun's Network File System protocol. To compile + this file system support as a module, choose M here: the module + will be called nfs. - If you say Y here, you should have said Y to TCP/IP networking also. - This option would enlarge your kernel by about 27 KB. + To mount file systems exported by NFS servers, you also need to + install the user space mount.nfs command which can be found in + the Linux nfs-utils package, available from http://linux-nfs.org/. + Information about using the mount command is available in the + mount(8) man page. More detail about the Linux NFS client + implementation is available via the nfs(5) man page. - To compile this file system support as a module, choose M here: the - module will be called nfs. + Below you can choose which versions of the NFS protocol are + available in the kernel to mount NFS servers. Support for NFS + version 2 (RFC 1094) is always available when NFS_FS is selected. - If you are configuring a diskless machine which will mount its root - file system over NFS at boot time, say Y here and to "Kernel - level IP autoconfiguration" above and to "Root file system on NFS" - below. You cannot compile this driver as a module in this case. - There are two packages designed for booting diskless machines over - the net: netboot, available from - <http://ftp1.sourceforge.net/netboot/>, and Etherboot, - available from <http://ftp1.sourceforge.net/etherboot/>. + To configure a system which mounts its root file system via NFS + at boot time, say Y here, select "Kernel level IP + autoconfiguration" in the NETWORK menu, and select "Root file + system on NFS" below. You cannot compile this file system as a + module in this case. - If you don't know what all this is about, say N. + If unsure, say N. config NFS_V3 - bool "Provide NFSv3 client support" + bool "NFS client support for NFS version 3" depends on NFS_FS help - Say Y here if you want your NFS client to be able to speak version - 3 of the NFS protocol. + This option enables support for version 3 of the NFS protocol + (RFC 1813) in the kernel's NFS client. If unsure, say Y. config NFS_V3_ACL - bool "Provide client support for the NFSv3 ACL protocol extension" + bool "NFS client support for the NFSv3 ACL protocol extension" depends on NFS_V3 help - Implement the NFSv3 ACL protocol extension for manipulating POSIX - Access Control Lists. The server should also be compiled with - the NFSv3 ACL protocol extension; see the CONFIG_NFSD_V3_ACL option. + Some NFS servers support an auxiliary NFSv3 ACL protocol that + Sun added to Solaris but never became an official part of the + NFS version 3 protocol. This protocol extension allows + applications on NFS clients to manipulate POSIX Access Control + Lists on files residing on NFS servers. NFS servers enforce + ACLs on local files whether this protocol is available or not. + + Choose Y here if your NFS server supports the Solaris NFSv3 ACL + protocol extension and you want your NFS client to allow + applications to access and modify ACLs on files on the server. + + Most NFS servers don't support the Solaris NFSv3 ACL protocol + extension. You can choose N here or specify the "noacl" mount + option to prevent your NFS client from trying to use the NFSv3 + ACL protocol. If unsure, say N. config NFS_V4 - bool "Provide NFSv4 client support (EXPERIMENTAL)" + bool "NFS client support for NFS version 4 (EXPERIMENTAL)" depends on NFS_FS && EXPERIMENTAL select RPCSEC_GSS_KRB5 help - Say Y here if you want your NFS client to be able to speak the newer - version 4 of the NFS protocol. + This option enables support for version 4 of the NFS protocol + (RFC 3530) in the kernel's NFS client. - Note: Requires auxiliary userspace daemons which may be found on - http://www.citi.umich.edu/projects/nfsv4/ + To mount NFS servers using NFSv4, you also need to install user + space programs which can be found in the Linux nfs-utils package, + available from http://linux-nfs.org/. If unsure, say N. +config ROOT_NFS + bool "Root file system on NFS" + depends on NFS_FS=y && IP_PNP + help + If you want your system to mount its root file system via NFS, + choose Y here. This is common practice for managing systems + without local permanent storage. For details, read + <file:Documentation/filesystems/nfsroot.txt>. + + Most people say N here. + config NFSD tristate "NFS server support" depends on INET @@ -1746,20 +1757,6 @@ config NFSD_V4 If unsure, say N. -config ROOT_NFS - bool "Root file system on NFS" - depends on NFS_FS=y && IP_PNP - help - If you want your Linux box to mount its whole root file system (the - one containing the directory /) from some other computer over the - net via NFS (presumably because your box doesn't have a hard disk), - say Y. Read <file:Documentation/filesystems/nfsroot.txt> for - details. It is likely that in this case, you also want to say Y to - "Kernel level IP autoconfiguration" so that your box can discover - its network address at boot time. - - Most people say N here. - config LOCKD tristate @@ -1800,26 +1797,27 @@ config SUNRPC_XPRT_RDMA If unsure, say N. -config SUNRPC_BIND34 - bool "Support for rpcbind versions 3 & 4 (EXPERIMENTAL)" +config SUNRPC_REGISTER_V4 + bool "Register local RPC services via rpcbind v4 (EXPERIMENTAL)" depends on SUNRPC && EXPERIMENTAL default n help - RPC requests over IPv6 networks require support for larger - addresses when performing an RPC bind. Sun added support for - IPv6 addressing by creating two new versions of the rpcbind - protocol (RFC 1833). + Sun added support for registering RPC services at an IPv6 + address by creating two new versions of the rpcbind protocol + (RFC 1833). + + This option enables support in the kernel RPC server for + registering kernel RPC services via version 4 of the rpcbind + protocol. If you enable this option, you must run a portmapper + daemon that supports rpcbind protocol version 4. - This option enables support in the kernel RPC client for - querying rpcbind servers via versions 3 and 4 of the rpcbind - protocol. The kernel automatically falls back to version 2 - if a remote rpcbind service does not support versions 3 or 4. - By themselves, these new versions do not provide support for - RPC over IPv6, but the new protocol versions are necessary to - support it. + Serving NFS over IPv6 from knfsd (the kernel's NFS server) + requires that you enable this option and use a portmapper that + supports rpcbind version 4. - If unsure, say N to get traditional behavior (version 2 rpcbind - requests only). + If unsure, say N to get traditional behavior (register kernel + RPC services using only rpcbind version 2). Distributions + using the legacy Linux portmapper daemon must say N here. config RPCSEC_GSS_KRB5 tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)" @@ -1986,6 +1984,16 @@ config CIFS_WEAK_PW_HASH If unsure, say N. +config CIFS_UPCALL + bool "Kerberos/SPNEGO advanced session setup" + depends on CIFS && KEYS + help + Enables an upcall mechanism for CIFS which accesses + userspace helper utilities to provide SPNEGO packaged (RFC 4178) + Kerberos tickets which are needed to mount to certain secure servers + (for which more secure Kerberos authentication is required). If + unsure, say N. + config CIFS_XATTR bool "CIFS extended attributes" depends on CIFS @@ -2038,17 +2046,6 @@ config CIFS_EXPERIMENTAL (which is disabled by default). See the file fs/cifs/README for more details. If unsure, say N. -config CIFS_UPCALL - bool "Kerberos/SPNEGO advanced session setup (EXPERIMENTAL)" - depends on CIFS_EXPERIMENTAL - depends on KEYS - help - Enables an upcall mechanism for CIFS which accesses - userspace helper utilities to provide SPNEGO packaged (RFC 4178) - Kerberos tickets which are needed to mount to certain secure servers - (for which more secure Kerberos authentication is required). If - unsure, say N. - config CIFS_DFS_UPCALL bool "DFS feature support (EXPERIMENTAL)" depends on CIFS_EXPERIMENTAL @@ -2104,20 +2101,6 @@ config CODA_FS To compile the coda client support as a module, choose M here: the module will be called coda. -config CODA_FS_OLD_API - bool "Use 96-bit Coda file identifiers" - depends on CODA_FS - help - A new kernel-userspace API had to be introduced for Coda v6.0 - to support larger 128-bit file identifiers as needed by the - new realms implementation. - - However this new API is not backward compatible with older - clients. If you really need to run the old Coda userspace - cache manager then say Y. - - For most cases you probably want to say N. - config AFS_FS tristate "Andrew File System support (AFS) (EXPERIMENTAL)" depends on INET && EXPERIMENTAL diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 3263084eef9e..17c9c5ec14c5 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -30,7 +30,7 @@ config COMPAT_BINFMT_ELF config BINFMT_ELF_FDPIC bool "Kernel support for FDPIC ELF binaries" default y - depends on (FRV || BLACKFIN) + depends on (FRV || BLACKFIN || (SUPERH32 && !MMU)) help ELF FDPIC binaries are based on ELF, but allow the individual load segments of a binary to be located in memory independently of each @@ -59,10 +59,12 @@ config BINFMT_SHARED_FLAT help Support FLAT shared libraries +config HAVE_AOUT + def_bool n + config BINFMT_AOUT tristate "Kernel support for a.out and ECOFF binaries" - depends on ARCH_SUPPORTS_AOUT && \ - (X86_32 || ALPHA || ARM || M68K) + depends on HAVE_AOUT ---help--- A.out (Assembler.OUTput) is a set of formats for libraries and executables used in the earliest versions of UNIX. Linux used diff --git a/fs/Makefile b/fs/Makefile index 1e7a11bd4da1..b6f27dc26b72 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -7,7 +7,7 @@ obj-y := open.o read_write.o file_table.o super.o \ char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \ - ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \ + ioctl.o readdir.o select.o fifo.o dcache.o inode.o \ attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \ seq_file.o xattr.o libfs.o fs-writeback.o \ pnode.o drop_caches.o splice.o sync.o utimes.o \ @@ -19,6 +19,7 @@ else obj-y += no-block.o endif +obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o obj-$(CONFIG_INOTIFY) += inotify.o obj-$(CONFIG_INOTIFY_USER) += inotify_user.o obj-$(CONFIG_EPOLL) += eventpoll.o @@ -26,6 +27,7 @@ obj-$(CONFIG_ANON_INODES) += anon_inodes.o obj-$(CONFIG_SIGNALFD) += signalfd.o obj-$(CONFIG_TIMERFD) += timerfd.o obj-$(CONFIG_EVENTFD) += eventfd.o +obj-$(CONFIG_FILE_LOCKING) += locks.o obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o nfsd-$(CONFIG_NFSD) := nfsctl.o @@ -68,7 +70,7 @@ obj-$(CONFIG_DLM) += dlm/ # Do not add any filesystems before this line obj-$(CONFIG_REISERFS_FS) += reiserfs/ obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3 -obj-$(CONFIG_EXT4DEV_FS) += ext4/ # Before ext2 so root fs can be ext4dev +obj-$(CONFIG_EXT4_FS) += ext4/ # Before ext2 so root fs can be ext4dev obj-$(CONFIG_JBD) += jbd/ obj-$(CONFIG_JBD2) += jbd2/ obj-$(CONFIG_EXT2_FS) += ext2/ @@ -100,6 +102,7 @@ obj-$(CONFIG_NTFS_FS) += ntfs/ obj-$(CONFIG_UFS_FS) += ufs/ obj-$(CONFIG_EFS_FS) += efs/ obj-$(CONFIG_JFFS2_FS) += jffs2/ +obj-$(CONFIG_UBIFS_FS) += ubifs/ obj-$(CONFIG_AFFS_FS) += affs/ obj-$(CONFIG_ROMFS_FS) += romfs/ obj-$(CONFIG_QNX4FS_FS) += qnx4/ @@ -109,6 +112,7 @@ obj-$(CONFIG_ADFS_FS) += adfs/ obj-$(CONFIG_FUSE_FS) += fuse/ obj-$(CONFIG_UDF_FS) += udf/ obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs/ +obj-$(CONFIG_OMFS_FS) += omfs/ obj-$(CONFIG_JFS_FS) += jfs/ obj-$(CONFIG_XFS_FS) += xfs/ obj-$(CONFIG_9P_FS) += 9p/ diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c index fc1a8dc64d78..85a30e929800 100644 --- a/fs/adfs/dir.c +++ b/fs/adfs/dir.c @@ -197,6 +197,7 @@ out: const struct file_operations adfs_dir_operations = { .read = generic_read_dir, + .llseek = generic_file_llseek, .readdir = adfs_readdir, .fsync = file_fsync, }; diff --git a/fs/adfs/super.c b/fs/adfs/super.c index 9e421eeb672b..7f83a46f2b7e 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -157,7 +157,7 @@ static int adfs_show_options(struct seq_file *seq, struct vfsmount *mnt) enum {Opt_uid, Opt_gid, Opt_ownmask, Opt_othmask, Opt_err}; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_uid, "uid=%u"}, {Opt_gid, "gid=%u"}, {Opt_ownmask, "ownmask=%o"}, @@ -249,7 +249,7 @@ static void adfs_destroy_inode(struct inode *inode) kmem_cache_free(adfs_inode_cachep, ADFS_I(inode)); } -static void init_once(struct kmem_cache *cachep, void *foo) +static void init_once(void *foo) { struct adfs_inode_info *ei = (struct adfs_inode_info *) foo; diff --git a/fs/affs/affs.h b/fs/affs/affs.h index 223b1917093e..e9ec915f7553 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -2,6 +2,7 @@ #include <linux/fs.h> #include <linux/buffer_head.h> #include <linux/amigaffs.h> +#include <linux/mutex.h> /* AmigaOS allows file names with up to 30 characters length. * Names longer than that will be silently truncated. If you @@ -98,7 +99,7 @@ struct affs_sb_info { gid_t s_gid; /* gid to override */ umode_t s_mode; /* mode to override */ struct buffer_head *s_root_bh; /* Cached root block. */ - struct semaphore s_bmlock; /* Protects bitmap access. */ + struct mutex s_bmlock; /* Protects bitmap access. */ struct affs_bm_info *s_bitmap; /* Bitmap infos. */ u32 s_bmap_count; /* # of bitmap blocks. */ u32 s_bmap_bits; /* # of bits in one bitmap blocks */ diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c index c4a5ad09ddf2..dc5ef14bdc1c 100644 --- a/fs/affs/bitmap.c +++ b/fs/affs/bitmap.c @@ -45,14 +45,14 @@ affs_count_free_blocks(struct super_block *sb) if (sb->s_flags & MS_RDONLY) return 0; - down(&AFFS_SB(sb)->s_bmlock); + mutex_lock(&AFFS_SB(sb)->s_bmlock); bm = AFFS_SB(sb)->s_bitmap; free = 0; for (i = AFFS_SB(sb)->s_bmap_count; i > 0; bm++, i--) free += bm->bm_free; - up(&AFFS_SB(sb)->s_bmlock); + mutex_unlock(&AFFS_SB(sb)->s_bmlock); return free; } @@ -76,7 +76,7 @@ affs_free_block(struct super_block *sb, u32 block) bit = blk % sbi->s_bmap_bits; bm = &sbi->s_bitmap[bmap]; - down(&sbi->s_bmlock); + mutex_lock(&sbi->s_bmlock); bh = sbi->s_bmap_bh; if (sbi->s_last_bmap != bmap) { @@ -105,19 +105,19 @@ affs_free_block(struct super_block *sb, u32 block) sb->s_dirt = 1; bm->bm_free++; - up(&sbi->s_bmlock); + mutex_unlock(&sbi->s_bmlock); return; err_free: affs_warning(sb,"affs_free_block","Trying to free block %u which is already free", block); - up(&sbi->s_bmlock); + mutex_unlock(&sbi->s_bmlock); return; err_bh_read: affs_error(sb,"affs_free_block","Cannot read bitmap block %u", bm->bm_key); sbi->s_bmap_bh = NULL; sbi->s_last_bmap = ~0; - up(&sbi->s_bmlock); + mutex_unlock(&sbi->s_bmlock); return; err_range: @@ -168,7 +168,7 @@ affs_alloc_block(struct inode *inode, u32 goal) bmap = blk / sbi->s_bmap_bits; bm = &sbi->s_bitmap[bmap]; - down(&sbi->s_bmlock); + mutex_lock(&sbi->s_bmlock); if (bm->bm_free) goto find_bmap_bit; @@ -249,7 +249,7 @@ find_bit: mark_buffer_dirty(bh); sb->s_dirt = 1; - up(&sbi->s_bmlock); + mutex_unlock(&sbi->s_bmlock); pr_debug("%d\n", blk); return blk; @@ -259,7 +259,7 @@ err_bh_read: sbi->s_bmap_bh = NULL; sbi->s_last_bmap = ~0; err_full: - up(&sbi->s_bmlock); + mutex_unlock(&sbi->s_bmlock); pr_debug("failed\n"); return 0; } diff --git a/fs/affs/dir.c b/fs/affs/dir.c index 6e3f282424b0..7b36904dbeac 100644 --- a/fs/affs/dir.c +++ b/fs/affs/dir.c @@ -19,6 +19,7 @@ static int affs_readdir(struct file *, void *, filldir_t); const struct file_operations affs_dir_operations = { .read = generic_read_dir, + .llseek = generic_file_llseek, .readdir = affs_readdir, .fsync = file_fsync, }; diff --git a/fs/affs/file.c b/fs/affs/file.c index 6eac7bdeec94..1377b1240b6e 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -46,8 +46,6 @@ const struct inode_operations affs_file_inode_operations = { static int affs_file_open(struct inode *inode, struct file *filp) { - if (atomic_read(&filp->f_count) != 1) - return 0; pr_debug("AFFS: open(%lu,%d)\n", inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt)); atomic_inc(&AFFS_I(inode)->i_opencnt); @@ -57,8 +55,6 @@ affs_file_open(struct inode *inode, struct file *filp) static int affs_file_release(struct inode *inode, struct file *filp) { - if (atomic_read(&filp->f_count) != 0) - return 0; pr_debug("AFFS: release(%lu, %d)\n", inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt)); diff --git a/fs/affs/super.c b/fs/affs/super.c index d214837d5e42..8989c93193ed 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -90,7 +90,7 @@ static void affs_destroy_inode(struct inode *inode) kmem_cache_free(affs_inode_cachep, AFFS_I(inode)); } -static void init_once(struct kmem_cache *cachep, void *foo) +static void init_once(void *foo) { struct affs_inode_info *ei = (struct affs_inode_info *) foo; @@ -135,7 +135,7 @@ enum { Opt_verbose, Opt_volume, Opt_ignore, Opt_err, }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_bs, "bs=%u"}, {Opt_mode, "mode=%o"}, {Opt_mufs, "mufs"}, @@ -290,7 +290,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) if (!sbi) return -ENOMEM; sb->s_fs_info = sbi; - init_MUTEX(&sbi->s_bmlock); + mutex_init(&sbi->s_bmlock); if (!parse_options(data,&uid,&gid,&i,&reserved,&root_block, &blocksize,&sbi->s_prefix, diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 7102824ba847..3cb6920ff30b 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -469,8 +469,6 @@ extern bool afs_cm_incoming_call(struct afs_call *); extern const struct inode_operations afs_dir_inode_operations; extern const struct file_operations afs_dir_file_operations; -extern int afs_permission(struct inode *, int, struct nameidata *); - /* * file.c */ @@ -605,7 +603,7 @@ extern void afs_clear_permits(struct afs_vnode *); extern void afs_cache_permit(struct afs_vnode *, struct key *, long); extern void afs_zap_permits(struct rcu_head *); extern struct key *afs_request_key(struct afs_cell *); -extern int afs_permission(struct inode *, int, struct nameidata *); +extern int afs_permission(struct inode *, int); /* * server.c diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 2f5503902c37..78db4953a800 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -232,7 +232,7 @@ static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd) } mntget(newmnt); - err = do_add_mount(newmnt, nd, MNT_SHRINKABLE, &afs_vfsmounts); + err = do_add_mount(newmnt, &nd->path, MNT_SHRINKABLE, &afs_vfsmounts); switch (err) { case 0: path_put(&nd->path); diff --git a/fs/afs/security.c b/fs/afs/security.c index 3bcbeceba1bb..3ef504370034 100644 --- a/fs/afs/security.c +++ b/fs/afs/security.c @@ -284,7 +284,7 @@ static int afs_check_permit(struct afs_vnode *vnode, struct key *key, * - AFS ACLs are attached to directories only, and a file is controlled by its * parent directory's ACL */ -int afs_permission(struct inode *inode, int mask, struct nameidata *nd) +int afs_permission(struct inode *inode, int mask) { struct afs_vnode *vnode = AFS_FS_I(inode); afs_access_t uninitialized_var(access); diff --git a/fs/afs/super.c b/fs/afs/super.c index 7e3faeef6818..aee239a048cb 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -27,7 +27,7 @@ #define AFS_FS_MAGIC 0x6B414653 /* 'kAFS' */ -static void afs_i_init_once(struct kmem_cache *cachep, void *foo); +static void afs_i_init_once(void *foo); static int afs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, struct vfsmount *mnt); @@ -64,7 +64,7 @@ enum { afs_opt_vol, }; -static match_table_t afs_options_list = { +static const match_table_t afs_options_list = { { afs_opt_cell, "cell=%s" }, { afs_opt_rwpath, "rwpath" }, { afs_opt_vol, "vol=%s" }, @@ -449,7 +449,7 @@ static void afs_put_super(struct super_block *sb) /* * initialise an inode cache slab element prior to any use */ -static void afs_i_init_once(struct kmem_cache *cachep, void *_vnode) +static void afs_i_init_once(void *_vnode) { struct afs_vnode *vnode = _vnode; diff --git a/fs/afs/write.c b/fs/afs/write.c index 9a849ad3c489..065b4e10681a 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -404,7 +404,7 @@ static int afs_write_back_from_locked_page(struct afs_writeback *wb, page = pages[loop]; if (page->index > wb->last) break; - if (TestSetPageLocked(page)) + if (!trylock_page(page)) break; if (!PageDirty(page) || page_private(page) != (unsigned long) wb) { @@ -512,8 +512,8 @@ static void aio_fput_routine(struct work_struct *data) */ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) { - dprintk(KERN_DEBUG "aio_put(%p): f_count=%d\n", - req, atomic_read(&req->ki_filp->f_count)); + dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n", + req, atomic_long_read(&req->ki_filp->f_count)); assert_spin_locked(&ctx->ctx_lock); @@ -528,7 +528,7 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) /* Must be done under the lock to serialise against cancellation. * Call this aio_fput as it duplicates fput via the fput_work. */ - if (unlikely(atomic_dec_and_test(&req->ki_filp->f_count))) { + if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) { get_ioctx(ctx); spin_lock(&fput_lock); list_add(&req->ki_list, &fput_head); @@ -586,7 +586,6 @@ static void use_mm(struct mm_struct *mm) struct task_struct *tsk = current; task_lock(tsk); - tsk->flags |= PF_BORROWED_MM; active_mm = tsk->active_mm; atomic_inc(&mm->mm_count); tsk->mm = mm; @@ -610,7 +609,6 @@ static void unuse_mm(struct mm_struct *mm) struct task_struct *tsk = current; task_lock(tsk); - tsk->flags &= ~PF_BORROWED_MM; tsk->mm = NULL; /* active_mm is still 'mm' */ enter_lazy_tlb(mm, tsk); diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 977ef208c051..3662dd44896b 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -58,8 +58,9 @@ static struct dentry_operations anon_inodefs_dentry_operations = { * of the file * * @name: [in] name of the "class" of the new file - * @fops [in] file operations for the new file - * @priv [in] private data for the new file (will be file's private_data) + * @fops: [in] file operations for the new file + * @priv: [in] private data for the new file (will be file's private_data) + * @flags: [in] flags * * Creates a new file by hooking it on a single inode. This is useful for files * that do not need to have a full-fledged inode in order to operate correctly. @@ -68,7 +69,7 @@ static struct dentry_operations anon_inodefs_dentry_operations = { * setup. Returns new descriptor or -error. */ int anon_inode_getfd(const char *name, const struct file_operations *fops, - void *priv) + void *priv, int flags) { struct qstr this; struct dentry *dentry; @@ -78,7 +79,7 @@ int anon_inode_getfd(const char *name, const struct file_operations *fops, if (IS_ERR(anon_inode_inode)) return -ENODEV; - error = get_unused_fd(); + error = get_unused_fd_flags(flags); if (error < 0) return error; fd = error; @@ -115,7 +116,7 @@ int anon_inode_getfd(const char *name, const struct file_operations *fops, file->f_mapping = anon_inode_inode->i_mapping; file->f_pos = 0; - file->f_flags = O_RDWR; + file->f_flags = O_RDWR | (flags & O_NONBLOCK); file->f_version = 0; file->private_data = priv; diff --git a/fs/attr.c b/fs/attr.c index 966b73e25f82..26c71ba1eed4 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -51,7 +51,7 @@ int inode_change_ok(struct inode *inode, struct iattr *attr) } /* Check for setting the inode time. */ - if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET)) { + if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) { if (!is_owner_or_cap(inode)) goto error; } @@ -108,6 +108,11 @@ int notify_change(struct dentry * dentry, struct iattr * attr) struct timespec now; unsigned int ia_valid = attr->ia_valid; + if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)) { + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + return -EPERM; + } + now = current_fs_time(inode->i_sb); attr->ia_ctime = now; diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c index dda510d31f84..b70eea1e8c59 100644 --- a/fs/autofs/inode.c +++ b/fs/autofs/inode.c @@ -59,7 +59,7 @@ static const struct super_operations autofs_sops = { enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto}; -static match_table_t autofs_tokens = { +static const match_table_t autofs_tokens = { {Opt_fd, "fd=%u"}, {Opt_uid, "uid=%u"}, {Opt_gid, "gid=%u"}, diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index c3d352d7fa93..69a2f5c92319 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -52,7 +52,10 @@ struct autofs_info { int flags; - struct list_head rehash; + struct completion expire_complete; + + struct list_head active; + struct list_head expiring; struct autofs_sb_info *sbi; unsigned long last_used; @@ -68,15 +71,14 @@ struct autofs_info { }; #define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */ +#define AUTOFS_INF_MOUNTPOINT (1<<1) /* mountpoint status for direct expire */ struct autofs_wait_queue { wait_queue_head_t queue; struct autofs_wait_queue *next; autofs_wqt_t wait_queue_token; /* We use the following to see what we are waiting for */ - unsigned int hash; - unsigned int len; - char *name; + struct qstr name; u32 dev; u64 ino; uid_t uid; @@ -85,7 +87,7 @@ struct autofs_wait_queue { pid_t tgid; /* This is for status reporting upon return */ int status; - atomic_t wait_ctr; + unsigned int wait_ctr; }; #define AUTOFS_SBI_MAGIC 0x6d4a556d @@ -112,8 +114,9 @@ struct autofs_sb_info { struct mutex wq_mutex; spinlock_t fs_lock; struct autofs_wait_queue *queues; /* Wait queue pointer */ - spinlock_t rehash_lock; - struct list_head rehash_list; + spinlock_t lookup_lock; + struct list_head active_list; + struct list_head expiring_list; }; static inline struct autofs_sb_info *autofs4_sbi(struct super_block *sb) @@ -138,18 +141,14 @@ static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) { static inline int autofs4_ispending(struct dentry *dentry) { struct autofs_info *inf = autofs4_dentry_ino(dentry); - int pending = 0; if (dentry->d_flags & DCACHE_AUTOFS_PENDING) return 1; - if (inf) { - spin_lock(&inf->sbi->fs_lock); - pending = inf->flags & AUTOFS_INF_EXPIRING; - spin_unlock(&inf->sbi->fs_lock); - } + if (inf->flags & AUTOFS_INF_EXPIRING) + return 1; - return pending; + return 0; } static inline void autofs4_copy_atime(struct file *src, struct file *dst) @@ -164,6 +163,7 @@ void autofs4_free_ino(struct autofs_info *); /* Expiration */ int is_autofs4_dentry(struct dentry *); +int autofs4_expire_wait(struct dentry *dentry); int autofs4_expire_run(struct super_block *, struct vfsmount *, struct autofs_sb_info *, struct autofs_packet_expire __user *); diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 894fee54d4d8..cdabb796ff01 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -259,13 +259,15 @@ static struct dentry *autofs4_expire_direct(struct super_block *sb, now = jiffies; timeout = sbi->exp_timeout; - /* Lock the tree as we must expire as a whole */ spin_lock(&sbi->fs_lock); if (!autofs4_direct_busy(mnt, root, timeout, do_now)) { struct autofs_info *ino = autofs4_dentry_ino(root); - - /* Set this flag early to catch sys_chdir and the like */ + if (d_mountpoint(root)) { + ino->flags |= AUTOFS_INF_MOUNTPOINT; + root->d_mounted--; + } ino->flags |= AUTOFS_INF_EXPIRING; + init_completion(&ino->expire_complete); spin_unlock(&sbi->fs_lock); return root; } @@ -292,6 +294,8 @@ static struct dentry *autofs4_expire_indirect(struct super_block *sb, struct list_head *next; int do_now = how & AUTOFS_EXP_IMMEDIATE; int exp_leaves = how & AUTOFS_EXP_LEAVES; + struct autofs_info *ino; + unsigned int ino_count; if (!root) return NULL; @@ -316,6 +320,9 @@ static struct dentry *autofs4_expire_indirect(struct super_block *sb, dentry = dget(dentry); spin_unlock(&dcache_lock); + spin_lock(&sbi->fs_lock); + ino = autofs4_dentry_ino(dentry); + /* * Case 1: (i) indirect mount or top level pseudo direct mount * (autofs-4.1). @@ -326,6 +333,11 @@ static struct dentry *autofs4_expire_indirect(struct super_block *sb, DPRINTK("checking mountpoint %p %.*s", dentry, (int)dentry->d_name.len, dentry->d_name.name); + /* Path walk currently on this dentry? */ + ino_count = atomic_read(&ino->count) + 2; + if (atomic_read(&dentry->d_count) > ino_count) + goto next; + /* Can we umount this guy */ if (autofs4_mount_busy(mnt, dentry)) goto next; @@ -343,23 +355,25 @@ static struct dentry *autofs4_expire_indirect(struct super_block *sb, /* Case 2: tree mount, expire iff entire tree is not busy */ if (!exp_leaves) { - /* Lock the tree as we must expire as a whole */ - spin_lock(&sbi->fs_lock); - if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) { - struct autofs_info *inf = autofs4_dentry_ino(dentry); + /* Path walk currently on this dentry? */ + ino_count = atomic_read(&ino->count) + 1; + if (atomic_read(&dentry->d_count) > ino_count) + goto next; - /* Set this flag early to catch sys_chdir and the like */ - inf->flags |= AUTOFS_INF_EXPIRING; - spin_unlock(&sbi->fs_lock); + if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) { expired = dentry; goto found; } - spin_unlock(&sbi->fs_lock); /* * Case 3: pseudo direct mount, expire individual leaves * (autofs-4.1). */ } else { + /* Path walk currently on this dentry? */ + ino_count = atomic_read(&ino->count) + 1; + if (atomic_read(&dentry->d_count) > ino_count) + goto next; + expired = autofs4_check_leaves(mnt, dentry, timeout, do_now); if (expired) { dput(dentry); @@ -367,6 +381,7 @@ static struct dentry *autofs4_expire_indirect(struct super_block *sb, } } next: + spin_unlock(&sbi->fs_lock); dput(dentry); spin_lock(&dcache_lock); next = next->next; @@ -377,12 +392,45 @@ next: found: DPRINTK("returning %p %.*s", expired, (int)expired->d_name.len, expired->d_name.name); + ino = autofs4_dentry_ino(expired); + ino->flags |= AUTOFS_INF_EXPIRING; + init_completion(&ino->expire_complete); + spin_unlock(&sbi->fs_lock); spin_lock(&dcache_lock); list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child); spin_unlock(&dcache_lock); return expired; } +int autofs4_expire_wait(struct dentry *dentry) +{ + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_info *ino = autofs4_dentry_ino(dentry); + int status; + + /* Block on any pending expire */ + spin_lock(&sbi->fs_lock); + if (ino->flags & AUTOFS_INF_EXPIRING) { + spin_unlock(&sbi->fs_lock); + + DPRINTK("waiting for expire %p name=%.*s", + dentry, dentry->d_name.len, dentry->d_name.name); + + status = autofs4_wait(sbi, dentry, NFY_NONE); + wait_for_completion(&ino->expire_complete); + + DPRINTK("expire done status=%d", status); + + if (d_unhashed(dentry)) + return -EAGAIN; + + return status; + } + spin_unlock(&sbi->fs_lock); + + return 0; +} + /* Perform an expiry operation */ int autofs4_expire_run(struct super_block *sb, struct vfsmount *mnt, @@ -390,7 +438,9 @@ int autofs4_expire_run(struct super_block *sb, struct autofs_packet_expire __user *pkt_p) { struct autofs_packet_expire pkt; + struct autofs_info *ino; struct dentry *dentry; + int ret = 0; memset(&pkt,0,sizeof pkt); @@ -406,9 +456,15 @@ int autofs4_expire_run(struct super_block *sb, dput(dentry); if ( copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire)) ) - return -EFAULT; + ret = -EFAULT; - return 0; + spin_lock(&sbi->fs_lock); + ino = autofs4_dentry_ino(dentry); + ino->flags &= ~AUTOFS_INF_EXPIRING; + complete_all(&ino->expire_complete); + spin_unlock(&sbi->fs_lock); + + return ret; } /* Call repeatedly until it returns -EAGAIN, meaning there's nothing @@ -433,9 +489,16 @@ int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt, /* This is synchronous because it makes the daemon a little easier */ - ino->flags |= AUTOFS_INF_EXPIRING; ret = autofs4_wait(sbi, dentry, NFY_EXPIRE); + + spin_lock(&sbi->fs_lock); + if (ino->flags & AUTOFS_INF_MOUNTPOINT) { + sb->s_root->d_mounted++; + ino->flags &= ~AUTOFS_INF_MOUNTPOINT; + } ino->flags &= ~AUTOFS_INF_EXPIRING; + complete_all(&ino->expire_complete); + spin_unlock(&sbi->fs_lock); dput(dentry); } diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 2fdcf5e1d236..45d55819203d 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -24,8 +24,10 @@ static void ino_lnkfree(struct autofs_info *ino) { - kfree(ino->u.symlink); - ino->u.symlink = NULL; + if (ino->u.symlink) { + kfree(ino->u.symlink); + ino->u.symlink = NULL; + } } struct autofs_info *autofs4_init_ino(struct autofs_info *ino, @@ -41,16 +43,18 @@ struct autofs_info *autofs4_init_ino(struct autofs_info *ino, if (ino == NULL) return NULL; - ino->flags = 0; - ino->mode = mode; - ino->inode = NULL; - ino->dentry = NULL; - ino->size = 0; - - INIT_LIST_HEAD(&ino->rehash); + if (!reinit) { + ino->flags = 0; + ino->inode = NULL; + ino->dentry = NULL; + ino->size = 0; + INIT_LIST_HEAD(&ino->active); + INIT_LIST_HEAD(&ino->expiring); + atomic_set(&ino->count, 0); + } + ino->mode = mode; ino->last_used = jiffies; - atomic_set(&ino->count, 0); ino->sbi = sbi; @@ -159,8 +163,8 @@ void autofs4_kill_sb(struct super_block *sb) if (!sbi) goto out_kill_sb; - if (!sbi->catatonic) - autofs4_catatonic_mode(sbi); /* Free wait queues, close pipe */ + /* Free wait queues, close pipe */ + autofs4_catatonic_mode(sbi); /* Clean up and release dangling references */ autofs4_force_release(sbi); @@ -209,7 +213,7 @@ static const struct super_operations autofs4_sops = { enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto, Opt_indirect, Opt_direct, Opt_offset}; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_fd, "fd=%u"}, {Opt_uid, "uid=%u"}, {Opt_gid, "gid=%u"}, @@ -338,8 +342,9 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) mutex_init(&sbi->wq_mutex); spin_lock_init(&sbi->fs_lock); sbi->queues = NULL; - spin_lock_init(&sbi->rehash_lock); - INIT_LIST_HEAD(&sbi->rehash_list); + spin_lock_init(&sbi->lookup_lock); + INIT_LIST_HEAD(&sbi->active_list); + INIT_LIST_HEAD(&sbi->expiring_list); s->s_blocksize = 1024; s->s_blocksize_bits = 10; s->s_magic = AUTOFS_SUPER_MAGIC; diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index edf5b6bddb52..2a41c2a7fc52 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -25,25 +25,27 @@ static int autofs4_dir_rmdir(struct inode *,struct dentry *); static int autofs4_dir_mkdir(struct inode *,struct dentry *,int); static int autofs4_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long); static int autofs4_dir_open(struct inode *inode, struct file *file); -static int autofs4_dir_close(struct inode *inode, struct file *file); -static int autofs4_dir_readdir(struct file * filp, void * dirent, filldir_t filldir); -static int autofs4_root_readdir(struct file * filp, void * dirent, filldir_t filldir); static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *); static void *autofs4_follow_link(struct dentry *, struct nameidata *); +#define TRIGGER_FLAGS (LOOKUP_CONTINUE | LOOKUP_DIRECTORY) +#define TRIGGER_INTENTS (LOOKUP_OPEN | LOOKUP_CREATE) + const struct file_operations autofs4_root_operations = { .open = dcache_dir_open, .release = dcache_dir_close, .read = generic_read_dir, - .readdir = autofs4_root_readdir, + .readdir = dcache_readdir, + .llseek = dcache_dir_lseek, .ioctl = autofs4_root_ioctl, }; const struct file_operations autofs4_dir_operations = { .open = autofs4_dir_open, - .release = autofs4_dir_close, + .release = dcache_dir_close, .read = generic_read_dir, - .readdir = autofs4_dir_readdir, + .readdir = dcache_readdir, + .llseek = dcache_dir_lseek, }; const struct inode_operations autofs4_indirect_root_inode_operations = { @@ -70,42 +72,10 @@ const struct inode_operations autofs4_dir_inode_operations = { .rmdir = autofs4_dir_rmdir, }; -static int autofs4_root_readdir(struct file *file, void *dirent, - filldir_t filldir) -{ - struct autofs_sb_info *sbi = autofs4_sbi(file->f_path.dentry->d_sb); - int oz_mode = autofs4_oz_mode(sbi); - - DPRINTK("called, filp->f_pos = %lld", file->f_pos); - - /* - * Don't set reghost flag if: - * 1) f_pos is larger than zero -- we've already been here. - * 2) we haven't even enabled reghosting in the 1st place. - * 3) this is the daemon doing a readdir - */ - if (oz_mode && file->f_pos == 0 && sbi->reghost_enabled) - sbi->needs_reghost = 1; - - DPRINTK("needs_reghost = %d", sbi->needs_reghost); - - return dcache_readdir(file, dirent, filldir); -} - static int autofs4_dir_open(struct inode *inode, struct file *file) { struct dentry *dentry = file->f_path.dentry; - struct vfsmount *mnt = file->f_path.mnt; struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - struct dentry *cursor; - int status; - - status = dcache_dir_open(inode, file); - if (status) - goto out; - - cursor = file->private_data; - cursor->d_fsdata = NULL; DPRINTK("file=%p dentry=%p %.*s", file, dentry, dentry->d_name.len, dentry->d_name.name); @@ -113,159 +83,32 @@ static int autofs4_dir_open(struct inode *inode, struct file *file) if (autofs4_oz_mode(sbi)) goto out; - if (autofs4_ispending(dentry)) { - DPRINTK("dentry busy"); - dcache_dir_close(inode, file); - status = -EBUSY; - goto out; - } - - status = -ENOENT; - if (!d_mountpoint(dentry) && dentry->d_op && dentry->d_op->d_revalidate) { - struct nameidata nd; - int empty, ret; - - /* In case there are stale directory dentrys from a failed mount */ - spin_lock(&dcache_lock); - empty = list_empty(&dentry->d_subdirs); + /* + * An empty directory in an autofs file system is always a + * mount point. The daemon must have failed to mount this + * during lookup so it doesn't exist. This can happen, for + * example, if user space returns an incorrect status for a + * mount request. Otherwise we're doing a readdir on the + * autofs file system so just let the libfs routines handle + * it. + */ + spin_lock(&dcache_lock); + if (!d_mountpoint(dentry) && __simple_empty(dentry)) { spin_unlock(&dcache_lock); - - if (!empty) - d_invalidate(dentry); - - nd.flags = LOOKUP_DIRECTORY; - ret = (dentry->d_op->d_revalidate)(dentry, &nd); - - if (ret <= 0) { - if (ret < 0) - status = ret; - dcache_dir_close(inode, file); - goto out; - } + return -ENOENT; } + spin_unlock(&dcache_lock); - if (d_mountpoint(dentry)) { - struct file *fp = NULL; - struct path fp_path = { .dentry = dentry, .mnt = mnt }; - - path_get(&fp_path); - - if (!autofs4_follow_mount(&fp_path.mnt, &fp_path.dentry)) { - path_put(&fp_path); - dcache_dir_close(inode, file); - goto out; - } - - fp = dentry_open(fp_path.dentry, fp_path.mnt, file->f_flags); - status = PTR_ERR(fp); - if (IS_ERR(fp)) { - dcache_dir_close(inode, file); - goto out; - } - cursor->d_fsdata = fp; - } - return 0; -out: - return status; -} - -static int autofs4_dir_close(struct inode *inode, struct file *file) -{ - struct dentry *dentry = file->f_path.dentry; - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - struct dentry *cursor = file->private_data; - int status = 0; - - DPRINTK("file=%p dentry=%p %.*s", - file, dentry, dentry->d_name.len, dentry->d_name.name); - - if (autofs4_oz_mode(sbi)) - goto out; - - if (autofs4_ispending(dentry)) { - DPRINTK("dentry busy"); - status = -EBUSY; - goto out; - } - - if (d_mountpoint(dentry)) { - struct file *fp = cursor->d_fsdata; - if (!fp) { - status = -ENOENT; - goto out; - } - filp_close(fp, current->files); - } -out: - dcache_dir_close(inode, file); - return status; -} - -static int autofs4_dir_readdir(struct file *file, void *dirent, filldir_t filldir) -{ - struct dentry *dentry = file->f_path.dentry; - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - struct dentry *cursor = file->private_data; - int status; - - DPRINTK("file=%p dentry=%p %.*s", - file, dentry, dentry->d_name.len, dentry->d_name.name); - - if (autofs4_oz_mode(sbi)) - goto out; - - if (autofs4_ispending(dentry)) { - DPRINTK("dentry busy"); - return -EBUSY; - } - - if (d_mountpoint(dentry)) { - struct file *fp = cursor->d_fsdata; - - if (!fp) - return -ENOENT; - - if (!fp->f_op || !fp->f_op->readdir) - goto out; - - status = vfs_readdir(fp, filldir, dirent); - file->f_pos = fp->f_pos; - if (status) - autofs4_copy_atime(file, fp); - return status; - } out: - return dcache_readdir(file, dirent, filldir); + return dcache_dir_open(inode, file); } static int try_to_fill_dentry(struct dentry *dentry, int flags) { struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); - struct dentry *new; int status; - /* Block on any pending expiry here; invalidate the dentry - when expiration is done to trigger mount request with a new - dentry */ - if (ino && (ino->flags & AUTOFS_INF_EXPIRING)) { - DPRINTK("waiting for expire %p name=%.*s", - dentry, dentry->d_name.len, dentry->d_name.name); - - status = autofs4_wait(sbi, dentry, NFY_NONE); - - DPRINTK("expire done status=%d", status); - - /* - * If the directory still exists the mount request must - * continue otherwise it can't be followed at the right - * time during the walk. - */ - status = d_invalidate(dentry); - if (status != -EBUSY) - return -EAGAIN; - } - DPRINTK("dentry=%p %.*s ino=%p", dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); @@ -292,7 +135,8 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags) return status; } /* Trigger mount for path component or follow link */ - } else if (flags & (LOOKUP_CONTINUE | LOOKUP_DIRECTORY) || + } else if (dentry->d_flags & DCACHE_AUTOFS_PENDING || + flags & (TRIGGER_FLAGS | TRIGGER_INTENTS) || current->link_count) { DPRINTK("waiting for mount name=%.*s", dentry->d_name.len, dentry->d_name.name); @@ -320,26 +164,6 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags) dentry->d_flags &= ~DCACHE_AUTOFS_PENDING; spin_unlock(&dentry->d_lock); - /* - * The dentry that is passed in from lookup may not be the one - * we end up using, as mkdir can create a new one. If this - * happens, and another process tries the lookup at the same time, - * it will set the PENDING flag on this new dentry, but add itself - * to our waitq. Then, if after the lookup succeeds, the first - * process that requested the mount performs another lookup of the - * same directory, it will show up as still pending! So, we need - * to redo the lookup here and clear pending on that dentry. - */ - if (d_unhashed(dentry)) { - new = d_lookup(dentry->d_parent, &dentry->d_name); - if (new) { - spin_lock(&new->d_lock); - new->d_flags &= ~DCACHE_AUTOFS_PENDING; - spin_unlock(&new->d_lock); - dput(new); - } - } - return 0; } @@ -355,51 +179,63 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd) DPRINTK("dentry=%p %.*s oz_mode=%d nd->flags=%d", dentry, dentry->d_name.len, dentry->d_name.name, oz_mode, nd->flags); - - /* If it's our master or we shouldn't trigger a mount we're done */ - lookup_type = nd->flags & (LOOKUP_CONTINUE | LOOKUP_DIRECTORY); - if (oz_mode || !lookup_type) + /* + * For an expire of a covered direct or offset mount we need + * to beeak out of follow_down() at the autofs mount trigger + * (d_mounted--), so we can see the expiring flag, and manage + * the blocking and following here until the expire is completed. + */ + if (oz_mode) { + spin_lock(&sbi->fs_lock); + if (ino->flags & AUTOFS_INF_EXPIRING) { + spin_unlock(&sbi->fs_lock); + /* Follow down to our covering mount. */ + if (!follow_down(&nd->path.mnt, &nd->path.dentry)) + goto done; + goto follow; + } + spin_unlock(&sbi->fs_lock); goto done; + } - /* If an expire request is pending wait for it. */ - if (ino && (ino->flags & AUTOFS_INF_EXPIRING)) { - DPRINTK("waiting for active request %p name=%.*s", - dentry, dentry->d_name.len, dentry->d_name.name); - - status = autofs4_wait(sbi, dentry, NFY_NONE); + /* If an expire request is pending everyone must wait. */ + autofs4_expire_wait(dentry); - DPRINTK("request done status=%d", status); - } + /* We trigger a mount for almost all flags */ + lookup_type = nd->flags & (TRIGGER_FLAGS | TRIGGER_INTENTS); + if (!(lookup_type || dentry->d_flags & DCACHE_AUTOFS_PENDING)) + goto follow; /* - * If the dentry contains directories then it is an - * autofs multi-mount with no root mount offset. So - * don't try to mount it again. + * If the dentry contains directories then it is an autofs + * multi-mount with no root mount offset. So don't try to + * mount it again. */ spin_lock(&dcache_lock); - if (!d_mountpoint(dentry) && __simple_empty(dentry)) { + if (dentry->d_flags & DCACHE_AUTOFS_PENDING || + (!d_mountpoint(dentry) && __simple_empty(dentry))) { spin_unlock(&dcache_lock); status = try_to_fill_dentry(dentry, 0); if (status) goto out_error; - /* - * The mount succeeded but if there is no root mount - * it must be an autofs multi-mount with no root offset - * so we don't need to follow the mount. - */ - if (d_mountpoint(dentry)) { - if (!autofs4_follow_mount(&nd->path.mnt, - &nd->path.dentry)) { - status = -ENOENT; - goto out_error; - } - } - - goto done; + goto follow; } spin_unlock(&dcache_lock); +follow: + /* + * If there is no root mount it must be an autofs + * multi-mount with no root offset so we don't need + * to follow it. + */ + if (d_mountpoint(dentry)) { + if (!autofs4_follow_mount(&nd->path.mnt, + &nd->path.dentry)) { + status = -ENOENT; + goto out_error; + } + } done: return NULL; @@ -424,12 +260,23 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd) int status = 1; /* Pending dentry */ + spin_lock(&sbi->fs_lock); if (autofs4_ispending(dentry)) { /* The daemon never causes a mount to trigger */ + spin_unlock(&sbi->fs_lock); + if (oz_mode) return 1; /* + * If the directory has gone away due to an expire + * we have been called as ->d_revalidate() and so + * we need to return false and proceed to ->lookup(). + */ + if (autofs4_expire_wait(dentry) == -EAGAIN) + return 0; + + /* * A zero status is success otherwise we have a * negative error code. */ @@ -437,17 +284,9 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd) if (status == 0) return 1; - /* - * A status of EAGAIN here means that the dentry has gone - * away while waiting for an expire to complete. If we are - * racing with expire lookup will wait for it so this must - * be a revalidate and we need to send it to lookup. - */ - if (status == -EAGAIN) - return 0; - return status; } + spin_unlock(&sbi->fs_lock); /* Negative dentry.. invalidate if "old" */ if (dentry->d_inode == NULL) @@ -461,6 +300,7 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd) DPRINTK("dentry=%p %.*s, emptydir", dentry, dentry->d_name.len, dentry->d_name.name); spin_unlock(&dcache_lock); + /* The daemon never causes a mount to trigger */ if (oz_mode) return 1; @@ -493,10 +333,12 @@ void autofs4_dentry_release(struct dentry *de) struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb); if (sbi) { - spin_lock(&sbi->rehash_lock); - if (!list_empty(&inf->rehash)) - list_del(&inf->rehash); - spin_unlock(&sbi->rehash_lock); + spin_lock(&sbi->lookup_lock); + if (!list_empty(&inf->active)) + list_del(&inf->active); + if (!list_empty(&inf->expiring)) + list_del(&inf->expiring); + spin_unlock(&sbi->lookup_lock); } inf->dentry = NULL; @@ -518,7 +360,7 @@ static struct dentry_operations autofs4_dentry_operations = { .d_release = autofs4_dentry_release, }; -static struct dentry *autofs4_lookup_unhashed(struct autofs_sb_info *sbi, struct dentry *parent, struct qstr *name) +static struct dentry *autofs4_lookup_active(struct autofs_sb_info *sbi, struct dentry *parent, struct qstr *name) { unsigned int len = name->len; unsigned int hash = name->hash; @@ -526,14 +368,66 @@ static struct dentry *autofs4_lookup_unhashed(struct autofs_sb_info *sbi, struct struct list_head *p, *head; spin_lock(&dcache_lock); - spin_lock(&sbi->rehash_lock); - head = &sbi->rehash_list; + spin_lock(&sbi->lookup_lock); + head = &sbi->active_list; list_for_each(p, head) { struct autofs_info *ino; struct dentry *dentry; struct qstr *qstr; - ino = list_entry(p, struct autofs_info, rehash); + ino = list_entry(p, struct autofs_info, active); + dentry = ino->dentry; + + spin_lock(&dentry->d_lock); + + /* Already gone? */ + if (atomic_read(&dentry->d_count) == 0) + goto next; + + qstr = &dentry->d_name; + + if (dentry->d_name.hash != hash) + goto next; + if (dentry->d_parent != parent) + goto next; + + if (qstr->len != len) + goto next; + if (memcmp(qstr->name, str, len)) + goto next; + + if (d_unhashed(dentry)) { + dget(dentry); + spin_unlock(&dentry->d_lock); + spin_unlock(&sbi->lookup_lock); + spin_unlock(&dcache_lock); + return dentry; + } +next: + spin_unlock(&dentry->d_lock); + } + spin_unlock(&sbi->lookup_lock); + spin_unlock(&dcache_lock); + + return NULL; +} + +static struct dentry *autofs4_lookup_expiring(struct autofs_sb_info *sbi, struct dentry *parent, struct qstr *name) +{ + unsigned int len = name->len; + unsigned int hash = name->hash; + const unsigned char *str = name->name; + struct list_head *p, *head; + + spin_lock(&dcache_lock); + spin_lock(&sbi->lookup_lock); + head = &sbi->expiring_list; + list_for_each(p, head) { + struct autofs_info *ino; + struct dentry *dentry; + struct qstr *qstr; + + ino = list_entry(p, struct autofs_info, expiring); dentry = ino->dentry; spin_lock(&dentry->d_lock); @@ -555,33 +449,16 @@ static struct dentry *autofs4_lookup_unhashed(struct autofs_sb_info *sbi, struct goto next; if (d_unhashed(dentry)) { - struct inode *inode = dentry->d_inode; - - ino = autofs4_dentry_ino(dentry); - list_del_init(&ino->rehash); dget(dentry); - /* - * Make the rehashed dentry negative so the VFS - * behaves as it should. - */ - if (inode) { - dentry->d_inode = NULL; - list_del_init(&dentry->d_alias); - spin_unlock(&dentry->d_lock); - spin_unlock(&sbi->rehash_lock); - spin_unlock(&dcache_lock); - iput(inode); - return dentry; - } spin_unlock(&dentry->d_lock); - spin_unlock(&sbi->rehash_lock); + spin_unlock(&sbi->lookup_lock); spin_unlock(&dcache_lock); return dentry; } next: spin_unlock(&dentry->d_lock); } - spin_unlock(&sbi->rehash_lock); + spin_unlock(&sbi->lookup_lock); spin_unlock(&dcache_lock); return NULL; @@ -591,7 +468,8 @@ next: static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { struct autofs_sb_info *sbi; - struct dentry *unhashed; + struct autofs_info *ino; + struct dentry *expiring, *unhashed; int oz_mode; DPRINTK("name = %.*s", @@ -607,8 +485,26 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d", current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode); - unhashed = autofs4_lookup_unhashed(sbi, dentry->d_parent, &dentry->d_name); - if (!unhashed) { + expiring = autofs4_lookup_expiring(sbi, dentry->d_parent, &dentry->d_name); + if (expiring) { + /* + * If we are racing with expire the request might not + * be quite complete but the directory has been removed + * so it must have been successful, so just wait for it. + */ + ino = autofs4_dentry_ino(expiring); + autofs4_expire_wait(expiring); + spin_lock(&sbi->lookup_lock); + if (!list_empty(&ino->expiring)) + list_del_init(&ino->expiring); + spin_unlock(&sbi->lookup_lock); + dput(expiring); + } + + unhashed = autofs4_lookup_active(sbi, dentry->d_parent, &dentry->d_name); + if (unhashed) + dentry = unhashed; + else { /* * Mark the dentry incomplete but don't hash it. We do this * to serialize our inode creation operations (symlink and @@ -622,39 +518,34 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s */ dentry->d_op = &autofs4_root_dentry_operations; - dentry->d_fsdata = NULL; - d_instantiate(dentry, NULL); - } else { - struct autofs_info *ino = autofs4_dentry_ino(unhashed); - DPRINTK("rehash %p with %p", dentry, unhashed); /* - * If we are racing with expire the request might not - * be quite complete but the directory has been removed - * so it must have been successful, so just wait for it. - * We need to ensure the AUTOFS_INF_EXPIRING flag is clear - * before continuing as revalidate may fail when calling - * try_to_fill_dentry (returning EAGAIN) if we don't. + * And we need to ensure that the same dentry is used for + * all following lookup calls until it is hashed so that + * the dentry flags are persistent throughout the request. */ - while (ino && (ino->flags & AUTOFS_INF_EXPIRING)) { - DPRINTK("wait for incomplete expire %p name=%.*s", - unhashed, unhashed->d_name.len, - unhashed->d_name.name); - autofs4_wait(sbi, unhashed, NFY_NONE); - DPRINTK("request completed"); - } - dentry = unhashed; + ino = autofs4_init_ino(NULL, sbi, 0555); + if (!ino) + return ERR_PTR(-ENOMEM); + + dentry->d_fsdata = ino; + ino->dentry = dentry; + + spin_lock(&sbi->lookup_lock); + list_add(&ino->active, &sbi->active_list); + spin_unlock(&sbi->lookup_lock); + + d_instantiate(dentry, NULL); } if (!oz_mode) { spin_lock(&dentry->d_lock); dentry->d_flags |= DCACHE_AUTOFS_PENDING; spin_unlock(&dentry->d_lock); - } - - if (dentry->d_op && dentry->d_op->d_revalidate) { - mutex_unlock(&dir->i_mutex); - (dentry->d_op->d_revalidate)(dentry, nd); - mutex_lock(&dir->i_mutex); + if (dentry->d_op && dentry->d_op->d_revalidate) { + mutex_unlock(&dir->i_mutex); + (dentry->d_op->d_revalidate)(dentry, nd); + mutex_lock(&dir->i_mutex); + } } /* @@ -673,9 +564,11 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s return ERR_PTR(-ERESTARTNOINTR); } } - spin_lock(&dentry->d_lock); - dentry->d_flags &= ~DCACHE_AUTOFS_PENDING; - spin_unlock(&dentry->d_lock); + if (!oz_mode) { + spin_lock(&dentry->d_lock); + dentry->d_flags &= ~DCACHE_AUTOFS_PENDING; + spin_unlock(&dentry->d_lock); + } } /* @@ -706,7 +599,7 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s } if (unhashed) - return dentry; + return unhashed; return NULL; } @@ -728,20 +621,31 @@ static int autofs4_dir_symlink(struct inode *dir, return -EACCES; ino = autofs4_init_ino(ino, sbi, S_IFLNK | 0555); - if (ino == NULL) - return -ENOSPC; + if (!ino) + return -ENOMEM; - ino->size = strlen(symname); - ino->u.symlink = cp = kmalloc(ino->size + 1, GFP_KERNEL); + spin_lock(&sbi->lookup_lock); + if (!list_empty(&ino->active)) + list_del_init(&ino->active); + spin_unlock(&sbi->lookup_lock); - if (cp == NULL) { - kfree(ino); - return -ENOSPC; + ino->size = strlen(symname); + cp = kmalloc(ino->size + 1, GFP_KERNEL); + if (!cp) { + if (!dentry->d_fsdata) + kfree(ino); + return -ENOMEM; } strcpy(cp, symname); inode = autofs4_get_inode(dir->i_sb, ino); + if (!inode) { + kfree(cp); + if (!dentry->d_fsdata) + kfree(ino); + return -ENOMEM; + } d_add(dentry, inode); if (dir == dir->i_sb->s_root->d_inode) @@ -757,6 +661,7 @@ static int autofs4_dir_symlink(struct inode *dir, atomic_inc(&p_ino->count); ino->inode = inode; + ino->u.symlink = cp; dir->i_mtime = CURRENT_TIME; return 0; @@ -769,9 +674,8 @@ static int autofs4_dir_symlink(struct inode *dir, * that the file no longer exists. However, doing that means that the * VFS layer can turn the dentry into a negative dentry. We don't want * this, because the unlink is probably the result of an expire. - * We simply d_drop it and add it to a rehash candidates list in the - * super block, which allows the dentry lookup to reuse it retaining - * the flags, such as expire in progress, in case we're racing with expire. + * We simply d_drop it and add it to a expiring list in the super block, + * which allows the dentry lookup to check for an incomplete expire. * * If a process is blocked on the dentry waiting for the expire to finish, * it will invalidate the dentry and try to mount with a new one. @@ -801,9 +705,10 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry) dir->i_mtime = CURRENT_TIME; spin_lock(&dcache_lock); - spin_lock(&sbi->rehash_lock); - list_add(&ino->rehash, &sbi->rehash_list); - spin_unlock(&sbi->rehash_lock); + spin_lock(&sbi->lookup_lock); + if (list_empty(&ino->expiring)) + list_add(&ino->expiring, &sbi->expiring_list); + spin_unlock(&sbi->lookup_lock); spin_lock(&dentry->d_lock); __d_drop(dentry); spin_unlock(&dentry->d_lock); @@ -829,9 +734,10 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) spin_unlock(&dcache_lock); return -ENOTEMPTY; } - spin_lock(&sbi->rehash_lock); - list_add(&ino->rehash, &sbi->rehash_list); - spin_unlock(&sbi->rehash_lock); + spin_lock(&sbi->lookup_lock); + if (list_empty(&ino->expiring)) + list_add(&ino->expiring, &sbi->expiring_list); + spin_unlock(&sbi->lookup_lock); spin_lock(&dentry->d_lock); __d_drop(dentry); spin_unlock(&dentry->d_lock); @@ -866,10 +772,20 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode) dentry, dentry->d_name.len, dentry->d_name.name); ino = autofs4_init_ino(ino, sbi, S_IFDIR | 0555); - if (ino == NULL) - return -ENOSPC; + if (!ino) + return -ENOMEM; + + spin_lock(&sbi->lookup_lock); + if (!list_empty(&ino->active)) + list_del_init(&ino->active); + spin_unlock(&sbi->lookup_lock); inode = autofs4_get_inode(dir->i_sb, ino); + if (!inode) { + if (!dentry->d_fsdata) + kfree(ino); + return -ENOMEM; + } d_add(dentry, inode); if (dir == dir->i_sb->s_root->d_inode) @@ -922,44 +838,6 @@ static inline int autofs4_get_protosubver(struct autofs_sb_info *sbi, int __user } /* - * Tells the daemon whether we need to reghost or not. Also, clears - * the reghost_needed flag. - */ -static inline int autofs4_ask_reghost(struct autofs_sb_info *sbi, int __user *p) -{ - int status; - - DPRINTK("returning %d", sbi->needs_reghost); - - status = put_user(sbi->needs_reghost, p); - if (status) - return status; - - sbi->needs_reghost = 0; - return 0; -} - -/* - * Enable / Disable reghosting ioctl() operation - */ -static inline int autofs4_toggle_reghost(struct autofs_sb_info *sbi, int __user *p) -{ - int status; - int val; - - status = get_user(val, p); - - DPRINTK("reghost = %d", val); - - if (status) - return status; - - /* turn on/off reghosting, with the val */ - sbi->reghost_enabled = val; - return 0; -} - -/* * Tells the daemon whether it can umount the autofs mount. */ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p) @@ -1023,11 +901,6 @@ static int autofs4_root_ioctl(struct inode *inode, struct file *filp, case AUTOFS_IOC_SETTIMEOUT: return autofs4_get_set_timeout(sbi, p); - case AUTOFS_IOC_TOGGLEREGHOST: - return autofs4_toggle_reghost(sbi, p); - case AUTOFS_IOC_ASKREGHOST: - return autofs4_ask_reghost(sbi, p); - case AUTOFS_IOC_ASKUMOUNT: return autofs4_ask_umount(filp->f_path.mnt, p); diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 75e5955c3f6d..35216d18d8b5 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -28,6 +28,12 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi) { struct autofs_wait_queue *wq, *nwq; + mutex_lock(&sbi->wq_mutex); + if (sbi->catatonic) { + mutex_unlock(&sbi->wq_mutex); + return; + } + DPRINTK("entering catatonic mode"); sbi->catatonic = 1; @@ -36,13 +42,18 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi) while (wq) { nwq = wq->next; wq->status = -ENOENT; /* Magic is gone - report failure */ - kfree(wq->name); - wq->name = NULL; + if (wq->name.name) { + kfree(wq->name.name); + wq->name.name = NULL; + } + wq->wait_ctr--; wake_up_interruptible(&wq->queue); wq = nwq; } fput(sbi->pipe); /* Close the pipe */ sbi->pipe = NULL; + sbi->pipefd = -1; + mutex_unlock(&sbi->wq_mutex); } static int autofs4_write(struct file *file, const void *addr, int bytes) @@ -89,10 +100,11 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, union autofs_packet_union v4_pkt; union autofs_v5_packet_union v5_pkt; } pkt; + struct file *pipe = NULL; size_t pktsz; DPRINTK("wait id = 0x%08lx, name = %.*s, type=%d", - wq->wait_queue_token, wq->len, wq->name, type); + wq->wait_queue_token, wq->name.len, wq->name.name, type); memset(&pkt,0,sizeof pkt); /* For security reasons */ @@ -107,9 +119,9 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, pktsz = sizeof(*mp); mp->wait_queue_token = wq->wait_queue_token; - mp->len = wq->len; - memcpy(mp->name, wq->name, wq->len); - mp->name[wq->len] = '\0'; + mp->len = wq->name.len; + memcpy(mp->name, wq->name.name, wq->name.len); + mp->name[wq->name.len] = '\0'; break; } case autofs_ptype_expire_multi: @@ -119,9 +131,9 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, pktsz = sizeof(*ep); ep->wait_queue_token = wq->wait_queue_token; - ep->len = wq->len; - memcpy(ep->name, wq->name, wq->len); - ep->name[wq->len] = '\0'; + ep->len = wq->name.len; + memcpy(ep->name, wq->name.name, wq->name.len); + ep->name[wq->name.len] = '\0'; break; } /* @@ -138,9 +150,9 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, pktsz = sizeof(*packet); packet->wait_queue_token = wq->wait_queue_token; - packet->len = wq->len; - memcpy(packet->name, wq->name, wq->len); - packet->name[wq->len] = '\0'; + packet->len = wq->name.len; + memcpy(packet->name, wq->name.name, wq->name.len); + packet->name[wq->name.len] = '\0'; packet->dev = wq->dev; packet->ino = wq->ino; packet->uid = wq->uid; @@ -154,8 +166,19 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, return; } - if (autofs4_write(sbi->pipe, &pkt, pktsz)) - autofs4_catatonic_mode(sbi); + /* Check if we have become catatonic */ + mutex_lock(&sbi->wq_mutex); + if (!sbi->catatonic) { + pipe = sbi->pipe; + get_file(pipe); + } + mutex_unlock(&sbi->wq_mutex); + + if (pipe) { + if (autofs4_write(pipe, &pkt, pktsz)) + autofs4_catatonic_mode(sbi); + fput(pipe); + } } static int autofs4_getpath(struct autofs_sb_info *sbi, @@ -191,58 +214,55 @@ static int autofs4_getpath(struct autofs_sb_info *sbi, } static struct autofs_wait_queue * -autofs4_find_wait(struct autofs_sb_info *sbi, - char *name, unsigned int hash, unsigned int len) +autofs4_find_wait(struct autofs_sb_info *sbi, struct qstr *qstr) { struct autofs_wait_queue *wq; for (wq = sbi->queues; wq; wq = wq->next) { - if (wq->hash == hash && - wq->len == len && - wq->name && !memcmp(wq->name, name, len)) + if (wq->name.hash == qstr->hash && + wq->name.len == qstr->len && + wq->name.name && + !memcmp(wq->name.name, qstr->name, qstr->len)) break; } return wq; } -int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, - enum autofs_notify notify) +/* + * Check if we have a valid request. + * Returns + * 1 if the request should continue. + * In this case we can return an autofs_wait_queue entry if one is + * found or NULL to idicate a new wait needs to be created. + * 0 or a negative errno if the request shouldn't continue. + */ +static int validate_request(struct autofs_wait_queue **wait, + struct autofs_sb_info *sbi, + struct qstr *qstr, + struct dentry*dentry, enum autofs_notify notify) { - struct autofs_info *ino; struct autofs_wait_queue *wq; - char *name; - unsigned int len = 0; - unsigned int hash = 0; - int status, type; - - /* In catatonic mode, we don't wait for nobody */ - if (sbi->catatonic) - return -ENOENT; - - name = kmalloc(NAME_MAX + 1, GFP_KERNEL); - if (!name) - return -ENOMEM; + struct autofs_info *ino; - /* If this is a direct mount request create a dummy name */ - if (IS_ROOT(dentry) && (sbi->type & AUTOFS_TYPE_DIRECT)) - len = sprintf(name, "%p", dentry); - else { - len = autofs4_getpath(sbi, dentry, &name); - if (!len) { - kfree(name); - return -ENOENT; - } + /* Wait in progress, continue; */ + wq = autofs4_find_wait(sbi, qstr); + if (wq) { + *wait = wq; + return 1; } - hash = full_name_hash(name, len); - if (mutex_lock_interruptible(&sbi->wq_mutex)) { - kfree(name); - return -EINTR; - } + *wait = NULL; - wq = autofs4_find_wait(sbi, name, hash, len); + /* If we don't yet have any info this is a new request */ ino = autofs4_dentry_ino(dentry); - if (!wq && ino && notify == NFY_NONE) { + if (!ino) + return 1; + + /* + * If we've been asked to wait on an existing expire (NFY_NONE) + * but there is no wait in the queue ... + */ + if (notify == NFY_NONE) { /* * Either we've betean the pending expire to post it's * wait or it finished while we waited on the mutex. @@ -253,13 +273,14 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, while (ino->flags & AUTOFS_INF_EXPIRING) { mutex_unlock(&sbi->wq_mutex); schedule_timeout_interruptible(HZ/10); - if (mutex_lock_interruptible(&sbi->wq_mutex)) { - kfree(name); + if (mutex_lock_interruptible(&sbi->wq_mutex)) return -EINTR; + + wq = autofs4_find_wait(sbi, qstr); + if (wq) { + *wait = wq; + return 1; } - wq = autofs4_find_wait(sbi, name, hash, len); - if (wq) - break; } /* @@ -267,18 +288,96 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, * cases where we wait on NFY_NONE neither depend on the * return status of the wait. */ - if (!wq) { + return 0; + } + + /* + * If we've been asked to trigger a mount and the request + * completed while we waited on the mutex ... + */ + if (notify == NFY_MOUNT) { + /* + * If the dentry isn't hashed just go ahead and try the + * mount again with a new wait (not much else we can do). + */ + if (!d_unhashed(dentry)) { + /* + * But if the dentry is hashed, that means that we + * got here through the revalidate path. Thus, we + * need to check if the dentry has been mounted + * while we waited on the wq_mutex. If it has, + * simply return success. + */ + if (d_mountpoint(dentry)) + return 0; + } + } + + return 1; +} + +int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, + enum autofs_notify notify) +{ + struct autofs_wait_queue *wq; + struct qstr qstr; + char *name; + int status, ret, type; + + /* In catatonic mode, we don't wait for nobody */ + if (sbi->catatonic) + return -ENOENT; + + if (!dentry->d_inode) { + /* + * A wait for a negative dentry is invalid for certain + * cases. A direct or offset mount "always" has its mount + * point directory created and so the request dentry must + * be positive or the map key doesn't exist. The situation + * is very similar for indirect mounts except only dentrys + * in the root of the autofs file system may be negative. + */ + if (sbi->type & (AUTOFS_TYPE_DIRECT|AUTOFS_TYPE_OFFSET)) + return -ENOENT; + else if (!IS_ROOT(dentry->d_parent)) + return -ENOENT; + } + + name = kmalloc(NAME_MAX + 1, GFP_KERNEL); + if (!name) + return -ENOMEM; + + /* If this is a direct mount request create a dummy name */ + if (IS_ROOT(dentry) && (sbi->type & AUTOFS_TYPE_DIRECT)) + qstr.len = sprintf(name, "%p", dentry); + else { + qstr.len = autofs4_getpath(sbi, dentry, &name); + if (!qstr.len) { kfree(name); - mutex_unlock(&sbi->wq_mutex); - return 0; + return -ENOENT; } } + qstr.name = name; + qstr.hash = full_name_hash(name, qstr.len); + + if (mutex_lock_interruptible(&sbi->wq_mutex)) { + kfree(qstr.name); + return -EINTR; + } + + ret = validate_request(&wq, sbi, &qstr, dentry, notify); + if (ret <= 0) { + if (ret == 0) + mutex_unlock(&sbi->wq_mutex); + kfree(qstr.name); + return ret; + } if (!wq) { /* Create a new wait queue */ wq = kmalloc(sizeof(struct autofs_wait_queue),GFP_KERNEL); if (!wq) { - kfree(name); + kfree(qstr.name); mutex_unlock(&sbi->wq_mutex); return -ENOMEM; } @@ -289,9 +388,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, wq->next = sbi->queues; sbi->queues = wq; init_waitqueue_head(&wq->queue); - wq->hash = hash; - wq->name = name; - wq->len = len; + memcpy(&wq->name, &qstr, sizeof(struct qstr)); wq->dev = autofs4_get_dev(sbi); wq->ino = autofs4_get_ino(sbi); wq->uid = current->uid; @@ -299,7 +396,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, wq->pid = current->pid; wq->tgid = current->tgid; wq->status = -EINTR; /* Status return if interrupted */ - atomic_set(&wq->wait_ctr, 2); + wq->wait_ctr = 2; mutex_unlock(&sbi->wq_mutex); if (sbi->version < 5) { @@ -319,28 +416,25 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, } DPRINTK("new wait id = 0x%08lx, name = %.*s, nfy=%d\n", - (unsigned long) wq->wait_queue_token, wq->len, wq->name, notify); + (unsigned long) wq->wait_queue_token, wq->name.len, + wq->name.name, notify); /* autofs4_notify_daemon() may block */ autofs4_notify_daemon(sbi, wq, type); } else { - atomic_inc(&wq->wait_ctr); + wq->wait_ctr++; mutex_unlock(&sbi->wq_mutex); - kfree(name); + kfree(qstr.name); DPRINTK("existing wait id = 0x%08lx, name = %.*s, nfy=%d", - (unsigned long) wq->wait_queue_token, wq->len, wq->name, notify); - } - - /* wq->name is NULL if and only if the lock is already released */ - - if (sbi->catatonic) { - /* We might have slept, so check again for catatonic mode */ - wq->status = -ENOENT; - kfree(wq->name); - wq->name = NULL; + (unsigned long) wq->wait_queue_token, wq->name.len, + wq->name.name, notify); } - if (wq->name) { + /* + * wq->name.name is NULL iff the lock is already released + * or the mount has been made catatonic. + */ + if (wq->name.name) { /* Block all but "shutdown" signals while waiting */ sigset_t oldset; unsigned long irqflags; @@ -351,7 +445,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, recalc_sigpending(); spin_unlock_irqrestore(¤t->sighand->siglock, irqflags); - wait_event_interruptible(wq->queue, wq->name == NULL); + wait_event_interruptible(wq->queue, wq->name.name == NULL); spin_lock_irqsave(¤t->sighand->siglock, irqflags); current->blocked = oldset; @@ -364,8 +458,10 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, status = wq->status; /* Are we the last process to need status? */ - if (atomic_dec_and_test(&wq->wait_ctr)) + mutex_lock(&sbi->wq_mutex); + if (!--wq->wait_ctr) kfree(wq); + mutex_unlock(&sbi->wq_mutex); return status; } @@ -387,16 +483,13 @@ int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_tok } *wql = wq->next; /* Unlink from chain */ - mutex_unlock(&sbi->wq_mutex); - kfree(wq->name); - wq->name = NULL; /* Do not wait on this queue */ - + kfree(wq->name.name); + wq->name.name = NULL; /* Do not wait on this queue */ wq->status = status; - - if (atomic_dec_and_test(&wq->wait_ctr)) /* Is anyone still waiting for this guy? */ + wake_up_interruptible(&wq->queue); + if (!--wq->wait_ctr) kfree(wq); - else - wake_up_interruptible(&wq->queue); + mutex_unlock(&sbi->wq_mutex); return 0; } diff --git a/fs/bad_inode.c b/fs/bad_inode.c index f1c2ea8342f5..5f1538c03b1b 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -243,8 +243,7 @@ static int bad_inode_readlink(struct dentry *dentry, char __user *buffer, return -EIO; } -static int bad_inode_permission(struct inode *inode, int mask, - struct nameidata *nd) +static int bad_inode_permission(struct inode *inode, int mask) { return -EIO; } diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index e8717de3bab3..9286b2af893a 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -66,6 +66,7 @@ static struct kmem_cache *befs_inode_cachep; static const struct file_operations befs_dir_operations = { .read = generic_read_dir, .readdir = befs_readdir, + .llseek = generic_file_llseek, }; static const struct inode_operations befs_dir_inode_operations = { @@ -289,7 +290,7 @@ befs_destroy_inode(struct inode *inode) kmem_cache_free(befs_inode_cachep, BEFS_I(inode)); } -static void init_once(struct kmem_cache *cachep, void *foo) +static void init_once(void *foo) { struct befs_inode_info *bi = (struct befs_inode_info *) foo; @@ -649,7 +650,7 @@ enum { Opt_uid, Opt_gid, Opt_charset, Opt_debug, Opt_err, }; -static match_table_t befs_tokens = { +static const match_table_t befs_tokens = { {Opt_uid, "uid=%d"}, {Opt_gid, "gid=%d"}, {Opt_charset, "iocharset=%s"}, diff --git a/fs/bfs/bfs.h b/fs/bfs/bfs.h index 70f5d3a8eede..7109e451abf7 100644 --- a/fs/bfs/bfs.h +++ b/fs/bfs/bfs.h @@ -16,8 +16,9 @@ struct bfs_sb_info { unsigned long si_freei; unsigned long si_lf_eblk; unsigned long si_lasti; - unsigned long * si_imap; - struct buffer_head * si_sbh; /* buffer header w/superblock */ + unsigned long *si_imap; + struct buffer_head *si_sbh; /* buffer header w/superblock */ + struct mutex bfs_lock; }; /* diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index 034950cb3cbe..ed8feb052df9 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -32,16 +32,17 @@ static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir) struct inode *dir = f->f_path.dentry->d_inode; struct buffer_head *bh; struct bfs_dirent *de; + struct bfs_sb_info *info = BFS_SB(dir->i_sb); unsigned int offset; int block; - lock_kernel(); + mutex_lock(&info->bfs_lock); if (f->f_pos & (BFS_DIRENT_SIZE - 1)) { printf("Bad f_pos=%08lx for %s:%08lx\n", (unsigned long)f->f_pos, dir->i_sb->s_id, dir->i_ino); - unlock_kernel(); + mutex_unlock(&info->bfs_lock); return -EBADF; } @@ -61,7 +62,7 @@ static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir) le16_to_cpu(de->ino), DT_UNKNOWN) < 0) { brelse(bh); - unlock_kernel(); + mutex_unlock(&info->bfs_lock); return 0; } } @@ -71,7 +72,7 @@ static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir) brelse(bh); } - unlock_kernel(); + mutex_unlock(&info->bfs_lock); return 0; } @@ -95,10 +96,10 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, int mode, inode = new_inode(s); if (!inode) return -ENOSPC; - lock_kernel(); + mutex_lock(&info->bfs_lock); ino = find_first_zero_bit(info->si_imap, info->si_lasti); if (ino > info->si_lasti) { - unlock_kernel(); + mutex_unlock(&info->bfs_lock); iput(inode); return -ENOSPC; } @@ -124,11 +125,11 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, int mode, inode->i_ino); if (err) { inode_dec_link_count(inode); + mutex_unlock(&info->bfs_lock); iput(inode); - unlock_kernel(); return err; } - unlock_kernel(); + mutex_unlock(&info->bfs_lock); d_instantiate(dentry, inode); return 0; } @@ -139,22 +140,23 @@ static struct dentry *bfs_lookup(struct inode *dir, struct dentry *dentry, struct inode *inode = NULL; struct buffer_head *bh; struct bfs_dirent *de; + struct bfs_sb_info *info = BFS_SB(dir->i_sb); if (dentry->d_name.len > BFS_NAMELEN) return ERR_PTR(-ENAMETOOLONG); - lock_kernel(); + mutex_lock(&info->bfs_lock); bh = bfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, &de); if (bh) { unsigned long ino = (unsigned long)le16_to_cpu(de->ino); brelse(bh); inode = bfs_iget(dir->i_sb, ino); if (IS_ERR(inode)) { - unlock_kernel(); + mutex_unlock(&info->bfs_lock); return ERR_CAST(inode); } } - unlock_kernel(); + mutex_unlock(&info->bfs_lock); d_add(dentry, inode); return NULL; } @@ -163,13 +165,14 @@ static int bfs_link(struct dentry *old, struct inode *dir, struct dentry *new) { struct inode *inode = old->d_inode; + struct bfs_sb_info *info = BFS_SB(inode->i_sb); int err; - lock_kernel(); + mutex_lock(&info->bfs_lock); err = bfs_add_entry(dir, new->d_name.name, new->d_name.len, inode->i_ino); if (err) { - unlock_kernel(); + mutex_unlock(&info->bfs_lock); return err; } inc_nlink(inode); @@ -177,19 +180,19 @@ static int bfs_link(struct dentry *old, struct inode *dir, mark_inode_dirty(inode); atomic_inc(&inode->i_count); d_instantiate(new, inode); - unlock_kernel(); + mutex_unlock(&info->bfs_lock); return 0; } static int bfs_unlink(struct inode *dir, struct dentry *dentry) { int error = -ENOENT; - struct inode *inode; + struct inode *inode = dentry->d_inode; struct buffer_head *bh; struct bfs_dirent *de; + struct bfs_sb_info *info = BFS_SB(inode->i_sb); - inode = dentry->d_inode; - lock_kernel(); + mutex_lock(&info->bfs_lock); bh = bfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, &de); if (!bh || (le16_to_cpu(de->ino) != inode->i_ino)) goto out_brelse; @@ -210,7 +213,7 @@ static int bfs_unlink(struct inode *dir, struct dentry *dentry) out_brelse: brelse(bh); - unlock_kernel(); + mutex_unlock(&info->bfs_lock); return error; } @@ -220,6 +223,7 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *old_inode, *new_inode; struct buffer_head *old_bh, *new_bh; struct bfs_dirent *old_de, *new_de; + struct bfs_sb_info *info; int error = -ENOENT; old_bh = new_bh = NULL; @@ -227,7 +231,9 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (S_ISDIR(old_inode->i_mode)) return -EINVAL; - lock_kernel(); + info = BFS_SB(old_inode->i_sb); + + mutex_lock(&info->bfs_lock); old_bh = bfs_find_entry(old_dir, old_dentry->d_name.name, old_dentry->d_name.len, &old_de); @@ -264,7 +270,7 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry, error = 0; end_rename: - unlock_kernel(); + mutex_unlock(&info->bfs_lock); brelse(old_bh); brelse(new_bh); return error; diff --git a/fs/bfs/file.c b/fs/bfs/file.c index b11e63e8fbcd..6a021265f018 100644 --- a/fs/bfs/file.c +++ b/fs/bfs/file.c @@ -99,7 +99,7 @@ static int bfs_get_block(struct inode *inode, sector_t block, return -ENOSPC; /* The rest has to be protected against itself. */ - lock_kernel(); + mutex_lock(&info->bfs_lock); /* * If the last data block for this file is the last allocated @@ -151,7 +151,7 @@ static int bfs_get_block(struct inode *inode, sector_t block, mark_buffer_dirty(sbh); map_bh(bh_result, sb, phys); out: - unlock_kernel(); + mutex_unlock(&info->bfs_lock); return err; } diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 8db623838b50..0ed57b5ee012 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -104,6 +104,7 @@ static int bfs_write_inode(struct inode *inode, int unused) struct bfs_inode *di; struct buffer_head *bh; int block, off; + struct bfs_sb_info *info = BFS_SB(inode->i_sb); dprintf("ino=%08x\n", ino); @@ -112,13 +113,13 @@ static int bfs_write_inode(struct inode *inode, int unused) return -EIO; } - lock_kernel(); + mutex_lock(&info->bfs_lock); block = (ino - BFS_ROOT_INO) / BFS_INODES_PER_BLOCK + 1; bh = sb_bread(inode->i_sb, block); if (!bh) { printf("Unable to read inode %s:%08x\n", inode->i_sb->s_id, ino); - unlock_kernel(); + mutex_unlock(&info->bfs_lock); return -EIO; } @@ -145,7 +146,7 @@ static int bfs_write_inode(struct inode *inode, int unused) mark_buffer_dirty(bh); brelse(bh); - unlock_kernel(); + mutex_unlock(&info->bfs_lock); return 0; } @@ -170,7 +171,7 @@ static void bfs_delete_inode(struct inode *inode) inode->i_size = 0; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; - lock_kernel(); + mutex_lock(&info->bfs_lock); mark_inode_dirty(inode); block = (ino - BFS_ROOT_INO) / BFS_INODES_PER_BLOCK + 1; @@ -178,7 +179,7 @@ static void bfs_delete_inode(struct inode *inode) if (!bh) { printf("Unable to read inode %s:%08lx\n", inode->i_sb->s_id, ino); - unlock_kernel(); + mutex_unlock(&info->bfs_lock); return; } off = (ino - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK; @@ -204,14 +205,16 @@ static void bfs_delete_inode(struct inode *inode) info->si_lf_eblk = bi->i_sblock - 1; mark_buffer_dirty(info->si_sbh); } - unlock_kernel(); + mutex_unlock(&info->bfs_lock); clear_inode(inode); } static void bfs_put_super(struct super_block *s) { struct bfs_sb_info *info = BFS_SB(s); + brelse(info->si_sbh); + mutex_destroy(&info->bfs_lock); kfree(info->si_imap); kfree(info); s->s_fs_info = NULL; @@ -236,11 +239,13 @@ static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf) static void bfs_write_super(struct super_block *s) { - lock_kernel(); + struct bfs_sb_info *info = BFS_SB(s); + + mutex_lock(&info->bfs_lock); if (!(s->s_flags & MS_RDONLY)) - mark_buffer_dirty(BFS_SB(s)->si_sbh); + mark_buffer_dirty(info->si_sbh); s->s_dirt = 0; - unlock_kernel(); + mutex_unlock(&info->bfs_lock); } static struct kmem_cache *bfs_inode_cachep; @@ -259,7 +264,7 @@ static void bfs_destroy_inode(struct inode *inode) kmem_cache_free(bfs_inode_cachep, BFS_I(inode)); } -static void init_once(struct kmem_cache *cachep, void *foo) +static void init_once(void *foo) { struct bfs_inode_info *bi = foo; @@ -380,7 +385,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) struct bfs_inode *di; int block = (i - BFS_ROOT_INO) / BFS_INODES_PER_BLOCK + 1; int off = (i - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK; - unsigned long sblock, eblock; + unsigned long eblock; if (!off) { brelse(bh); @@ -399,7 +404,6 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) set_bit(i, info->si_imap); info->si_freeb -= BFS_FILEBLOCKS(di); - sblock = le32_to_cpu(di->i_sblock); eblock = le32_to_cpu(di->i_eblock); if (eblock > info->si_lf_eblk) info->si_lf_eblk = eblock; @@ -410,6 +414,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) s->s_dirt = 1; } dump_imap("read_super", s); + mutex_init(&info->bfs_lock); return 0; out: diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index ba4cddb92f1d..204cfd1d7676 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -444,12 +444,6 @@ beyond_if: regs->gp = ex.a_gpvalue; #endif start_thread(regs, ex.a_entry, current->mm->start_stack); - if (unlikely(current->ptrace & PT_PTRACED)) { - if (current->ptrace & PT_TRACE_EXEC) - ptrace_notify ((PTRACE_EVENT_EXEC << 8) | SIGTRAP); - else - send_sig(SIGTRAP, current, 0); - } return 0; } diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index d48ff5f370f4..655ed8d30a86 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -131,6 +131,15 @@ static int padzero(unsigned long elf_bss) #define STACK_ALLOC(sp, len) ({ sp -= len ; sp; }) #endif +#ifndef ELF_BASE_PLATFORM +/* + * AT_BASE_PLATFORM indicates the "real" hardware/microarchitecture. + * If the arch defines ELF_BASE_PLATFORM (in asm/elf.h), the value + * will be copied to the user stack in the same manner as AT_PLATFORM. + */ +#define ELF_BASE_PLATFORM NULL +#endif + static int create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, unsigned long load_addr, unsigned long interp_load_addr) @@ -142,7 +151,9 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, elf_addr_t __user *envp; elf_addr_t __user *sp; elf_addr_t __user *u_platform; + elf_addr_t __user *u_base_platform; const char *k_platform = ELF_PLATFORM; + const char *k_base_platform = ELF_BASE_PLATFORM; int items; elf_addr_t *elf_info; int ei_index = 0; @@ -172,6 +183,19 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, return -EFAULT; } + /* + * If this architecture has a "base" platform capability + * string, copy it to userspace. + */ + u_base_platform = NULL; + if (k_base_platform) { + size_t len = strlen(k_base_platform) + 1; + + u_base_platform = (elf_addr_t __user *)STACK_ALLOC(p, len); + if (__copy_to_user(u_base_platform, k_base_platform, len)) + return -EFAULT; + } + /* Create the ELF interpreter info */ elf_info = (elf_addr_t *)current->mm->saved_auxv; /* update AT_VECTOR_SIZE_BASE if the number of NEW_AUX_ENT() changes */ @@ -204,10 +228,15 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, NEW_AUX_ENT(AT_GID, tsk->gid); NEW_AUX_ENT(AT_EGID, tsk->egid); NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm)); + NEW_AUX_ENT(AT_EXECFN, bprm->exec); if (k_platform) { NEW_AUX_ENT(AT_PLATFORM, (elf_addr_t)(unsigned long)u_platform); } + if (k_base_platform) { + NEW_AUX_ENT(AT_BASE_PLATFORM, + (elf_addr_t)(unsigned long)u_base_platform); + } if (bprm->interp_flags & BINPRM_FLAGS_EXECFD) { NEW_AUX_ENT(AT_EXECFD, bprm->interp_data); } @@ -974,12 +1003,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) #endif start_thread(regs, elf_entry, bprm->p); - if (unlikely(current->ptrace & PT_PTRACED)) { - if (current->ptrace & PT_TRACE_EXEC) - ptrace_notify ((PTRACE_EVENT_EXEC << 8) | SIGTRAP); - else - send_sig(SIGTRAP, current, 0); - } retval = 0; out: kfree(loc); @@ -1477,7 +1500,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, const struct user_regset_view *view = task_user_regset_view(dump_task); struct elf_thread_core_info *t; struct elf_prpsinfo *psinfo; - struct task_struct *g, *p; + struct core_thread *ct; unsigned int i; info->size = 0; @@ -1516,31 +1539,26 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, /* * Allocate a structure for each thread. */ - rcu_read_lock(); - do_each_thread(g, p) - if (p->mm == dump_task->mm) { - t = kzalloc(offsetof(struct elf_thread_core_info, - notes[info->thread_notes]), - GFP_ATOMIC); - if (unlikely(!t)) { - rcu_read_unlock(); - return 0; - } - t->task = p; - if (p == dump_task || !info->thread) { - t->next = info->thread; - info->thread = t; - } else { - /* - * Make sure to keep the original task at - * the head of the list. - */ - t->next = info->thread->next; - info->thread->next = t; - } + for (ct = &dump_task->mm->core_state->dumper; ct; ct = ct->next) { + t = kzalloc(offsetof(struct elf_thread_core_info, + notes[info->thread_notes]), + GFP_KERNEL); + if (unlikely(!t)) + return 0; + + t->task = ct->task; + if (ct->task == dump_task || !info->thread) { + t->next = info->thread; + info->thread = t; + } else { + /* + * Make sure to keep the original task at + * the head of the list. + */ + t->next = info->thread->next; + info->thread->next = t; } - while_each_thread(g, p); - rcu_read_unlock(); + } /* * Now fill in each thread's information. @@ -1687,7 +1705,6 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, { #define NUM_NOTES 6 struct list_head *t; - struct task_struct *g, *p; info->notes = NULL; info->prstatus = NULL; @@ -1719,20 +1736,19 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, info->thread_status_size = 0; if (signr) { + struct core_thread *ct; struct elf_thread_status *ets; - rcu_read_lock(); - do_each_thread(g, p) - if (current->mm == p->mm && current != p) { - ets = kzalloc(sizeof(*ets), GFP_ATOMIC); - if (!ets) { - rcu_read_unlock(); - return 0; - } - ets->thread = p; - list_add(&ets->list, &info->thread_list); - } - while_each_thread(g, p); - rcu_read_unlock(); + + for (ct = current->mm->core_state->dumper.next; + ct; ct = ct->next) { + ets = kzalloc(sizeof(*ets), GFP_KERNEL); + if (!ets) + return 0; + + ets->thread = ct->task; + list_add(&ets->list, &info->thread_list); + } + list_for_each(t, &info->thread_list) { int sz; diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index d051a32e6270..80c1f952ef78 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -433,13 +433,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, entryaddr = interp_params.entry_addr ?: exec_params.entry_addr; start_thread(regs, entryaddr, current->mm->start_stack); - if (unlikely(current->ptrace & PT_PTRACED)) { - if (current->ptrace & PT_TRACE_EXEC) - ptrace_notify((PTRACE_EVENT_EXEC << 8) | SIGTRAP); - else - send_sig(SIGTRAP, current, 0); - } - retval = 0; error: @@ -477,6 +470,7 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, char __user *u_platform, *p; long hwcap; int loop; + int nr; /* reset for each csp adjustment */ /* we're going to shovel a whole load of stuff onto the stack */ #ifdef CONFIG_MMU @@ -549,10 +543,7 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, /* force 16 byte _final_ alignment here for generality */ #define DLINFO_ITEMS 13 - nitems = 1 + DLINFO_ITEMS + (k_platform ? 1 : 0); -#ifdef DLINFO_ARCH_ITEMS - nitems += DLINFO_ARCH_ITEMS; -#endif + nitems = 1 + DLINFO_ITEMS + (k_platform ? 1 : 0) + AT_VECTOR_SIZE_ARCH; csp = sp; sp -= nitems * 2 * sizeof(unsigned long); @@ -564,39 +555,46 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, sp -= sp & 15UL; /* put the ELF interpreter info on the stack */ -#define NEW_AUX_ENT(nr, id, val) \ +#define NEW_AUX_ENT(id, val) \ do { \ struct { unsigned long _id, _val; } __user *ent; \ \ ent = (void __user *) csp; \ __put_user((id), &ent[nr]._id); \ __put_user((val), &ent[nr]._val); \ + nr++; \ } while (0) + nr = 0; csp -= 2 * sizeof(unsigned long); - NEW_AUX_ENT(0, AT_NULL, 0); + NEW_AUX_ENT(AT_NULL, 0); if (k_platform) { + nr = 0; csp -= 2 * sizeof(unsigned long); - NEW_AUX_ENT(0, AT_PLATFORM, + NEW_AUX_ENT(AT_PLATFORM, (elf_addr_t) (unsigned long) u_platform); } + nr = 0; csp -= DLINFO_ITEMS * 2 * sizeof(unsigned long); - NEW_AUX_ENT( 0, AT_HWCAP, hwcap); - NEW_AUX_ENT( 1, AT_PAGESZ, PAGE_SIZE); - NEW_AUX_ENT( 2, AT_CLKTCK, CLOCKS_PER_SEC); - NEW_AUX_ENT( 3, AT_PHDR, exec_params->ph_addr); - NEW_AUX_ENT( 4, AT_PHENT, sizeof(struct elf_phdr)); - NEW_AUX_ENT( 5, AT_PHNUM, exec_params->hdr.e_phnum); - NEW_AUX_ENT( 6, AT_BASE, interp_params->elfhdr_addr); - NEW_AUX_ENT( 7, AT_FLAGS, 0); - NEW_AUX_ENT( 8, AT_ENTRY, exec_params->entry_addr); - NEW_AUX_ENT( 9, AT_UID, (elf_addr_t) current->uid); - NEW_AUX_ENT(10, AT_EUID, (elf_addr_t) current->euid); - NEW_AUX_ENT(11, AT_GID, (elf_addr_t) current->gid); - NEW_AUX_ENT(12, AT_EGID, (elf_addr_t) current->egid); + NEW_AUX_ENT(AT_HWCAP, hwcap); + NEW_AUX_ENT(AT_PAGESZ, PAGE_SIZE); + NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC); + NEW_AUX_ENT(AT_PHDR, exec_params->ph_addr); + NEW_AUX_ENT(AT_PHENT, sizeof(struct elf_phdr)); + NEW_AUX_ENT(AT_PHNUM, exec_params->hdr.e_phnum); + NEW_AUX_ENT(AT_BASE, interp_params->elfhdr_addr); + NEW_AUX_ENT(AT_FLAGS, 0); + NEW_AUX_ENT(AT_ENTRY, exec_params->entry_addr); + NEW_AUX_ENT(AT_UID, (elf_addr_t) current->uid); + NEW_AUX_ENT(AT_EUID, (elf_addr_t) current->euid); + NEW_AUX_ENT(AT_GID, (elf_addr_t) current->gid); + NEW_AUX_ENT(AT_EGID, (elf_addr_t) current->egid); #ifdef ARCH_DLINFO + nr = 0; + csp -= AT_VECTOR_SIZE_ARCH * 2 * sizeof(unsigned long); + /* ARCH_DLINFO must come last so platform specific code can enforce * special alignment requirements on the AUXV if necessary (eg. PPC). */ @@ -1573,7 +1571,6 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, struct memelfnote *notes = NULL; struct elf_prstatus *prstatus = NULL; /* NT_PRSTATUS */ struct elf_prpsinfo *psinfo = NULL; /* NT_PRPSINFO */ - struct task_struct *g, *p; LIST_HEAD(thread_list); struct list_head *t; elf_fpregset_t *fpu = NULL; @@ -1622,20 +1619,19 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, #endif if (signr) { + struct core_thread *ct; struct elf_thread_status *tmp; - rcu_read_lock(); - do_each_thread(g,p) - if (current->mm == p->mm && current != p) { - tmp = kzalloc(sizeof(*tmp), GFP_ATOMIC); - if (!tmp) { - rcu_read_unlock(); - goto cleanup; - } - tmp->thread = p; - list_add(&tmp->list, &thread_list); - } - while_each_thread(g,p); - rcu_read_unlock(); + + for (ct = current->mm->core_state->dumper.next; + ct; ct = ct->next) { + tmp = kzalloc(sizeof(*tmp), GFP_KERNEL); + if (!tmp) + goto cleanup; + + tmp->thread = ct->task; + list_add(&tmp->list, &thread_list); + } + list_for_each(t, &thread_list) { struct elf_thread_status *tmp; int sz; diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 2cb1acda3a82..dfc0197905ca 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -914,15 +914,14 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs) /* Stash our initial stack pointer into the mm structure */ current->mm->start_stack = (unsigned long )sp; - +#ifdef FLAT_PLAT_INIT + FLAT_PLAT_INIT(regs); +#endif DBG_FLT("start_thread(regs=0x%x, entry=0x%x, start_stack=0x%x)\n", (int)regs, (int)start_addr, (int)current->mm->start_stack); start_thread(regs, start_addr, current->mm->start_stack); - if (current->ptrace & PT_PTRACED) - send_sig(SIGTRAP, current, 0); - return 0; } diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 7191306367c5..8d7e88e02e0f 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -27,6 +27,7 @@ #include <linux/namei.h> #include <linux/mount.h> #include <linux/syscalls.h> +#include <linux/fs.h> #include <asm/uaccess.h> @@ -119,8 +120,6 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs) if (bprm->misc_bang) goto _ret; - bprm->misc_bang = 1; - /* to keep locking time low, we copy the interpreter string */ read_lock(&entries_lock); fmt = check_file(bprm); @@ -198,6 +197,8 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs) if (retval < 0) goto _error; + bprm->misc_bang = 1; + retval = search_binary_handler (bprm, regs); if (retval < 0) goto _error; @@ -535,31 +536,16 @@ static ssize_t bm_entry_read(struct file * file, char __user * buf, size_t nbytes, loff_t *ppos) { Node *e = file->f_path.dentry->d_inode->i_private; - loff_t pos = *ppos; ssize_t res; char *page; - int len; if (!(page = (char*) __get_free_page(GFP_KERNEL))) return -ENOMEM; entry_status(e, page); - len = strlen(page); - res = -EINVAL; - if (pos < 0) - goto out; - res = 0; - if (pos >= len) - goto out; - if (len < pos + nbytes) - nbytes = len - pos; - res = -EFAULT; - if (copy_to_user(buf, page + pos, nbytes)) - goto out; - *ppos = pos + nbytes; - res = nbytes; -out: + res = simple_read_from_buffer(buf, nbytes, ppos, page, strlen(page)); + free_page((unsigned long) page); return res; } diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c index fdc36bfd6a7b..68be580ba289 100644 --- a/fs/binfmt_som.c +++ b/fs/binfmt_som.c @@ -274,8 +274,6 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs) map_hpux_gateway_page(current,current->mm); start_thread_som(regs, som_entry, bprm->p); - if (current->ptrace & PT_PTRACED) - send_sig(SIGTRAP, current, 0); return 0; /* error cleanup */ diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c new file mode 100644 index 000000000000..19caf7c962ac --- /dev/null +++ b/fs/bio-integrity.c @@ -0,0 +1,745 @@ +/* + * bio-integrity.c - bio data integrity extensions + * + * Copyright (C) 2007, 2008 Oracle Corporation + * Written by: Martin K. Petersen <martin.petersen@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, + * USA. + * + */ + +#include <linux/blkdev.h> +#include <linux/mempool.h> +#include <linux/bio.h> +#include <linux/workqueue.h> + +static struct kmem_cache *bio_integrity_slab __read_mostly; +static struct workqueue_struct *kintegrityd_wq; + +/** + * bio_integrity_alloc_bioset - Allocate integrity payload and attach it to bio + * @bio: bio to attach integrity metadata to + * @gfp_mask: Memory allocation mask + * @nr_vecs: Number of integrity metadata scatter-gather elements + * @bs: bio_set to allocate from + * + * Description: This function prepares a bio for attaching integrity + * metadata. nr_vecs specifies the maximum number of pages containing + * integrity metadata that can be attached. + */ +struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio, + gfp_t gfp_mask, + unsigned int nr_vecs, + struct bio_set *bs) +{ + struct bio_integrity_payload *bip; + struct bio_vec *iv; + unsigned long idx; + + BUG_ON(bio == NULL); + + bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask); + if (unlikely(bip == NULL)) { + printk(KERN_ERR "%s: could not alloc bip\n", __func__); + return NULL; + } + + memset(bip, 0, sizeof(*bip)); + + iv = bvec_alloc_bs(gfp_mask, nr_vecs, &idx, bs); + if (unlikely(iv == NULL)) { + printk(KERN_ERR "%s: could not alloc bip_vec\n", __func__); + mempool_free(bip, bs->bio_integrity_pool); + return NULL; + } + + bip->bip_pool = idx; + bip->bip_vec = iv; + bip->bip_bio = bio; + bio->bi_integrity = bip; + + return bip; +} +EXPORT_SYMBOL(bio_integrity_alloc_bioset); + +/** + * bio_integrity_alloc - Allocate integrity payload and attach it to bio + * @bio: bio to attach integrity metadata to + * @gfp_mask: Memory allocation mask + * @nr_vecs: Number of integrity metadata scatter-gather elements + * + * Description: This function prepares a bio for attaching integrity + * metadata. nr_vecs specifies the maximum number of pages containing + * integrity metadata that can be attached. + */ +struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, + gfp_t gfp_mask, + unsigned int nr_vecs) +{ + return bio_integrity_alloc_bioset(bio, gfp_mask, nr_vecs, fs_bio_set); +} +EXPORT_SYMBOL(bio_integrity_alloc); + +/** + * bio_integrity_free - Free bio integrity payload + * @bio: bio containing bip to be freed + * @bs: bio_set this bio was allocated from + * + * Description: Used to free the integrity portion of a bio. Usually + * called from bio_free(). + */ +void bio_integrity_free(struct bio *bio, struct bio_set *bs) +{ + struct bio_integrity_payload *bip = bio->bi_integrity; + + BUG_ON(bip == NULL); + + /* A cloned bio doesn't own the integrity metadata */ + if (!bio_flagged(bio, BIO_CLONED) && !bio_flagged(bio, BIO_FS_INTEGRITY) + && bip->bip_buf != NULL) + kfree(bip->bip_buf); + + mempool_free(bip->bip_vec, bs->bvec_pools[bip->bip_pool]); + mempool_free(bip, bs->bio_integrity_pool); + + bio->bi_integrity = NULL; +} +EXPORT_SYMBOL(bio_integrity_free); + +/** + * bio_integrity_add_page - Attach integrity metadata + * @bio: bio to update + * @page: page containing integrity metadata + * @len: number of bytes of integrity metadata in page + * @offset: start offset within page + * + * Description: Attach a page containing integrity metadata to bio. + */ +int bio_integrity_add_page(struct bio *bio, struct page *page, + unsigned int len, unsigned int offset) +{ + struct bio_integrity_payload *bip = bio->bi_integrity; + struct bio_vec *iv; + + if (bip->bip_vcnt >= bvec_nr_vecs(bip->bip_pool)) { + printk(KERN_ERR "%s: bip_vec full\n", __func__); + return 0; + } + + iv = bip_vec_idx(bip, bip->bip_vcnt); + BUG_ON(iv == NULL); + BUG_ON(iv->bv_page != NULL); + + iv->bv_page = page; + iv->bv_len = len; + iv->bv_offset = offset; + bip->bip_vcnt++; + + return len; +} +EXPORT_SYMBOL(bio_integrity_add_page); + +static int bdev_integrity_enabled(struct block_device *bdev, int rw) +{ + struct blk_integrity *bi = bdev_get_integrity(bdev); + + if (bi == NULL) + return 0; + + if (rw == READ && bi->verify_fn != NULL && + (bi->flags & INTEGRITY_FLAG_READ)) + return 1; + + if (rw == WRITE && bi->generate_fn != NULL && + (bi->flags & INTEGRITY_FLAG_WRITE)) + return 1; + + return 0; +} + +/** + * bio_integrity_enabled - Check whether integrity can be passed + * @bio: bio to check + * + * Description: Determines whether bio_integrity_prep() can be called + * on this bio or not. bio data direction and target device must be + * set prior to calling. The functions honors the write_generate and + * read_verify flags in sysfs. + */ +int bio_integrity_enabled(struct bio *bio) +{ + /* Already protected? */ + if (bio_integrity(bio)) + return 0; + + return bdev_integrity_enabled(bio->bi_bdev, bio_data_dir(bio)); +} +EXPORT_SYMBOL(bio_integrity_enabled); + +/** + * bio_integrity_hw_sectors - Convert 512b sectors to hardware ditto + * @bi: blk_integrity profile for device + * @sectors: Number of 512 sectors to convert + * + * Description: The block layer calculates everything in 512 byte + * sectors but integrity metadata is done in terms of the hardware + * sector size of the storage device. Convert the block layer sectors + * to physical sectors. + */ +static inline unsigned int bio_integrity_hw_sectors(struct blk_integrity *bi, + unsigned int sectors) +{ + /* At this point there are only 512b or 4096b DIF/EPP devices */ + if (bi->sector_size == 4096) + return sectors >>= 3; + + return sectors; +} + +/** + * bio_integrity_tag_size - Retrieve integrity tag space + * @bio: bio to inspect + * + * Description: Returns the maximum number of tag bytes that can be + * attached to this bio. Filesystems can use this to determine how + * much metadata to attach to an I/O. + */ +unsigned int bio_integrity_tag_size(struct bio *bio) +{ + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + + BUG_ON(bio->bi_size == 0); + + return bi->tag_size * (bio->bi_size / bi->sector_size); +} +EXPORT_SYMBOL(bio_integrity_tag_size); + +int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len, int set) +{ + struct bio_integrity_payload *bip = bio->bi_integrity; + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + unsigned int nr_sectors; + + BUG_ON(bip->bip_buf == NULL); + + if (bi->tag_size == 0) + return -1; + + nr_sectors = bio_integrity_hw_sectors(bi, + DIV_ROUND_UP(len, bi->tag_size)); + + if (nr_sectors * bi->tuple_size > bip->bip_size) { + printk(KERN_ERR "%s: tag too big for bio: %u > %u\n", + __func__, nr_sectors * bi->tuple_size, bip->bip_size); + return -1; + } + + if (set) + bi->set_tag_fn(bip->bip_buf, tag_buf, nr_sectors); + else + bi->get_tag_fn(bip->bip_buf, tag_buf, nr_sectors); + + return 0; +} + +/** + * bio_integrity_set_tag - Attach a tag buffer to a bio + * @bio: bio to attach buffer to + * @tag_buf: Pointer to a buffer containing tag data + * @len: Length of the included buffer + * + * Description: Use this function to tag a bio by leveraging the extra + * space provided by devices formatted with integrity protection. The + * size of the integrity buffer must be <= to the size reported by + * bio_integrity_tag_size(). + */ +int bio_integrity_set_tag(struct bio *bio, void *tag_buf, unsigned int len) +{ + BUG_ON(bio_data_dir(bio) != WRITE); + + return bio_integrity_tag(bio, tag_buf, len, 1); +} +EXPORT_SYMBOL(bio_integrity_set_tag); + +/** + * bio_integrity_get_tag - Retrieve a tag buffer from a bio + * @bio: bio to retrieve buffer from + * @tag_buf: Pointer to a buffer for the tag data + * @len: Length of the target buffer + * + * Description: Use this function to retrieve the tag buffer from a + * completed I/O. The size of the integrity buffer must be <= to the + * size reported by bio_integrity_tag_size(). + */ +int bio_integrity_get_tag(struct bio *bio, void *tag_buf, unsigned int len) +{ + BUG_ON(bio_data_dir(bio) != READ); + + return bio_integrity_tag(bio, tag_buf, len, 0); +} +EXPORT_SYMBOL(bio_integrity_get_tag); + +/** + * bio_integrity_generate - Generate integrity metadata for a bio + * @bio: bio to generate integrity metadata for + * + * Description: Generates integrity metadata for a bio by calling the + * block device's generation callback function. The bio must have a + * bip attached with enough room to accommodate the generated + * integrity metadata. + */ +static void bio_integrity_generate(struct bio *bio) +{ + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + struct blk_integrity_exchg bix; + struct bio_vec *bv; + sector_t sector = bio->bi_sector; + unsigned int i, sectors, total; + void *prot_buf = bio->bi_integrity->bip_buf; + + total = 0; + bix.disk_name = bio->bi_bdev->bd_disk->disk_name; + bix.sector_size = bi->sector_size; + + bio_for_each_segment(bv, bio, i) { + void *kaddr = kmap_atomic(bv->bv_page, KM_USER0); + bix.data_buf = kaddr + bv->bv_offset; + bix.data_size = bv->bv_len; + bix.prot_buf = prot_buf; + bix.sector = sector; + + bi->generate_fn(&bix); + + sectors = bv->bv_len / bi->sector_size; + sector += sectors; + prot_buf += sectors * bi->tuple_size; + total += sectors * bi->tuple_size; + BUG_ON(total > bio->bi_integrity->bip_size); + + kunmap_atomic(kaddr, KM_USER0); + } +} + +static inline unsigned short blk_integrity_tuple_size(struct blk_integrity *bi) +{ + if (bi) + return bi->tuple_size; + + return 0; +} + +/** + * bio_integrity_prep - Prepare bio for integrity I/O + * @bio: bio to prepare + * + * Description: Allocates a buffer for integrity metadata, maps the + * pages and attaches them to a bio. The bio must have data + * direction, target device and start sector set priot to calling. In + * the WRITE case, integrity metadata will be generated using the + * block device's integrity function. In the READ case, the buffer + * will be prepared for DMA and a suitable end_io handler set up. + */ +int bio_integrity_prep(struct bio *bio) +{ + struct bio_integrity_payload *bip; + struct blk_integrity *bi; + struct request_queue *q; + void *buf; + unsigned long start, end; + unsigned int len, nr_pages; + unsigned int bytes, offset, i; + unsigned int sectors; + + bi = bdev_get_integrity(bio->bi_bdev); + q = bdev_get_queue(bio->bi_bdev); + BUG_ON(bi == NULL); + BUG_ON(bio_integrity(bio)); + + sectors = bio_integrity_hw_sectors(bi, bio_sectors(bio)); + + /* Allocate kernel buffer for protection data */ + len = sectors * blk_integrity_tuple_size(bi); + buf = kmalloc(len, GFP_NOIO | __GFP_NOFAIL | q->bounce_gfp); + if (unlikely(buf == NULL)) { + printk(KERN_ERR "could not allocate integrity buffer\n"); + return -EIO; + } + + end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + start = ((unsigned long) buf) >> PAGE_SHIFT; + nr_pages = end - start; + + /* Allocate bio integrity payload and integrity vectors */ + bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages); + if (unlikely(bip == NULL)) { + printk(KERN_ERR "could not allocate data integrity bioset\n"); + kfree(buf); + return -EIO; + } + + bip->bip_buf = buf; + bip->bip_size = len; + bip->bip_sector = bio->bi_sector; + + /* Map it */ + offset = offset_in_page(buf); + for (i = 0 ; i < nr_pages ; i++) { + int ret; + bytes = PAGE_SIZE - offset; + + if (len <= 0) + break; + + if (bytes > len) + bytes = len; + + ret = bio_integrity_add_page(bio, virt_to_page(buf), + bytes, offset); + + if (ret == 0) + return 0; + + if (ret < bytes) + break; + + buf += bytes; + len -= bytes; + offset = 0; + } + + /* Install custom I/O completion handler if read verify is enabled */ + if (bio_data_dir(bio) == READ) { + bip->bip_end_io = bio->bi_end_io; + bio->bi_end_io = bio_integrity_endio; + } + + /* Auto-generate integrity metadata if this is a write */ + if (bio_data_dir(bio) == WRITE) + bio_integrity_generate(bio); + + return 0; +} +EXPORT_SYMBOL(bio_integrity_prep); + +/** + * bio_integrity_verify - Verify integrity metadata for a bio + * @bio: bio to verify + * + * Description: This function is called to verify the integrity of a + * bio. The data in the bio io_vec is compared to the integrity + * metadata returned by the HBA. + */ +static int bio_integrity_verify(struct bio *bio) +{ + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + struct blk_integrity_exchg bix; + struct bio_vec *bv; + sector_t sector = bio->bi_integrity->bip_sector; + unsigned int i, sectors, total, ret; + void *prot_buf = bio->bi_integrity->bip_buf; + + ret = total = 0; + bix.disk_name = bio->bi_bdev->bd_disk->disk_name; + bix.sector_size = bi->sector_size; + + bio_for_each_segment(bv, bio, i) { + void *kaddr = kmap_atomic(bv->bv_page, KM_USER0); + bix.data_buf = kaddr + bv->bv_offset; + bix.data_size = bv->bv_len; + bix.prot_buf = prot_buf; + bix.sector = sector; + + ret = bi->verify_fn(&bix); + + if (ret) { + kunmap_atomic(kaddr, KM_USER0); + break; + } + + sectors = bv->bv_len / bi->sector_size; + sector += sectors; + prot_buf += sectors * bi->tuple_size; + total += sectors * bi->tuple_size; + BUG_ON(total > bio->bi_integrity->bip_size); + + kunmap_atomic(kaddr, KM_USER0); + } + + return ret; +} + +/** + * bio_integrity_verify_fn - Integrity I/O completion worker + * @work: Work struct stored in bio to be verified + * + * Description: This workqueue function is called to complete a READ + * request. The function verifies the transferred integrity metadata + * and then calls the original bio end_io function. + */ +static void bio_integrity_verify_fn(struct work_struct *work) +{ + struct bio_integrity_payload *bip = + container_of(work, struct bio_integrity_payload, bip_work); + struct bio *bio = bip->bip_bio; + int error = bip->bip_error; + + if (bio_integrity_verify(bio)) { + clear_bit(BIO_UPTODATE, &bio->bi_flags); + error = -EIO; + } + + /* Restore original bio completion handler */ + bio->bi_end_io = bip->bip_end_io; + + if (bio->bi_end_io) + bio->bi_end_io(bio, error); +} + +/** + * bio_integrity_endio - Integrity I/O completion function + * @bio: Protected bio + * @error: Pointer to errno + * + * Description: Completion for integrity I/O + * + * Normally I/O completion is done in interrupt context. However, + * verifying I/O integrity is a time-consuming task which must be run + * in process context. This function postpones completion + * accordingly. + */ +void bio_integrity_endio(struct bio *bio, int error) +{ + struct bio_integrity_payload *bip = bio->bi_integrity; + + BUG_ON(bip->bip_bio != bio); + + bip->bip_error = error; + INIT_WORK(&bip->bip_work, bio_integrity_verify_fn); + queue_work(kintegrityd_wq, &bip->bip_work); +} +EXPORT_SYMBOL(bio_integrity_endio); + +/** + * bio_integrity_mark_head - Advance bip_vec skip bytes + * @bip: Integrity vector to advance + * @skip: Number of bytes to advance it + */ +void bio_integrity_mark_head(struct bio_integrity_payload *bip, + unsigned int skip) +{ + struct bio_vec *iv; + unsigned int i; + + bip_for_each_vec(iv, bip, i) { + if (skip == 0) { + bip->bip_idx = i; + return; + } else if (skip >= iv->bv_len) { + skip -= iv->bv_len; + } else { /* skip < iv->bv_len) */ + iv->bv_offset += skip; + iv->bv_len -= skip; + bip->bip_idx = i; + return; + } + } +} + +/** + * bio_integrity_mark_tail - Truncate bip_vec to be len bytes long + * @bip: Integrity vector to truncate + * @len: New length of integrity vector + */ +void bio_integrity_mark_tail(struct bio_integrity_payload *bip, + unsigned int len) +{ + struct bio_vec *iv; + unsigned int i; + + bip_for_each_vec(iv, bip, i) { + if (len == 0) { + bip->bip_vcnt = i; + return; + } else if (len >= iv->bv_len) { + len -= iv->bv_len; + } else { /* len < iv->bv_len) */ + iv->bv_len = len; + len = 0; + } + } +} + +/** + * bio_integrity_advance - Advance integrity vector + * @bio: bio whose integrity vector to update + * @bytes_done: number of data bytes that have been completed + * + * Description: This function calculates how many integrity bytes the + * number of completed data bytes correspond to and advances the + * integrity vector accordingly. + */ +void bio_integrity_advance(struct bio *bio, unsigned int bytes_done) +{ + struct bio_integrity_payload *bip = bio->bi_integrity; + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + unsigned int nr_sectors; + + BUG_ON(bip == NULL); + BUG_ON(bi == NULL); + + nr_sectors = bio_integrity_hw_sectors(bi, bytes_done >> 9); + bio_integrity_mark_head(bip, nr_sectors * bi->tuple_size); +} +EXPORT_SYMBOL(bio_integrity_advance); + +/** + * bio_integrity_trim - Trim integrity vector + * @bio: bio whose integrity vector to update + * @offset: offset to first data sector + * @sectors: number of data sectors + * + * Description: Used to trim the integrity vector in a cloned bio. + * The ivec will be advanced corresponding to 'offset' data sectors + * and the length will be truncated corresponding to 'len' data + * sectors. + */ +void bio_integrity_trim(struct bio *bio, unsigned int offset, + unsigned int sectors) +{ + struct bio_integrity_payload *bip = bio->bi_integrity; + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + unsigned int nr_sectors; + + BUG_ON(bip == NULL); + BUG_ON(bi == NULL); + BUG_ON(!bio_flagged(bio, BIO_CLONED)); + + nr_sectors = bio_integrity_hw_sectors(bi, sectors); + bip->bip_sector = bip->bip_sector + offset; + bio_integrity_mark_head(bip, offset * bi->tuple_size); + bio_integrity_mark_tail(bip, sectors * bi->tuple_size); +} +EXPORT_SYMBOL(bio_integrity_trim); + +/** + * bio_integrity_split - Split integrity metadata + * @bio: Protected bio + * @bp: Resulting bio_pair + * @sectors: Offset + * + * Description: Splits an integrity page into a bio_pair. + */ +void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors) +{ + struct blk_integrity *bi; + struct bio_integrity_payload *bip = bio->bi_integrity; + unsigned int nr_sectors; + + if (bio_integrity(bio) == 0) + return; + + bi = bdev_get_integrity(bio->bi_bdev); + BUG_ON(bi == NULL); + BUG_ON(bip->bip_vcnt != 1); + + nr_sectors = bio_integrity_hw_sectors(bi, sectors); + + bp->bio1.bi_integrity = &bp->bip1; + bp->bio2.bi_integrity = &bp->bip2; + + bp->iv1 = bip->bip_vec[0]; + bp->iv2 = bip->bip_vec[0]; + + bp->bip1.bip_vec = &bp->iv1; + bp->bip2.bip_vec = &bp->iv2; + + bp->iv1.bv_len = sectors * bi->tuple_size; + bp->iv2.bv_offset += sectors * bi->tuple_size; + bp->iv2.bv_len -= sectors * bi->tuple_size; + + bp->bip1.bip_sector = bio->bi_integrity->bip_sector; + bp->bip2.bip_sector = bio->bi_integrity->bip_sector + nr_sectors; + + bp->bip1.bip_vcnt = bp->bip2.bip_vcnt = 1; + bp->bip1.bip_idx = bp->bip2.bip_idx = 0; +} +EXPORT_SYMBOL(bio_integrity_split); + +/** + * bio_integrity_clone - Callback for cloning bios with integrity metadata + * @bio: New bio + * @bio_src: Original bio + * @bs: bio_set to allocate bip from + * + * Description: Called to allocate a bip when cloning a bio + */ +int bio_integrity_clone(struct bio *bio, struct bio *bio_src, + struct bio_set *bs) +{ + struct bio_integrity_payload *bip_src = bio_src->bi_integrity; + struct bio_integrity_payload *bip; + + BUG_ON(bip_src == NULL); + + bip = bio_integrity_alloc_bioset(bio, GFP_NOIO, bip_src->bip_vcnt, bs); + + if (bip == NULL) + return -EIO; + + memcpy(bip->bip_vec, bip_src->bip_vec, + bip_src->bip_vcnt * sizeof(struct bio_vec)); + + bip->bip_sector = bip_src->bip_sector; + bip->bip_vcnt = bip_src->bip_vcnt; + bip->bip_idx = bip_src->bip_idx; + + return 0; +} +EXPORT_SYMBOL(bio_integrity_clone); + +int bioset_integrity_create(struct bio_set *bs, int pool_size) +{ + bs->bio_integrity_pool = mempool_create_slab_pool(pool_size, + bio_integrity_slab); + if (!bs->bio_integrity_pool) + return -1; + + return 0; +} +EXPORT_SYMBOL(bioset_integrity_create); + +void bioset_integrity_free(struct bio_set *bs) +{ + if (bs->bio_integrity_pool) + mempool_destroy(bs->bio_integrity_pool); +} +EXPORT_SYMBOL(bioset_integrity_free); + +void __init bio_integrity_init_slab(void) +{ + bio_integrity_slab = KMEM_CACHE(bio_integrity_payload, + SLAB_HWCACHE_ALIGN|SLAB_PANIC); +} + +static int __init integrity_init(void) +{ + kintegrityd_wq = create_workqueue("kintegrityd"); + + if (!kintegrityd_wq) + panic("Failed to create kintegrityd\n"); + + return 0; +} +subsys_initcall(integrity_init); @@ -28,24 +28,9 @@ #include <linux/blktrace_api.h> #include <scsi/sg.h> /* for struct sg_iovec */ -#define BIO_POOL_SIZE 2 - static struct kmem_cache *bio_slab __read_mostly; -#define BIOVEC_NR_POOLS 6 - -/* - * a small number of entries is fine, not going to be performance critical. - * basically we just need to survive - */ -#define BIO_SPLIT_ENTRIES 2 -mempool_t *bio_split_pool __read_mostly; - -struct biovec_slab { - int nr_vecs; - char *name; - struct kmem_cache *slab; -}; +static mempool_t *bio_split_pool __read_mostly; /* * if you change this list, also change bvec_alloc or things will @@ -60,49 +45,61 @@ static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = { #undef BV /* - * bio_set is used to allow other portions of the IO system to - * allocate their own private memory pools for bio and iovec structures. - * These memory pools in turn all allocate from the bio_slab - * and the bvec_slabs[]. - */ -struct bio_set { - mempool_t *bio_pool; - mempool_t *bvec_pools[BIOVEC_NR_POOLS]; -}; - -/* * fs_bio_set is the bio_set containing bio and iovec memory pools used by * IO code that does not need private memory pools. */ -static struct bio_set *fs_bio_set; +struct bio_set *fs_bio_set; + +unsigned int bvec_nr_vecs(unsigned short idx) +{ + return bvec_slabs[idx].nr_vecs; +} -static inline struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs) +struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs) { struct bio_vec *bvl; /* - * see comment near bvec_array define! + * If 'bs' is given, lookup the pool and do the mempool alloc. + * If not, this is a bio_kmalloc() allocation and just do a + * kzalloc() for the exact number of vecs right away. */ - switch (nr) { - case 1 : *idx = 0; break; - case 2 ... 4: *idx = 1; break; - case 5 ... 16: *idx = 2; break; - case 17 ... 64: *idx = 3; break; - case 65 ... 128: *idx = 4; break; - case 129 ... BIO_MAX_PAGES: *idx = 5; break; + if (bs) { + /* + * see comment near bvec_array define! + */ + switch (nr) { + case 1: + *idx = 0; + break; + case 2 ... 4: + *idx = 1; + break; + case 5 ... 16: + *idx = 2; + break; + case 17 ... 64: + *idx = 3; + break; + case 65 ... 128: + *idx = 4; + break; + case 129 ... BIO_MAX_PAGES: + *idx = 5; + break; default: return NULL; - } - /* - * idx now points to the pool we want to allocate from - */ - - bvl = mempool_alloc(bs->bvec_pools[*idx], gfp_mask); - if (bvl) { - struct biovec_slab *bp = bvec_slabs + *idx; + } - memset(bvl, 0, bp->nr_vecs * sizeof(struct bio_vec)); - } + /* + * idx now points to the pool we want to allocate from + */ + bvl = mempool_alloc(bs->bvec_pools[*idx], gfp_mask); + if (bvl) + memset(bvl, 0, + bvec_nr_vecs(*idx) * sizeof(struct bio_vec)); + } else + bvl = kzalloc(nr * sizeof(struct bio_vec), gfp_mask); return bvl; } @@ -117,6 +114,9 @@ void bio_free(struct bio *bio, struct bio_set *bio_set) mempool_free(bio->bi_io_vec, bio_set->bvec_pools[pool_idx]); } + if (bio_integrity(bio)) + bio_integrity_free(bio, bio_set); + mempool_free(bio, bio_set->bio_pool); } @@ -128,10 +128,17 @@ static void bio_fs_destructor(struct bio *bio) bio_free(bio, fs_bio_set); } +static void bio_kmalloc_destructor(struct bio *bio) +{ + kfree(bio->bi_io_vec); + kfree(bio); +} + void bio_init(struct bio *bio) { memset(bio, 0, sizeof(*bio)); bio->bi_flags = 1 << BIO_UPTODATE; + bio->bi_comp_cpu = -1; atomic_set(&bio->bi_cnt, 1); } @@ -139,19 +146,25 @@ void bio_init(struct bio *bio) * bio_alloc_bioset - allocate a bio for I/O * @gfp_mask: the GFP_ mask given to the slab allocator * @nr_iovecs: number of iovecs to pre-allocate - * @bs: the bio_set to allocate from + * @bs: the bio_set to allocate from. If %NULL, just use kmalloc * * Description: - * bio_alloc_bioset will first try it's on mempool to satisfy the allocation. + * bio_alloc_bioset will first try its own mempool to satisfy the allocation. * If %__GFP_WAIT is set then we will block on the internal pool waiting - * for a &struct bio to become free. + * for a &struct bio to become free. If a %NULL @bs is passed in, we will + * fall back to just using @kmalloc to allocate the required memory. * * allocate bio and iovecs from the memory pools specified by the - * bio_set structure. + * bio_set structure, or @kmalloc if none given. **/ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) { - struct bio *bio = mempool_alloc(bs->bio_pool, gfp_mask); + struct bio *bio; + + if (bs) + bio = mempool_alloc(bs->bio_pool, gfp_mask); + else + bio = kmalloc(sizeof(*bio), gfp_mask); if (likely(bio)) { struct bio_vec *bvl = NULL; @@ -162,12 +175,15 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs); if (unlikely(!bvl)) { - mempool_free(bio, bs->bio_pool); + if (bs) + mempool_free(bio, bs->bio_pool); + else + kfree(bio); bio = NULL; goto out; } bio->bi_flags |= idx << BIO_POOL_OFFSET; - bio->bi_max_vecs = bvec_slabs[idx].nr_vecs; + bio->bi_max_vecs = bvec_nr_vecs(idx); } bio->bi_io_vec = bvl; } @@ -185,6 +201,23 @@ struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs) return bio; } +/* + * Like bio_alloc(), but doesn't use a mempool backing. This means that + * it CAN fail, but while bio_alloc() can only be used for allocations + * that have a short (finite) life span, bio_kmalloc() should be used + * for more permanent bio allocations (like allocating some bio's for + * initalization or setup purposes). + */ +struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs) +{ + struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, NULL); + + if (bio) + bio->bi_destructor = bio_kmalloc_destructor; + + return bio; +} + void zero_fill_bio(struct bio *bio) { unsigned long flags; @@ -229,14 +262,6 @@ inline int bio_phys_segments(struct request_queue *q, struct bio *bio) return bio->bi_phys_segments; } -inline int bio_hw_segments(struct request_queue *q, struct bio *bio) -{ - if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) - blk_recount_segments(q, bio); - - return bio->bi_hw_segments; -} - /** * __bio_clone - clone a bio * @bio: destination bio @@ -275,9 +300,19 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask) { struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, fs_bio_set); - if (b) { - b->bi_destructor = bio_fs_destructor; - __bio_clone(b, bio); + if (!b) + return NULL; + + b->bi_destructor = bio_fs_destructor; + __bio_clone(b, bio); + + if (bio_integrity(bio)) { + int ret; + + ret = bio_integrity_clone(b, bio, fs_bio_set); + + if (ret < 0) + return NULL; } return b; @@ -333,10 +368,19 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page if (page == prev->bv_page && offset == prev->bv_offset + prev->bv_len) { prev->bv_len += len; - if (q->merge_bvec_fn && - q->merge_bvec_fn(q, bio, prev) < len) { - prev->bv_len -= len; - return 0; + + if (q->merge_bvec_fn) { + struct bvec_merge_data bvm = { + .bi_bdev = bio->bi_bdev, + .bi_sector = bio->bi_sector, + .bi_size = bio->bi_size, + .bi_rw = bio->bi_rw, + }; + + if (q->merge_bvec_fn(q, &bvm, prev) < len) { + prev->bv_len -= len; + return 0; + } } goto done; @@ -352,8 +396,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page */ while (bio->bi_phys_segments >= q->max_phys_segments - || bio->bi_hw_segments >= q->max_hw_segments - || BIOVEC_VIRT_OVERSIZE(bio->bi_size)) { + || bio->bi_phys_segments >= q->max_hw_segments) { if (retried_segments) return 0; @@ -377,11 +420,18 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page * queue to get further control */ if (q->merge_bvec_fn) { + struct bvec_merge_data bvm = { + .bi_bdev = bio->bi_bdev, + .bi_sector = bio->bi_sector, + .bi_size = bio->bi_size, + .bi_rw = bio->bi_rw, + }; + /* * merge_bvec_fn() returns number of bytes it can accept * at this offset */ - if (q->merge_bvec_fn(q, bio, bvec) < len) { + if (q->merge_bvec_fn(q, &bvm, bvec) < len) { bvec->bv_page = NULL; bvec->bv_len = 0; bvec->bv_offset = 0; @@ -390,13 +440,11 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page } /* If we may be able to merge these biovecs, force a recount */ - if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec) || - BIOVEC_VIRT_MERGEABLE(bvec-1, bvec))) + if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec))) bio->bi_flags &= ~(1 << BIO_SEG_VALID); bio->bi_vcnt++; bio->bi_phys_segments++; - bio->bi_hw_segments++; done: bio->bi_size += len; return len; @@ -444,16 +492,19 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len, struct bio_map_data { struct bio_vec *iovecs; - int nr_sgvecs; struct sg_iovec *sgvecs; + int nr_sgvecs; + int is_our_pages; }; static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio, - struct sg_iovec *iov, int iov_count) + struct sg_iovec *iov, int iov_count, + int is_our_pages) { memcpy(bmd->iovecs, bio->bi_io_vec, sizeof(struct bio_vec) * bio->bi_vcnt); memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count); bmd->nr_sgvecs = iov_count; + bmd->is_our_pages = is_our_pages; bio->bi_private = bmd; } @@ -464,20 +515,21 @@ static void bio_free_map_data(struct bio_map_data *bmd) kfree(bmd); } -static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count) +static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count, + gfp_t gfp_mask) { - struct bio_map_data *bmd = kmalloc(sizeof(*bmd), GFP_KERNEL); + struct bio_map_data *bmd = kmalloc(sizeof(*bmd), gfp_mask); if (!bmd) return NULL; - bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, GFP_KERNEL); + bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, gfp_mask); if (!bmd->iovecs) { kfree(bmd); return NULL; } - bmd->sgvecs = kmalloc(sizeof(struct sg_iovec) * iov_count, GFP_KERNEL); + bmd->sgvecs = kmalloc(sizeof(struct sg_iovec) * iov_count, gfp_mask); if (bmd->sgvecs) return bmd; @@ -486,8 +538,9 @@ static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count) return NULL; } -static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count, - int uncopy) +static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs, + struct sg_iovec *iov, int iov_count, int uncopy, + int do_free_page) { int ret = 0, i; struct bio_vec *bvec; @@ -497,7 +550,7 @@ static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count, __bio_for_each_segment(bvec, bio, i, 0) { char *bv_addr = page_address(bvec->bv_page); - unsigned int bv_len = bvec->bv_len; + unsigned int bv_len = iovecs[i].bv_len; while (bv_len && iov_idx < iov_count) { unsigned int bytes; @@ -530,7 +583,7 @@ static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count, } } - if (uncopy) + if (do_free_page) __free_page(bvec->bv_page); } @@ -547,10 +600,11 @@ static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count, int bio_uncopy_user(struct bio *bio) { struct bio_map_data *bmd = bio->bi_private; - int ret; - - ret = __bio_copy_iov(bio, bmd->sgvecs, bmd->nr_sgvecs, 1); + int ret = 0; + if (!bio_flagged(bio, BIO_NULL_MAPPED)) + ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs, + bmd->nr_sgvecs, 1, bmd->is_our_pages); bio_free_map_data(bmd); bio_put(bio); return ret; @@ -559,16 +613,20 @@ int bio_uncopy_user(struct bio *bio) /** * bio_copy_user_iov - copy user data to bio * @q: destination block queue + * @map_data: pointer to the rq_map_data holding pages (if necessary) * @iov: the iovec. * @iov_count: number of elements in the iovec * @write_to_vm: bool indicating writing to pages or not + * @gfp_mask: memory allocation flags * * Prepares and returns a bio for indirect user io, bouncing data * to/from kernel pages as necessary. Must be paired with * call bio_uncopy_user() on io completion. */ -struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov, - int iov_count, int write_to_vm) +struct bio *bio_copy_user_iov(struct request_queue *q, + struct rq_map_data *map_data, + struct sg_iovec *iov, int iov_count, + int write_to_vm, gfp_t gfp_mask) { struct bio_map_data *bmd; struct bio_vec *bvec; @@ -591,25 +649,38 @@ struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov, len += iov[i].iov_len; } - bmd = bio_alloc_map_data(nr_pages, iov_count); + bmd = bio_alloc_map_data(nr_pages, iov_count, gfp_mask); if (!bmd) return ERR_PTR(-ENOMEM); ret = -ENOMEM; - bio = bio_alloc(GFP_KERNEL, nr_pages); + bio = bio_alloc(gfp_mask, nr_pages); if (!bio) goto out_bmd; bio->bi_rw |= (!write_to_vm << BIO_RW); ret = 0; + i = 0; while (len) { - unsigned int bytes = PAGE_SIZE; + unsigned int bytes; + + if (map_data) + bytes = 1U << (PAGE_SHIFT + map_data->page_order); + else + bytes = PAGE_SIZE; if (bytes > len) bytes = len; - page = alloc_page(q->bounce_gfp | GFP_KERNEL); + if (map_data) { + if (i == map_data->nr_entries) { + ret = -ENOMEM; + break; + } + page = map_data->pages[i++]; + } else + page = alloc_page(q->bounce_gfp | gfp_mask); if (!page) { ret = -ENOMEM; break; @@ -628,16 +699,17 @@ struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov, * success */ if (!write_to_vm) { - ret = __bio_copy_iov(bio, iov, iov_count, 0); + ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 0); if (ret) goto cleanup; } - bio_set_map_data(bmd, bio, iov, iov_count); + bio_set_map_data(bmd, bio, iov, iov_count, map_data ? 0 : 1); return bio; cleanup: - bio_for_each_segment(bvec, bio, i) - __free_page(bvec->bv_page); + if (!map_data) + bio_for_each_segment(bvec, bio, i) + __free_page(bvec->bv_page); bio_put(bio); out_bmd: @@ -648,29 +720,32 @@ out_bmd: /** * bio_copy_user - copy user data to bio * @q: destination block queue + * @map_data: pointer to the rq_map_data holding pages (if necessary) * @uaddr: start of user address * @len: length in bytes * @write_to_vm: bool indicating writing to pages or not + * @gfp_mask: memory allocation flags * * Prepares and returns a bio for indirect user io, bouncing data * to/from kernel pages as necessary. Must be paired with * call bio_uncopy_user() on io completion. */ -struct bio *bio_copy_user(struct request_queue *q, unsigned long uaddr, - unsigned int len, int write_to_vm) +struct bio *bio_copy_user(struct request_queue *q, struct rq_map_data *map_data, + unsigned long uaddr, unsigned int len, + int write_to_vm, gfp_t gfp_mask) { struct sg_iovec iov; iov.iov_base = (void __user *)uaddr; iov.iov_len = len; - return bio_copy_user_iov(q, &iov, 1, write_to_vm); + return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask); } static struct bio *__bio_map_user_iov(struct request_queue *q, struct block_device *bdev, struct sg_iovec *iov, int iov_count, - int write_to_vm) + int write_to_vm, gfp_t gfp_mask) { int i, j; int nr_pages = 0; @@ -696,12 +771,12 @@ static struct bio *__bio_map_user_iov(struct request_queue *q, if (!nr_pages) return ERR_PTR(-EINVAL); - bio = bio_alloc(GFP_KERNEL, nr_pages); + bio = bio_alloc(gfp_mask, nr_pages); if (!bio) return ERR_PTR(-ENOMEM); ret = -ENOMEM; - pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); + pages = kcalloc(nr_pages, sizeof(struct page *), gfp_mask); if (!pages) goto out; @@ -713,12 +788,8 @@ static struct bio *__bio_map_user_iov(struct request_queue *q, const int local_nr_pages = end - start; const int page_limit = cur_page + local_nr_pages; - down_read(¤t->mm->mmap_sem); - ret = get_user_pages(current, current->mm, uaddr, - local_nr_pages, - write_to_vm, 0, &pages[cur_page], NULL); - up_read(¤t->mm->mmap_sem); - + ret = get_user_pages_fast(uaddr, local_nr_pages, + write_to_vm, &pages[cur_page]); if (ret < local_nr_pages) { ret = -EFAULT; goto out_unmap; @@ -784,19 +855,21 @@ static struct bio *__bio_map_user_iov(struct request_queue *q, * @uaddr: start of user address * @len: length in bytes * @write_to_vm: bool indicating writing to pages or not + * @gfp_mask: memory allocation flags * * Map the user space address into a bio suitable for io to a block * device. Returns an error pointer in case of error. */ struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev, - unsigned long uaddr, unsigned int len, int write_to_vm) + unsigned long uaddr, unsigned int len, int write_to_vm, + gfp_t gfp_mask) { struct sg_iovec iov; iov.iov_base = (void __user *)uaddr; iov.iov_len = len; - return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm); + return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask); } /** @@ -806,18 +879,19 @@ struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev, * @iov: the iovec. * @iov_count: number of elements in the iovec * @write_to_vm: bool indicating writing to pages or not + * @gfp_mask: memory allocation flags * * Map the user space address into a bio suitable for io to a block * device. Returns an error pointer in case of error. */ struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev, struct sg_iovec *iov, int iov_count, - int write_to_vm) + int write_to_vm, gfp_t gfp_mask) { struct bio *bio; - bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm); - + bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm, + gfp_mask); if (IS_ERR(bio)) return bio; @@ -941,19 +1015,22 @@ static void bio_copy_kern_endio(struct bio *bio, int err) { struct bio_vec *bvec; const int read = bio_data_dir(bio) == READ; - char *p = bio->bi_private; + struct bio_map_data *bmd = bio->bi_private; int i; + char *p = bmd->sgvecs[0].iov_base; __bio_for_each_segment(bvec, bio, i, 0) { char *addr = page_address(bvec->bv_page); + int len = bmd->iovecs[i].bv_len; if (read && !err) - memcpy(p, addr, bvec->bv_len); + memcpy(p, addr, len); __free_page(bvec->bv_page); - p += bvec->bv_len; + p += len; } + bio_free_map_data(bmd); bio_put(bio); } @@ -971,38 +1048,13 @@ static void bio_copy_kern_endio(struct bio *bio, int err) struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, gfp_t gfp_mask, int reading) { - unsigned long kaddr = (unsigned long)data; - unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - unsigned long start = kaddr >> PAGE_SHIFT; - const int nr_pages = end - start; struct bio *bio; struct bio_vec *bvec; - int i, ret; - - bio = bio_alloc(gfp_mask, nr_pages); - if (!bio) - return ERR_PTR(-ENOMEM); - - while (len) { - struct page *page; - unsigned int bytes = PAGE_SIZE; - - if (bytes > len) - bytes = len; - - page = alloc_page(q->bounce_gfp | gfp_mask); - if (!page) { - ret = -ENOMEM; - goto cleanup; - } - - if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes) { - ret = -EINVAL; - goto cleanup; - } + int i; - len -= bytes; - } + bio = bio_copy_user(q, NULL, (unsigned long)data, len, 1, gfp_mask); + if (IS_ERR(bio)) + return bio; if (!reading) { void *p = data; @@ -1015,16 +1067,9 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, } } - bio->bi_private = data; bio->bi_end_io = bio_copy_kern_endio; - return bio; -cleanup: - bio_for_each_segment(bvec, bio, i) - __free_page(bvec->bv_page); - - bio_put(bio); - return ERR_PTR(ret); + return bio; } /* @@ -1211,9 +1256,9 @@ static void bio_pair_end_2(struct bio *bi, int err) * split a bio - only worry about a bio with a single page * in it's iovec */ -struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors) +struct bio_pair *bio_split(struct bio *bi, int first_sectors) { - struct bio_pair *bp = mempool_alloc(pool, GFP_NOIO); + struct bio_pair *bp = mempool_alloc(bio_split_pool, GFP_NOIO); if (!bp) return bp; @@ -1247,11 +1292,50 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors) bp->bio2.bi_end_io = bio_pair_end_2; bp->bio1.bi_private = bi; - bp->bio2.bi_private = pool; + bp->bio2.bi_private = bio_split_pool; + + if (bio_integrity(bi)) + bio_integrity_split(bi, bp, first_sectors); return bp; } +/** + * bio_sector_offset - Find hardware sector offset in bio + * @bio: bio to inspect + * @index: bio_vec index + * @offset: offset in bv_page + * + * Return the number of hardware sectors between beginning of bio + * and an end point indicated by a bio_vec index and an offset + * within that vector's page. + */ +sector_t bio_sector_offset(struct bio *bio, unsigned short index, + unsigned int offset) +{ + unsigned int sector_sz = queue_hardsect_size(bio->bi_bdev->bd_disk->queue); + struct bio_vec *bv; + sector_t sectors; + int i; + + sectors = 0; + + if (index >= bio->bi_idx) + index = bio->bi_vcnt - 1; + + __bio_for_each_segment(bv, bio, i, 0) { + if (i == index) { + if (offset > bv->bv_offset) + sectors += (offset - bv->bv_offset) / sector_sz; + break; + } + + sectors += bv->bv_len / sector_sz; + } + + return sectors; +} +EXPORT_SYMBOL(bio_sector_offset); /* * create memory pools for biovec's in a bio_set. @@ -1290,6 +1374,7 @@ void bioset_free(struct bio_set *bs) if (bs->bio_pool) mempool_destroy(bs->bio_pool); + bioset_integrity_free(bs); biovec_free_pools(bs); kfree(bs); @@ -1306,6 +1391,9 @@ struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size) if (!bs->bio_pool) goto bad; + if (bioset_integrity_create(bs, bio_pool_size)) + goto bad; + if (!biovec_create_pools(bs, bvec_pool_size)) return bs; @@ -1332,6 +1420,7 @@ static int __init init_bio(void) { bio_slab = KMEM_CACHE(bio, SLAB_HWCACHE_ALIGN|SLAB_PANIC); + bio_integrity_init_slab(); biovec_init_slabs(); fs_bio_set = bioset_create(BIO_POOL_SIZE, 2); @@ -1349,6 +1438,7 @@ static int __init init_bio(void) subsys_initcall(init_bio); EXPORT_SYMBOL(bio_alloc); +EXPORT_SYMBOL(bio_kmalloc); EXPORT_SYMBOL(bio_put); EXPORT_SYMBOL(bio_free); EXPORT_SYMBOL(bio_endio); @@ -1356,7 +1446,6 @@ EXPORT_SYMBOL(bio_init); EXPORT_SYMBOL(__bio_clone); EXPORT_SYMBOL(bio_clone); EXPORT_SYMBOL(bio_phys_segments); -EXPORT_SYMBOL(bio_hw_segments); EXPORT_SYMBOL(bio_add_page); EXPORT_SYMBOL(bio_add_pc_page); EXPORT_SYMBOL(bio_get_nr_vecs); @@ -1366,7 +1455,6 @@ EXPORT_SYMBOL(bio_map_kern); EXPORT_SYMBOL(bio_copy_kern); EXPORT_SYMBOL(bio_pair_release); EXPORT_SYMBOL(bio_split); -EXPORT_SYMBOL(bio_split_pool); EXPORT_SYMBOL(bio_copy_user); EXPORT_SYMBOL(bio_uncopy_user); EXPORT_SYMBOL(bioset_create); diff --git a/fs/block_dev.c b/fs/block_dev.c index 470c10ceb0fb..d84f0469a016 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -271,7 +271,7 @@ static void bdev_destroy_inode(struct inode *inode) kmem_cache_free(bdev_cachep, bdi); } -static void init_once(struct kmem_cache * cachep, void *foo) +static void init_once(void *foo) { struct bdev_inode *ei = (struct bdev_inode *) foo; struct block_device *bdev = &ei->bdev; @@ -540,22 +540,6 @@ EXPORT_SYMBOL(bd_release); * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0 */ -static struct kobject *bdev_get_kobj(struct block_device *bdev) -{ - if (bdev->bd_contains != bdev) - return kobject_get(&bdev->bd_part->dev.kobj); - else - return kobject_get(&bdev->bd_disk->dev.kobj); -} - -static struct kobject *bdev_get_holder(struct block_device *bdev) -{ - if (bdev->bd_contains != bdev) - return kobject_get(bdev->bd_part->holder_dir); - else - return kobject_get(bdev->bd_disk->holder_dir); -} - static int add_symlink(struct kobject *from, struct kobject *to) { if (!from || !to) @@ -604,11 +588,11 @@ static int bd_holder_grab_dirs(struct block_device *bdev, if (!bo->hdev) goto fail_put_sdir; - bo->sdev = bdev_get_kobj(bdev); + bo->sdev = kobject_get(&part_to_dev(bdev->bd_part)->kobj); if (!bo->sdev) goto fail_put_hdev; - bo->hdir = bdev_get_holder(bdev); + bo->hdir = kobject_get(bdev->bd_part->holder_dir); if (!bo->hdir) goto fail_put_sdev; @@ -868,6 +852,87 @@ struct block_device *open_by_devnum(dev_t dev, unsigned mode) EXPORT_SYMBOL(open_by_devnum); +/** + * flush_disk - invalidates all buffer-cache entries on a disk + * + * @bdev: struct block device to be flushed + * + * Invalidates all buffer-cache entries on a disk. It should be called + * when a disk has been changed -- either by a media change or online + * resize. + */ +static void flush_disk(struct block_device *bdev) +{ + if (__invalidate_device(bdev)) { + char name[BDEVNAME_SIZE] = ""; + + if (bdev->bd_disk) + disk_name(bdev->bd_disk, 0, name); + printk(KERN_WARNING "VFS: busy inodes on changed media or " + "resized disk %s\n", name); + } + + if (!bdev->bd_disk) + return; + if (disk_partitionable(bdev->bd_disk)) + bdev->bd_invalidated = 1; +} + +/** + * check_disk_size_change - checks for disk size change and adjusts bdev size. + * @disk: struct gendisk to check + * @bdev: struct bdev to adjust. + * + * This routine checks to see if the bdev size does not match the disk size + * and adjusts it if it differs. + */ +void check_disk_size_change(struct gendisk *disk, struct block_device *bdev) +{ + loff_t disk_size, bdev_size; + + disk_size = (loff_t)get_capacity(disk) << 9; + bdev_size = i_size_read(bdev->bd_inode); + if (disk_size != bdev_size) { + char name[BDEVNAME_SIZE]; + + disk_name(disk, 0, name); + printk(KERN_INFO + "%s: detected capacity change from %lld to %lld\n", + name, bdev_size, disk_size); + i_size_write(bdev->bd_inode, disk_size); + flush_disk(bdev); + } +} +EXPORT_SYMBOL(check_disk_size_change); + +/** + * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back + * @disk: struct gendisk to be revalidated + * + * This routine is a wrapper for lower-level driver's revalidate_disk + * call-backs. It is used to do common pre and post operations needed + * for all revalidate_disk operations. + */ +int revalidate_disk(struct gendisk *disk) +{ + struct block_device *bdev; + int ret = 0; + + if (disk->fops->revalidate_disk) + ret = disk->fops->revalidate_disk(disk); + + bdev = bdget_disk(disk, 0); + if (!bdev) + return ret; + + mutex_lock(&bdev->bd_mutex); + check_disk_size_change(disk, bdev); + mutex_unlock(&bdev->bd_mutex); + bdput(bdev); + return ret; +} +EXPORT_SYMBOL(revalidate_disk); + /* * This routine checks whether a removable media has been changed, * and invalidates all buffer-cache-entries in that case. This @@ -887,13 +952,9 @@ int check_disk_change(struct block_device *bdev) if (!bdops->media_changed(bdev->bd_disk)) return 0; - if (__invalidate_device(bdev)) - printk("VFS: busy inodes on changed media.\n"); - + flush_disk(bdev); if (bdops->revalidate_disk) bdops->revalidate_disk(bdev->bd_disk); - if (bdev->bd_disk->minors > 1) - bdev->bd_invalidated = 1; return 1; } @@ -927,36 +988,48 @@ static int __blkdev_put(struct block_device *bdev, int for_part); static int do_open(struct block_device *bdev, struct file *file, int for_part) { - struct module *owner = NULL; struct gendisk *disk; + struct hd_struct *part = NULL; int ret; - int part; + int partno; + int perm = 0; - ret = devcgroup_inode_permission(bdev->bd_inode, file->f_mode); - if (ret != 0) + if (file->f_mode & FMODE_READ) + perm |= MAY_READ; + if (file->f_mode & FMODE_WRITE) + perm |= MAY_WRITE; + /* + * hooks: /n/, see "layering violations". + */ + ret = devcgroup_inode_permission(bdev->bd_inode, perm); + if (ret != 0) { + bdput(bdev); return ret; + } ret = -ENXIO; file->f_mapping = bdev->bd_inode->i_mapping; + lock_kernel(); - disk = get_gendisk(bdev->bd_dev, &part); - if (!disk) { - unlock_kernel(); - bdput(bdev); - return ret; - } - owner = disk->fops->owner; + + disk = get_gendisk(bdev->bd_dev, &partno); + if (!disk) + goto out_unlock_kernel; + part = disk_get_part(disk, partno); + if (!part) + goto out_unlock_kernel; mutex_lock_nested(&bdev->bd_mutex, for_part); if (!bdev->bd_openers) { bdev->bd_disk = disk; + bdev->bd_part = part; bdev->bd_contains = bdev; - if (!part) { + if (!partno) { struct backing_dev_info *bdi; if (disk->fops->open) { ret = disk->fops->open(bdev->bd_inode, file); if (ret) - goto out_first; + goto out_clear; } if (!bdev->bd_openers) { bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); @@ -968,36 +1041,36 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part) if (bdev->bd_invalidated) rescan_partitions(disk, bdev); } else { - struct hd_struct *p; struct block_device *whole; whole = bdget_disk(disk, 0); ret = -ENOMEM; if (!whole) - goto out_first; + goto out_clear; BUG_ON(for_part); ret = __blkdev_get(whole, file->f_mode, file->f_flags, 1); if (ret) - goto out_first; + goto out_clear; bdev->bd_contains = whole; - p = disk->part[part - 1]; bdev->bd_inode->i_data.backing_dev_info = whole->bd_inode->i_data.backing_dev_info; - if (!(disk->flags & GENHD_FL_UP) || !p || !p->nr_sects) { + if (!(disk->flags & GENHD_FL_UP) || + !part || !part->nr_sects) { ret = -ENXIO; - goto out_first; + goto out_clear; } - kobject_get(&p->dev.kobj); - bdev->bd_part = p; - bd_set_size(bdev, (loff_t) p->nr_sects << 9); + bd_set_size(bdev, (loff_t)part->nr_sects << 9); } } else { + disk_put_part(part); put_disk(disk); - module_put(owner); + module_put(disk->fops->owner); + part = NULL; + disk = NULL; if (bdev->bd_contains == bdev) { if (bdev->bd_disk->fops->open) { ret = bdev->bd_disk->fops->open(bdev->bd_inode, file); if (ret) - goto out; + goto out_unlock_bdev; } if (bdev->bd_invalidated) rescan_partitions(bdev->bd_disk, bdev); @@ -1010,19 +1083,24 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part) unlock_kernel(); return 0; -out_first: + out_clear: bdev->bd_disk = NULL; + bdev->bd_part = NULL; bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; if (bdev != bdev->bd_contains) __blkdev_put(bdev->bd_contains, 1); bdev->bd_contains = NULL; - put_disk(disk); - module_put(owner); -out: + out_unlock_bdev: mutex_unlock(&bdev->bd_mutex); + out_unlock_kernel: unlock_kernel(); - if (ret) - bdput(bdev); + + disk_put_part(part); + if (disk) + module_put(disk->fops->owner); + put_disk(disk); + bdput(bdev); + return ret; } @@ -1107,11 +1185,8 @@ static int __blkdev_put(struct block_device *bdev, int for_part) put_disk(disk); module_put(owner); - - if (bdev->bd_contains != bdev) { - kobject_put(&bdev->bd_part->dev.kobj); - bdev->bd_part = NULL; - } + disk_put_part(bdev->bd_part); + bdev->bd_part = NULL; bdev->bd_disk = NULL; bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; if (bdev != bdev->bd_contains) @@ -1187,10 +1262,9 @@ EXPORT_SYMBOL(ioctl_by_bdev); /** * lookup_bdev - lookup a struct block_device by name + * @pathname: special file representing the block device * - * @path: special file representing the block device - * - * Get a reference to the blockdevice at @path in the current + * Get a reference to the blockdevice at @pathname in the current * namespace if possible and return it. Return ERR_PTR(error) * otherwise. */ @@ -1226,6 +1300,7 @@ fail: bdev = ERR_PTR(error); goto out; } +EXPORT_SYMBOL(lookup_bdev); /** * open_bdev_excl - open a block device by name and set it up for use diff --git a/fs/buffer.c b/fs/buffer.c index a073f3f4f013..ac78d4c19b3b 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -580,7 +580,7 @@ EXPORT_SYMBOL(mark_buffer_async_write); /* * The buffer's backing address_space's private_lock must be held */ -static inline void __remove_assoc_queue(struct buffer_head *bh) +static void __remove_assoc_queue(struct buffer_head *bh) { list_del_init(&bh->b_assoc_buffers); WARN_ON(!bh->b_assoc_map); @@ -706,7 +706,7 @@ static int __set_page_dirty(struct page *page, if (TestSetPageDirty(page)) return 0; - write_lock_irq(&mapping->tree_lock); + spin_lock_irq(&mapping->tree_lock); if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !PageUptodate(page)); @@ -719,7 +719,7 @@ static int __set_page_dirty(struct page *page, radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); return 1; @@ -821,7 +821,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) * contents - it is a noop if I/O is still in * flight on potentially older contents. */ - ll_rw_block(SWRITE, 1, &bh); + ll_rw_block(SWRITE_SYNC, 1, &bh); brelse(bh); spin_lock(lock); } @@ -1214,8 +1214,7 @@ void __brelse(struct buffer_head * buf) put_bh(buf); return; } - printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n"); - WARN_ON(1); + WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n"); } /* @@ -1464,7 +1463,7 @@ static void invalidate_bh_lru(void *arg) void invalidate_bh_lrus(void) { - on_each_cpu(invalidate_bh_lru, NULL, 1, 1); + on_each_cpu(invalidate_bh_lru, NULL, 1); } EXPORT_SYMBOL_GPL(invalidate_bh_lrus); @@ -1691,11 +1690,13 @@ static int __block_write_full_page(struct inode *inode, struct page *page, */ clear_buffer_dirty(bh); set_buffer_uptodate(bh); - } else if (!buffer_mapped(bh) && buffer_dirty(bh)) { + } else if ((!buffer_mapped(bh) || buffer_delay(bh)) && + buffer_dirty(bh)) { WARN_ON(bh->b_size != blocksize); err = get_block(inode, block, bh, 1); if (err) goto recover; + clear_buffer_delay(bh); if (buffer_new(bh)) { /* blockdev mappings never come here */ clear_buffer_new(bh); @@ -1719,7 +1720,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page, */ if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { lock_buffer(bh); - } else if (test_set_buffer_locked(bh)) { + } else if (!trylock_buffer(bh)) { redirty_page_for_writepage(wbc, page); continue; } @@ -1774,7 +1775,8 @@ recover: bh = head; /* Recovery: lock and submit the mapped buffers */ do { - if (buffer_mapped(bh) && buffer_dirty(bh)) { + if (buffer_mapped(bh) && buffer_dirty(bh) && + !buffer_delay(bh)) { lock_buffer(bh); mark_buffer_async_write(bh); } else { @@ -2061,6 +2063,7 @@ int generic_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { struct inode *inode = mapping->host; + int i_size_changed = 0; copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); @@ -2073,17 +2076,72 @@ int generic_write_end(struct file *file, struct address_space *mapping, */ if (pos+copied > inode->i_size) { i_size_write(inode, pos+copied); - mark_inode_dirty(inode); + i_size_changed = 1; } unlock_page(page); page_cache_release(page); + /* + * Don't mark the inode dirty under page lock. First, it unnecessarily + * makes the holding time of page lock longer. Second, it forces lock + * ordering of page lock and transaction start for journaling + * filesystems. + */ + if (i_size_changed) + mark_inode_dirty(inode); + return copied; } EXPORT_SYMBOL(generic_write_end); /* + * block_is_partially_uptodate checks whether buffers within a page are + * uptodate or not. + * + * Returns true if all buffers which correspond to a file portion + * we want to read are uptodate. + */ +int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, + unsigned long from) +{ + struct inode *inode = page->mapping->host; + unsigned block_start, block_end, blocksize; + unsigned to; + struct buffer_head *bh, *head; + int ret = 1; + + if (!page_has_buffers(page)) + return 0; + + blocksize = 1 << inode->i_blkbits; + to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count); + to = from + to; + if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize) + return 0; + + head = page_buffers(page); + bh = head; + block_start = 0; + do { + block_end = block_start + blocksize; + if (block_end > from && block_start < to) { + if (!buffer_uptodate(bh)) { + ret = 0; + break; + } + if (block_end >= to) + break; + } + block_start = block_end; + bh = bh->b_this_page; + } while (bh != head); + + return ret; +} +EXPORT_SYMBOL(block_is_partially_uptodate); + +/* * Generic "read page" function for block devices that have the normal * get_block functionality. This is most of the block device filesystems. * Reads the page asynchronously --- the unlock_buffer() and @@ -2868,14 +2926,17 @@ int submit_bh(int rw, struct buffer_head * bh) BUG_ON(!buffer_mapped(bh)); BUG_ON(!bh->b_end_io); - if (buffer_ordered(bh) && (rw == WRITE)) - rw = WRITE_BARRIER; + /* + * Mask in barrier bit for a write (could be either a WRITE or a + * WRITE_SYNC + */ + if (buffer_ordered(bh) && (rw & WRITE)) + rw |= WRITE_BARRIER; /* - * Only clear out a write error when rewriting, should this - * include WRITE_SYNC as well? + * Only clear out a write error when rewriting */ - if (test_set_buffer_req(bh) && (rw == WRITE || rw == WRITE_BARRIER)) + if (test_set_buffer_req(bh) && (rw & WRITE)) clear_buffer_write_io_error(bh); /* @@ -2940,16 +3001,19 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[]) for (i = 0; i < nr; i++) { struct buffer_head *bh = bhs[i]; - if (rw == SWRITE) + if (rw == SWRITE || rw == SWRITE_SYNC) lock_buffer(bh); - else if (test_set_buffer_locked(bh)) + else if (!trylock_buffer(bh)) continue; - if (rw == WRITE || rw == SWRITE) { + if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) { if (test_clear_buffer_dirty(bh)) { bh->b_end_io = end_buffer_write_sync; get_bh(bh); - submit_bh(WRITE, bh); + if (rw == SWRITE_SYNC) + submit_bh(WRITE_SYNC, bh); + else + submit_bh(WRITE, bh); continue; } } else { @@ -2978,7 +3042,7 @@ int sync_dirty_buffer(struct buffer_head *bh) if (test_clear_buffer_dirty(bh)) { get_bh(bh); bh->b_end_io = end_buffer_write_sync; - ret = submit_bh(WRITE, bh); + ret = submit_bh(WRITE_SYNC, bh); wait_on_buffer(bh); if (buffer_eopnotsupp(bh)) { clear_buffer_eopnotsupp(bh); @@ -3256,7 +3320,7 @@ int bh_submit_read(struct buffer_head *bh) EXPORT_SYMBOL(bh_submit_read); static void -init_buffer_head(struct kmem_cache *cachep, void *data) +init_buffer_head(void *data) { struct buffer_head *bh = data; diff --git a/fs/char_dev.c b/fs/char_dev.c index 68e510b88457..3cb7cda3d780 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -373,6 +373,8 @@ static int chrdev_open(struct inode *inode, struct file *filp) return -ENXIO; new = container_of(kobj, struct cdev, kobj); spin_lock(&cdev_lock); + /* Check i_cdev again in case somebody beat us to it while + we dropped the lock. */ p = inode->i_cdev; if (!p) { inode->i_cdev = p = new; @@ -392,11 +394,8 @@ static int chrdev_open(struct inode *inode, struct file *filp) cdev_put(p); return -ENXIO; } - if (filp->f_op->open) { - lock_kernel(); + if (filp->f_op->open) ret = filp->f_op->open(inode,filp); - unlock_kernel(); - } if (ret) cdev_put(p); return ret; diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES index 1f3465201fdf..06e521a945c3 100644 --- a/fs/cifs/CHANGES +++ b/fs/cifs/CHANGES @@ -1,3 +1,19 @@ +Version 1.54 +------------ +Fix premature write failure on congested networks (we would give up +on EAGAIN from the socket too quickly on large writes). +Cifs_mkdir and cifs_create now respect the setgid bit on parent dir. +Fix endian problems in acl (mode from/to cifs acl) on bigendian +architectures. Fix problems with preserving timestamps on copying open +files (e.g. "cp -a") to Windows servers. For mkdir and create honor setgid bit +on parent directory when server supports Unix Extensions but not POSIX +create. Update cifs.upcall version to handle new Kerberos sec flags +(this requires update of cifs.upcall program from Samba). Fix memory leak +on dns_upcall (resolving DFS referralls). Fix plain text password +authentication (requires setting SecurityFlags to 0x30030 to enable +lanman and plain text though). Fix writes to be at correct offset when +file is open with O_APPEND and file is on a directio (forcediretio) mount. + Version 1.53 ------------ DFS support added (Microsoft Distributed File System client support needed diff --git a/fs/cifs/README b/fs/cifs/README index 2bd6fe556f88..bd2343d4c6a6 100644 --- a/fs/cifs/README +++ b/fs/cifs/README @@ -542,10 +542,20 @@ SecurityFlags Flags which control security negotiation and hashing mechanisms (as "must use") on the other hand does not make much sense. Default flags are 0x07007 - (NTLM, NTLMv2 and packet signing allowed). Maximum + (NTLM, NTLMv2 and packet signing allowed). The maximum allowable flags if you want to allow mounts to servers using weaker password hashes is 0x37037 (lanman, - plaintext, ntlm, ntlmv2, signing allowed): + plaintext, ntlm, ntlmv2, signing allowed). Some + SecurityFlags require the corresponding menuconfig + options to be enabled (lanman and plaintext require + CONFIG_CIFS_WEAK_PW_HASH for example). Enabling + plaintext authentication currently requires also + enabling lanman authentication in the security flags + because the cifs module only supports sending + laintext passwords using the older lanman dialect + form of the session setup SMB. (e.g. for authentication + using plain text passwords, set the SecurityFlags + to 0x30030): may use packet signing 0x00001 must use packet signing 0x01001 @@ -642,8 +652,30 @@ The statistics for the number of total SMBs and oplock breaks are different in that they represent all for that share, not just those for which the server returned success. -Also note that "cat /proc/fs/cifs/DebugData" will display information about +Also note that "cat /proc/fs/cifs/DebugData" will display information about the active sessions and the shares that are mounted. -Enabling Kerberos (extended security) works when CONFIG_CIFS_EXPERIMENTAL is -on but requires a user space helper (from the Samba project). NTLM and NTLMv2 and -LANMAN support do not require this helper. + +Enabling Kerberos (extended security) works but requires version 1.2 or later +of the helper program cifs.upcall to be present and to be configured in the +/etc/request-key.conf file. The cifs.upcall helper program is from the Samba +project(http://www.samba.org). NTLM and NTLMv2 and LANMAN support do not +require this helper. Note that NTLMv2 security (which does not require the +cifs.upcall helper program), instead of using Kerberos, is sufficient for +some use cases. + +Enabling DFS support (used to access shares transparently in an MS-DFS +global name space) requires that CONFIG_CIFS_EXPERIMENTAL be enabled. In +addition, DFS support for target shares which are specified as UNC +names which begin with host names (rather than IP addresses) requires +a user space helper (such as cifs.upcall) to be present in order to +translate host names to ip address, and the user space helper must also +be configured in the file /etc/request-key.conf + +To use cifs Kerberos and DFS support, the Linux keyutils package should be +installed and something like the following lines should be added to the +/etc/request-key.conf file: + +create cifs.spnego * * /usr/local/sbin/cifs.upcall %k +create dns_resolver * * /usr/local/sbin/cifs.upcall %k + + diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c index f58e41d3ba48..1b09f1670061 100644 --- a/fs/cifs/asn1.c +++ b/fs/cifs/asn1.c @@ -400,7 +400,7 @@ asn1_oid_decode(struct asn1_ctx *ctx, size = eoc - ctx->pointer + 1; /* first subid actually encodes first two subids */ - if (size < 2 || size > ULONG_MAX/sizeof(unsigned long)) + if (size < 2 || size > UINT_MAX/sizeof(unsigned long)) return 0; *oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC); @@ -476,6 +476,7 @@ decode_negTokenInit(unsigned char *security_blob, int length, unsigned int cls, con, tag, oidlen, rc; bool use_ntlmssp = false; bool use_kerberos = false; + bool use_mskerberos = false; *secType = NTLM; /* BB eventually make Kerberos or NLTMSSP the default*/ @@ -483,6 +484,7 @@ decode_negTokenInit(unsigned char *security_blob, int length, asn1_open(&ctx, security_blob, length); + /* GSSAPI header */ if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { cFYI(1, ("Error decoding negTokenInit header")); return 0; @@ -490,156 +492,149 @@ decode_negTokenInit(unsigned char *security_blob, int length, || (tag != ASN1_EOC)) { cFYI(1, ("cls = %d con = %d tag = %d", cls, con, tag)); return 0; - } else { - /* remember to free obj->oid */ - rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag); - if (rc) { - if ((tag == ASN1_OJI) && (cls == ASN1_PRI)) { - rc = asn1_oid_decode(&ctx, end, &oid, &oidlen); - if (rc) { - rc = compare_oid(oid, oidlen, - SPNEGO_OID, - SPNEGO_OID_LEN); - kfree(oid); - } - } else - rc = 0; - } + } - if (!rc) { - cFYI(1, ("Error decoding negTokenInit header")); - return 0; - } + /* Check for SPNEGO OID -- remember to free obj->oid */ + rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag); + if (rc) { + if ((tag == ASN1_OJI) && (con == ASN1_PRI) && + (cls == ASN1_UNI)) { + rc = asn1_oid_decode(&ctx, end, &oid, &oidlen); + if (rc) { + rc = compare_oid(oid, oidlen, SPNEGO_OID, + SPNEGO_OID_LEN); + kfree(oid); + } + } else + rc = 0; + } - if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - cFYI(1, ("Error decoding negTokenInit")); - return 0; - } else if ((cls != ASN1_CTX) || (con != ASN1_CON) - || (tag != ASN1_EOC)) { - cFYI(1, - ("cls = %d con = %d tag = %d end = %p (%d) exit 0", - cls, con, tag, end, *end)); - return 0; - } + /* SPNEGO OID not present or garbled -- bail out */ + if (!rc) { + cFYI(1, ("Error decoding negTokenInit header")); + return 0; + } - if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - cFYI(1, ("Error decoding negTokenInit")); - return 0; - } else if ((cls != ASN1_UNI) || (con != ASN1_CON) - || (tag != ASN1_SEQ)) { - cFYI(1, - ("cls = %d con = %d tag = %d end = %p (%d) exit 1", - cls, con, tag, end, *end)); - return 0; - } + if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { + cFYI(1, ("Error decoding negTokenInit")); + return 0; + } else if ((cls != ASN1_CTX) || (con != ASN1_CON) + || (tag != ASN1_EOC)) { + cFYI(1, + ("cls = %d con = %d tag = %d end = %p (%d) exit 0", + cls, con, tag, end, *end)); + return 0; + } - if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - cFYI(1, ("Error decoding 2nd part of negTokenInit")); - return 0; - } else if ((cls != ASN1_CTX) || (con != ASN1_CON) - || (tag != ASN1_EOC)) { - cFYI(1, - ("cls = %d con = %d tag = %d end = %p (%d) exit 0", - cls, con, tag, end, *end)); - return 0; - } + if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { + cFYI(1, ("Error decoding negTokenInit")); + return 0; + } else if ((cls != ASN1_UNI) || (con != ASN1_CON) + || (tag != ASN1_SEQ)) { + cFYI(1, + ("cls = %d con = %d tag = %d end = %p (%d) exit 1", + cls, con, tag, end, *end)); + return 0; + } - if (asn1_header_decode - (&ctx, &sequence_end, &cls, &con, &tag) == 0) { - cFYI(1, ("Error decoding 2nd part of negTokenInit")); - return 0; - } else if ((cls != ASN1_UNI) || (con != ASN1_CON) - || (tag != ASN1_SEQ)) { - cFYI(1, - ("cls = %d con = %d tag = %d end = %p (%d) exit 1", - cls, con, tag, end, *end)); - return 0; - } + if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { + cFYI(1, ("Error decoding 2nd part of negTokenInit")); + return 0; + } else if ((cls != ASN1_CTX) || (con != ASN1_CON) + || (tag != ASN1_EOC)) { + cFYI(1, + ("cls = %d con = %d tag = %d end = %p (%d) exit 0", + cls, con, tag, end, *end)); + return 0; + } - while (!asn1_eoc_decode(&ctx, sequence_end)) { - rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag); - if (!rc) { - cFYI(1, - ("Error decoding negTokenInit hdr exit2")); - return 0; - } - if ((tag == ASN1_OJI) && (con == ASN1_PRI)) { - if (asn1_oid_decode(&ctx, end, &oid, &oidlen)) { - - cFYI(1, - ("OID len = %d oid = 0x%lx 0x%lx " - "0x%lx 0x%lx", - oidlen, *oid, *(oid + 1), - *(oid + 2), *(oid + 3))); - - if (compare_oid(oid, oidlen, - MSKRB5_OID, - MSKRB5_OID_LEN)) - use_kerberos = true; - else if (compare_oid(oid, oidlen, - KRB5_OID, - KRB5_OID_LEN)) - use_kerberos = true; - else if (compare_oid(oid, oidlen, - NTLMSSP_OID, - NTLMSSP_OID_LEN)) - use_ntlmssp = true; - - kfree(oid); - } - } else { - cFYI(1, ("Should be an oid what is going on?")); - } - } + if (asn1_header_decode + (&ctx, &sequence_end, &cls, &con, &tag) == 0) { + cFYI(1, ("Error decoding 2nd part of negTokenInit")); + return 0; + } else if ((cls != ASN1_UNI) || (con != ASN1_CON) + || (tag != ASN1_SEQ)) { + cFYI(1, + ("cls = %d con = %d tag = %d end = %p (%d) exit 1", + cls, con, tag, end, *end)); + return 0; + } - if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - cFYI(1, - ("Error decoding last part negTokenInit exit3")); - return 0; - } else if ((cls != ASN1_CTX) || (con != ASN1_CON)) { - /* tag = 3 indicating mechListMIC */ + while (!asn1_eoc_decode(&ctx, sequence_end)) { + rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag); + if (!rc) { cFYI(1, - ("Exit 4 cls = %d con = %d tag = %d end = %p (%d)", - cls, con, tag, end, *end)); + ("Error decoding negTokenInit hdr exit2")); return 0; } - if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - cFYI(1, - ("Error decoding last part negTokenInit exit5")); - return 0; - } else if ((cls != ASN1_UNI) || (con != ASN1_CON) - || (tag != ASN1_SEQ)) { - cFYI(1, ("cls = %d con = %d tag = %d end = %p (%d)", - cls, con, tag, end, *end)); + if ((tag == ASN1_OJI) && (con == ASN1_PRI)) { + if (asn1_oid_decode(&ctx, end, &oid, &oidlen)) { + + cFYI(1, ("OID len = %d oid = 0x%lx 0x%lx " + "0x%lx 0x%lx", oidlen, *oid, + *(oid + 1), *(oid + 2), *(oid + 3))); + + if (compare_oid(oid, oidlen, MSKRB5_OID, + MSKRB5_OID_LEN) && + !use_kerberos) + use_mskerberos = true; + else if (compare_oid(oid, oidlen, KRB5_OID, + KRB5_OID_LEN) && + !use_mskerberos) + use_kerberos = true; + else if (compare_oid(oid, oidlen, NTLMSSP_OID, + NTLMSSP_OID_LEN)) + use_ntlmssp = true; + + kfree(oid); + } + } else { + cFYI(1, ("Should be an oid what is going on?")); } + } - if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - cFYI(1, - ("Error decoding last part negTokenInit exit 7")); - return 0; - } else if ((cls != ASN1_CTX) || (con != ASN1_CON)) { - cFYI(1, - ("Exit 8 cls = %d con = %d tag = %d end = %p (%d)", - cls, con, tag, end, *end)); - return 0; - } - if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - cFYI(1, - ("Error decoding last part negTokenInit exit9")); - return 0; - } else if ((cls != ASN1_UNI) || (con != ASN1_PRI) - || (tag != ASN1_GENSTR)) { - cFYI(1, - ("Exit10 cls = %d con = %d tag = %d end = %p (%d)", - cls, con, tag, end, *end)); - return 0; - } - cFYI(1, ("Need to call asn1_octets_decode() function for %s", - ctx.pointer)); /* is this UTF-8 or ASCII? */ + if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { + cFYI(1, ("Error decoding last part negTokenInit exit3")); + return 0; + } else if ((cls != ASN1_CTX) || (con != ASN1_CON)) { + /* tag = 3 indicating mechListMIC */ + cFYI(1, ("Exit 4 cls = %d con = %d tag = %d end = %p (%d)", + cls, con, tag, end, *end)); + return 0; + } + if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { + cFYI(1, ("Error decoding last part negTokenInit exit5")); + return 0; + } else if ((cls != ASN1_UNI) || (con != ASN1_CON) + || (tag != ASN1_SEQ)) { + cFYI(1, ("cls = %d con = %d tag = %d end = %p (%d)", + cls, con, tag, end, *end)); + } + + if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { + cFYI(1, ("Error decoding last part negTokenInit exit 7")); + return 0; + } else if ((cls != ASN1_CTX) || (con != ASN1_CON)) { + cFYI(1, ("Exit 8 cls = %d con = %d tag = %d end = %p (%d)", + cls, con, tag, end, *end)); + return 0; + } + if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { + cFYI(1, ("Error decoding last part negTokenInit exit9")); + return 0; + } else if ((cls != ASN1_UNI) || (con != ASN1_PRI) + || (tag != ASN1_GENSTR)) { + cFYI(1, ("Exit10 cls = %d con = %d tag = %d end = %p (%d)", + cls, con, tag, end, *end)); + return 0; } + cFYI(1, ("Need to call asn1_octets_decode() function for %s", + ctx.pointer)); /* is this UTF-8 or ASCII? */ if (use_kerberos) *secType = Kerberos; + else if (use_mskerberos) + *secType = MSKerberos; else if (use_ntlmssp) *secType = NTLMSSP; diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index cc950f69e51e..69a12aae91d3 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -79,27 +79,25 @@ void cifs_dump_mids(struct TCP_Server_Info *server) spin_lock(&GlobalMid_Lock); list_for_each(tmp, &server->pending_mid_q) { mid_entry = list_entry(tmp, struct mid_q_entry, qhead); - if (mid_entry) { - cERROR(1, ("State: %d Cmd: %d Pid: %d Tsk: %p Mid %d", - mid_entry->midState, - (int)mid_entry->command, - mid_entry->pid, - mid_entry->tsk, - mid_entry->mid)); + cERROR(1, ("State: %d Cmd: %d Pid: %d Tsk: %p Mid %d", + mid_entry->midState, + (int)mid_entry->command, + mid_entry->pid, + mid_entry->tsk, + mid_entry->mid)); #ifdef CONFIG_CIFS_STATS2 - cERROR(1, ("IsLarge: %d buf: %p time rcv: %ld now: %ld", - mid_entry->largeBuf, - mid_entry->resp_buf, - mid_entry->when_received, - jiffies)); + cERROR(1, ("IsLarge: %d buf: %p time rcv: %ld now: %ld", + mid_entry->largeBuf, + mid_entry->resp_buf, + mid_entry->when_received, + jiffies)); #endif /* STATS2 */ - cERROR(1, ("IsMult: %d IsEnd: %d", mid_entry->multiRsp, - mid_entry->multiEnd)); - if (mid_entry->resp_buf) { - cifs_dump_detail(mid_entry->resp_buf); - cifs_dump_mem("existing buf: ", - mid_entry->resp_buf, 62); - } + cERROR(1, ("IsMult: %d IsEnd: %d", mid_entry->multiRsp, + mid_entry->multiEnd)); + if (mid_entry->resp_buf) { + cifs_dump_detail(mid_entry->resp_buf); + cifs_dump_mem("existing buf: ", + mid_entry->resp_buf, 62); } } spin_unlock(&GlobalMid_Lock); @@ -107,9 +105,7 @@ void cifs_dump_mids(struct TCP_Server_Info *server) #endif /* CONFIG_CIFS_DEBUG2 */ #ifdef CONFIG_PROC_FS -static int -cifs_debug_data_read(char *buf, char **beginBuffer, off_t offset, - int count, int *eof, void *data) +static int cifs_debug_data_proc_show(struct seq_file *m, void *v) { struct list_head *tmp; struct list_head *tmp1; @@ -117,23 +113,13 @@ cifs_debug_data_read(char *buf, char **beginBuffer, off_t offset, struct cifsSesInfo *ses; struct cifsTconInfo *tcon; int i; - int length = 0; - char *original_buf = buf; - - *beginBuffer = buf + offset; - length = - sprintf(buf, + seq_puts(m, "Display Internal CIFS Data Structures for Debugging\n" "---------------------------------------------------\n"); - buf += length; - length = sprintf(buf, "CIFS Version %s\n", CIFS_VERSION); - buf += length; - length = sprintf(buf, - "Active VFS Requests: %d\n", GlobalTotalActiveXid); - buf += length; - length = sprintf(buf, "Servers:"); - buf += length; + seq_printf(m, "CIFS Version %s\n", CIFS_VERSION); + seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid); + seq_printf(m, "Servers:"); i = 0; read_lock(&GlobalSMBSeslock); @@ -142,11 +128,10 @@ cifs_debug_data_read(char *buf, char **beginBuffer, off_t offset, ses = list_entry(tmp, struct cifsSesInfo, cifsSessionList); if ((ses->serverDomain == NULL) || (ses->serverOS == NULL) || (ses->serverNOS == NULL)) { - buf += sprintf(buf, "\nentry for %s not fully " + seq_printf(m, "\nentry for %s not fully " "displayed\n\t", ses->serverName); } else { - length = - sprintf(buf, + seq_printf(m, "\n%d) Name: %s Domain: %s Mounts: %d OS:" " %s \n\tNOS: %s\tCapability: 0x%x\n\tSMB" " session status: %d\t", @@ -154,10 +139,9 @@ cifs_debug_data_read(char *buf, char **beginBuffer, off_t offset, atomic_read(&ses->inUse), ses->serverOS, ses->serverNOS, ses->capabilities, ses->status); - buf += length; } if (ses->server) { - buf += sprintf(buf, "TCP status: %d\n\tLocal Users To " + seq_printf(m, "TCP status: %d\n\tLocal Users To " "Server: %d SecMode: 0x%x Req On Wire: %d", ses->server->tcpStatus, atomic_read(&ses->server->socketUseCount), @@ -165,41 +149,34 @@ cifs_debug_data_read(char *buf, char **beginBuffer, off_t offset, atomic_read(&ses->server->inFlight)); #ifdef CONFIG_CIFS_STATS2 - buf += sprintf(buf, " In Send: %d In MaxReq Wait: %d", + seq_printf(m, " In Send: %d In MaxReq Wait: %d", atomic_read(&ses->server->inSend), atomic_read(&ses->server->num_waiters)); #endif - length = sprintf(buf, "\nMIDs:\n"); - buf += length; + seq_puts(m, "\nMIDs:\n"); spin_lock(&GlobalMid_Lock); list_for_each(tmp1, &ses->server->pending_mid_q) { mid_entry = list_entry(tmp1, struct mid_q_entry, qhead); - if (mid_entry) { - length = sprintf(buf, - "State: %d com: %d pid:" - " %d tsk: %p mid %d\n", - mid_entry->midState, - (int)mid_entry->command, - mid_entry->pid, - mid_entry->tsk, - mid_entry->mid); - buf += length; - } + seq_printf(m, "State: %d com: %d pid:" + " %d tsk: %p mid %d\n", + mid_entry->midState, + (int)mid_entry->command, + mid_entry->pid, + mid_entry->tsk, + mid_entry->mid); } spin_unlock(&GlobalMid_Lock); } } read_unlock(&GlobalSMBSeslock); - sprintf(buf, "\n"); - buf++; + seq_putc(m, '\n'); - length = sprintf(buf, "Shares:"); - buf += length; + seq_puts(m, "Shares:"); i = 0; read_lock(&GlobalSMBSeslock); @@ -208,62 +185,52 @@ cifs_debug_data_read(char *buf, char **beginBuffer, off_t offset, i++; tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList); dev_type = le32_to_cpu(tcon->fsDevInfo.DeviceType); - length = sprintf(buf, "\n%d) %s Uses: %d ", i, + seq_printf(m, "\n%d) %s Uses: %d ", i, tcon->treeName, atomic_read(&tcon->useCount)); - buf += length; if (tcon->nativeFileSystem) { - length = sprintf(buf, "Type: %s ", + seq_printf(m, "Type: %s ", tcon->nativeFileSystem); - buf += length; } - length = sprintf(buf, "DevInfo: 0x%x Attributes: 0x%x" + seq_printf(m, "DevInfo: 0x%x Attributes: 0x%x" "\nPathComponentMax: %d Status: %d", le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics), le32_to_cpu(tcon->fsAttrInfo.Attributes), le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength), tcon->tidStatus); - buf += length; if (dev_type == FILE_DEVICE_DISK) - length = sprintf(buf, " type: DISK "); + seq_puts(m, " type: DISK "); else if (dev_type == FILE_DEVICE_CD_ROM) - length = sprintf(buf, " type: CDROM "); + seq_puts(m, " type: CDROM "); else - length = - sprintf(buf, " type: %d ", dev_type); - buf += length; - if (tcon->tidStatus == CifsNeedReconnect) { - buf += sprintf(buf, "\tDISCONNECTED "); - length += 14; - } + seq_printf(m, " type: %d ", dev_type); + + if (tcon->tidStatus == CifsNeedReconnect) + seq_puts(m, "\tDISCONNECTED "); } read_unlock(&GlobalSMBSeslock); - length = sprintf(buf, "\n"); - buf += length; + seq_putc(m, '\n'); /* BB add code to dump additional info such as TCP session info now */ - /* Now calculate total size of returned data */ - length = buf - original_buf; - - if (offset + count >= length) - *eof = 1; - if (length < offset) { - *eof = 1; - return 0; - } else { - length = length - offset; - } - if (length > count) - length = count; + return 0; +} - return length; +static int cifs_debug_data_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, cifs_debug_data_proc_show, NULL); } -#ifdef CONFIG_CIFS_STATS +static const struct file_operations cifs_debug_data_proc_fops = { + .owner = THIS_MODULE, + .open = cifs_debug_data_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; -static int -cifs_stats_write(struct file *file, const char __user *buffer, - unsigned long count, void *data) +#ifdef CONFIG_CIFS_STATS +static ssize_t cifs_stats_proc_write(struct file *file, + const char __user *buffer, size_t count, loff_t *ppos) { char c; int rc; @@ -307,236 +274,132 @@ cifs_stats_write(struct file *file, const char __user *buffer, return count; } -static int -cifs_stats_read(char *buf, char **beginBuffer, off_t offset, - int count, int *eof, void *data) +static int cifs_stats_proc_show(struct seq_file *m, void *v) { - int item_length, i, length; + int i; struct list_head *tmp; struct cifsTconInfo *tcon; - *beginBuffer = buf + offset; - - length = sprintf(buf, + seq_printf(m, "Resources in use\nCIFS Session: %d\n", sesInfoAllocCount.counter); - buf += length; - item_length = - sprintf(buf, "Share (unique mount targets): %d\n", + seq_printf(m, "Share (unique mount targets): %d\n", tconInfoAllocCount.counter); - length += item_length; - buf += item_length; - item_length = - sprintf(buf, "SMB Request/Response Buffer: %d Pool size: %d\n", + seq_printf(m, "SMB Request/Response Buffer: %d Pool size: %d\n", bufAllocCount.counter, cifs_min_rcv + tcpSesAllocCount.counter); - length += item_length; - buf += item_length; - item_length = - sprintf(buf, "SMB Small Req/Resp Buffer: %d Pool size: %d\n", + seq_printf(m, "SMB Small Req/Resp Buffer: %d Pool size: %d\n", smBufAllocCount.counter, cifs_min_small); - length += item_length; - buf += item_length; #ifdef CONFIG_CIFS_STATS2 - item_length = sprintf(buf, "Total Large %d Small %d Allocations\n", + seq_printf(m, "Total Large %d Small %d Allocations\n", atomic_read(&totBufAllocCount), atomic_read(&totSmBufAllocCount)); - length += item_length; - buf += item_length; #endif /* CONFIG_CIFS_STATS2 */ - item_length = - sprintf(buf, "Operations (MIDs): %d\n", - midCount.counter); - length += item_length; - buf += item_length; - item_length = sprintf(buf, + seq_printf(m, "Operations (MIDs): %d\n", midCount.counter); + seq_printf(m, "\n%d session %d share reconnects\n", tcpSesReconnectCount.counter, tconInfoReconnectCount.counter); - length += item_length; - buf += item_length; - item_length = sprintf(buf, + seq_printf(m, "Total vfs operations: %d maximum at one time: %d\n", GlobalCurrentXid, GlobalMaxActiveXid); - length += item_length; - buf += item_length; i = 0; read_lock(&GlobalSMBSeslock); list_for_each(tmp, &GlobalTreeConnectionList) { i++; tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList); - item_length = sprintf(buf, "\n%d) %s", i, tcon->treeName); - buf += item_length; - length += item_length; - if (tcon->tidStatus == CifsNeedReconnect) { - buf += sprintf(buf, "\tDISCONNECTED "); - length += 14; - } - item_length = sprintf(buf, "\nSMBs: %d Oplock Breaks: %d", + seq_printf(m, "\n%d) %s", i, tcon->treeName); + if (tcon->tidStatus == CifsNeedReconnect) + seq_puts(m, "\tDISCONNECTED "); + seq_printf(m, "\nSMBs: %d Oplock Breaks: %d", atomic_read(&tcon->num_smbs_sent), atomic_read(&tcon->num_oplock_brks)); - buf += item_length; - length += item_length; - item_length = sprintf(buf, "\nReads: %d Bytes: %lld", + seq_printf(m, "\nReads: %d Bytes: %lld", atomic_read(&tcon->num_reads), (long long)(tcon->bytes_read)); - buf += item_length; - length += item_length; - item_length = sprintf(buf, "\nWrites: %d Bytes: %lld", + seq_printf(m, "\nWrites: %d Bytes: %lld", atomic_read(&tcon->num_writes), (long long)(tcon->bytes_written)); - buf += item_length; - length += item_length; - item_length = sprintf(buf, + seq_printf(m, "\nLocks: %d HardLinks: %d Symlinks: %d", atomic_read(&tcon->num_locks), atomic_read(&tcon->num_hardlinks), atomic_read(&tcon->num_symlinks)); - buf += item_length; - length += item_length; - item_length = sprintf(buf, "\nOpens: %d Closes: %d Deletes: %d", + seq_printf(m, "\nOpens: %d Closes: %d Deletes: %d", atomic_read(&tcon->num_opens), atomic_read(&tcon->num_closes), atomic_read(&tcon->num_deletes)); - buf += item_length; - length += item_length; - item_length = sprintf(buf, "\nMkdirs: %d Rmdirs: %d", + seq_printf(m, "\nMkdirs: %d Rmdirs: %d", atomic_read(&tcon->num_mkdirs), atomic_read(&tcon->num_rmdirs)); - buf += item_length; - length += item_length; - item_length = sprintf(buf, "\nRenames: %d T2 Renames %d", + seq_printf(m, "\nRenames: %d T2 Renames %d", atomic_read(&tcon->num_renames), atomic_read(&tcon->num_t2renames)); - buf += item_length; - length += item_length; - item_length = sprintf(buf, "\nFindFirst: %d FNext %d FClose %d", + seq_printf(m, "\nFindFirst: %d FNext %d FClose %d", atomic_read(&tcon->num_ffirst), atomic_read(&tcon->num_fnext), atomic_read(&tcon->num_fclose)); - buf += item_length; - length += item_length; } read_unlock(&GlobalSMBSeslock); - buf += sprintf(buf, "\n"); - length++; - - if (offset + count >= length) - *eof = 1; - if (length < offset) { - *eof = 1; - return 0; - } else { - length = length - offset; - } - if (length > count) - length = count; + seq_putc(m, '\n'); + return 0; +} - return length; +static int cifs_stats_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, cifs_stats_proc_show, NULL); } + +static const struct file_operations cifs_stats_proc_fops = { + .owner = THIS_MODULE, + .open = cifs_stats_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = cifs_stats_proc_write, +}; #endif /* STATS */ static struct proc_dir_entry *proc_fs_cifs; -read_proc_t cifs_txanchor_read; -static read_proc_t cifsFYI_read; -static write_proc_t cifsFYI_write; -static read_proc_t oplockEnabled_read; -static write_proc_t oplockEnabled_write; -static read_proc_t lookupFlag_read; -static write_proc_t lookupFlag_write; -static read_proc_t traceSMB_read; -static write_proc_t traceSMB_write; -static read_proc_t multiuser_mount_read; -static write_proc_t multiuser_mount_write; -static read_proc_t security_flags_read; -static write_proc_t security_flags_write; -/* static read_proc_t ntlmv2_enabled_read; -static write_proc_t ntlmv2_enabled_write; -static read_proc_t packet_signing_enabled_read; -static write_proc_t packet_signing_enabled_write;*/ -static read_proc_t experimEnabled_read; -static write_proc_t experimEnabled_write; -static read_proc_t linuxExtensionsEnabled_read; -static write_proc_t linuxExtensionsEnabled_write; +static const struct file_operations cifsFYI_proc_fops; +static const struct file_operations cifs_oplock_proc_fops; +static const struct file_operations cifs_lookup_cache_proc_fops; +static const struct file_operations traceSMB_proc_fops; +static const struct file_operations cifs_multiuser_mount_proc_fops; +static const struct file_operations cifs_security_flags_proc_fops; +static const struct file_operations cifs_experimental_proc_fops; +static const struct file_operations cifs_linux_ext_proc_fops; void cifs_proc_init(void) { - struct proc_dir_entry *pde; - proc_fs_cifs = proc_mkdir("fs/cifs", NULL); if (proc_fs_cifs == NULL) return; proc_fs_cifs->owner = THIS_MODULE; - create_proc_read_entry("DebugData", 0, proc_fs_cifs, - cifs_debug_data_read, NULL); + proc_create("DebugData", 0, proc_fs_cifs, &cifs_debug_data_proc_fops); #ifdef CONFIG_CIFS_STATS - pde = create_proc_read_entry("Stats", 0, proc_fs_cifs, - cifs_stats_read, NULL); - if (pde) - pde->write_proc = cifs_stats_write; + proc_create("Stats", 0, proc_fs_cifs, &cifs_stats_proc_fops); #endif /* STATS */ - pde = create_proc_read_entry("cifsFYI", 0, proc_fs_cifs, - cifsFYI_read, NULL); - if (pde) - pde->write_proc = cifsFYI_write; - - pde = - create_proc_read_entry("traceSMB", 0, proc_fs_cifs, - traceSMB_read, NULL); - if (pde) - pde->write_proc = traceSMB_write; - - pde = create_proc_read_entry("OplockEnabled", 0, proc_fs_cifs, - oplockEnabled_read, NULL); - if (pde) - pde->write_proc = oplockEnabled_write; - - pde = create_proc_read_entry("Experimental", 0, proc_fs_cifs, - experimEnabled_read, NULL); - if (pde) - pde->write_proc = experimEnabled_write; - - pde = create_proc_read_entry("LinuxExtensionsEnabled", 0, proc_fs_cifs, - linuxExtensionsEnabled_read, NULL); - if (pde) - pde->write_proc = linuxExtensionsEnabled_write; - - pde = - create_proc_read_entry("MultiuserMount", 0, proc_fs_cifs, - multiuser_mount_read, NULL); - if (pde) - pde->write_proc = multiuser_mount_write; - - pde = - create_proc_read_entry("SecurityFlags", 0, proc_fs_cifs, - security_flags_read, NULL); - if (pde) - pde->write_proc = security_flags_write; - - pde = - create_proc_read_entry("LookupCacheEnabled", 0, proc_fs_cifs, - lookupFlag_read, NULL); - if (pde) - pde->write_proc = lookupFlag_write; - -/* pde = - create_proc_read_entry("NTLMV2Enabled", 0, proc_fs_cifs, - ntlmv2_enabled_read, NULL); - if (pde) - pde->write_proc = ntlmv2_enabled_write; - - pde = - create_proc_read_entry("PacketSigningEnabled", 0, proc_fs_cifs, - packet_signing_enabled_read, NULL); - if (pde) - pde->write_proc = packet_signing_enabled_write;*/ + proc_create("cifsFYI", 0, proc_fs_cifs, &cifsFYI_proc_fops); + proc_create("traceSMB", 0, proc_fs_cifs, &traceSMB_proc_fops); + proc_create("OplockEnabled", 0, proc_fs_cifs, &cifs_oplock_proc_fops); + proc_create("Experimental", 0, proc_fs_cifs, + &cifs_experimental_proc_fops); + proc_create("LinuxExtensionsEnabled", 0, proc_fs_cifs, + &cifs_linux_ext_proc_fops); + proc_create("MultiuserMount", 0, proc_fs_cifs, + &cifs_multiuser_mount_proc_fops); + proc_create("SecurityFlags", 0, proc_fs_cifs, + &cifs_security_flags_proc_fops); + proc_create("LookupCacheEnabled", 0, proc_fs_cifs, + &cifs_lookup_cache_proc_fops); } void @@ -553,39 +416,26 @@ cifs_proc_clean(void) #endif remove_proc_entry("MultiuserMount", proc_fs_cifs); remove_proc_entry("OplockEnabled", proc_fs_cifs); -/* remove_proc_entry("NTLMV2Enabled",proc_fs_cifs); */ remove_proc_entry("SecurityFlags", proc_fs_cifs); -/* remove_proc_entry("PacketSigningEnabled", proc_fs_cifs); */ remove_proc_entry("LinuxExtensionsEnabled", proc_fs_cifs); remove_proc_entry("Experimental", proc_fs_cifs); remove_proc_entry("LookupCacheEnabled", proc_fs_cifs); remove_proc_entry("fs/cifs", NULL); } -static int -cifsFYI_read(char *page, char **start, off_t off, int count, - int *eof, void *data) +static int cifsFYI_proc_show(struct seq_file *m, void *v) { - int len; - - len = sprintf(page, "%d\n", cifsFYI); - - len -= off; - *start = page + off; - - if (len > count) - len = count; - else - *eof = 1; - - if (len < 0) - len = 0; + seq_printf(m, "%d\n", cifsFYI); + return 0; +} - return len; +static int cifsFYI_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, cifsFYI_proc_show, NULL); } -static int -cifsFYI_write(struct file *file, const char __user *buffer, - unsigned long count, void *data) + +static ssize_t cifsFYI_proc_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) { char c; int rc; @@ -603,30 +453,28 @@ cifsFYI_write(struct file *file, const char __user *buffer, return count; } -static int -oplockEnabled_read(char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - int len; - - len = sprintf(page, "%d\n", oplockEnabled); - - len -= off; - *start = page + off; +static const struct file_operations cifsFYI_proc_fops = { + .owner = THIS_MODULE, + .open = cifsFYI_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = cifsFYI_proc_write, +}; - if (len > count) - len = count; - else - *eof = 1; - - if (len < 0) - len = 0; +static int cifs_oplock_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%d\n", oplockEnabled); + return 0; +} - return len; +static int cifs_oplock_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, cifs_oplock_proc_show, NULL); } -static int -oplockEnabled_write(struct file *file, const char __user *buffer, - unsigned long count, void *data) + +static ssize_t cifs_oplock_proc_write(struct file *file, + const char __user *buffer, size_t count, loff_t *ppos) { char c; int rc; @@ -642,30 +490,28 @@ oplockEnabled_write(struct file *file, const char __user *buffer, return count; } -static int -experimEnabled_read(char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - int len; - - len = sprintf(page, "%d\n", experimEnabled); +static const struct file_operations cifs_oplock_proc_fops = { + .owner = THIS_MODULE, + .open = cifs_oplock_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = cifs_oplock_proc_write, +}; - len -= off; - *start = page + off; - - if (len > count) - len = count; - else - *eof = 1; - - if (len < 0) - len = 0; +static int cifs_experimental_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%d\n", experimEnabled); + return 0; +} - return len; +static int cifs_experimental_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, cifs_experimental_proc_show, NULL); } -static int -experimEnabled_write(struct file *file, const char __user *buffer, - unsigned long count, void *data) + +static ssize_t cifs_experimental_proc_write(struct file *file, + const char __user *buffer, size_t count, loff_t *ppos) { char c; int rc; @@ -683,29 +529,28 @@ experimEnabled_write(struct file *file, const char __user *buffer, return count; } -static int -linuxExtensionsEnabled_read(char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - int len; - - len = sprintf(page, "%d\n", linuxExtEnabled); - len -= off; - *start = page + off; +static const struct file_operations cifs_experimental_proc_fops = { + .owner = THIS_MODULE, + .open = cifs_experimental_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = cifs_experimental_proc_write, +}; - if (len > count) - len = count; - else - *eof = 1; - - if (len < 0) - len = 0; +static int cifs_linux_ext_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%d\n", linuxExtEnabled); + return 0; +} - return len; +static int cifs_linux_ext_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, cifs_linux_ext_proc_show, NULL); } -static int -linuxExtensionsEnabled_write(struct file *file, const char __user *buffer, - unsigned long count, void *data) + +static ssize_t cifs_linux_ext_proc_write(struct file *file, + const char __user *buffer, size_t count, loff_t *ppos) { char c; int rc; @@ -721,31 +566,28 @@ linuxExtensionsEnabled_write(struct file *file, const char __user *buffer, return count; } +static const struct file_operations cifs_linux_ext_proc_fops = { + .owner = THIS_MODULE, + .open = cifs_linux_ext_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = cifs_linux_ext_proc_write, +}; -static int -lookupFlag_read(char *page, char **start, off_t off, - int count, int *eof, void *data) +static int cifs_lookup_cache_proc_show(struct seq_file *m, void *v) { - int len; - - len = sprintf(page, "%d\n", lookupCacheEnabled); - - len -= off; - *start = page + off; - - if (len > count) - len = count; - else - *eof = 1; - - if (len < 0) - len = 0; + seq_printf(m, "%d\n", lookupCacheEnabled); + return 0; +} - return len; +static int cifs_lookup_cache_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, cifs_lookup_cache_proc_show, NULL); } -static int -lookupFlag_write(struct file *file, const char __user *buffer, - unsigned long count, void *data) + +static ssize_t cifs_lookup_cache_proc_write(struct file *file, + const char __user *buffer, size_t count, loff_t *ppos) { char c; int rc; @@ -760,30 +602,29 @@ lookupFlag_write(struct file *file, const char __user *buffer, return count; } -static int -traceSMB_read(char *page, char **start, off_t off, int count, - int *eof, void *data) -{ - int len; - - len = sprintf(page, "%d\n", traceSMB); - - len -= off; - *start = page + off; - if (len > count) - len = count; - else - *eof = 1; +static const struct file_operations cifs_lookup_cache_proc_fops = { + .owner = THIS_MODULE, + .open = cifs_lookup_cache_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = cifs_lookup_cache_proc_write, +}; - if (len < 0) - len = 0; +static int traceSMB_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%d\n", traceSMB); + return 0; +} - return len; +static int traceSMB_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, traceSMB_proc_show, NULL); } -static int -traceSMB_write(struct file *file, const char __user *buffer, - unsigned long count, void *data) + +static ssize_t traceSMB_proc_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) { char c; int rc; @@ -799,30 +640,28 @@ traceSMB_write(struct file *file, const char __user *buffer, return count; } -static int -multiuser_mount_read(char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - int len; - - len = sprintf(page, "%d\n", multiuser_mount); - - len -= off; - *start = page + off; +static const struct file_operations traceSMB_proc_fops = { + .owner = THIS_MODULE, + .open = traceSMB_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = traceSMB_proc_write, +}; - if (len > count) - len = count; - else - *eof = 1; - - if (len < 0) - len = 0; +static int cifs_multiuser_mount_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%d\n", multiuser_mount); + return 0; +} - return len; +static int cifs_multiuser_mount_proc_open(struct inode *inode, struct file *fh) +{ + return single_open(fh, cifs_multiuser_mount_proc_show, NULL); } -static int -multiuser_mount_write(struct file *file, const char __user *buffer, - unsigned long count, void *data) + +static ssize_t cifs_multiuser_mount_proc_write(struct file *file, + const char __user *buffer, size_t count, loff_t *ppos) { char c; int rc; @@ -838,30 +677,28 @@ multiuser_mount_write(struct file *file, const char __user *buffer, return count; } -static int -security_flags_read(char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - int len; - - len = sprintf(page, "0x%x\n", extended_security); - - len -= off; - *start = page + off; +static const struct file_operations cifs_multiuser_mount_proc_fops = { + .owner = THIS_MODULE, + .open = cifs_multiuser_mount_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = cifs_multiuser_mount_proc_write, +}; - if (len > count) - len = count; - else - *eof = 1; - - if (len < 0) - len = 0; +static int cifs_security_flags_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, "0x%x\n", extended_security); + return 0; +} - return len; +static int cifs_security_flags_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, cifs_security_flags_proc_show, NULL); } -static int -security_flags_write(struct file *file, const char __user *buffer, - unsigned long count, void *data) + +static ssize_t cifs_security_flags_proc_write(struct file *file, + const char __user *buffer, size_t count, loff_t *ppos) { unsigned int flags; char flags_string[12]; @@ -917,6 +754,15 @@ security_flags_write(struct file *file, const char __user *buffer, /* BB should we turn on MAY flags for other MUST options? */ return count; } + +static const struct file_operations cifs_security_flags_proc_fops = { + .owner = THIS_MODULE, + .open = cifs_security_flags_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = cifs_security_flags_proc_write, +}; #else inline void cifs_proc_init(void) { diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index d82374c9e329..d2c8eef84f3c 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -226,7 +226,7 @@ static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd, int err; mntget(newmnt); - err = do_add_mount(newmnt, nd, nd->path.mnt->mnt_flags, mntlist); + err = do_add_mount(newmnt, &nd->path, nd->path.mnt->mnt_flags, mntlist); switch (err) { case 0: path_put(&nd->path); diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c index 7013aaff6aed..fcee9298b620 100644 --- a/fs/cifs/cifs_spnego.c +++ b/fs/cifs/cifs_spnego.c @@ -66,11 +66,28 @@ struct key_type cifs_spnego_key_type = { .describe = user_describe, }; -#define MAX_VER_STR_LEN 9 /* length of longest version string e.g. - strlen(";ver=0xFF") */ -#define MAX_MECH_STR_LEN 13 /* length of longest security mechanism name, eg - in future could have strlen(";sec=ntlmsspi") */ -#define MAX_IPV6_ADDR_LEN 42 /* eg FEDC:BA98:7654:3210:FEDC:BA98:7654:3210/60 */ +/* length of longest version string e.g. strlen("ver=0xFF") */ +#define MAX_VER_STR_LEN 8 + +/* length of longest security mechanism name, eg in future could have + * strlen(";sec=ntlmsspi") */ +#define MAX_MECH_STR_LEN 13 + +/* max possible addr len eg FEDC:BA98:7654:3210:FEDC:BA98:7654:3210/60 */ +#define MAX_IPV6_ADDR_LEN 42 + +/* strlen of "host=" */ +#define HOST_KEY_LEN 5 + +/* strlen of ";ip4=" or ";ip6=" */ +#define IP_KEY_LEN 5 + +/* strlen of ";uid=0x" */ +#define UID_KEY_LEN 7 + +/* strlen of ";user=" */ +#define USER_KEY_LEN 6 + /* get a key struct with a SPNEGO security blob, suitable for session setup */ struct key * cifs_get_spnego_key(struct cifsSesInfo *sesInfo) @@ -81,11 +98,15 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo) struct key *spnego_key; const char *hostname = server->hostname; - /* BB: come up with better scheme for determining length */ - /* length of fields (with semicolons): ver=0xyz ipv4= ipaddress host= - hostname sec=mechanism uid=0x uid */ - desc_len = MAX_VER_STR_LEN + 5 + MAX_IPV6_ADDR_LEN + 1 + 6 + - strlen(hostname) + MAX_MECH_STR_LEN + 8 + (sizeof(uid_t) * 2); + /* length of fields (with semicolons): ver=0xyz ip4=ipaddress + host=hostname sec=mechanism uid=0xFF user=username */ + desc_len = MAX_VER_STR_LEN + + HOST_KEY_LEN + strlen(hostname) + + IP_KEY_LEN + MAX_IPV6_ADDR_LEN + + MAX_MECH_STR_LEN + + UID_KEY_LEN + (sizeof(uid_t) * 2) + + USER_KEY_LEN + strlen(sesInfo->userName) + 1; + spnego_key = ERR_PTR(-ENOMEM); description = kzalloc(desc_len, GFP_KERNEL); if (description == NULL) @@ -110,9 +131,11 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo) dp = description + strlen(description); - /* for now, only sec=krb5 is valid */ + /* for now, only sec=krb5 and sec=mskrb5 are valid */ if (server->secType == Kerberos) sprintf(dp, ";sec=krb5"); + else if (server->secType == MSKerberos) + sprintf(dp, ";sec=mskrb5"); else goto out; diff --git a/fs/cifs/cifs_spnego.h b/fs/cifs/cifs_spnego.h index 05a34b17a1ab..e4041ec4d712 100644 --- a/fs/cifs/cifs_spnego.h +++ b/fs/cifs/cifs_spnego.h @@ -23,7 +23,7 @@ #ifndef _CIFS_SPNEGO_H #define _CIFS_SPNEGO_H -#define CIFS_SPNEGO_UPCALL_VERSION 1 +#define CIFS_SPNEGO_UPCALL_VERSION 2 /* * The version field should always be set to CIFS_SPNEGO_UPCALL_VERSION. diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 34902cff5400..57ecdc83c26f 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -34,11 +34,11 @@ static struct cifs_wksid wksidarr[NUM_WK_SIDS] = { {{1, 0, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0} }, "null user"}, {{1, 1, {0, 0, 0, 0, 0, 1}, {0, 0, 0, 0, 0} }, "nobody"}, - {{1, 1, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(11), 0, 0, 0, 0} }, "net-users"}, - {{1, 1, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(18), 0, 0, 0, 0} }, "sys"}, - {{1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(32), cpu_to_le32(544), 0, 0, 0} }, "root"}, - {{1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(32), cpu_to_le32(545), 0, 0, 0} }, "users"}, - {{1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(32), cpu_to_le32(546), 0, 0, 0} }, "guest"} } + {{1, 1, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(11), 0, 0, 0, 0} }, "net-users"}, + {{1, 1, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(18), 0, 0, 0, 0} }, "sys"}, + {{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(544), 0, 0, 0} }, "root"}, + {{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(545), 0, 0, 0} }, "users"}, + {{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(546), 0, 0, 0} }, "guest"} } ; @@ -56,7 +56,7 @@ int match_sid(struct cifs_sid *ctsid) struct cifs_sid *cwsid; if (!ctsid) - return (-1); + return -1; for (i = 0; i < NUM_WK_SIDS; ++i) { cwsid = &(wksidarr[i].cifssid); @@ -87,11 +87,11 @@ int match_sid(struct cifs_sid *ctsid) } cFYI(1, ("matching sid: %s\n", wksidarr[i].sidname)); - return (0); /* sids compare/match */ + return 0; /* sids compare/match */ } cFYI(1, ("No matching sid")); - return (-1); + return -1; } /* if the two SIDs (roughly equivalent to a UUID for a user or group) are @@ -102,16 +102,16 @@ int compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid) int num_subauth, num_sat, num_saw; if ((!ctsid) || (!cwsid)) - return (0); + return 0; /* compare the revision */ if (ctsid->revision != cwsid->revision) - return (0); + return 0; /* compare all of the six auth values */ for (i = 0; i < 6; ++i) { if (ctsid->authority[i] != cwsid->authority[i]) - return (0); + return 0; } /* compare all of the subauth values if any */ @@ -121,11 +121,11 @@ int compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid) if (num_subauth) { for (i = 0; i < num_subauth; ++i) { if (ctsid->sub_auth[i] != cwsid->sub_auth[i]) - return (0); + return 0; } } - return (1); /* sids compare/match */ + return 1; /* sids compare/match */ } @@ -169,8 +169,7 @@ static void copy_sec_desc(const struct cifs_ntsd *pntsd, for (i = 0; i < 6; i++) ngroup_sid_ptr->authority[i] = group_sid_ptr->authority[i]; for (i = 0; i < 5; i++) - ngroup_sid_ptr->sub_auth[i] = - cpu_to_le32(group_sid_ptr->sub_auth[i]); + ngroup_sid_ptr->sub_auth[i] = group_sid_ptr->sub_auth[i]; return; } @@ -285,7 +284,7 @@ static __u16 fill_ace_for_sid(struct cifs_ace *pntace, size = 1 + 1 + 2 + 4 + 1 + 1 + 6 + (psid->num_subauth * 4); pntace->size = cpu_to_le16(size); - return (size); + return size; } @@ -426,7 +425,7 @@ static int set_chmod_dacl(struct cifs_acl *pndacl, struct cifs_sid *pownersid, pndacl->size = cpu_to_le16(size + sizeof(struct cifs_acl)); pndacl->num_aces = cpu_to_le32(3); - return (0); + return 0; } @@ -510,7 +509,7 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len, sizeof(struct cifs_sid)); */ - return (0); + return 0; } @@ -527,7 +526,7 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, struct cifs_acl *ndacl_ptr = NULL; /* no need for SACL ptr */ if ((inode == NULL) || (pntsd == NULL) || (pnntsd == NULL)) - return (-EIO); + return -EIO; owner_sid_ptr = (struct cifs_sid *)((char *)pntsd + le32_to_cpu(pntsd->osidoffset)); @@ -550,7 +549,7 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, /* copy security descriptor control portion and owner and group sid */ copy_sec_desc(pntsd, pnntsd, sidsoffset); - return (rc); + return rc; } @@ -629,11 +628,11 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen, cFYI(DBG2, ("set ACL for %s from mode 0x%x", path, inode->i_mode)); if (!inode) - return (rc); + return rc; sb = inode->i_sb; if (sb == NULL) - return (rc); + return rc; cifs_sb = CIFS_SB(sb); xid = GetXid(); @@ -652,7 +651,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen, if (rc != 0) { cERROR(1, ("Unable to open file to set ACL")); FreeXid(xid); - return (rc); + return rc; } } @@ -665,7 +664,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen, FreeXid(xid); - return (rc); + return rc; } /* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */ @@ -715,7 +714,7 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode) if (!pnntsd) { cERROR(1, ("Unable to allocate security descriptor")); kfree(pntsd); - return (-ENOMEM); + return -ENOMEM; } rc = build_sec_desc(pntsd, pnntsd, inode, nmode); @@ -732,6 +731,6 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode) kfree(pntsd); } - return (rc); + return rc; } #endif /* CONFIG_CIFS_EXPERIMENTAL */ diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 4ff8939c6cc7..bd5f13d38450 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -294,6 +294,7 @@ void calc_lanman_hash(struct cifsSesInfo *ses, char *lnm_session_key) if ((ses->server->secMode & SECMODE_PW_ENCRYPT) == 0) if (extended_security & CIFSSEC_MAY_PLNTXT) { + memset(lnm_session_key, 0, CIFS_SESS_KEY_SIZE); memcpy(lnm_session_key, password_with_pad, CIFS_ENCPWD_SIZE); return; @@ -310,9 +311,8 @@ void calc_lanman_hash(struct cifsSesInfo *ses, char *lnm_session_key) utf8 and other multibyte codepages each need their own strupper function since a byte at a time will ont work. */ - for (i = 0; i < CIFS_ENCPWD_SIZE; i++) { + for (i = 0; i < CIFS_ENCPWD_SIZE; i++) password_with_pad[i] = toupper(password_with_pad[i]); - } SMBencrypt(password_with_pad, ses->server->cryptKey, lnm_session_key); /* clear password before we return/free memory */ diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 86b4d5f405ae..25ecbd5b0404 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -175,6 +175,8 @@ out_no_root: if (inode) iput(inode); + cifs_umount(sb, cifs_sb); + out_mount_failed: if (cifs_sb) { #ifdef CONFIG_CIFS_DFS_UPCALL @@ -267,7 +269,7 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } -static int cifs_permission(struct inode *inode, int mask, struct nameidata *nd) +static int cifs_permission(struct inode *inode, int mask) { struct cifs_sb_info *cifs_sb; @@ -612,7 +614,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin) if (retval < 0) return (loff_t)retval; } - return remote_llseek(file, offset, origin); + return generic_file_llseek_unlocked(file, offset, origin); } struct file_system_type cifs_fs_type = { @@ -766,7 +768,7 @@ const struct file_operations cifs_dir_ops = { }; static void -cifs_init_once(struct kmem_cache *cachep, void *inode) +cifs_init_once(void *inode) { struct cifsInodeInfo *cifsi = inode; @@ -930,36 +932,34 @@ static int cifs_oplock_thread(void *dummyarg) schedule_timeout(39*HZ); } else { oplock_item = list_entry(GlobalOplock_Q.next, - struct oplock_q_entry, qhead); - if (oplock_item) { - cFYI(1, ("found oplock item to write out")); - pTcon = oplock_item->tcon; - inode = oplock_item->pinode; - netfid = oplock_item->netfid; - spin_unlock(&GlobalMid_Lock); - DeleteOplockQEntry(oplock_item); - /* can not grab inode sem here since it would + struct oplock_q_entry, qhead); + cFYI(1, ("found oplock item to write out")); + pTcon = oplock_item->tcon; + inode = oplock_item->pinode; + netfid = oplock_item->netfid; + spin_unlock(&GlobalMid_Lock); + DeleteOplockQEntry(oplock_item); + /* can not grab inode sem here since it would deadlock when oplock received on delete since vfs_unlink holds the i_mutex across the call */ - /* mutex_lock(&inode->i_mutex);*/ - if (S_ISREG(inode->i_mode)) { - rc = - filemap_fdatawrite(inode->i_mapping); - if (CIFS_I(inode)->clientCanCacheRead - == 0) { - waitrc = filemap_fdatawait(inode->i_mapping); - invalidate_remote_inode(inode); - } - if (rc == 0) - rc = waitrc; - } else - rc = 0; - /* mutex_unlock(&inode->i_mutex);*/ - if (rc) - CIFS_I(inode)->write_behind_rc = rc; - cFYI(1, ("Oplock flush inode %p rc %d", - inode, rc)); + /* mutex_lock(&inode->i_mutex);*/ + if (S_ISREG(inode->i_mode)) { + rc = filemap_fdatawrite(inode->i_mapping); + if (CIFS_I(inode)->clientCanCacheRead == 0) { + waitrc = filemap_fdatawait( + inode->i_mapping); + invalidate_remote_inode(inode); + } + if (rc == 0) + rc = waitrc; + } else + rc = 0; + /* mutex_unlock(&inode->i_mutex);*/ + if (rc) + CIFS_I(inode)->write_behind_rc = rc; + cFYI(1, ("Oplock flush inode %p rc %d", + inode, rc)); /* releasing stale oplock after recent reconnect of smb session using a now incorrect file @@ -967,15 +967,13 @@ static int cifs_oplock_thread(void *dummyarg) not bother sending an oplock release if session to server still is disconnected since oplock already released by the server in that case */ - if (pTcon->tidStatus != CifsNeedReconnect) { - rc = CIFSSMBLock(0, pTcon, netfid, - 0 /* len */ , 0 /* offset */, 0, - 0, LOCKING_ANDX_OPLOCK_RELEASE, - false /* wait flag */); - cFYI(1, ("Oplock release rc = %d", rc)); - } - } else - spin_unlock(&GlobalMid_Lock); + if (pTcon->tidStatus != CifsNeedReconnect) { + rc = CIFSSMBLock(0, pTcon, netfid, + 0 /* len */ , 0 /* offset */, 0, + 0, LOCKING_ANDX_OPLOCK_RELEASE, + false /* wait flag */); + cFYI(1, ("Oplock release rc = %d", rc)); + } set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(1); /* yield in case q were corrupt */ } @@ -1001,8 +999,7 @@ static int cifs_dnotify_thread(void *dummyarg) list_for_each(tmp, &GlobalSMBSessionList) { ses = list_entry(tmp, struct cifsSesInfo, cifsSessionList); - if (ses && ses->server && - atomic_read(&ses->server->inFlight)) + if (ses->server && atomic_read(&ses->server->inFlight)) wake_up_all(&ses->server->response_q); } read_unlock(&GlobalSMBSeslock); diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 25a6cbd15529..f7b4a5cd837b 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -41,7 +41,7 @@ extern int cifs_create(struct inode *, struct dentry *, int, struct nameidata *); extern struct dentry *cifs_lookup(struct inode *, struct dentry *, struct nameidata *); -extern int cifs_unlink(struct inode *, struct dentry *); +extern int cifs_unlink(struct inode *dir, struct dentry *dentry); extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *); extern int cifs_mknod(struct inode *, struct dentry *, int, dev_t); extern int cifs_mkdir(struct inode *, struct dentry *, int); @@ -101,5 +101,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* EXPERIMENTAL */ -#define CIFS_VERSION "1.53" +#define CIFS_VERSION "1.54" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 9cfcf326ead3..0d22479d99b7 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -27,7 +27,7 @@ #define MAX_SES_INFO 2 #define MAX_TCON_INFO 4 -#define MAX_TREE_SIZE 2 + MAX_SERVER_SIZE + 1 + MAX_SHARE_SIZE + 1 +#define MAX_TREE_SIZE (2 + MAX_SERVER_SIZE + 1 + MAX_SHARE_SIZE + 1) #define MAX_SERVER_SIZE 15 #define MAX_SHARE_SIZE 64 /* used to be 20, this should still be enough */ #define MAX_USERNAME_SIZE 32 /* 32 is to allow for 15 char names + null @@ -80,7 +80,8 @@ enum securityEnum { NTLMv2, /* Legacy NTLM auth with NTLMv2 hash */ RawNTLMSSP, /* NTLMSSP without SPNEGO */ NTLMSSP, /* NTLMSSP via SPNEGO */ - Kerberos /* Kerberos via SPNEGO */ + Kerberos, /* Kerberos via SPNEGO */ + MSKerberos, /* MS Kerberos via SPNEGO */ }; enum protocolEnum { @@ -308,6 +309,7 @@ struct cifs_search_info { __u32 resume_key; char *ntwrk_buf_start; char *srch_entries_start; + char *last_entry; char *presume_name; unsigned int resume_name_len; bool endOfSearch:1; @@ -537,8 +539,8 @@ require use of the stronger protocol */ #endif /* WEAK_PW_HASH */ #define CIFSSEC_MUST_SEAL 0x40040 /* not supported yet */ -#define CIFSSEC_DEF CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 -#define CIFSSEC_MAX CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2 +#define CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2) +#define CIFSSEC_MAX (CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2) #define CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_LANMAN | CIFSSEC_MAY_PLNTXT | CIFSSEC_MAY_KRB5) /* ***************************************************************** diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index 0f327c224da3..d2a073edd1b8 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -31,7 +31,7 @@ #else #define CIFS_PROT 0 #endif -#define POSIX_PROT CIFS_PROT+1 +#define POSIX_PROT (CIFS_PROT+1) #define BAD_PROT 0xFFFF /* SMB command codes */ @@ -262,7 +262,7 @@ */ #define CIFS_NO_HANDLE 0xFFFF -#define NO_CHANGE_64 cpu_to_le64(0xFFFFFFFFFFFFFFFFULL) +#define NO_CHANGE_64 0xFFFFFFFFFFFFFFFFULL #define NO_CHANGE_32 0xFFFFFFFFUL /* IPC$ in ASCII */ @@ -341,7 +341,7 @@ #define CREATE_COMPLETE_IF_OPLK 0x00000100 /* should be zero */ #define CREATE_NO_EA_KNOWLEDGE 0x00000200 #define CREATE_EIGHT_DOT_THREE 0x00000400 /* doc says this is obsolete - "open for recovery" flag - should + "open for recovery" flag should be zero in any case */ #define CREATE_OPEN_FOR_RECOVERY 0x00000400 #define CREATE_RANDOM_ACCESS 0x00000800 @@ -414,8 +414,8 @@ struct smb_hdr { __u8 WordCount; } __attribute__((packed)); /* given a pointer to an smb_hdr retrieve the value of byte count */ -#define BCC(smb_var) ( *(__u16 *)((char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount))) -#define BCC_LE(smb_var) ( *(__le16 *)((char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount))) +#define BCC(smb_var) (*(__u16 *)((char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount))) +#define BCC_LE(smb_var) (*(__le16 *)((char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount))) /* given a pointer to an smb_hdr retrieve the pointer to the byte area */ #define pByteArea(smb_var) ((unsigned char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount) + 2) diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index b9f5e935f821..0cff7fe986e8 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -172,12 +172,15 @@ extern int CIFSSMBQFSUnixInfo(const int xid, struct cifsTconInfo *tcon); extern int CIFSSMBQFSPosixInfo(const int xid, struct cifsTconInfo *tcon, struct kstatfs *FSData); -extern int CIFSSMBSetTimes(const int xid, struct cifsTconInfo *tcon, +extern int CIFSSMBSetPathInfo(const int xid, struct cifsTconInfo *tcon, const char *fileName, const FILE_BASIC_INFO *data, const struct nls_table *nls_codepage, int remap_special_chars); -extern int CIFSSMBSetFileTimes(const int xid, struct cifsTconInfo *tcon, - const FILE_BASIC_INFO *data, __u16 fid); +extern int CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon, + const FILE_BASIC_INFO *data, __u16 fid, + __u32 pid_of_opener); +extern int CIFSSMBSetFileDisposition(const int xid, struct cifsTconInfo *tcon, + bool delete_file, __u16 fid, __u32 pid_of_opener); #if 0 extern int CIFSSMBSetAttrLegacy(int xid, struct cifsTconInfo *tcon, char *fileName, __u16 dos_attributes, @@ -191,9 +194,20 @@ extern int CIFSSMBSetEOF(const int xid, struct cifsTconInfo *tcon, extern int CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size, __u16 fileHandle, __u32 opener_pid, bool AllocSizeFlag); -extern int CIFSSMBUnixSetPerms(const int xid, struct cifsTconInfo *pTcon, - char *full_path, __u64 mode, __u64 uid, - __u64 gid, dev_t dev, + +struct cifs_unix_set_info_args { + __u64 ctime; + __u64 atime; + __u64 mtime; + __u64 mode; + __u64 uid; + __u64 gid; + dev_t device; +}; + +extern int CIFSSMBUnixSetInfo(const int xid, struct cifsTconInfo *pTcon, + char *fileName, + const struct cifs_unix_set_info_args *args, const struct nls_table *nls_codepage, int remap_special_chars); @@ -217,7 +231,7 @@ extern int CIFSSMBRename(const int xid, struct cifsTconInfo *tcon, const struct nls_table *nls_codepage, int remap_special_chars); extern int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon, - int netfid, char *target_name, + int netfid, const char *target_name, const struct nls_table *nls_codepage, int remap_special_chars); extern int CIFSCreateHardLink(const int xid, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 4511b708f0f3..6f4ffe15d68d 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -128,8 +128,7 @@ static void mark_open_files_invalid(struct cifsTconInfo *pTcon) write_lock(&GlobalSMBSeslock); list_for_each_safe(tmp, tmp1, &pTcon->openFileList) { open_file = list_entry(tmp, struct cifsFileInfo, tlist); - if (open_file) - open_file->invalidHandle = true; + open_file->invalidHandle = true; } write_unlock(&GlobalSMBSeslock); /* BB Add call to invalidate_inodes(sb) for all superblocks mounted @@ -686,11 +685,10 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses) SecurityBlob, count - 16, &server->secType); - if (rc == 1) { + if (rc == 1) rc = 0; - } else { + else rc = -EINVAL; - } } } else server->capabilities &= ~CAP_EXTENDED_SECURITY; @@ -2019,7 +2017,7 @@ renameRetry: } int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon, - int netfid, char *target_name, + int netfid, const char *target_name, const struct nls_table *nls_codepage, int remap) { struct smb_com_transaction2_sfi_req *pSMB = NULL; @@ -2073,7 +2071,7 @@ int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon, remap); } rename_info->target_name_len = cpu_to_le32(2 * len_of_str); - count = 12 /* sizeof(struct set_file_rename) */ + (2 * len_of_str) + 2; + count = 12 /* sizeof(struct set_file_rename) */ + (2 * len_of_str); byte_count += count; pSMB->DataCount = cpu_to_le16(count); pSMB->TotalDataCount = pSMB->DataCount; @@ -3616,6 +3614,8 @@ findFirstRetry: /* BB remember to free buffer if error BB */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); if (rc == 0) { + unsigned int lnoff; + if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) psrch_inf->unicode = true; else @@ -3638,6 +3638,17 @@ findFirstRetry: le16_to_cpu(parms->SearchCount); psrch_inf->index_of_last_entry = 2 /* skip . and .. */ + psrch_inf->entries_in_buffer; + lnoff = le16_to_cpu(parms->LastNameOffset); + if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE < + lnoff) { + cERROR(1, ("ignoring corrupt resume name")); + psrch_inf->last_entry = NULL; + return rc; + } + + psrch_inf->last_entry = psrch_inf->srch_entries_start + + lnoff; + *pnetfid = parms->SearchHandle; } else { cifs_buf_release(pSMB); @@ -3727,6 +3738,8 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon, rc = validate_t2((struct smb_t2_rsp *)pSMBr); if (rc == 0) { + unsigned int lnoff; + /* BB fixme add lock for file (srch_info) struct here */ if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) psrch_inf->unicode = true; @@ -3753,6 +3766,16 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon, le16_to_cpu(parms->SearchCount); psrch_inf->index_of_last_entry += psrch_inf->entries_in_buffer; + lnoff = le16_to_cpu(parms->LastNameOffset); + if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE < + lnoff) { + cERROR(1, ("ignoring corrupt resume name")); + psrch_inf->last_entry = NULL; + return rc; + } else + psrch_inf->last_entry = + psrch_inf->srch_entries_start + lnoff; + /* cFYI(1,("fnxt2 entries in buf %d index_of_last %d", psrch_inf->entries_in_buffer, psrch_inf->index_of_last_entry)); */ @@ -3914,7 +3937,10 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr, bool is_unicode; struct dfs_referral_level_3 *ref; - is_unicode = pSMBr->hdr.Flags2 & SMBFLG2_UNICODE; + if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) + is_unicode = true; + else + is_unicode = false; *num_of_nodes = le16_to_cpu(pSMBr->NumberOfReferrals); if (*num_of_nodes < 1) { @@ -4814,8 +4840,8 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size, time and resort to the original setpathinfo level which takes the ancient DOS time format with 2 second granularity */ int -CIFSSMBSetFileTimes(const int xid, struct cifsTconInfo *tcon, - const FILE_BASIC_INFO *data, __u16 fid) +CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon, + const FILE_BASIC_INFO *data, __u16 fid, __u32 pid_of_opener) { struct smb_com_transaction2_sfi_req *pSMB = NULL; char *data_offset; @@ -4828,11 +4854,8 @@ CIFSSMBSetFileTimes(const int xid, struct cifsTconInfo *tcon, if (rc) return rc; - /* At this point there is no need to override the current pid - with the pid of the opener, but that could change if we someday - use an existing handle (rather than opening one on the fly) */ - /* pSMB->hdr.Pid = cpu_to_le16((__u16)pid_of_opener); - pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid_of_opener >> 16));*/ + pSMB->hdr.Pid = cpu_to_le16((__u16)pid_of_opener); + pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid_of_opener >> 16)); params = 6; pSMB->MaxSetupCount = 0; @@ -4878,11 +4901,66 @@ CIFSSMBSetFileTimes(const int xid, struct cifsTconInfo *tcon, return rc; } +int +CIFSSMBSetFileDisposition(const int xid, struct cifsTconInfo *tcon, + bool delete_file, __u16 fid, __u32 pid_of_opener) +{ + struct smb_com_transaction2_sfi_req *pSMB = NULL; + char *data_offset; + int rc = 0; + __u16 params, param_offset, offset, byte_count, count; + + cFYI(1, ("Set File Disposition (via SetFileInfo)")); + rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB); + + if (rc) + return rc; + + pSMB->hdr.Pid = cpu_to_le16((__u16)pid_of_opener); + pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid_of_opener >> 16)); + + params = 6; + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4; + offset = param_offset + params; + + data_offset = (char *) (&pSMB->hdr.Protocol) + offset; + + count = 1; + pSMB->MaxParameterCount = cpu_to_le16(2); + /* BB find max SMB PDU from sess */ + pSMB->MaxDataCount = cpu_to_le16(1000); + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FILE_INFORMATION); + byte_count = 3 /* pad */ + params + count; + pSMB->DataCount = cpu_to_le16(count); + pSMB->ParameterCount = cpu_to_le16(params); + pSMB->TotalDataCount = pSMB->DataCount; + pSMB->TotalParameterCount = pSMB->ParameterCount; + pSMB->ParameterOffset = cpu_to_le16(param_offset); + pSMB->DataOffset = cpu_to_le16(offset); + pSMB->Fid = fid; + pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_DISPOSITION_INFO); + pSMB->Reserved4 = 0; + pSMB->hdr.smb_buf_length += byte_count; + pSMB->ByteCount = cpu_to_le16(byte_count); + *data_offset = delete_file ? 1 : 0; + rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0); + if (rc) + cFYI(1, ("Send error in SetFileDisposition = %d", rc)); + + return rc; +} int -CIFSSMBSetTimes(const int xid, struct cifsTconInfo *tcon, const char *fileName, - const FILE_BASIC_INFO *data, - const struct nls_table *nls_codepage, int remap) +CIFSSMBSetPathInfo(const int xid, struct cifsTconInfo *tcon, + const char *fileName, const FILE_BASIC_INFO *data, + const struct nls_table *nls_codepage, int remap) { TRANSACTION2_SPI_REQ *pSMB = NULL; TRANSACTION2_SPI_RSP *pSMBr = NULL; @@ -5011,10 +5089,9 @@ SetAttrLgcyRetry: #endif /* temporarily unneeded SetAttr legacy function */ int -CIFSSMBUnixSetPerms(const int xid, struct cifsTconInfo *tcon, - char *fileName, __u64 mode, __u64 uid, __u64 gid, - dev_t device, const struct nls_table *nls_codepage, - int remap) +CIFSSMBUnixSetInfo(const int xid, struct cifsTconInfo *tcon, char *fileName, + const struct cifs_unix_set_info_args *args, + const struct nls_table *nls_codepage, int remap) { TRANSACTION2_SPI_REQ *pSMB = NULL; TRANSACTION2_SPI_RSP *pSMBr = NULL; @@ -5023,6 +5100,7 @@ CIFSSMBUnixSetPerms(const int xid, struct cifsTconInfo *tcon, int bytes_returned = 0; FILE_UNIX_BASIC_INFO *data_offset; __u16 params, param_offset, offset, count, byte_count; + __u64 mode = args->mode; cFYI(1, ("In SetUID/GID/Mode")); setPermsRetry: @@ -5078,16 +5156,16 @@ setPermsRetry: set file size and do not want to truncate file size to zero accidently as happened on one Samba server beta by putting zero instead of -1 here */ - data_offset->EndOfFile = NO_CHANGE_64; - data_offset->NumOfBytes = NO_CHANGE_64; - data_offset->LastStatusChange = NO_CHANGE_64; - data_offset->LastAccessTime = NO_CHANGE_64; - data_offset->LastModificationTime = NO_CHANGE_64; - data_offset->Uid = cpu_to_le64(uid); - data_offset->Gid = cpu_to_le64(gid); + data_offset->EndOfFile = cpu_to_le64(NO_CHANGE_64); + data_offset->NumOfBytes = cpu_to_le64(NO_CHANGE_64); + data_offset->LastStatusChange = cpu_to_le64(args->ctime); + data_offset->LastAccessTime = cpu_to_le64(args->atime); + data_offset->LastModificationTime = cpu_to_le64(args->mtime); + data_offset->Uid = cpu_to_le64(args->uid); + data_offset->Gid = cpu_to_le64(args->gid); /* better to leave device as zero when it is */ - data_offset->DevMajor = cpu_to_le64(MAJOR(device)); - data_offset->DevMinor = cpu_to_le64(MINOR(device)); + data_offset->DevMajor = cpu_to_le64(MAJOR(args->device)); + data_offset->DevMinor = cpu_to_le64(MINOR(args->device)); data_offset->Permissions = cpu_to_le64(mode); if (S_ISREG(mode)) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index e8fa46c7cff2..4c13bcdb92a5 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -151,7 +151,7 @@ cifs_reconnect(struct TCP_Server_Info *server) } list_for_each(tmp, &GlobalTreeConnectionList) { tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList); - if ((tcon) && (tcon->ses) && (tcon->ses->server == server)) + if ((tcon->ses) && (tcon->ses->server == server)) tcon->tidStatus = CifsNeedReconnect; } read_unlock(&GlobalSMBSeslock); @@ -173,14 +173,12 @@ cifs_reconnect(struct TCP_Server_Info *server) mid_entry = list_entry(tmp, struct mid_q_entry, qhead); - if (mid_entry) { - if (mid_entry->midState == MID_REQUEST_SUBMITTED) { + if (mid_entry->midState == MID_REQUEST_SUBMITTED) { /* Mark other intransit requests as needing retry so we do not immediately mark the session bad again (ie after we reconnect below) as they timeout too */ - mid_entry->midState = MID_RETRY_NEEDED; - } + mid_entry->midState = MID_RETRY_NEEDED; } } spin_unlock(&GlobalMid_Lock); @@ -351,11 +349,9 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server) current->flags |= PF_MEMALLOC; cFYI(1, ("Demultiplex PID: %d", task_pid_nr(current))); - write_lock(&GlobalSMBSeslock); - atomic_inc(&tcpSesAllocCount); - length = tcpSesAllocCount.counter; - write_unlock(&GlobalSMBSeslock); - if (length > 1) + + length = atomic_inc_return(&tcpSesAllocCount); + if (length > 1) mempool_resize(cifs_req_poolp, length + cifs_min_rcv, GFP_KERNEL); @@ -455,7 +451,7 @@ incomplete_rcv: /* Note that FC 1001 length is big endian on the wire, but we convert it here so it is always manipulated as host byte order */ - pdu_length = ntohl(smb_buffer->smb_buf_length); + pdu_length = be32_to_cpu((__force __be32)smb_buffer->smb_buf_length); smb_buffer->smb_buf_length = pdu_length; cFYI(1, ("rfc1002 length 0x%x", pdu_length+4)); @@ -745,14 +741,11 @@ multi_t2_fnd: coming home not much else we can do but free the memory */ } - write_lock(&GlobalSMBSeslock); - atomic_dec(&tcpSesAllocCount); - length = tcpSesAllocCount.counter; - /* last chance to mark ses pointers invalid if there are any pointing to this (e.g if a crazy root user tried to kill cifsd kernel thread explicitly this might happen) */ + write_lock(&GlobalSMBSeslock); list_for_each(tmp, &GlobalSMBSessionList) { ses = list_entry(tmp, struct cifsSesInfo, cifsSessionList); @@ -763,6 +756,8 @@ multi_t2_fnd: kfree(server->hostname); kfree(server); + + length = atomic_dec_return(&tcpSesAllocCount); if (length > 0) mempool_resize(cifs_req_poolp, length + cifs_min_rcv, GFP_KERNEL); @@ -1461,6 +1456,39 @@ get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path, return rc; } +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static struct lock_class_key cifs_key[2]; +static struct lock_class_key cifs_slock_key[2]; + +static inline void +cifs_reclassify_socket4(struct socket *sock) +{ + struct sock *sk = sock->sk; + BUG_ON(sock_owned_by_user(sk)); + sock_lock_init_class_and_name(sk, "slock-AF_INET-CIFS", + &cifs_slock_key[0], "sk_lock-AF_INET-CIFS", &cifs_key[0]); +} + +static inline void +cifs_reclassify_socket6(struct socket *sock) +{ + struct sock *sk = sock->sk; + BUG_ON(sock_owned_by_user(sk)); + sock_lock_init_class_and_name(sk, "slock-AF_INET6-CIFS", + &cifs_slock_key[1], "sk_lock-AF_INET6-CIFS", &cifs_key[1]); +} +#else +static inline void +cifs_reclassify_socket4(struct socket *sock) +{ +} + +static inline void +cifs_reclassify_socket6(struct socket *sock) +{ +} +#endif + /* See RFC1001 section 14 on representation of Netbios names */ static void rfc1002mangle(char *target, char *source, unsigned int length) { @@ -1495,6 +1523,7 @@ ipv4_connect(struct sockaddr_in *psin_server, struct socket **csocket, /* BB other socket options to set KEEPALIVE, NODELAY? */ cFYI(1, ("Socket created")); (*csocket)->sk->sk_allocation = GFP_NOFS; + cifs_reclassify_socket4(*csocket); } } @@ -1627,6 +1656,7 @@ ipv6_connect(struct sockaddr_in6 *psin_server, struct socket **csocket) /* BB other socket options to set KEEPALIVE, NODELAY? */ cFYI(1, ("ipv6 Socket created")); (*csocket)->sk->sk_allocation = GFP_NOFS; + cifs_reclassify_socket6(*csocket); } } @@ -3568,19 +3598,21 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo, char ntlm_session_key[CIFS_SESS_KEY_SIZE]; bool ntlmv2_flag = false; int first_time = 0; + struct TCP_Server_Info *server = pSesInfo->server; /* what if server changes its buffer size after dropping the session? */ - if (pSesInfo->server->maxBuf == 0) /* no need to send on reconnect */ { + if (server->maxBuf == 0) /* no need to send on reconnect */ { rc = CIFSSMBNegotiate(xid, pSesInfo); - if (rc == -EAGAIN) /* retry only once on 1st time connection */ { + if (rc == -EAGAIN) { + /* retry only once on 1st time connection */ rc = CIFSSMBNegotiate(xid, pSesInfo); if (rc == -EAGAIN) rc = -EHOSTDOWN; } if (rc == 0) { spin_lock(&GlobalMid_Lock); - if (pSesInfo->server->tcpStatus != CifsExiting) - pSesInfo->server->tcpStatus = CifsGood; + if (server->tcpStatus != CifsExiting) + server->tcpStatus = CifsGood; else rc = -EHOSTDOWN; spin_unlock(&GlobalMid_Lock); @@ -3588,97 +3620,90 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo, } first_time = 1; } - if (!rc) { - pSesInfo->flags = 0; - pSesInfo->capabilities = pSesInfo->server->capabilities; - if (linuxExtEnabled == 0) - pSesInfo->capabilities &= (~CAP_UNIX); + + if (rc) + goto ss_err_exit; + + pSesInfo->flags = 0; + pSesInfo->capabilities = server->capabilities; + if (linuxExtEnabled == 0) + pSesInfo->capabilities &= (~CAP_UNIX); /* pSesInfo->sequence_number = 0;*/ - cFYI(1, - ("Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d", - pSesInfo->server->secMode, - pSesInfo->server->capabilities, - pSesInfo->server->timeAdj)); - if (experimEnabled < 2) - rc = CIFS_SessSetup(xid, pSesInfo, - first_time, nls_info); - else if (extended_security - && (pSesInfo->capabilities - & CAP_EXTENDED_SECURITY) - && (pSesInfo->server->secType == NTLMSSP)) { - rc = -EOPNOTSUPP; - } else if (extended_security - && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY) - && (pSesInfo->server->secType == RawNTLMSSP)) { - cFYI(1, ("NTLMSSP sesssetup")); - rc = CIFSNTLMSSPNegotiateSessSetup(xid, - pSesInfo, - &ntlmv2_flag, - nls_info); - if (!rc) { - if (ntlmv2_flag) { - char *v2_response; - cFYI(1, ("more secure NTLM ver2 hash")); - if (CalcNTLMv2_partial_mac_key(pSesInfo, - nls_info)) { - rc = -ENOMEM; - goto ss_err_exit; - } else - v2_response = kmalloc(16 + 64 /* blob */, GFP_KERNEL); - if (v2_response) { - CalcNTLMv2_response(pSesInfo, - v2_response); - /* if (first_time) - cifs_calculate_ntlmv2_mac_key( - pSesInfo->server->mac_signing_key, - response, ntlm_session_key,*/ - kfree(v2_response); + cFYI(1, ("Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d", + server->secMode, server->capabilities, server->timeAdj)); + + if (experimEnabled < 2) + rc = CIFS_SessSetup(xid, pSesInfo, first_time, nls_info); + else if (extended_security + && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY) + && (server->secType == NTLMSSP)) { + rc = -EOPNOTSUPP; + } else if (extended_security + && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY) + && (server->secType == RawNTLMSSP)) { + cFYI(1, ("NTLMSSP sesssetup")); + rc = CIFSNTLMSSPNegotiateSessSetup(xid, pSesInfo, &ntlmv2_flag, + nls_info); + if (!rc) { + if (ntlmv2_flag) { + char *v2_response; + cFYI(1, ("more secure NTLM ver2 hash")); + if (CalcNTLMv2_partial_mac_key(pSesInfo, + nls_info)) { + rc = -ENOMEM; + goto ss_err_exit; + } else + v2_response = kmalloc(16 + 64 /* blob*/, + GFP_KERNEL); + if (v2_response) { + CalcNTLMv2_response(pSesInfo, + v2_response); + /* if (first_time) + cifs_calculate_ntlmv2_mac_key */ + kfree(v2_response); /* BB Put dummy sig in SessSetup PDU? */ - } else { - rc = -ENOMEM; - goto ss_err_exit; - } - } else { - SMBNTencrypt(pSesInfo->password, - pSesInfo->server->cryptKey, - ntlm_session_key); - - if (first_time) - cifs_calculate_mac_key( - &pSesInfo->server->mac_signing_key, - ntlm_session_key, - pSesInfo->password); + rc = -ENOMEM; + goto ss_err_exit; } + + } else { + SMBNTencrypt(pSesInfo->password, + server->cryptKey, + ntlm_session_key); + + if (first_time) + cifs_calculate_mac_key( + &server->mac_signing_key, + ntlm_session_key, + pSesInfo->password); + } /* for better security the weaker lanman hash not sent in AuthSessSetup so we no longer calculate it */ - rc = CIFSNTLMSSPAuthSessSetup(xid, - pSesInfo, - ntlm_session_key, - ntlmv2_flag, - nls_info); - } - } else { /* old style NTLM 0.12 session setup */ - SMBNTencrypt(pSesInfo->password, - pSesInfo->server->cryptKey, - ntlm_session_key); + rc = CIFSNTLMSSPAuthSessSetup(xid, pSesInfo, + ntlm_session_key, + ntlmv2_flag, + nls_info); + } + } else { /* old style NTLM 0.12 session setup */ + SMBNTencrypt(pSesInfo->password, server->cryptKey, + ntlm_session_key); - if (first_time) - cifs_calculate_mac_key( - &pSesInfo->server->mac_signing_key, - ntlm_session_key, pSesInfo->password); + if (first_time) + cifs_calculate_mac_key(&server->mac_signing_key, + ntlm_session_key, + pSesInfo->password); - rc = CIFSSessSetup(xid, pSesInfo, - ntlm_session_key, nls_info); - } - if (rc) { - cERROR(1, ("Send error in SessSetup = %d", rc)); - } else { - cFYI(1, ("CIFS Session Established successfully")); + rc = CIFSSessSetup(xid, pSesInfo, ntlm_session_key, nls_info); + } + if (rc) { + cERROR(1, ("Send error in SessSetup = %d", rc)); + } else { + cFYI(1, ("CIFS Session Established successfully")); pSesInfo->status = CifsGood; - } } + ss_err_exit: return rc; } diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index fb69c1fa85c9..e962e75e6f7b 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -226,23 +226,28 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, /* If Open reported that we actually created a file then we now have to set the mode if possible */ if ((pTcon->unix_ext) && (oplock & CIFS_CREATE_ACTION)) { + struct cifs_unix_set_info_args args = { + .mode = mode, + .ctime = NO_CHANGE_64, + .atime = NO_CHANGE_64, + .mtime = NO_CHANGE_64, + .device = 0, + }; + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { - CIFSSMBUnixSetPerms(xid, pTcon, full_path, mode, - (__u64)current->fsuid, - (__u64)current->fsgid, - 0 /* dev */, - cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + args.uid = (__u64) current->fsuid; + if (inode->i_mode & S_ISGID) + args.gid = (__u64) inode->i_gid; + else + args.gid = (__u64) current->fsgid; } else { - CIFSSMBUnixSetPerms(xid, pTcon, full_path, mode, - (__u64)-1, - (__u64)-1, - 0 /* dev */, - cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + args.uid = NO_CHANGE_64; + args.gid = NO_CHANGE_64; } + CIFSSMBUnixSetInfo(xid, pTcon, full_path, &args, + cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); } else { /* BB implement mode setting via Windows security descriptors e.g. */ @@ -267,7 +272,12 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)) { newinode->i_uid = current->fsuid; - newinode->i_gid = current->fsgid; + if (inode->i_mode & S_ISGID) + newinode->i_gid = + inode->i_gid; + else + newinode->i_gid = + current->fsgid; } } } @@ -357,21 +367,24 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode, if (full_path == NULL) rc = -ENOMEM; else if (pTcon->unix_ext) { - mode &= ~current->fs->umask; + struct cifs_unix_set_info_args args = { + .mode = mode & ~current->fs->umask, + .ctime = NO_CHANGE_64, + .atime = NO_CHANGE_64, + .mtime = NO_CHANGE_64, + .device = device_number, + }; if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { - rc = CIFSSMBUnixSetPerms(xid, pTcon, full_path, - mode, (__u64)current->fsuid, - (__u64)current->fsgid, - device_number, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + args.uid = (__u64) current->fsuid; + args.gid = (__u64) current->fsgid; } else { - rc = CIFSSMBUnixSetPerms(xid, pTcon, - full_path, mode, (__u64)-1, (__u64)-1, - device_number, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + args.uid = NO_CHANGE_64; + args.gid = NO_CHANGE_64; } + rc = CIFSSMBUnixSetInfo(xid, pTcon, full_path, + &args, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); if (!rc) { rc = cifs_get_inode_info_unix(&newinode, full_path, diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c index f730ef35499e..1e0c1bd8f2e4 100644 --- a/fs/cifs/dns_resolve.c +++ b/fs/cifs/dns_resolve.c @@ -29,38 +29,13 @@ #include "cifsproto.h" #include "cifs_debug.h" -static int dns_resolver_instantiate(struct key *key, const void *data, - size_t datalen) -{ - int rc = 0; - char *ip; - - ip = kmalloc(datalen+1, GFP_KERNEL); - if (!ip) - return -ENOMEM; - - memcpy(ip, data, datalen); - ip[datalen] = '\0'; - - rcu_assign_pointer(key->payload.data, ip); - - return rc; -} - -struct key_type key_type_dns_resolver = { - .name = "dns_resolver", - .def_datalen = sizeof(struct in_addr), - .describe = user_describe, - .instantiate = dns_resolver_instantiate, - .match = user_match, -}; - /* Checks if supplied name is IP address * returns: * 1 - name is IP * 0 - name is not IP */ -static int is_ip(const char *name) +static int +is_ip(const char *name) { int rc; struct sockaddr_in sin_server; @@ -82,6 +57,47 @@ static int is_ip(const char *name) return 0; } +static int +dns_resolver_instantiate(struct key *key, const void *data, + size_t datalen) +{ + int rc = 0; + char *ip; + + ip = kmalloc(datalen + 1, GFP_KERNEL); + if (!ip) + return -ENOMEM; + + memcpy(ip, data, datalen); + ip[datalen] = '\0'; + + /* make sure this looks like an address */ + if (!is_ip((const char *) ip)) { + kfree(ip); + return -EINVAL; + } + + key->type_data.x[0] = datalen; + rcu_assign_pointer(key->payload.data, ip); + + return rc; +} + +static void +dns_resolver_destroy(struct key *key) +{ + kfree(key->payload.data); +} + +struct key_type key_type_dns_resolver = { + .name = "dns_resolver", + .def_datalen = sizeof(struct in_addr), + .describe = user_describe, + .instantiate = dns_resolver_instantiate, + .destroy = dns_resolver_destroy, + .match = user_match, +}; + /* Resolves server name to ip address. * input: * unc - server UNC @@ -133,6 +149,7 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr) rkey = request_key(&key_type_dns_resolver, name, ""); if (!IS_ERR(rkey)) { + len = rkey->type_data.x[0]; data = rkey->payload.data; } else { cERROR(1, ("%s: unable to resolve: %s", __func__, name)); @@ -141,11 +158,9 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr) skip_upcall: if (data) { - len = strlen(data); - *ip_addr = kmalloc(len+1, GFP_KERNEL); + *ip_addr = kmalloc(len + 1, GFP_KERNEL); if (*ip_addr) { - memcpy(*ip_addr, data, len); - (*ip_addr)[len] = '\0'; + memcpy(*ip_addr, data, len + 1); if (!IS_ERR(rkey)) cFYI(1, ("%s: resolved: %s to %s", __func__, name, diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 0aac824371a5..c4a8a0605125 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -107,7 +107,7 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file, /* want handles we can use to read with first in the list so we do not have to walk the - list to search for one in prepare_write */ + list to search for one in write_begin */ if ((file->f_flags & O_ACCMODE) == O_WRONLY) { list_add_tail(&pCifsFile->flist, &pCifsInode->openFileList); @@ -310,18 +310,19 @@ int cifs_open(struct inode *inode, struct file *file) /* time to set mode which we can not set earlier due to problems creating new read-only files */ if (pTcon->unix_ext) { - CIFSSMBUnixSetPerms(xid, pTcon, full_path, - inode->i_mode, - (__u64)-1, (__u64)-1, 0 /* dev */, + struct cifs_unix_set_info_args args = { + .mode = inode->i_mode, + .uid = NO_CHANGE_64, + .gid = NO_CHANGE_64, + .ctime = NO_CHANGE_64, + .atime = NO_CHANGE_64, + .mtime = NO_CHANGE_64, + .device = 0, + }; + CIFSSMBUnixSetInfo(xid, pTcon, full_path, &args, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); - } else { - /* BB implement via Windows security descriptors eg - CIFSSMBWinSetPerms(xid, pTcon, full_path, mode, - -1, -1, local_nls); - in the meantime could set r/o dos attribute when - perms are eg: mode & 0222 == 0 */ } } @@ -832,6 +833,10 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data, return -EBADF; open_file = (struct cifsFileInfo *) file->private_data; + rc = generic_write_checks(file, poffset, &write_size, 0); + if (rc) + return rc; + xid = GetXid(); if (*poffset > file->f_path.dentry->d_inode->i_size) @@ -910,7 +915,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data, } static ssize_t cifs_write(struct file *file, const char *write_data, - size_t write_size, loff_t *poffset) + size_t write_size, loff_t *poffset) { int rc = 0; unsigned int bytes_written = 0; @@ -1060,6 +1065,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode) struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode) { struct cifsFileInfo *open_file; + bool any_available = false; int rc; /* Having a null inode here (because mapping->host was set to zero by @@ -1075,8 +1081,10 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode) read_lock(&GlobalSMBSeslock); refind_writable: list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { - if (open_file->closePend) + if (open_file->closePend || + (!any_available && open_file->pid != current->tgid)) continue; + if (open_file->pfile && ((open_file->pfile->f_flags & O_RDWR) || (open_file->pfile->f_flags & O_WRONLY))) { @@ -1126,6 +1134,11 @@ refind_writable: of the loop here. */ } } + /* couldn't find useable FH with same pid, try any available */ + if (!any_available) { + any_available = true; + goto refind_writable; + } read_unlock(&GlobalSMBSeslock); return NULL; } @@ -1280,7 +1293,7 @@ retry: if (first < 0) lock_page(page); - else if (TestSetPageLocked(page)) + else if (!trylock_page(page)) break; if (unlikely(page->mapping != mapping)) { @@ -1442,49 +1455,52 @@ static int cifs_writepage(struct page *page, struct writeback_control *wbc) return rc; } -static int cifs_commit_write(struct file *file, struct page *page, - unsigned offset, unsigned to) +static int cifs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) { - int xid; - int rc = 0; - struct inode *inode = page->mapping->host; - loff_t position = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; - char *page_data; + int rc; + struct inode *inode = mapping->host; - xid = GetXid(); - cFYI(1, ("commit write for page %p up to position %lld for %d", - page, position, to)); - spin_lock(&inode->i_lock); - if (position > inode->i_size) - i_size_write(inode, position); + cFYI(1, ("write_end for page %p from pos %lld with %d bytes", + page, pos, copied)); + + if (!PageUptodate(page) && copied == PAGE_CACHE_SIZE) + SetPageUptodate(page); - spin_unlock(&inode->i_lock); if (!PageUptodate(page)) { - position = ((loff_t)page->index << PAGE_CACHE_SHIFT) + offset; - /* can not rely on (or let) writepage write this data */ - if (to < offset) { - cFYI(1, ("Illegal offsets, can not copy from %d to %d", - offset, to)); - FreeXid(xid); - return rc; - } + char *page_data; + unsigned offset = pos & (PAGE_CACHE_SIZE - 1); + int xid; + + xid = GetXid(); /* this is probably better than directly calling partialpage_write since in this function the file handle is known which we might as well leverage */ /* BB check if anything else missing out of ppw such as updating last write time */ page_data = kmap(page); - rc = cifs_write(file, page_data + offset, to-offset, - &position); - if (rc > 0) - rc = 0; - /* else if (rc < 0) should we set writebehind rc? */ + rc = cifs_write(file, page_data + offset, copied, &pos); + /* if (rc < 0) should we set writebehind rc? */ kunmap(page); + + FreeXid(xid); } else { + rc = copied; + pos += copied; set_page_dirty(page); } - FreeXid(xid); + if (rc > 0) { + spin_lock(&inode->i_lock); + if (pos > inode->i_size) + i_size_write(inode, pos); + spin_unlock(&inode->i_lock); + } + + unlock_page(page); + page_cache_release(page); + return rc; } @@ -2030,49 +2046,44 @@ bool is_size_safe_to_change(struct cifsInodeInfo *cifsInode, __u64 end_of_file) return true; } -static int cifs_prepare_write(struct file *file, struct page *page, - unsigned from, unsigned to) +static int cifs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) { - int rc = 0; - loff_t i_size; - loff_t offset; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + loff_t offset = pos & (PAGE_CACHE_SIZE - 1); + + cFYI(1, ("write_begin from %lld len %d", (long long)pos, len)); - cFYI(1, ("prepare write for page %p from %d to %d", page, from, to)); - if (PageUptodate(page)) + *pagep = __grab_cache_page(mapping, index); + if (!*pagep) + return -ENOMEM; + + if (PageUptodate(*pagep)) return 0; /* If we are writing a full page it will be up to date, no need to read from the server */ - if ((to == PAGE_CACHE_SIZE) && (from == 0)) { - SetPageUptodate(page); + if (len == PAGE_CACHE_SIZE && flags & AOP_FLAG_UNINTERRUPTIBLE) return 0; - } - offset = (loff_t)page->index << PAGE_CACHE_SHIFT; - i_size = i_size_read(page->mapping->host); + if ((file->f_flags & O_ACCMODE) != O_WRONLY) { + int rc; - if ((offset >= i_size) || - ((from == 0) && (offset + to) >= i_size)) { - /* - * We don't need to read data beyond the end of the file. - * zero it, and set the page uptodate - */ - simple_prepare_write(file, page, from, to); - SetPageUptodate(page); - } else if ((file->f_flags & O_ACCMODE) != O_WRONLY) { /* might as well read a page, it is fast enough */ - rc = cifs_readpage_worker(file, page, &offset); + rc = cifs_readpage_worker(file, *pagep, &offset); + + /* we do not need to pass errors back + e.g. if we do not have read access to the file + because cifs_write_end will attempt synchronous writes + -- shaggy */ } else { /* we could try using another file handle if there is one - but how would we lock it to prevent close of that handle racing with this read? In any case - this will be written out by commit_write so is fine */ + this will be written out by write_end so is fine */ } - /* we do not need to pass errors back - e.g. if we do not have read access to the file - because cifs_commit_write will do the right thing. -- shaggy */ - return 0; } @@ -2081,8 +2092,8 @@ const struct address_space_operations cifs_addr_ops = { .readpages = cifs_readpages, .writepage = cifs_writepage, .writepages = cifs_writepages, - .prepare_write = cifs_prepare_write, - .commit_write = cifs_commit_write, + .write_begin = cifs_write_begin, + .write_end = cifs_write_end, .set_page_dirty = __set_page_dirty_nobuffers, /* .sync_page = cifs_sync_page, */ /* .direct_IO = */ @@ -2097,8 +2108,8 @@ const struct address_space_operations cifs_addr_ops_smallbuf = { .readpage = cifs_readpage, .writepage = cifs_writepage, .writepages = cifs_writepages, - .prepare_write = cifs_prepare_write, - .commit_write = cifs_commit_write, + .write_begin = cifs_write_begin, + .write_end = cifs_write_end, .set_page_dirty = __set_page_dirty_nobuffers, /* .sync_page = cifs_sync_page, */ /* .direct_IO = */ diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 722be543ceec..a8c833345fc9 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -219,15 +219,15 @@ int cifs_get_inode_info_unix(struct inode **pinode, rc = CIFSSMBUnixQPathInfo(xid, pTcon, full_path, &find_data, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); - if (rc) { - if (rc == -EREMOTE && !is_dfs_referral) { - is_dfs_referral = true; - cFYI(DBG2, ("DFS ref")); - /* for DFS, server does not give us real inode data */ - fill_fake_finddataunix(&find_data, sb); - rc = 0; - } - } + if (rc == -EREMOTE && !is_dfs_referral) { + is_dfs_referral = true; + cFYI(DBG2, ("DFS ref")); + /* for DFS, server does not give us real inode data */ + fill_fake_finddataunix(&find_data, sb); + rc = 0; + } else if (rc) + goto cgiiu_exit; + num_of_bytes = le64_to_cpu(find_data.NumOfBytes); end_of_file = le64_to_cpu(find_data.EndOfFile); @@ -236,7 +236,7 @@ int cifs_get_inode_info_unix(struct inode **pinode, *pinode = new_inode(sb); if (*pinode == NULL) { rc = -ENOMEM; - goto cgiiu_exit; + goto cgiiu_exit; } /* Is an i_ino of zero legal? */ /* note ino incremented to unique num in new_inode */ @@ -546,7 +546,8 @@ int cifs_get_inode_info(struct inode **pinode, if ((inode->i_mode & S_IWUGO) == 0 && (attr & ATTR_READONLY) == 0) inode->i_mode |= (S_IWUGO & default_mode); - inode->i_mode &= ~S_IFMT; + + inode->i_mode &= ~S_IFMT; } /* clear write bits if ATTR_READONLY is set */ if (attr & ATTR_READONLY) @@ -649,6 +650,7 @@ struct inode *cifs_iget(struct super_block *sb, unsigned long ino) inode->i_fop = &simple_dir_operations; inode->i_uid = cifs_sb->mnt_uid; inode->i_gid = cifs_sb->mnt_gid; + } else if (rc) { _FreeXid(xid); iget_failed(inode); return ERR_PTR(rc); @@ -663,40 +665,201 @@ struct inode *cifs_iget(struct super_block *sb, unsigned long ino) return inode; } -int cifs_unlink(struct inode *inode, struct dentry *direntry) +static int +cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid, + char *full_path, __u32 dosattr) +{ + int rc; + int oplock = 0; + __u16 netfid; + __u32 netpid; + bool set_time = false; + struct cifsFileInfo *open_file; + struct cifsInodeInfo *cifsInode = CIFS_I(inode); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifsTconInfo *pTcon = cifs_sb->tcon; + FILE_BASIC_INFO info_buf; + + if (attrs->ia_valid & ATTR_ATIME) { + set_time = true; + info_buf.LastAccessTime = + cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_atime)); + } else + info_buf.LastAccessTime = 0; + + if (attrs->ia_valid & ATTR_MTIME) { + set_time = true; + info_buf.LastWriteTime = + cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_mtime)); + } else + info_buf.LastWriteTime = 0; + + /* + * Samba throws this field away, but windows may actually use it. + * Do not set ctime unless other time stamps are changed explicitly + * (i.e. by utimes()) since we would then have a mix of client and + * server times. + */ + if (set_time && (attrs->ia_valid & ATTR_CTIME)) { + cFYI(1, ("CIFS - CTIME changed")); + info_buf.ChangeTime = + cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_ctime)); + } else + info_buf.ChangeTime = 0; + + info_buf.CreationTime = 0; /* don't change */ + info_buf.Attributes = cpu_to_le32(dosattr); + + /* + * If the file is already open for write, just use that fileid + */ + open_file = find_writable_file(cifsInode); + if (open_file) { + netfid = open_file->netfid; + netpid = open_file->pid; + goto set_via_filehandle; + } + + /* + * NT4 apparently returns success on this call, but it doesn't + * really work. + */ + if (!(pTcon->ses->flags & CIFS_SES_NT4)) { + rc = CIFSSMBSetPathInfo(xid, pTcon, full_path, + &info_buf, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + if (rc == 0) { + cifsInode->cifsAttrs = dosattr; + goto out; + } else if (rc != -EOPNOTSUPP && rc != -EINVAL) + goto out; + } + + cFYI(1, ("calling SetFileInfo since SetPathInfo for " + "times not supported by this server")); + rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN, + SYNCHRONIZE | FILE_WRITE_ATTRIBUTES, + CREATE_NOT_DIR, &netfid, &oplock, + NULL, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + + if (rc != 0) { + if (rc == -EIO) + rc = -EINVAL; + goto out; + } + + netpid = current->tgid; + +set_via_filehandle: + rc = CIFSSMBSetFileInfo(xid, pTcon, &info_buf, netfid, netpid); + if (!rc) + cifsInode->cifsAttrs = dosattr; + + if (open_file == NULL) + CIFSSMBClose(xid, pTcon, netfid); + else + atomic_dec(&open_file->wrtPending); +out: + return rc; +} + +/* + * open the given file (if it isn't already), set the DELETE_ON_CLOSE bit + * and rename it to a random name that hopefully won't conflict with + * anything else. + */ +static int +cifs_rename_pending_delete(char *full_path, struct inode *inode, int xid) +{ + int oplock = 0; + int rc; + __u16 netfid; + struct cifsInodeInfo *cifsInode = CIFS_I(inode); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifsTconInfo *tcon = cifs_sb->tcon; + __u32 dosattr; + FILE_BASIC_INFO *info_buf; + + rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN, + DELETE|FILE_WRITE_ATTRIBUTES, + CREATE_NOT_DIR|CREATE_DELETE_ON_CLOSE, + &netfid, &oplock, NULL, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + if (rc != 0) + goto out; + + /* set ATTR_HIDDEN and clear ATTR_READONLY */ + cifsInode = CIFS_I(inode); + dosattr = cifsInode->cifsAttrs & ~ATTR_READONLY; + if (dosattr == 0) + dosattr |= ATTR_NORMAL; + dosattr |= ATTR_HIDDEN; + + info_buf = kzalloc(sizeof(*info_buf), GFP_KERNEL); + if (info_buf == NULL) { + rc = -ENOMEM; + goto out_close; + } + info_buf->Attributes = cpu_to_le32(dosattr); + rc = CIFSSMBSetFileInfo(xid, tcon, info_buf, netfid, current->tgid); + kfree(info_buf); + if (rc != 0) + goto out_close; + cifsInode->cifsAttrs = dosattr; + + /* silly-rename the file */ + CIFSSMBRenameOpenFile(xid, tcon, netfid, NULL, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + + /* set DELETE_ON_CLOSE */ + rc = CIFSSMBSetFileDisposition(xid, tcon, true, netfid, current->tgid); + + /* + * some samba versions return -ENOENT when we try to set the file + * disposition here. Likely a samba bug, but work around it for now + */ + if (rc == -ENOENT) + rc = 0; + +out_close: + CIFSSMBClose(xid, tcon, netfid); +out: + return rc; +} + +int cifs_unlink(struct inode *dir, struct dentry *dentry) { int rc = 0; int xid; - struct cifs_sb_info *cifs_sb; - struct cifsTconInfo *pTcon; char *full_path = NULL; - struct cifsInodeInfo *cifsInode; - FILE_BASIC_INFO *pinfo_buf; + struct inode *inode = dentry->d_inode; + struct cifsInodeInfo *cifsInode = CIFS_I(inode); + struct super_block *sb = dir->i_sb; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + struct cifsTconInfo *tcon = cifs_sb->tcon; + struct iattr *attrs = NULL; + __u32 dosattr = 0, origattr = 0; - cFYI(1, ("cifs_unlink, inode = 0x%p", inode)); + cFYI(1, ("cifs_unlink, dir=0x%p, dentry=0x%p", dir, dentry)); xid = GetXid(); - if (inode) - cifs_sb = CIFS_SB(inode->i_sb); - else - cifs_sb = CIFS_SB(direntry->d_sb); - pTcon = cifs_sb->tcon; - - /* Unlink can be called from rename so we can not grab the sem here - since we deadlock otherwise */ -/* mutex_lock(&direntry->d_sb->s_vfs_rename_mutex);*/ - full_path = build_path_from_dentry(direntry); -/* mutex_unlock(&direntry->d_sb->s_vfs_rename_mutex);*/ + /* Unlink can be called from rename so we can not take the + * sb->s_vfs_rename_mutex here */ + full_path = build_path_from_dentry(dentry); if (full_path == NULL) { FreeXid(xid); return -ENOMEM; } - if ((pTcon->ses->capabilities & CAP_UNIX) && + if ((tcon->ses->capabilities & CAP_UNIX) && (CIFS_UNIX_POSIX_PATH_OPS_CAP & - le64_to_cpu(pTcon->fsUnixInfo.Capability))) { - rc = CIFSPOSIXDelFile(xid, pTcon, full_path, + le64_to_cpu(tcon->fsUnixInfo.Capability))) { + rc = CIFSPOSIXDelFile(xid, tcon, full_path, SMB_POSIX_UNLINK_FILE_TARGET, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); cFYI(1, ("posix del rc %d", rc)); @@ -704,124 +867,60 @@ int cifs_unlink(struct inode *inode, struct dentry *direntry) goto psx_del_no_retry; } - rc = CIFSSMBDelFile(xid, pTcon, full_path, cifs_sb->local_nls, +retry_std_delete: + rc = CIFSSMBDelFile(xid, tcon, full_path, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + psx_del_no_retry: if (!rc) { - if (direntry->d_inode) - drop_nlink(direntry->d_inode); + if (inode) + drop_nlink(inode); } else if (rc == -ENOENT) { - d_drop(direntry); + d_drop(dentry); } else if (rc == -ETXTBSY) { - int oplock = 0; - __u16 netfid; - - rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN, DELETE, - CREATE_NOT_DIR | CREATE_DELETE_ON_CLOSE, - &netfid, &oplock, NULL, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - if (rc == 0) { - CIFSSMBRenameOpenFile(xid, pTcon, netfid, NULL, - cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - CIFSSMBClose(xid, pTcon, netfid); - if (direntry->d_inode) - drop_nlink(direntry->d_inode); + rc = cifs_rename_pending_delete(full_path, inode, xid); + if (rc == 0) + drop_nlink(inode); + } else if (rc == -EACCES && dosattr == 0) { + attrs = kzalloc(sizeof(*attrs), GFP_KERNEL); + if (attrs == NULL) { + rc = -ENOMEM; + goto out_reval; } - } else if (rc == -EACCES) { - /* try only if r/o attribute set in local lookup data? */ - pinfo_buf = kzalloc(sizeof(FILE_BASIC_INFO), GFP_KERNEL); - if (pinfo_buf) { - /* ATTRS set to normal clears r/o bit */ - pinfo_buf->Attributes = cpu_to_le32(ATTR_NORMAL); - if (!(pTcon->ses->flags & CIFS_SES_NT4)) - rc = CIFSSMBSetTimes(xid, pTcon, full_path, - pinfo_buf, - cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - else - rc = -EOPNOTSUPP; - if (rc == -EOPNOTSUPP) { - int oplock = 0; - __u16 netfid; - /* rc = CIFSSMBSetAttrLegacy(xid, pTcon, - full_path, - (__u16)ATTR_NORMAL, - cifs_sb->local_nls); - For some strange reason it seems that NT4 eats the - old setattr call without actually setting the - attributes so on to the third attempted workaround - */ - - /* BB could scan to see if we already have it open - and pass in pid of opener to function */ - rc = CIFSSMBOpen(xid, pTcon, full_path, - FILE_OPEN, SYNCHRONIZE | - FILE_WRITE_ATTRIBUTES, 0, - &netfid, &oplock, NULL, - cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - if (rc == 0) { - rc = CIFSSMBSetFileTimes(xid, pTcon, - pinfo_buf, - netfid); - CIFSSMBClose(xid, pTcon, netfid); - } - } - kfree(pinfo_buf); - } - if (rc == 0) { - rc = CIFSSMBDelFile(xid, pTcon, full_path, - cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - if (!rc) { - if (direntry->d_inode) - drop_nlink(direntry->d_inode); - } else if (rc == -ETXTBSY) { - int oplock = 0; - __u16 netfid; - - rc = CIFSSMBOpen(xid, pTcon, full_path, - FILE_OPEN, DELETE, - CREATE_NOT_DIR | - CREATE_DELETE_ON_CLOSE, - &netfid, &oplock, NULL, - cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - if (rc == 0) { - CIFSSMBRenameOpenFile(xid, pTcon, - netfid, NULL, - cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - CIFSSMBClose(xid, pTcon, netfid); - if (direntry->d_inode) - drop_nlink(direntry->d_inode); - } - /* BB if rc = -ETXTBUSY goto the rename logic BB */ - } - } - } - if (direntry->d_inode) { - cifsInode = CIFS_I(direntry->d_inode); - cifsInode->time = 0; /* will force revalidate to get info - when needed */ - direntry->d_inode->i_ctime = current_fs_time(inode->i_sb); + /* try to reset dos attributes */ + origattr = cifsInode->cifsAttrs; + if (origattr == 0) + origattr |= ATTR_NORMAL; + dosattr = origattr & ~ATTR_READONLY; + if (dosattr == 0) + dosattr |= ATTR_NORMAL; + dosattr |= ATTR_HIDDEN; + + rc = cifs_set_file_info(inode, attrs, xid, full_path, dosattr); + if (rc != 0) + goto out_reval; + + goto retry_std_delete; } + + /* undo the setattr if we errored out and it's needed */ + if (rc != 0 && dosattr != 0) + cifs_set_file_info(inode, attrs, xid, full_path, origattr); + +out_reval: if (inode) { - inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb); cifsInode = CIFS_I(inode); - cifsInode->time = 0; /* force revalidate of dir as well */ + cifsInode->time = 0; /* will force revalidate to get info + when needed */ + inode->i_ctime = current_fs_time(sb); } + dir->i_ctime = dir->i_mtime = current_fs_time(sb); + cifsInode = CIFS_I(dir); + CIFS_I(dir)->time = 0; /* force revalidate of dir as well */ kfree(full_path); + kfree(attrs); FreeXid(xid); return rc; } @@ -866,7 +965,7 @@ static void posix_fill_in_inode(struct inode *tmp_inode, int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode) { - int rc = 0; + int rc = 0, tmprc; int xid; struct cifs_sb_info *cifs_sb; struct cifsTconInfo *pTcon; @@ -928,6 +1027,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode) kfree(pInfo); goto mkdir_get_info; } + /* Is an i_ino of zero legal? */ /* Are there sanity checks we can use to ensure that the server is really filling in that field? */ @@ -984,35 +1084,52 @@ mkdir_get_info: * failed to get it from the server or was set bogus */ if ((direntry->d_inode) && (direntry->d_inode->i_nlink < 2)) direntry->d_inode->i_nlink = 2; + mode &= ~current->fs->umask; + /* must turn on setgid bit if parent dir has it */ + if (inode->i_mode & S_ISGID) + mode |= S_ISGID; + if (pTcon->unix_ext) { + struct cifs_unix_set_info_args args = { + .mode = mode, + .ctime = NO_CHANGE_64, + .atime = NO_CHANGE_64, + .mtime = NO_CHANGE_64, + .device = 0, + }; if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { - CIFSSMBUnixSetPerms(xid, pTcon, full_path, - mode, - (__u64)current->fsuid, - (__u64)current->fsgid, - 0 /* dev_t */, - cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + args.uid = (__u64)current->fsuid; + if (inode->i_mode & S_ISGID) + args.gid = (__u64)inode->i_gid; + else + args.gid = (__u64)current->fsgid; } else { - CIFSSMBUnixSetPerms(xid, pTcon, full_path, - mode, (__u64)-1, - (__u64)-1, 0 /* dev_t */, - cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + args.uid = NO_CHANGE_64; + args.gid = NO_CHANGE_64; } + CIFSSMBUnixSetInfo(xid, pTcon, full_path, &args, + cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); } else { if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) && (mode & S_IWUGO) == 0) { FILE_BASIC_INFO pInfo; + struct cifsInodeInfo *cifsInode; + u32 dosattrs; + memset(&pInfo, 0, sizeof(pInfo)); - pInfo.Attributes = cpu_to_le32(ATTR_READONLY); - CIFSSMBSetTimes(xid, pTcon, full_path, - &pInfo, cifs_sb->local_nls, + cifsInode = CIFS_I(newinode); + dosattrs = cifsInode->cifsAttrs|ATTR_READONLY; + pInfo.Attributes = cpu_to_le32(dosattrs); + tmprc = CIFSSMBSetPathInfo(xid, pTcon, + full_path, &pInfo, + cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + if (tmprc == 0) + cifsInode->cifsAttrs = dosattrs; } if (direntry->d_inode) { if (cifs_sb->mnt_cifs_flags & @@ -1024,8 +1141,12 @@ mkdir_get_info: CIFS_MOUNT_SET_UID) { direntry->d_inode->i_uid = current->fsuid; - direntry->d_inode->i_gid = - current->fsgid; + if (inode->i_mode & S_ISGID) + direntry->d_inode->i_gid = + inode->i_gid; + else + direntry->d_inode->i_gid = + current->fsgid; } } } @@ -1080,117 +1201,141 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry) return rc; } +static int +cifs_do_rename(int xid, struct dentry *from_dentry, const char *fromPath, + struct dentry *to_dentry, const char *toPath) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(from_dentry->d_sb); + struct cifsTconInfo *pTcon = cifs_sb->tcon; + __u16 srcfid; + int oplock, rc; + + /* try path-based rename first */ + rc = CIFSSMBRename(xid, pTcon, fromPath, toPath, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + + /* + * don't bother with rename by filehandle unless file is busy and + * source Note that cross directory moves do not work with + * rename by filehandle to various Windows servers. + */ + if (rc == 0 || rc != -ETXTBSY) + return rc; + + /* open the file to be renamed -- we need DELETE perms */ + rc = CIFSSMBOpen(xid, pTcon, fromPath, FILE_OPEN, DELETE, + CREATE_NOT_DIR, &srcfid, &oplock, NULL, + cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + + if (rc == 0) { + rc = CIFSSMBRenameOpenFile(xid, pTcon, srcfid, + (const char *) to_dentry->d_name.name, + cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + + CIFSSMBClose(xid, pTcon, srcfid); + } + + return rc; +} + int cifs_rename(struct inode *source_inode, struct dentry *source_direntry, struct inode *target_inode, struct dentry *target_direntry) { - char *fromName; - char *toName; + char *fromName = NULL; + char *toName = NULL; struct cifs_sb_info *cifs_sb_source; struct cifs_sb_info *cifs_sb_target; struct cifsTconInfo *pTcon; + FILE_UNIX_BASIC_INFO *info_buf_source = NULL; + FILE_UNIX_BASIC_INFO *info_buf_target; int xid; - int rc = 0; - - xid = GetXid(); + int rc; cifs_sb_target = CIFS_SB(target_inode->i_sb); cifs_sb_source = CIFS_SB(source_inode->i_sb); pTcon = cifs_sb_source->tcon; + xid = GetXid(); + + /* + * BB: this might be allowed if same server, but different share. + * Consider adding support for this + */ if (pTcon != cifs_sb_target->tcon) { - FreeXid(xid); - return -EXDEV; /* BB actually could be allowed if same server, - but different share. - Might eventually add support for this */ + rc = -EXDEV; + goto cifs_rename_exit; } - /* we already have the rename sem so we do not need to grab it again - here to protect the path integrity */ + /* + * we already have the rename sem so we do not need to + * grab it again here to protect the path integrity + */ fromName = build_path_from_dentry(source_direntry); + if (fromName == NULL) { + rc = -ENOMEM; + goto cifs_rename_exit; + } + toName = build_path_from_dentry(target_direntry); - if ((fromName == NULL) || (toName == NULL)) { + if (toName == NULL) { rc = -ENOMEM; goto cifs_rename_exit; } - rc = CIFSSMBRename(xid, pTcon, fromName, toName, - cifs_sb_source->local_nls, - cifs_sb_source->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + rc = cifs_do_rename(xid, source_direntry, fromName, + target_direntry, toName); + if (rc == -EEXIST) { - /* check if they are the same file because rename of hardlinked - files is a noop */ - FILE_UNIX_BASIC_INFO *info_buf_source; - FILE_UNIX_BASIC_INFO *info_buf_target; - - info_buf_source = - kmalloc(2 * sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL); - if (info_buf_source != NULL) { + if (pTcon->unix_ext) { + /* + * Are src and dst hardlinks of same inode? We can + * only tell with unix extensions enabled + */ + info_buf_source = + kmalloc(2 * sizeof(FILE_UNIX_BASIC_INFO), + GFP_KERNEL); + if (info_buf_source == NULL) + goto unlink_target; + info_buf_target = info_buf_source + 1; - if (pTcon->unix_ext) - rc = CIFSSMBUnixQPathInfo(xid, pTcon, fromName, - info_buf_source, - cifs_sb_source->local_nls, - cifs_sb_source->mnt_cifs_flags & + rc = CIFSSMBUnixQPathInfo(xid, pTcon, fromName, + info_buf_source, + cifs_sb_source->local_nls, + cifs_sb_source->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); - /* else rc is still EEXIST so will fall through to - unlink the target and retry rename */ - if (rc == 0) { - rc = CIFSSMBUnixQPathInfo(xid, pTcon, toName, - info_buf_target, + if (rc != 0) + goto unlink_target; + + rc = CIFSSMBUnixQPathInfo(xid, pTcon, + toName, info_buf_target, cifs_sb_target->local_nls, /* remap based on source sb */ cifs_sb_source->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - } - if ((rc == 0) && - (info_buf_source->UniqueId == - info_buf_target->UniqueId)) { - /* do not rename since the files are hardlinked which - is a noop */ - } else { - /* we either can not tell the files are hardlinked - (as with Windows servers) or files are not - hardlinked so delete the target manually before - renaming to follow POSIX rather than Windows - semantics */ - cifs_unlink(target_inode, target_direntry); - rc = CIFSSMBRename(xid, pTcon, fromName, - toName, - cifs_sb_source->local_nls, - cifs_sb_source->mnt_cifs_flags - & CIFS_MOUNT_MAP_SPECIAL_CHR); - } - kfree(info_buf_source); - } /* if we can not get memory just leave rc as EEXIST */ - } - - if (rc) - cFYI(1, ("rename rc %d", rc)); - - if ((rc == -EIO) || (rc == -EEXIST)) { - int oplock = 0; - __u16 netfid; - - /* BB FIXME Is Generic Read correct for rename? */ - /* if renaming directory - we should not say CREATE_NOT_DIR, - need to test renaming open directory, also GENERIC_READ - might not right be right access to request */ - rc = CIFSSMBOpen(xid, pTcon, fromName, FILE_OPEN, GENERIC_READ, - CREATE_NOT_DIR, &netfid, &oplock, NULL, - cifs_sb_source->local_nls, - cifs_sb_source->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - if (rc == 0) { - rc = CIFSSMBRenameOpenFile(xid, pTcon, netfid, toName, - cifs_sb_source->local_nls, - cifs_sb_source->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); - CIFSSMBClose(xid, pTcon, netfid); - } + + if (rc == 0 && (info_buf_source->UniqueId == + info_buf_target->UniqueId)) + /* same file, POSIX says that this is a noop */ + goto cifs_rename_exit; + } /* else ... BB we could add the same check for Windows by + checking the UniqueId via FILE_INTERNAL_INFO */ +unlink_target: + /* + * we either can not tell the files are hardlinked (as with + * Windows servers) or files are not hardlinked. Delete the + * target manually before renaming to follow POSIX rather than + * Windows semantics + */ + cifs_unlink(target_inode, target_direntry); + rc = cifs_do_rename(xid, source_direntry, fromName, + target_direntry, toName); } cifs_rename_exit: + kfree(info_buf_source); kfree(fromName); kfree(toName); FreeXid(xid); @@ -1310,10 +1455,11 @@ int cifs_revalidate(struct dentry *direntry) /* if (S_ISDIR(direntry->d_inode->i_mode)) shrink_dcache_parent(direntry); */ if (S_ISREG(direntry->d_inode->i_mode)) { - if (direntry->d_inode->i_mapping) + if (direntry->d_inode->i_mapping) { wbrc = filemap_fdatawait(direntry->d_inode->i_mapping); if (wbrc) CIFS_I(direntry->d_inode)->write_behind_rc = wbrc; + } /* may eventually have to do this for open files too */ if (list_empty(&(cifsInode->openFileList))) { /* changed on server - flush read ahead pages */ @@ -1413,31 +1559,209 @@ out_busy: return -ETXTBSY; } -int cifs_setattr(struct dentry *direntry, struct iattr *attrs) +static int +cifs_set_file_size(struct inode *inode, struct iattr *attrs, + int xid, char *full_path) { + int rc; + struct cifsFileInfo *open_file; + struct cifsInodeInfo *cifsInode = CIFS_I(inode); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifsTconInfo *pTcon = cifs_sb->tcon; + + /* + * To avoid spurious oplock breaks from server, in the case of + * inodes that we already have open, avoid doing path based + * setting of file size if we can do it by handle. + * This keeps our caching token (oplock) and avoids timeouts + * when the local oplock break takes longer to flush + * writebehind data than the SMB timeout for the SetPathInfo + * request would allow + */ + open_file = find_writable_file(cifsInode); + if (open_file) { + __u16 nfid = open_file->netfid; + __u32 npid = open_file->pid; + rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid, + npid, false); + atomic_dec(&open_file->wrtPending); + cFYI(1, ("SetFSize for attrs rc = %d", rc)); + if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { + unsigned int bytes_written; + rc = CIFSSMBWrite(xid, pTcon, nfid, 0, attrs->ia_size, + &bytes_written, NULL, NULL, 1); + cFYI(1, ("Wrt seteof rc %d", rc)); + } + } else + rc = -EINVAL; + + if (rc != 0) { + /* Set file size by pathname rather than by handle + either because no valid, writeable file handle for + it was found or because there was an error setting + it by handle */ + rc = CIFSSMBSetEOF(xid, pTcon, full_path, attrs->ia_size, + false, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + cFYI(1, ("SetEOF by path (setattrs) rc = %d", rc)); + if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { + __u16 netfid; + int oplock = 0; + + rc = SMBLegacyOpen(xid, pTcon, full_path, + FILE_OPEN, GENERIC_WRITE, + CREATE_NOT_DIR, &netfid, &oplock, NULL, + cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + if (rc == 0) { + unsigned int bytes_written; + rc = CIFSSMBWrite(xid, pTcon, netfid, 0, + attrs->ia_size, + &bytes_written, NULL, + NULL, 1); + cFYI(1, ("wrt seteof rc %d", rc)); + CIFSSMBClose(xid, pTcon, netfid); + } + } + } + + if (rc == 0) { + rc = cifs_vmtruncate(inode, attrs->ia_size); + cifs_truncate_page(inode->i_mapping, inode->i_size); + } + + return rc; +} + +static int +cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) +{ + int rc; int xid; - struct cifs_sb_info *cifs_sb; - struct cifsTconInfo *pTcon; char *full_path = NULL; - int rc = -EACCES; - struct cifsFileInfo *open_file = NULL; - FILE_BASIC_INFO time_buf; - bool set_time = false; - bool set_dosattr = false; - __u64 mode = 0xFFFFFFFFFFFFFFFFULL; - __u64 uid = 0xFFFFFFFFFFFFFFFFULL; - __u64 gid = 0xFFFFFFFFFFFFFFFFULL; - struct cifsInodeInfo *cifsInode; struct inode *inode = direntry->d_inode; + struct cifsInodeInfo *cifsInode = CIFS_I(inode); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifsTconInfo *pTcon = cifs_sb->tcon; + struct cifs_unix_set_info_args *args = NULL; + + cFYI(1, ("setattr_unix on file %s attrs->ia_valid=0x%x", + direntry->d_name.name, attrs->ia_valid)); + + xid = GetXid(); + + if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) == 0) { + /* check if we have permission to change attrs */ + rc = inode_change_ok(inode, attrs); + if (rc < 0) + goto out; + else + rc = 0; + } + + full_path = build_path_from_dentry(direntry); + if (full_path == NULL) { + rc = -ENOMEM; + goto out; + } + + if ((attrs->ia_valid & ATTR_MTIME) || (attrs->ia_valid & ATTR_SIZE)) { + /* + Flush data before changing file size or changing the last + write time of the file on the server. If the + flush returns error, store it to report later and continue. + BB: This should be smarter. Why bother flushing pages that + will be truncated anyway? Also, should we error out here if + the flush returns error? + */ + rc = filemap_write_and_wait(inode->i_mapping); + if (rc != 0) { + cifsInode->write_behind_rc = rc; + rc = 0; + } + } + + if (attrs->ia_valid & ATTR_SIZE) { + rc = cifs_set_file_size(inode, attrs, xid, full_path); + if (rc != 0) + goto out; + } + + /* skip mode change if it's just for clearing setuid/setgid */ + if (attrs->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID)) + attrs->ia_valid &= ~ATTR_MODE; + + args = kmalloc(sizeof(*args), GFP_KERNEL); + if (args == NULL) { + rc = -ENOMEM; + goto out; + } + + /* set up the struct */ + if (attrs->ia_valid & ATTR_MODE) + args->mode = attrs->ia_mode; + else + args->mode = NO_CHANGE_64; + + if (attrs->ia_valid & ATTR_UID) + args->uid = attrs->ia_uid; + else + args->uid = NO_CHANGE_64; + + if (attrs->ia_valid & ATTR_GID) + args->gid = attrs->ia_gid; + else + args->gid = NO_CHANGE_64; + + if (attrs->ia_valid & ATTR_ATIME) + args->atime = cifs_UnixTimeToNT(attrs->ia_atime); + else + args->atime = NO_CHANGE_64; + + if (attrs->ia_valid & ATTR_MTIME) + args->mtime = cifs_UnixTimeToNT(attrs->ia_mtime); + else + args->mtime = NO_CHANGE_64; + + if (attrs->ia_valid & ATTR_CTIME) + args->ctime = cifs_UnixTimeToNT(attrs->ia_ctime); + else + args->ctime = NO_CHANGE_64; + + args->device = 0; + rc = CIFSSMBUnixSetInfo(xid, pTcon, full_path, args, + cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + + if (!rc) + rc = inode_setattr(inode, attrs); +out: + kfree(args); + kfree(full_path); + FreeXid(xid); + return rc; +} + +static int +cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) +{ + int xid; + struct inode *inode = direntry->d_inode; + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifsInodeInfo *cifsInode = CIFS_I(inode); + char *full_path = NULL; + int rc = -EACCES; + __u32 dosattr = 0; + __u64 mode = NO_CHANGE_64; xid = GetXid(); cFYI(1, ("setattr on file %s attrs->iavalid 0x%x", direntry->d_name.name, attrs->ia_valid)); - cifs_sb = CIFS_SB(inode->i_sb); - pTcon = cifs_sb->tcon; - if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) == 0) { /* check if we have permission to change attrs */ rc = inode_change_ok(inode, attrs); @@ -1453,7 +1777,6 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs) FreeXid(xid); return -ENOMEM; } - cifsInode = CIFS_I(inode); if ((attrs->ia_valid & ATTR_MTIME) || (attrs->ia_valid & ATTR_SIZE)) { /* @@ -1472,78 +1795,8 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs) } if (attrs->ia_valid & ATTR_SIZE) { - /* To avoid spurious oplock breaks from server, in the case of - inodes that we already have open, avoid doing path based - setting of file size if we can do it by handle. - This keeps our caching token (oplock) and avoids timeouts - when the local oplock break takes longer to flush - writebehind data than the SMB timeout for the SetPathInfo - request would allow */ - - open_file = find_writable_file(cifsInode); - if (open_file) { - __u16 nfid = open_file->netfid; - __u32 npid = open_file->pid; - rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, - nfid, npid, false); - atomic_dec(&open_file->wrtPending); - cFYI(1, ("SetFSize for attrs rc = %d", rc)); - if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { - unsigned int bytes_written; - rc = CIFSSMBWrite(xid, pTcon, - nfid, 0, attrs->ia_size, - &bytes_written, NULL, NULL, - 1 /* 45 seconds */); - cFYI(1, ("Wrt seteof rc %d", rc)); - } - } else - rc = -EINVAL; - - if (rc != 0) { - /* Set file size by pathname rather than by handle - either because no valid, writeable file handle for - it was found or because there was an error setting - it by handle */ - rc = CIFSSMBSetEOF(xid, pTcon, full_path, - attrs->ia_size, false, - cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - cFYI(1, ("SetEOF by path (setattrs) rc = %d", rc)); - if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { - __u16 netfid; - int oplock = 0; - - rc = SMBLegacyOpen(xid, pTcon, full_path, - FILE_OPEN, GENERIC_WRITE, - CREATE_NOT_DIR, &netfid, &oplock, - NULL, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - if (rc == 0) { - unsigned int bytes_written; - rc = CIFSSMBWrite(xid, pTcon, - netfid, 0, - attrs->ia_size, - &bytes_written, NULL, - NULL, 1 /* 45 sec */); - cFYI(1, ("wrt seteof rc %d", rc)); - CIFSSMBClose(xid, pTcon, netfid); - } - - } - } - - /* Server is ok setting allocation size implicitly - no need - to call: - CIFSSMBSetEOF(xid, pTcon, full_path, attrs->ia_size, true, - cifs_sb->local_nls); - */ - - if (rc == 0) { - rc = cifs_vmtruncate(inode, attrs->ia_size); - cifs_truncate_page(inode->i_mapping, inode->i_size); - } else + rc = cifs_set_file_size(inode, attrs, xid, full_path); + if (rc != 0) goto cifs_setattr_exit; } @@ -1554,21 +1807,8 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs) * CIFSACL support + proper Windows to Unix idmapping, we may be * able to support this in the future. */ - if (!pTcon->unix_ext && - !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)) { + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)) attrs->ia_valid &= ~(ATTR_UID | ATTR_GID); - } else { - if (attrs->ia_valid & ATTR_UID) { - cFYI(1, ("UID changed to %d", attrs->ia_uid)); - uid = attrs->ia_uid; - } - if (attrs->ia_valid & ATTR_GID) { - cFYI(1, ("GID changed to %d", attrs->ia_gid)); - gid = attrs->ia_gid; - } - } - - time_buf.Attributes = 0; /* skip mode change if it's just for clearing setuid/setgid */ if (attrs->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID)) @@ -1579,13 +1819,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs) mode = attrs->ia_mode; } - if ((pTcon->unix_ext) - && (attrs->ia_valid & (ATTR_MODE | ATTR_GID | ATTR_UID))) - rc = CIFSSMBUnixSetPerms(xid, pTcon, full_path, mode, uid, gid, - 0 /* dev_t */, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - else if (attrs->ia_valid & ATTR_MODE) { + if (attrs->ia_valid & ATTR_MODE) { rc = 0; #ifdef CONFIG_CIFS_EXPERIMENTAL if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) @@ -1594,24 +1828,19 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs) #endif if (((mode & S_IWUGO) == 0) && (cifsInode->cifsAttrs & ATTR_READONLY) == 0) { - set_dosattr = true; - time_buf.Attributes = cpu_to_le32(cifsInode->cifsAttrs | - ATTR_READONLY); + + dosattr = cifsInode->cifsAttrs | ATTR_READONLY; + /* fix up mode if we're not using dynperm */ if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) == 0) attrs->ia_mode = inode->i_mode & ~S_IWUGO; } else if ((mode & S_IWUGO) && (cifsInode->cifsAttrs & ATTR_READONLY)) { - /* If file is readonly on server, we would - not be able to write to it - so if any write - bit is enabled for user or group or other we - need to at least try to remove r/o dos attr */ - set_dosattr = true; - time_buf.Attributes = cpu_to_le32(cifsInode->cifsAttrs & - (~ATTR_READONLY)); - /* Windows ignores set to zero */ - if (time_buf.Attributes == 0) - time_buf.Attributes |= cpu_to_le32(ATTR_NORMAL); + + dosattr = cifsInode->cifsAttrs & ~ATTR_READONLY; + /* Attributes of 0 are ignored */ + if (dosattr == 0) + dosattr |= ATTR_NORMAL; /* reset local inode permissions to normal */ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)) { @@ -1629,82 +1858,18 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs) } } - if (attrs->ia_valid & ATTR_ATIME) { - set_time = true; - time_buf.LastAccessTime = - cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_atime)); - } else - time_buf.LastAccessTime = 0; - - if (attrs->ia_valid & ATTR_MTIME) { - set_time = true; - time_buf.LastWriteTime = - cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_mtime)); - } else - time_buf.LastWriteTime = 0; - /* Do not set ctime explicitly unless other time - stamps are changed explicitly (i.e. by utime() - since we would then have a mix of client and - server times */ - - if (set_time && (attrs->ia_valid & ATTR_CTIME)) { - set_time = true; - /* Although Samba throws this field away - it may be useful to Windows - but we do - not want to set ctime unless some other - timestamp is changing */ - cFYI(1, ("CIFS - CTIME changed")); - time_buf.ChangeTime = - cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_ctime)); - } else - time_buf.ChangeTime = 0; - - if (set_time || set_dosattr) { - time_buf.CreationTime = 0; /* do not change */ - /* In the future we should experiment - try setting timestamps - via Handle (SetFileInfo) instead of by path */ - if (!(pTcon->ses->flags & CIFS_SES_NT4)) - rc = CIFSSMBSetTimes(xid, pTcon, full_path, &time_buf, - cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - else - rc = -EOPNOTSUPP; - - if (rc == -EOPNOTSUPP) { - int oplock = 0; - __u16 netfid; - - cFYI(1, ("calling SetFileInfo since SetPathInfo for " - "times not supported by this server")); - /* BB we could scan to see if we already have it open - and pass in pid of opener to function */ - rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN, - SYNCHRONIZE | FILE_WRITE_ATTRIBUTES, - CREATE_NOT_DIR, &netfid, &oplock, - NULL, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - if (rc == 0) { - rc = CIFSSMBSetFileTimes(xid, pTcon, &time_buf, - netfid); - CIFSSMBClose(xid, pTcon, netfid); - } else { - /* BB For even older servers we could convert time_buf - into old DOS style which uses two second - granularity */ + if (attrs->ia_valid & (ATTR_MTIME|ATTR_ATIME|ATTR_CTIME) || + ((attrs->ia_valid & ATTR_MODE) && dosattr)) { + rc = cifs_set_file_info(inode, attrs, xid, full_path, dosattr); + /* BB: check for rc = -EOPNOTSUPP and switch to legacy mode */ - /* rc = CIFSSMBSetTimesLegacy(xid, pTcon, full_path, - &time_buf, cifs_sb->local_nls); */ - } - } /* Even if error on time set, no sense failing the call if the server would set the time to a reasonable value anyway, and this check ensures that we are not being called from sys_utimes in which case we ought to fail the call back to the user when the server rejects the call */ if ((rc) && (attrs->ia_valid & - (ATTR_MODE | ATTR_GID | ATTR_UID | ATTR_SIZE))) + (ATTR_MODE | ATTR_GID | ATTR_UID | ATTR_SIZE))) rc = 0; } @@ -1718,6 +1883,21 @@ cifs_setattr_exit: return rc; } +int +cifs_setattr(struct dentry *direntry, struct iattr *attrs) +{ + struct inode *inode = direntry->d_inode; + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifsTconInfo *pTcon = cifs_sb->tcon; + + if (pTcon->unix_ext) + return cifs_setattr_unix(direntry, attrs); + + return cifs_setattr_nounix(direntry, attrs); + + /* BB: add cifs_setattr_legacy for really old servers */ +} + #if 0 void cifs_delete_inode(struct inode *inode) { diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 4b17f8fe3157..88786ba02d27 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -150,8 +150,7 @@ cifs_buf_get(void) but it may be more efficient to always alloc same size albeit slightly larger than necessary and maxbuffersize defaults to this and can not be bigger */ - ret_buf = (struct smb_hdr *) mempool_alloc(cifs_req_poolp, - GFP_KERNEL | GFP_NOFS); + ret_buf = mempool_alloc(cifs_req_poolp, GFP_NOFS); /* clear the first few header bytes */ /* for most paths, more is cleared in header_assemble */ @@ -188,8 +187,7 @@ cifs_small_buf_get(void) but it may be more efficient to always alloc same size albeit slightly larger than necessary and maxbuffersize defaults to this and can not be bigger */ - ret_buf = (struct smb_hdr *) mempool_alloc(cifs_sm_req_poolp, - GFP_KERNEL | GFP_NOFS); + ret_buf = mempool_alloc(cifs_sm_req_poolp, GFP_NOFS); if (ret_buf) { /* No need to clear memory here, cleared in header assemble */ /* memset(ret_buf, 0, sizeof(struct smb_hdr) + 27);*/ @@ -313,8 +311,6 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ , buffer->Flags2 = SMBFLG2_KNOWS_LONG_NAMES; buffer->Pid = cpu_to_le16((__u16)current->tgid); buffer->PidHigh = cpu_to_le16((__u16)(current->tgid >> 16)); - spin_lock(&GlobalMid_Lock); - spin_unlock(&GlobalMid_Lock); if (treeCon) { buffer->Tid = treeCon->tid; if (treeCon->ses) { diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 83f306954883..765adf12d54f 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -640,6 +640,70 @@ static int is_dir_changed(struct file *file) } +static int cifs_save_resume_key(const char *current_entry, + struct cifsFileInfo *cifsFile) +{ + int rc = 0; + unsigned int len = 0; + __u16 level; + char *filename; + + if ((cifsFile == NULL) || (current_entry == NULL)) + return -EINVAL; + + level = cifsFile->srch_inf.info_level; + + if (level == SMB_FIND_FILE_UNIX) { + FILE_UNIX_INFO *pFindData = (FILE_UNIX_INFO *)current_entry; + + filename = &pFindData->FileName[0]; + if (cifsFile->srch_inf.unicode) { + len = cifs_unicode_bytelen(filename); + } else { + /* BB should we make this strnlen of PATH_MAX? */ + len = strnlen(filename, PATH_MAX); + } + cifsFile->srch_inf.resume_key = pFindData->ResumeKey; + } else if (level == SMB_FIND_FILE_DIRECTORY_INFO) { + FILE_DIRECTORY_INFO *pFindData = + (FILE_DIRECTORY_INFO *)current_entry; + filename = &pFindData->FileName[0]; + len = le32_to_cpu(pFindData->FileNameLength); + cifsFile->srch_inf.resume_key = pFindData->FileIndex; + } else if (level == SMB_FIND_FILE_FULL_DIRECTORY_INFO) { + FILE_FULL_DIRECTORY_INFO *pFindData = + (FILE_FULL_DIRECTORY_INFO *)current_entry; + filename = &pFindData->FileName[0]; + len = le32_to_cpu(pFindData->FileNameLength); + cifsFile->srch_inf.resume_key = pFindData->FileIndex; + } else if (level == SMB_FIND_FILE_ID_FULL_DIR_INFO) { + SEARCH_ID_FULL_DIR_INFO *pFindData = + (SEARCH_ID_FULL_DIR_INFO *)current_entry; + filename = &pFindData->FileName[0]; + len = le32_to_cpu(pFindData->FileNameLength); + cifsFile->srch_inf.resume_key = pFindData->FileIndex; + } else if (level == SMB_FIND_FILE_BOTH_DIRECTORY_INFO) { + FILE_BOTH_DIRECTORY_INFO *pFindData = + (FILE_BOTH_DIRECTORY_INFO *)current_entry; + filename = &pFindData->FileName[0]; + len = le32_to_cpu(pFindData->FileNameLength); + cifsFile->srch_inf.resume_key = pFindData->FileIndex; + } else if (level == SMB_FIND_FILE_INFO_STANDARD) { + FIND_FILE_STANDARD_INFO *pFindData = + (FIND_FILE_STANDARD_INFO *)current_entry; + filename = &pFindData->FileName[0]; + /* one byte length, no name conversion */ + len = (unsigned int)pFindData->FileNameLength; + cifsFile->srch_inf.resume_key = pFindData->ResumeKey; + } else { + cFYI(1, ("Unknown findfirst level %d", level)); + return -EINVAL; + } + cifsFile->srch_inf.resume_name_len = len; + cifsFile->srch_inf.presume_name = filename; + return rc; +} + /* find the corresponding entry in the search */ /* Note that the SMB server returns search entries for . and .. which complicates logic here if we choose to parse for them and we do not @@ -690,6 +754,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon, else cifs_buf_release(cifsFile->srch_inf. ntwrk_buf_start); + cifsFile->srch_inf.ntwrk_buf_start = NULL; } rc = initiate_cifs_search(xid, file); if (rc) { @@ -702,6 +767,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon, while ((index_to_find >= cifsFile->srch_inf.index_of_last_entry) && (rc == 0) && !cifsFile->srch_inf.endOfSearch) { cFYI(1, ("calling findnext2")); + cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile); rc = CIFSFindNext(xid, pTcon, cifsFile->netfid, &cifsFile->srch_inf); if (rc) @@ -918,69 +984,6 @@ static int cifs_filldir(char *pfindEntry, struct file *file, return rc; } -static int cifs_save_resume_key(const char *current_entry, - struct cifsFileInfo *cifsFile) -{ - int rc = 0; - unsigned int len = 0; - __u16 level; - char *filename; - - if ((cifsFile == NULL) || (current_entry == NULL)) - return -EINVAL; - - level = cifsFile->srch_inf.info_level; - - if (level == SMB_FIND_FILE_UNIX) { - FILE_UNIX_INFO *pFindData = (FILE_UNIX_INFO *)current_entry; - - filename = &pFindData->FileName[0]; - if (cifsFile->srch_inf.unicode) { - len = cifs_unicode_bytelen(filename); - } else { - /* BB should we make this strnlen of PATH_MAX? */ - len = strnlen(filename, PATH_MAX); - } - cifsFile->srch_inf.resume_key = pFindData->ResumeKey; - } else if (level == SMB_FIND_FILE_DIRECTORY_INFO) { - FILE_DIRECTORY_INFO *pFindData = - (FILE_DIRECTORY_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - cifsFile->srch_inf.resume_key = pFindData->FileIndex; - } else if (level == SMB_FIND_FILE_FULL_DIRECTORY_INFO) { - FILE_FULL_DIRECTORY_INFO *pFindData = - (FILE_FULL_DIRECTORY_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - cifsFile->srch_inf.resume_key = pFindData->FileIndex; - } else if (level == SMB_FIND_FILE_ID_FULL_DIR_INFO) { - SEARCH_ID_FULL_DIR_INFO *pFindData = - (SEARCH_ID_FULL_DIR_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - cifsFile->srch_inf.resume_key = pFindData->FileIndex; - } else if (level == SMB_FIND_FILE_BOTH_DIRECTORY_INFO) { - FILE_BOTH_DIRECTORY_INFO *pFindData = - (FILE_BOTH_DIRECTORY_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - cifsFile->srch_inf.resume_key = pFindData->FileIndex; - } else if (level == SMB_FIND_FILE_INFO_STANDARD) { - FIND_FILE_STANDARD_INFO *pFindData = - (FIND_FILE_STANDARD_INFO *)current_entry; - filename = &pFindData->FileName[0]; - /* one byte length, no name conversion */ - len = (unsigned int)pFindData->FileNameLength; - cifsFile->srch_inf.resume_key = pFindData->ResumeKey; - } else { - cFYI(1, ("Unknown findfirst level %d", level)); - return -EINVAL; - } - cifsFile->srch_inf.resume_name_len = len; - cifsFile->srch_inf.presume_name = filename; - return rc; -} int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) { diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index ed150efbe27c..2851d5da0c8c 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -409,6 +409,8 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time, #ifdef CONFIG_CIFS_WEAK_PW_HASH char lnm_session_key[CIFS_SESS_KEY_SIZE]; + pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE; + /* no capabilities flags in old lanman negotiation */ pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE); @@ -505,7 +507,7 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time, unicode_ssetup_strings(&bcc_ptr, ses, nls_cp); } else ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); - } else if (type == Kerberos) { + } else if (type == Kerberos || type == MSKerberos) { #ifdef CONFIG_CIFS_UPCALL struct cifs_spnego_msg *msg; spnego_key = cifs_get_spnego_key(ses); @@ -516,6 +518,15 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time, } msg = spnego_key->payload.data; + /* check version field to make sure that cifs.upcall is + sending us a response in an expected form */ + if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) { + cERROR(1, ("incorrect version of cifs.upcall (expected" + " %d but got %d)", + CIFS_SPNEGO_UPCALL_VERSION, msg->version)); + rc = -EKEYREJECTED; + goto ssetup_exit; + } /* bail out if key is too long */ if (msg->sesskey_len > sizeof(ses->server->mac_signing_key.data.krb5)) { @@ -613,8 +624,10 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time, ses, nls_cp); ssetup_exit: - if (spnego_key) + if (spnego_key) { + key_revoke(spnego_key); key_put(spnego_key); + } kfree(str_area); if (resp_buf_type == CIFS_SMALL_BUFFER) { cFYI(1, ("ssetup freeing small buf %p", iov[0].iov_base)); diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 000ac509c98a..bf0e6d8e382a 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -50,8 +50,7 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct cifsSesInfo *ses) return NULL; } - temp = (struct mid_q_entry *) mempool_alloc(cifs_mid_poolp, - GFP_KERNEL | GFP_NOFS); + temp = mempool_alloc(cifs_mid_poolp, GFP_NOFS); if (temp == NULL) return temp; else { @@ -265,6 +264,7 @@ smb_send2(struct socket *ssocket, struct kvec *iov, int n_vec, cFYI(1, ("Sending smb: total_len %d", total_len)); dump_smb(smb_buffer, len); + i = 0; while (total_len) { rc = kernel_sendmsg(ssocket, &smb_msg, &iov[first_vec], n_vec - first_vec, total_len); diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c index e1c854890f94..bf4a3fd3c8e3 100644 --- a/fs/coda/coda_linux.c +++ b/fs/coda/coda_linux.c @@ -28,11 +28,9 @@ int coda_fake_statfs; char * coda_f2s(struct CodaFid *f) { static char s[60]; -#ifdef CONFIG_CODA_FS_OLD_API - sprintf(s, "(%08x.%08x.%08x)", f->opaque[0], f->opaque[1], f->opaque[2]); -#else + sprintf(s, "(%08x.%08x.%08x.%08x)", f->opaque[0], f->opaque[1], f->opaque[2], f->opaque[3]); -#endif + return s; } diff --git a/fs/coda/dir.c b/fs/coda/dir.c index 3d2580e00a3e..c5916228243c 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -137,9 +137,11 @@ exit: } -int coda_permission(struct inode *inode, int mask, struct nameidata *nd) +int coda_permission(struct inode *inode, int mask) { int error = 0; + + mask &= MAY_READ | MAY_WRITE | MAY_EXEC; if (!mask) return 0; diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 2f58dfc70083..830f51abb971 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -58,7 +58,7 @@ static void coda_destroy_inode(struct inode *inode) kmem_cache_free(coda_inode_cachep, ITOC(inode)); } -static void init_once(struct kmem_cache * cachep, void *foo) +static void init_once(void *foo) { struct coda_inode_info *ei = (struct coda_inode_info *) foo; diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c index c21a1f552a63..c51365422aa8 100644 --- a/fs/coda/pioctl.c +++ b/fs/coda/pioctl.c @@ -24,8 +24,7 @@ #include <linux/coda_psdev.h> /* pioctl ops */ -static int coda_ioctl_permission(struct inode *inode, int mask, - struct nameidata *nd); +static int coda_ioctl_permission(struct inode *inode, int mask); static int coda_pioctl(struct inode * inode, struct file * filp, unsigned int cmd, unsigned long user_data); @@ -42,8 +41,7 @@ const struct file_operations coda_ioctl_operations = { }; /* the coda pioctl inode ops */ -static int coda_ioctl_permission(struct inode *inode, int mask, - struct nameidata *nd) +static int coda_ioctl_permission(struct inode *inode, int mask) { return 0; } @@ -51,7 +49,7 @@ static int coda_ioctl_permission(struct inode *inode, int mask, static int coda_pioctl(struct inode * inode, struct file * filp, unsigned int cmd, unsigned long user_data) { - struct nameidata nd; + struct path path; int error; struct PioctlData data; struct inode *target_inode = NULL; @@ -66,21 +64,21 @@ static int coda_pioctl(struct inode * inode, struct file * filp, * Look up the pathname. Note that the pathname is in * user memory, and namei takes care of this */ - if ( data.follow ) { - error = user_path_walk(data.path, &nd); + if (data.follow) { + error = user_path(data.path, &path); } else { - error = user_path_walk_link(data.path, &nd); + error = user_lpath(data.path, &path); } if ( error ) { return error; } else { - target_inode = nd.path.dentry->d_inode; + target_inode = path.dentry->d_inode; } /* return if it is not a Coda inode */ if ( target_inode->i_sb != inode->i_sb ) { - path_put(&nd.path); + path_put(&path); return -EINVAL; } @@ -89,7 +87,7 @@ static int coda_pioctl(struct inode * inode, struct file * filp, error = venus_pioctl(inode->i_sb, &(cnp->c_fid), cmd, &data); - path_put(&nd.path); + path_put(&path); return error; } diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c index e3eb3556622b..0d9b80ec689c 100644 --- a/fs/coda/psdev.c +++ b/fs/coda/psdev.c @@ -362,8 +362,9 @@ static int init_coda_psdev(void) goto out_chrdev; } for (i = 0; i < MAX_CODADEVS; i++) - device_create(coda_psdev_class, NULL, - MKDEV(CODA_PSDEV_MAJOR,i), "cfs%d", i); + device_create_drvdata(coda_psdev_class, NULL, + MKDEV(CODA_PSDEV_MAJOR, i), + NULL, "cfs%d", i); coda_sysctl_init(); goto out; @@ -377,11 +378,7 @@ MODULE_AUTHOR("Jan Harkes, Peter J. Braam"); MODULE_DESCRIPTION("Coda Distributed File System VFS interface"); MODULE_ALIAS_CHARDEV_MAJOR(CODA_PSDEV_MAJOR); MODULE_LICENSE("GPL"); -#ifdef CONFIG_CODA_FS_OLD_API -MODULE_VERSION("5.3.21"); -#else MODULE_VERSION("6.6"); -#endif static int __init init_coda(void) { diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c index 359e531094dd..ce432bca95d1 100644 --- a/fs/coda/upcall.c +++ b/fs/coda/upcall.c @@ -52,12 +52,8 @@ static void *alloc_upcall(int opcode, int size) inp->ih.opcode = opcode; inp->ih.pid = current->pid; inp->ih.pgid = task_pgrp_nr(current); -#ifdef CONFIG_CODA_FS_OLD_API - memset(&inp->ih.cred, 0, sizeof(struct coda_cred)); - inp->ih.cred.cr_fsuid = current->fsuid; -#else inp->ih.uid = current->fsuid; -#endif + return (void*)inp; } @@ -166,20 +162,11 @@ int venus_close(struct super_block *sb, struct CodaFid *fid, int flags, union inputArgs *inp; union outputArgs *outp; int insize, outsize, error; -#ifdef CONFIG_CODA_FS_OLD_API - struct coda_cred cred = { 0, }; - cred.cr_fsuid = uid; -#endif insize = SIZE(release); UPARG(CODA_CLOSE); -#ifdef CONFIG_CODA_FS_OLD_API - memcpy(&(inp->ih.cred), &cred, sizeof(cred)); -#else inp->ih.uid = uid; -#endif - inp->coda_close.VFid = *fid; inp->coda_close.flags = flags; diff --git a/fs/compat.c b/fs/compat.c index ed43e17a5dc6..075d0509970d 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -197,8 +197,8 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs * { if (sizeof ubuf->f_blocks == 4) { - if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail) & - 0xffffffff00000000ULL) + if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail | + kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL) return -EOVERFLOW; /* f_files and f_ffree may be -1; it's okay * to stuff that into 32 bits */ @@ -234,18 +234,18 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs * * The following statfs calls are copies of code from fs/open.c and * should be checked against those from time to time */ -asmlinkage long compat_sys_statfs(const char __user *path, struct compat_statfs __user *buf) +asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf) { - struct nameidata nd; + struct path path; int error; - error = user_path_walk(path, &nd); + error = user_path(pathname, &path); if (!error) { struct kstatfs tmp; - error = vfs_statfs(nd.path.dentry, &tmp); + error = vfs_statfs(path.dentry, &tmp); if (!error) error = put_compat_statfs(buf, &tmp); - path_put(&nd.path); + path_put(&path); } return error; } @@ -271,8 +271,8 @@ out: static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstatfs *kbuf) { if (sizeof ubuf->f_blocks == 4) { - if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail) & - 0xffffffff00000000ULL) + if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail | + kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL) return -EOVERFLOW; /* f_files and f_ffree may be -1; it's okay * to stuff that into 32 bits */ @@ -299,21 +299,21 @@ static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstat return 0; } -asmlinkage long compat_sys_statfs64(const char __user *path, compat_size_t sz, struct compat_statfs64 __user *buf) +asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t sz, struct compat_statfs64 __user *buf) { - struct nameidata nd; + struct path path; int error; if (sz != sizeof(*buf)) return -EINVAL; - error = user_path_walk(path, &nd); + error = user_path(pathname, &path); if (!error) { struct kstatfs tmp; - error = vfs_statfs(nd.path.dentry, &tmp); + error = vfs_statfs(path.dentry, &tmp); if (!error) error = put_compat_statfs64(buf, &tmp); - path_put(&nd.path); + path_put(&path); } return error; } @@ -792,8 +792,10 @@ static int compat_fillonedir(void *__buf, const char *name, int namlen, if (buf->result) return -EINVAL; d_ino = ino; - if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) + if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { + buf->result = -EOVERFLOW; return -EOVERFLOW; + } buf->result++; dirent = buf->dirent; if (!access_ok(VERIFY_WRITE, dirent, @@ -862,8 +864,10 @@ static int compat_filldir(void *__buf, const char *name, int namlen, if (reclen > buf->count) return -EINVAL; d_ino = ino; - if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) + if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { + buf->error = -EOVERFLOW; return -EOVERFLOW; + } dirent = buf->previous; if (dirent) { if (__put_user(offset, &dirent->d_off)) @@ -2131,9 +2135,9 @@ asmlinkage long compat_sys_epoll_pwait(int epfd, #ifdef CONFIG_SIGNALFD -asmlinkage long compat_sys_signalfd(int ufd, - const compat_sigset_t __user *sigmask, - compat_size_t sigsetsize) +asmlinkage long compat_sys_signalfd4(int ufd, + const compat_sigset_t __user *sigmask, + compat_size_t sigsetsize, int flags) { compat_sigset_t ss32; sigset_t tmp; @@ -2148,9 +2152,15 @@ asmlinkage long compat_sys_signalfd(int ufd, if (copy_to_user(ksigmask, &tmp, sizeof(sigset_t))) return -EFAULT; - return sys_signalfd(ufd, ksigmask, sizeof(sigset_t)); + return sys_signalfd4(ufd, ksigmask, sizeof(sigset_t), flags); } +asmlinkage long compat_sys_signalfd(int ufd, + const compat_sigset_t __user *sigmask, + compat_size_t sigsetsize) +{ + return compat_sys_signalfd4(ufd, sigmask, sigsetsize, 0); +} #endif /* CONFIG_SIGNALFD */ #ifdef CONFIG_TIMERFD diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 97dba0d92348..5235c67e7594 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -25,7 +25,6 @@ #include <linux/slab.h> #include <linux/raid/md.h> #include <linux/kd.h> -#include <linux/dirent.h> #include <linux/route.h> #include <linux/in6.h> #include <linux/ipv6_route.h> @@ -58,7 +57,6 @@ #include <linux/syscalls.h> #include <linux/i2c.h> #include <linux/i2c-dev.h> -#include <linux/wireless.h> #include <linux/atalk.h> #include <linux/loop.h> @@ -69,9 +67,11 @@ #include <linux/capi.h> #include <linux/gigaset_dev.h> +#ifdef CONFIG_BLOCK #include <scsi/scsi.h> #include <scsi/scsi_ioctl.h> #include <scsi/sg.h> +#endif #include <asm/uaccess.h> #include <linux/ethtool.h> @@ -1757,64 +1757,6 @@ static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd, unsigned long a return sys_ioctl(fd, cmd, (unsigned long)tdata); } -struct compat_iw_point { - compat_caddr_t pointer; - __u16 length; - __u16 flags; -}; - -static int do_wireless_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg) -{ - struct iwreq __user *iwr; - struct iwreq __user *iwr_u; - struct iw_point __user *iwp; - struct compat_iw_point __user *iwp_u; - compat_caddr_t pointer_u; - void __user *pointer; - __u16 length, flags; - int ret; - - iwr_u = compat_ptr(arg); - iwp_u = (struct compat_iw_point __user *) &iwr_u->u.data; - iwr = compat_alloc_user_space(sizeof(*iwr)); - if (iwr == NULL) - return -ENOMEM; - - iwp = &iwr->u.data; - - if (!access_ok(VERIFY_WRITE, iwr, sizeof(*iwr))) - return -EFAULT; - - if (__copy_in_user(&iwr->ifr_ifrn.ifrn_name[0], - &iwr_u->ifr_ifrn.ifrn_name[0], - sizeof(iwr->ifr_ifrn.ifrn_name))) - return -EFAULT; - - if (__get_user(pointer_u, &iwp_u->pointer) || - __get_user(length, &iwp_u->length) || - __get_user(flags, &iwp_u->flags)) - return -EFAULT; - - if (__put_user(compat_ptr(pointer_u), &iwp->pointer) || - __put_user(length, &iwp->length) || - __put_user(flags, &iwp->flags)) - return -EFAULT; - - ret = sys_ioctl(fd, cmd, (unsigned long) iwr); - - if (__get_user(pointer, &iwp->pointer) || - __get_user(length, &iwp->length) || - __get_user(flags, &iwp->flags)) - return -EFAULT; - - if (__put_user(ptr_to_compat(pointer), &iwp_u->pointer) || - __put_user(length, &iwp_u->length) || - __put_user(flags, &iwp_u->flags)) - return -EFAULT; - - return ret; -} - /* Since old style bridge ioctl's endup using SIOCDEVPRIVATE * for some operations; this forces use of the newer bridge-utils that * use compatiable ioctls @@ -2024,6 +1966,7 @@ COMPATIBLE_IOCTL(GIO_UNISCRNMAP) COMPATIBLE_IOCTL(PIO_UNISCRNMAP) COMPATIBLE_IOCTL(PIO_FONTRESET) COMPATIBLE_IOCTL(PIO_UNIMAPCLR) +#ifdef CONFIG_BLOCK /* Big S */ COMPATIBLE_IOCTL(SCSI_IOCTL_GET_IDLUN) COMPATIBLE_IOCTL(SCSI_IOCTL_DOORLOCK) @@ -2033,6 +1976,7 @@ COMPATIBLE_IOCTL(SCSI_IOCTL_GET_BUS_NUMBER) COMPATIBLE_IOCTL(SCSI_IOCTL_SEND_COMMAND) COMPATIBLE_IOCTL(SCSI_IOCTL_PROBE_HOST) COMPATIBLE_IOCTL(SCSI_IOCTL_GET_PCI) +#endif /* Big T */ COMPATIBLE_IOCTL(TUNSETNOCSUM) COMPATIBLE_IOCTL(TUNSETDEBUG) @@ -2103,6 +2047,7 @@ COMPATIBLE_IOCTL(SIOCGIFVLAN) COMPATIBLE_IOCTL(SIOCSIFVLAN) COMPATIBLE_IOCTL(SIOCBRADDBR) COMPATIBLE_IOCTL(SIOCBRDELBR) +#ifdef CONFIG_BLOCK /* SG stuff */ COMPATIBLE_IOCTL(SG_SET_TIMEOUT) COMPATIBLE_IOCTL(SG_GET_TIMEOUT) @@ -2127,6 +2072,7 @@ COMPATIBLE_IOCTL(SG_SCSI_RESET) COMPATIBLE_IOCTL(SG_GET_REQUEST_TABLE) COMPATIBLE_IOCTL(SG_SET_KEEP_ORPHAN) COMPATIBLE_IOCTL(SG_GET_KEEP_ORPHAN) +#endif /* PPP stuff */ COMPATIBLE_IOCTL(PPPIOCGFLAGS) COMPATIBLE_IOCTL(PPPIOCSFLAGS) @@ -2350,8 +2296,6 @@ COMPATIBLE_IOCTL(AUTOFS_IOC_PROTOVER) COMPATIBLE_IOCTL(AUTOFS_IOC_EXPIRE) COMPATIBLE_IOCTL(AUTOFS_IOC_EXPIRE_MULTI) COMPATIBLE_IOCTL(AUTOFS_IOC_PROTOSUBVER) -COMPATIBLE_IOCTL(AUTOFS_IOC_ASKREGHOST) -COMPATIBLE_IOCTL(AUTOFS_IOC_TOGGLEREGHOST) COMPATIBLE_IOCTL(AUTOFS_IOC_ASKUMOUNT) /* Raw devices */ COMPATIBLE_IOCTL(RAW_SETBIND) @@ -2399,6 +2343,7 @@ COMPATIBLE_IOCTL(HCIGETDEVLIST) COMPATIBLE_IOCTL(HCIGETDEVINFO) COMPATIBLE_IOCTL(HCIGETCONNLIST) COMPATIBLE_IOCTL(HCIGETCONNINFO) +COMPATIBLE_IOCTL(HCIGETAUTHINFO) COMPATIBLE_IOCTL(HCISETRAW) COMPATIBLE_IOCTL(HCISETSCAN) COMPATIBLE_IOCTL(HCISETAUTH) @@ -2495,36 +2440,6 @@ COMPATIBLE_IOCTL(I2C_TENBIT) COMPATIBLE_IOCTL(I2C_PEC) COMPATIBLE_IOCTL(I2C_RETRIES) COMPATIBLE_IOCTL(I2C_TIMEOUT) -/* wireless */ -COMPATIBLE_IOCTL(SIOCSIWCOMMIT) -COMPATIBLE_IOCTL(SIOCGIWNAME) -COMPATIBLE_IOCTL(SIOCSIWNWID) -COMPATIBLE_IOCTL(SIOCGIWNWID) -COMPATIBLE_IOCTL(SIOCSIWFREQ) -COMPATIBLE_IOCTL(SIOCGIWFREQ) -COMPATIBLE_IOCTL(SIOCSIWMODE) -COMPATIBLE_IOCTL(SIOCGIWMODE) -COMPATIBLE_IOCTL(SIOCSIWSENS) -COMPATIBLE_IOCTL(SIOCGIWSENS) -COMPATIBLE_IOCTL(SIOCSIWRANGE) -COMPATIBLE_IOCTL(SIOCSIWPRIV) -COMPATIBLE_IOCTL(SIOCSIWSTATS) -COMPATIBLE_IOCTL(SIOCSIWAP) -COMPATIBLE_IOCTL(SIOCGIWAP) -COMPATIBLE_IOCTL(SIOCSIWRATE) -COMPATIBLE_IOCTL(SIOCGIWRATE) -COMPATIBLE_IOCTL(SIOCSIWRTS) -COMPATIBLE_IOCTL(SIOCGIWRTS) -COMPATIBLE_IOCTL(SIOCSIWFRAG) -COMPATIBLE_IOCTL(SIOCGIWFRAG) -COMPATIBLE_IOCTL(SIOCSIWTXPOW) -COMPATIBLE_IOCTL(SIOCGIWTXPOW) -COMPATIBLE_IOCTL(SIOCSIWRETRY) -COMPATIBLE_IOCTL(SIOCGIWRETRY) -COMPATIBLE_IOCTL(SIOCSIWPOWER) -COMPATIBLE_IOCTL(SIOCGIWPOWER) -COMPATIBLE_IOCTL(SIOCSIWAUTH) -COMPATIBLE_IOCTL(SIOCGIWAUTH) /* hiddev */ COMPATIBLE_IOCTL(HIDIOCGVERSION) COMPATIBLE_IOCTL(HIDIOCAPPLICATION) @@ -2755,29 +2670,7 @@ COMPATIBLE_IOCTL(USBDEVFS_IOCTL32) HANDLE_IOCTL(I2C_FUNCS, w_long) HANDLE_IOCTL(I2C_RDWR, do_i2c_rdwr_ioctl) HANDLE_IOCTL(I2C_SMBUS, do_i2c_smbus_ioctl) -/* wireless */ -HANDLE_IOCTL(SIOCGIWRANGE, do_wireless_ioctl) -HANDLE_IOCTL(SIOCGIWPRIV, do_wireless_ioctl) -HANDLE_IOCTL(SIOCGIWSTATS, do_wireless_ioctl) -HANDLE_IOCTL(SIOCSIWSPY, do_wireless_ioctl) -HANDLE_IOCTL(SIOCGIWSPY, do_wireless_ioctl) -HANDLE_IOCTL(SIOCSIWTHRSPY, do_wireless_ioctl) -HANDLE_IOCTL(SIOCGIWTHRSPY, do_wireless_ioctl) -HANDLE_IOCTL(SIOCSIWMLME, do_wireless_ioctl) -HANDLE_IOCTL(SIOCGIWAPLIST, do_wireless_ioctl) -HANDLE_IOCTL(SIOCSIWSCAN, do_wireless_ioctl) -HANDLE_IOCTL(SIOCGIWSCAN, do_wireless_ioctl) -HANDLE_IOCTL(SIOCSIWESSID, do_wireless_ioctl) -HANDLE_IOCTL(SIOCGIWESSID, do_wireless_ioctl) -HANDLE_IOCTL(SIOCSIWNICKN, do_wireless_ioctl) -HANDLE_IOCTL(SIOCGIWNICKN, do_wireless_ioctl) -HANDLE_IOCTL(SIOCSIWENCODE, do_wireless_ioctl) -HANDLE_IOCTL(SIOCGIWENCODE, do_wireless_ioctl) -HANDLE_IOCTL(SIOCSIWGENIE, do_wireless_ioctl) -HANDLE_IOCTL(SIOCGIWGENIE, do_wireless_ioctl) -HANDLE_IOCTL(SIOCSIWENCODEEXT, do_wireless_ioctl) -HANDLE_IOCTL(SIOCGIWENCODEEXT, do_wireless_ioctl) -HANDLE_IOCTL(SIOCSIWPMKSA, do_wireless_ioctl) +/* bridge */ HANDLE_IOCTL(SIOCSIFBR, old_bridge_ioctl) HANDLE_IOCTL(SIOCGIFBR, old_bridge_ioctl) /* Not implemented in the native kernel */ diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h index cca98609aa7f..762d287123ca 100644 --- a/fs/configfs/configfs_internal.h +++ b/fs/configfs/configfs_internal.h @@ -26,6 +26,7 @@ #include <linux/slab.h> #include <linux/list.h> +#include <linux/spinlock.h> struct configfs_dirent { atomic_t s_count; @@ -47,8 +48,13 @@ struct configfs_dirent { #define CONFIGFS_USET_DIR 0x0040 #define CONFIGFS_USET_DEFAULT 0x0080 #define CONFIGFS_USET_DROPPING 0x0100 +#define CONFIGFS_USET_IN_MKDIR 0x0200 +#define CONFIGFS_USET_CREATING 0x0400 #define CONFIGFS_NOT_PINNED (CONFIGFS_ITEM_ATTR) +extern struct mutex configfs_symlink_mutex; +extern spinlock_t configfs_dirent_lock; + extern struct vfsmount * configfs_mount; extern struct kmem_cache *configfs_dir_cachep; @@ -62,6 +68,7 @@ extern void configfs_inode_exit(void); extern int configfs_create_file(struct config_item *, const struct configfs_attribute *); extern int configfs_make_dirent(struct configfs_dirent *, struct dentry *, void *, umode_t, int); +extern int configfs_dirent_is_ready(struct configfs_dirent *); extern int configfs_add_file(struct dentry *, const struct configfs_attribute *, int); extern void configfs_hash_and_remove(struct dentry * dir, const char * name); diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index a48dc7dd8765..8e93341f3e82 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -30,11 +30,25 @@ #include <linux/mount.h> #include <linux/module.h> #include <linux/slab.h> +#include <linux/err.h> #include <linux/configfs.h> #include "configfs_internal.h" DECLARE_RWSEM(configfs_rename_sem); +/* + * Protects mutations of configfs_dirent linkage together with proper i_mutex + * Also protects mutations of symlinks linkage to target configfs_dirent + * Mutators of configfs_dirent linkage must *both* have the proper inode locked + * and configfs_dirent_lock locked, in that order. + * This allows one to safely traverse configfs_dirent trees and symlinks without + * having to lock inodes. + * + * Protects setting of CONFIGFS_USET_DROPPING: checking the flag + * unlocked is not reliable unless in detach_groups() called from + * rmdir()/unregister() and from configfs_attach_group() + */ +DEFINE_SPINLOCK(configfs_dirent_lock); static void configfs_d_iput(struct dentry * dentry, struct inode * inode) @@ -74,13 +88,20 @@ static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent * pare sd = kmem_cache_zalloc(configfs_dir_cachep, GFP_KERNEL); if (!sd) - return NULL; + return ERR_PTR(-ENOMEM); atomic_set(&sd->s_count, 1); INIT_LIST_HEAD(&sd->s_links); INIT_LIST_HEAD(&sd->s_children); - list_add(&sd->s_sibling, &parent_sd->s_children); sd->s_element = element; + spin_lock(&configfs_dirent_lock); + if (parent_sd->s_type & CONFIGFS_USET_DROPPING) { + spin_unlock(&configfs_dirent_lock); + kmem_cache_free(configfs_dir_cachep, sd); + return ERR_PTR(-ENOENT); + } + list_add(&sd->s_sibling, &parent_sd->s_children); + spin_unlock(&configfs_dirent_lock); return sd; } @@ -118,8 +139,8 @@ int configfs_make_dirent(struct configfs_dirent * parent_sd, struct configfs_dirent * sd; sd = configfs_new_dirent(parent_sd, element); - if (!sd) - return -ENOMEM; + if (IS_ERR(sd)) + return PTR_ERR(sd); sd->s_mode = mode; sd->s_type = type; @@ -164,7 +185,7 @@ static int create_dir(struct config_item * k, struct dentry * p, error = configfs_dirent_exists(p->d_fsdata, d->d_name.name); if (!error) error = configfs_make_dirent(p->d_fsdata, d, k, mode, - CONFIGFS_DIR); + CONFIGFS_DIR | CONFIGFS_USET_CREATING); if (!error) { error = configfs_create(d, mode, init_dir); if (!error) { @@ -173,7 +194,9 @@ static int create_dir(struct config_item * k, struct dentry * p, } else { struct configfs_dirent *sd = d->d_fsdata; if (sd) { + spin_lock(&configfs_dirent_lock); list_del_init(&sd->s_sibling); + spin_unlock(&configfs_dirent_lock); configfs_put(sd); } } @@ -186,6 +209,9 @@ static int create_dir(struct config_item * k, struct dentry * p, * configfs_create_dir - create a directory for an config_item. * @item: config_itemwe're creating directory for. * @dentry: config_item's dentry. + * + * Note: user-created entries won't be allowed under this new directory + * until it is validated by configfs_dir_set_ready() */ static int configfs_create_dir(struct config_item * item, struct dentry *dentry) @@ -208,6 +234,44 @@ static int configfs_create_dir(struct config_item * item, struct dentry *dentry) return error; } +/* + * Allow userspace to create new entries under a new directory created with + * configfs_create_dir(), and under all of its chidlren directories recursively. + * @sd configfs_dirent of the new directory to validate + * + * Caller must hold configfs_dirent_lock. + */ +static void configfs_dir_set_ready(struct configfs_dirent *sd) +{ + struct configfs_dirent *child_sd; + + sd->s_type &= ~CONFIGFS_USET_CREATING; + list_for_each_entry(child_sd, &sd->s_children, s_sibling) + if (child_sd->s_type & CONFIGFS_USET_CREATING) + configfs_dir_set_ready(child_sd); +} + +/* + * Check that a directory does not belong to a directory hierarchy being + * attached and not validated yet. + * @sd configfs_dirent of the directory to check + * + * @return non-zero iff the directory was validated + * + * Note: takes configfs_dirent_lock, so the result may change from false to true + * in two consecutive calls, but never from true to false. + */ +int configfs_dirent_is_ready(struct configfs_dirent *sd) +{ + int ret; + + spin_lock(&configfs_dirent_lock); + ret = !(sd->s_type & CONFIGFS_USET_CREATING); + spin_unlock(&configfs_dirent_lock); + + return ret; +} + int configfs_create_link(struct configfs_symlink *sl, struct dentry *parent, struct dentry *dentry) @@ -224,7 +288,9 @@ int configfs_create_link(struct configfs_symlink *sl, else { struct configfs_dirent *sd = dentry->d_fsdata; if (sd) { + spin_lock(&configfs_dirent_lock); list_del_init(&sd->s_sibling); + spin_unlock(&configfs_dirent_lock); configfs_put(sd); } } @@ -238,7 +304,9 @@ static void remove_dir(struct dentry * d) struct configfs_dirent * sd; sd = d->d_fsdata; + spin_lock(&configfs_dirent_lock); list_del_init(&sd->s_sibling); + spin_unlock(&configfs_dirent_lock); configfs_put(sd); if (d->d_inode) simple_rmdir(parent->d_inode,d); @@ -256,6 +324,8 @@ static void remove_dir(struct dentry * d) * The only thing special about this is that we remove any files in * the directory before we remove the directory, and we've inlined * what used to be configfs_rmdir() below, instead of calling separately. + * + * Caller holds the mutex of the item's inode */ static void configfs_remove_dir(struct config_item * item) @@ -303,7 +373,19 @@ static struct dentry * configfs_lookup(struct inode *dir, struct configfs_dirent * parent_sd = dentry->d_parent->d_fsdata; struct configfs_dirent * sd; int found = 0; - int err = 0; + int err; + + /* + * Fake invisibility if dir belongs to a group/default groups hierarchy + * being attached + * + * This forbids userspace to read/write attributes of items which may + * not complete their initialization, since the dentries of the + * attributes won't be instantiated. + */ + err = -ENOENT; + if (!configfs_dirent_is_ready(parent_sd)) + goto out; list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { if (sd->s_type & CONFIGFS_NOT_PINNED) { @@ -326,41 +408,49 @@ static struct dentry * configfs_lookup(struct inode *dir, return simple_lookup(dir, dentry, nd); } +out: return ERR_PTR(err); } /* * Only subdirectories count here. Files (CONFIGFS_NOT_PINNED) are - * attributes and are removed by rmdir(). We recurse, taking i_mutex - * on all children that are candidates for default detach. If the - * result is clean, then configfs_detach_group() will handle dropping - * i_mutex. If there is an error, the caller will clean up the i_mutex - * holders via configfs_detach_rollback(). + * attributes and are removed by rmdir(). We recurse, setting + * CONFIGFS_USET_DROPPING on all children that are candidates for + * default detach. + * If there is an error, the caller will reset the flags via + * configfs_detach_rollback(). */ -static int configfs_detach_prep(struct dentry *dentry) +static int configfs_detach_prep(struct dentry *dentry, struct mutex **wait_mutex) { struct configfs_dirent *parent_sd = dentry->d_fsdata; struct configfs_dirent *sd; int ret; + /* Mark that we're trying to drop the group */ + parent_sd->s_type |= CONFIGFS_USET_DROPPING; + ret = -EBUSY; if (!list_empty(&parent_sd->s_links)) goto out; ret = 0; list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { - if (sd->s_type & CONFIGFS_NOT_PINNED) + if (!sd->s_element || + (sd->s_type & CONFIGFS_NOT_PINNED)) continue; if (sd->s_type & CONFIGFS_USET_DEFAULT) { - mutex_lock(&sd->s_dentry->d_inode->i_mutex); - /* Mark that we've taken i_mutex */ - sd->s_type |= CONFIGFS_USET_DROPPING; + /* Abort if racing with mkdir() */ + if (sd->s_type & CONFIGFS_USET_IN_MKDIR) { + if (wait_mutex) + *wait_mutex = &sd->s_dentry->d_inode->i_mutex; + return -EAGAIN; + } /* * Yup, recursive. If there's a problem, blame * deep nesting of default_groups */ - ret = configfs_detach_prep(sd->s_dentry); + ret = configfs_detach_prep(sd->s_dentry, wait_mutex); if (!ret) continue; } else @@ -374,7 +464,7 @@ out: } /* - * Walk the tree, dropping i_mutex wherever CONFIGFS_USET_DROPPING is + * Walk the tree, resetting CONFIGFS_USET_DROPPING wherever it was * set. */ static void configfs_detach_rollback(struct dentry *dentry) @@ -382,16 +472,11 @@ static void configfs_detach_rollback(struct dentry *dentry) struct configfs_dirent *parent_sd = dentry->d_fsdata; struct configfs_dirent *sd; - list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { - if (sd->s_type & CONFIGFS_USET_DEFAULT) { - configfs_detach_rollback(sd->s_dentry); + parent_sd->s_type &= ~CONFIGFS_USET_DROPPING; - if (sd->s_type & CONFIGFS_USET_DROPPING) { - sd->s_type &= ~CONFIGFS_USET_DROPPING; - mutex_unlock(&sd->s_dentry->d_inode->i_mutex); - } - } - } + list_for_each_entry(sd, &parent_sd->s_children, s_sibling) + if (sd->s_type & CONFIGFS_USET_DEFAULT) + configfs_detach_rollback(sd->s_dentry); } static void detach_attrs(struct config_item * item) @@ -410,7 +495,9 @@ static void detach_attrs(struct config_item * item) list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) { if (!sd->s_element || !(sd->s_type & CONFIGFS_NOT_PINNED)) continue; + spin_lock(&configfs_dirent_lock); list_del_init(&sd->s_sibling); + spin_unlock(&configfs_dirent_lock); configfs_drop_dentry(sd, dentry); configfs_put(sd); } @@ -466,16 +553,12 @@ static void detach_groups(struct config_group *group) child = sd->s_dentry; + mutex_lock(&child->d_inode->i_mutex); + configfs_detach_group(sd->s_element); child->d_inode->i_flags |= S_DEAD; - /* - * From rmdir/unregister, a configfs_detach_prep() pass - * has taken our i_mutex for us. Drop it. - * From mkdir/register cleanup, there is no sem held. - */ - if (sd->s_type & CONFIGFS_USET_DROPPING) - mutex_unlock(&child->d_inode->i_mutex); + mutex_unlock(&child->d_inode->i_mutex); d_delete(child); dput(child); @@ -532,36 +615,21 @@ static int create_default_group(struct config_group *parent_group, static int populate_groups(struct config_group *group) { struct config_group *new_group; - struct dentry *dentry = group->cg_item.ci_dentry; int ret = 0; int i; if (group->default_groups) { - /* - * FYI, we're faking mkdir here - * I'm not sure we need this semaphore, as we're called - * from our parent's mkdir. That holds our parent's - * i_mutex, so afaik lookup cannot continue through our - * parent to find us, let alone mess with our tree. - * That said, taking our i_mutex is closer to mkdir - * emulation, and shouldn't hurt. - */ - mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); - for (i = 0; group->default_groups[i]; i++) { new_group = group->default_groups[i]; ret = create_default_group(group, new_group); - if (ret) + if (ret) { + detach_groups(group); break; + } } - - mutex_unlock(&dentry->d_inode->i_mutex); } - if (ret) - detach_groups(group); - return ret; } @@ -676,7 +744,15 @@ static int configfs_attach_item(struct config_item *parent_item, if (!ret) { ret = populate_attrs(item); if (ret) { + /* + * We are going to remove an inode and its dentry but + * the VFS may already have hit and used them. Thus, + * we must lock them as rmdir() would. + */ + mutex_lock(&dentry->d_inode->i_mutex); configfs_remove_dir(item); + dentry->d_inode->i_flags |= S_DEAD; + mutex_unlock(&dentry->d_inode->i_mutex); d_delete(dentry); } } @@ -684,6 +760,7 @@ static int configfs_attach_item(struct config_item *parent_item, return ret; } +/* Caller holds the mutex of the item's inode */ static void configfs_detach_item(struct config_item *item) { detach_attrs(item); @@ -702,16 +779,30 @@ static int configfs_attach_group(struct config_item *parent_item, sd = dentry->d_fsdata; sd->s_type |= CONFIGFS_USET_DIR; + /* + * FYI, we're faking mkdir in populate_groups() + * We must lock the group's inode to avoid races with the VFS + * which can already hit the inode and try to add/remove entries + * under it. + * + * We must also lock the inode to remove it safely in case of + * error, as rmdir() would. + */ + mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); ret = populate_groups(to_config_group(item)); if (ret) { configfs_detach_item(item); - d_delete(dentry); + dentry->d_inode->i_flags |= S_DEAD; } + mutex_unlock(&dentry->d_inode->i_mutex); + if (ret) + d_delete(dentry); } return ret; } +/* Caller holds the mutex of the group's inode */ static void configfs_detach_group(struct config_item *item) { detach_groups(to_config_group(item)); @@ -1001,14 +1092,15 @@ EXPORT_SYMBOL(configfs_undepend_item); static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) { - int ret, module_got = 0; - struct config_group *group; - struct config_item *item; + int ret = 0; + int module_got = 0; + struct config_group *group = NULL; + struct config_item *item = NULL; struct config_item *parent_item; struct configfs_subsystem *subsys; struct configfs_dirent *sd; struct config_item_type *type; - struct module *owner = NULL; + struct module *subsys_owner = NULL, *new_item_owner = NULL; char *name; if (dentry->d_parent == configfs_sb->s_root) { @@ -1017,6 +1109,16 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) } sd = dentry->d_parent->d_fsdata; + + /* + * Fake invisibility if dir belongs to a group/default groups hierarchy + * being attached + */ + if (!configfs_dirent_is_ready(sd)) { + ret = -ENOENT; + goto out; + } + if (!(sd->s_type & CONFIGFS_USET_DIR)) { ret = -EPERM; goto out; @@ -1035,38 +1137,57 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) goto out_put; } + /* + * The subsystem may belong to a different module than the item + * being created. We don't want to safely pin the new item but + * fail to pin the subsystem it sits under. + */ + if (!subsys->su_group.cg_item.ci_type) { + ret = -EINVAL; + goto out_put; + } + subsys_owner = subsys->su_group.cg_item.ci_type->ct_owner; + if (!try_module_get(subsys_owner)) { + ret = -EINVAL; + goto out_put; + } + name = kmalloc(dentry->d_name.len + 1, GFP_KERNEL); if (!name) { ret = -ENOMEM; - goto out_put; + goto out_subsys_put; } snprintf(name, dentry->d_name.len + 1, "%s", dentry->d_name.name); mutex_lock(&subsys->su_mutex); - group = NULL; - item = NULL; if (type->ct_group_ops->make_group) { group = type->ct_group_ops->make_group(to_config_group(parent_item), name); - if (group) { + if (!group) + group = ERR_PTR(-ENOMEM); + if (!IS_ERR(group)) { link_group(to_config_group(parent_item), group); item = &group->cg_item; - } + } else + ret = PTR_ERR(group); } else { item = type->ct_group_ops->make_item(to_config_group(parent_item), name); - if (item) + if (!item) + item = ERR_PTR(-ENOMEM); + if (!IS_ERR(item)) link_obj(parent_item, item); + else + ret = PTR_ERR(item); } mutex_unlock(&subsys->su_mutex); kfree(name); - if (!item) { + if (ret) { /* - * If item == NULL, then link_obj() was never called. + * If ret != 0, then link_obj() was never called. * There are no extra references to clean up. */ - ret = -ENOMEM; - goto out_put; + goto out_subsys_put; } /* @@ -1080,8 +1201,8 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) goto out_unlink; } - owner = type->ct_owner; - if (!try_module_get(owner)) { + new_item_owner = type->ct_owner; + if (!try_module_get(new_item_owner)) { ret = -EINVAL; goto out_unlink; } @@ -1093,11 +1214,28 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) */ module_got = 1; + /* + * Make racing rmdir() fail if it did not tag parent with + * CONFIGFS_USET_DROPPING + * Note: if CONFIGFS_USET_DROPPING is already set, attach_group() will + * fail and let rmdir() terminate correctly + */ + spin_lock(&configfs_dirent_lock); + /* This will make configfs_detach_prep() fail */ + sd->s_type |= CONFIGFS_USET_IN_MKDIR; + spin_unlock(&configfs_dirent_lock); + if (group) ret = configfs_attach_group(parent_item, item, dentry); else ret = configfs_attach_item(parent_item, item, dentry); + spin_lock(&configfs_dirent_lock); + sd->s_type &= ~CONFIGFS_USET_IN_MKDIR; + if (!ret) + configfs_dir_set_ready(dentry->d_fsdata); + spin_unlock(&configfs_dirent_lock); + out_unlink: if (ret) { /* Tear down everything we built up */ @@ -1113,9 +1251,13 @@ out_unlink: mutex_unlock(&subsys->su_mutex); if (module_got) - module_put(owner); + module_put(new_item_owner); } +out_subsys_put: + if (ret) + module_put(subsys_owner); + out_put: /* * link_obj()/link_group() took a reference from child->parent, @@ -1134,7 +1276,7 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry) struct config_item *item; struct configfs_subsystem *subsys; struct configfs_dirent *sd; - struct module *owner = NULL; + struct module *subsys_owner = NULL, *dead_item_owner = NULL; int ret; if (dentry->d_parent == configfs_sb->s_root) @@ -1161,12 +1303,36 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry) return -EINVAL; } - ret = configfs_detach_prep(dentry); - if (ret) { - configfs_detach_rollback(dentry); - config_item_put(parent_item); - return ret; - } + /* configfs_mkdir() shouldn't have allowed this */ + BUG_ON(!subsys->su_group.cg_item.ci_type); + subsys_owner = subsys->su_group.cg_item.ci_type->ct_owner; + + /* + * Ensure that no racing symlink() will make detach_prep() fail while + * the new link is temporarily attached + */ + do { + struct mutex *wait_mutex; + + mutex_lock(&configfs_symlink_mutex); + spin_lock(&configfs_dirent_lock); + ret = configfs_detach_prep(dentry, &wait_mutex); + if (ret) + configfs_detach_rollback(dentry); + spin_unlock(&configfs_dirent_lock); + mutex_unlock(&configfs_symlink_mutex); + + if (ret) { + if (ret != -EAGAIN) { + config_item_put(parent_item); + return ret; + } + + /* Wait until the racing operation terminates */ + mutex_lock(wait_mutex); + mutex_unlock(wait_mutex); + } + } while (ret == -EAGAIN); /* Get a working ref for the duration of this function */ item = configfs_get_config_item(dentry); @@ -1175,7 +1341,7 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry) config_item_put(parent_item); if (item->ci_type) - owner = item->ci_type->ct_owner; + dead_item_owner = item->ci_type->ct_owner; if (sd->s_type & CONFIGFS_USET_DIR) { configfs_detach_group(item); @@ -1197,7 +1363,8 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry) /* Drop our reference from above */ config_item_put(item); - module_put(owner); + module_put(dead_item_owner); + module_put(subsys_owner); return 0; } @@ -1253,13 +1420,24 @@ static int configfs_dir_open(struct inode *inode, struct file *file) { struct dentry * dentry = file->f_path.dentry; struct configfs_dirent * parent_sd = dentry->d_fsdata; + int err; mutex_lock(&dentry->d_inode->i_mutex); - file->private_data = configfs_new_dirent(parent_sd, NULL); + /* + * Fake invisibility if dir belongs to a group/default groups hierarchy + * being attached + */ + err = -ENOENT; + if (configfs_dirent_is_ready(parent_sd)) { + file->private_data = configfs_new_dirent(parent_sd, NULL); + if (IS_ERR(file->private_data)) + err = PTR_ERR(file->private_data); + else + err = 0; + } mutex_unlock(&dentry->d_inode->i_mutex); - return file->private_data ? 0 : -ENOMEM; - + return err; } static int configfs_dir_close(struct inode *inode, struct file *file) @@ -1268,7 +1446,9 @@ static int configfs_dir_close(struct inode *inode, struct file *file) struct configfs_dirent * cursor = file->private_data; mutex_lock(&dentry->d_inode->i_mutex); + spin_lock(&configfs_dirent_lock); list_del_init(&cursor->s_sibling); + spin_unlock(&configfs_dirent_lock); mutex_unlock(&dentry->d_inode->i_mutex); release_configfs_dirent(cursor); @@ -1308,7 +1488,9 @@ static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir /* fallthrough */ default: if (filp->f_pos == 2) { + spin_lock(&configfs_dirent_lock); list_move(q, &parent_sd->s_children); + spin_unlock(&configfs_dirent_lock); } for (p=q->next; p!= &parent_sd->s_children; p=p->next) { struct configfs_dirent *next; @@ -1331,7 +1513,9 @@ static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir dt_type(next)) < 0) return 0; + spin_lock(&configfs_dirent_lock); list_move(q, p); + spin_unlock(&configfs_dirent_lock); p = q; filp->f_pos++; } @@ -1362,6 +1546,7 @@ static loff_t configfs_dir_lseek(struct file * file, loff_t offset, int origin) struct list_head *p; loff_t n = file->f_pos - 2; + spin_lock(&configfs_dirent_lock); list_del(&cursor->s_sibling); p = sd->s_children.next; while (n && p != &sd->s_children) { @@ -1373,6 +1558,7 @@ static loff_t configfs_dir_lseek(struct file * file, loff_t offset, int origin) p = p->next; } list_add_tail(&cursor->s_sibling, p); + spin_unlock(&configfs_dirent_lock); } } mutex_unlock(&dentry->d_inode->i_mutex); @@ -1422,6 +1608,10 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys) if (err) { d_delete(dentry); dput(dentry); + } else { + spin_lock(&configfs_dirent_lock); + configfs_dir_set_ready(dentry->d_fsdata); + spin_unlock(&configfs_dirent_lock); } } @@ -1448,9 +1638,13 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys) mutex_lock_nested(&configfs_sb->s_root->d_inode->i_mutex, I_MUTEX_PARENT); mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); - if (configfs_detach_prep(dentry)) { + mutex_lock(&configfs_symlink_mutex); + spin_lock(&configfs_dirent_lock); + if (configfs_detach_prep(dentry, NULL)) { printk(KERN_ERR "configfs: Tried to unregister non-empty subsystem!\n"); } + spin_unlock(&configfs_dirent_lock); + mutex_unlock(&configfs_symlink_mutex); configfs_detach_group(&group->cg_item); dentry->d_inode->i_flags |= S_DEAD; mutex_unlock(&dentry->d_inode->i_mutex); diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index b9a1d810346d..4803ccc94480 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -247,7 +247,9 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name) if (!sd->s_element) continue; if (!strcmp(configfs_get_name(sd), name)) { + spin_lock(&configfs_dirent_lock); list_del_init(&sd->s_sibling); + spin_unlock(&configfs_dirent_lock); configfs_drop_dentry(sd, dir); configfs_put(sd); break; diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c index 2a731ef5f305..bf74973b0492 100644 --- a/fs/configfs/symlink.c +++ b/fs/configfs/symlink.c @@ -31,6 +31,9 @@ #include <linux/configfs.h> #include "configfs_internal.h" +/* Protects attachments of new symlinks */ +DEFINE_MUTEX(configfs_symlink_mutex); + static int item_depth(struct config_item * item) { struct config_item * p = item; @@ -73,21 +76,34 @@ static int create_link(struct config_item *parent_item, struct configfs_symlink *sl; int ret; + ret = -ENOENT; + if (!configfs_dirent_is_ready(target_sd)) + goto out; ret = -ENOMEM; sl = kmalloc(sizeof(struct configfs_symlink), GFP_KERNEL); if (sl) { sl->sl_target = config_item_get(item); - /* FIXME: needs a lock, I'd bet */ + spin_lock(&configfs_dirent_lock); + if (target_sd->s_type & CONFIGFS_USET_DROPPING) { + spin_unlock(&configfs_dirent_lock); + config_item_put(item); + kfree(sl); + return -ENOENT; + } list_add(&sl->sl_list, &target_sd->s_links); + spin_unlock(&configfs_dirent_lock); ret = configfs_create_link(sl, parent_item->ci_dentry, dentry); if (ret) { + spin_lock(&configfs_dirent_lock); list_del_init(&sl->sl_list); + spin_unlock(&configfs_dirent_lock); config_item_put(item); kfree(sl); } } +out: return ret; } @@ -117,6 +133,7 @@ int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symna { int ret; struct nameidata nd; + struct configfs_dirent *sd; struct config_item *parent_item; struct config_item *target_item; struct config_item_type *type; @@ -125,9 +142,19 @@ int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symna if (dentry->d_parent == configfs_sb->s_root) goto out; + sd = dentry->d_parent->d_fsdata; + /* + * Fake invisibility if dir belongs to a group/default groups hierarchy + * being attached + */ + ret = -ENOENT; + if (!configfs_dirent_is_ready(sd)) + goto out; + parent_item = configfs_get_config_item(dentry->d_parent); type = parent_item->ci_type; + ret = -EPERM; if (!type || !type->ct_item_ops || !type->ct_item_ops->allow_link) goto out_put; @@ -137,8 +164,14 @@ int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symna goto out_put; ret = type->ct_item_ops->allow_link(parent_item, target_item); - if (!ret) + if (!ret) { + mutex_lock(&configfs_symlink_mutex); ret = create_link(parent_item, target_item, dentry); + mutex_unlock(&configfs_symlink_mutex); + if (ret && type->ct_item_ops->drop_link) + type->ct_item_ops->drop_link(parent_item, + target_item); + } config_item_put(target_item); path_put(&nd.path); @@ -169,7 +202,9 @@ int configfs_unlink(struct inode *dir, struct dentry *dentry) parent_item = configfs_get_config_item(dentry->d_parent); type = parent_item->ci_type; + spin_lock(&configfs_dirent_lock); list_del_init(&sd->s_sibling); + spin_unlock(&configfs_dirent_lock); configfs_drop_dentry(sd, dentry->d_parent); dput(dentry); configfs_put(sd); @@ -184,8 +219,9 @@ int configfs_unlink(struct inode *dir, struct dentry *dentry) type->ct_item_ops->drop_link(parent_item, sl->sl_target); - /* FIXME: Needs lock */ + spin_lock(&configfs_dirent_lock); list_del_init(&sl->sl_list); + spin_unlock(&configfs_dirent_lock); /* Put reference from create_link() */ config_item_put(sl->sl_target); diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 0c3b618c15b3..f40423eb1a14 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -43,58 +43,13 @@ static DEFINE_MUTEX(read_mutex); static int cramfs_iget5_test(struct inode *inode, void *opaque) { struct cramfs_inode *cramfs_inode = opaque; - - if (inode->i_ino != CRAMINO(cramfs_inode)) - return 0; /* does not match */ - - if (inode->i_ino != 1) - return 1; - - /* all empty directories, char, block, pipe, and sock, share inode #1 */ - - if ((inode->i_mode != cramfs_inode->mode) || - (inode->i_gid != cramfs_inode->gid) || - (inode->i_uid != cramfs_inode->uid)) - return 0; /* does not match */ - - if ((S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) && - (inode->i_rdev != old_decode_dev(cramfs_inode->size))) - return 0; /* does not match */ - - return 1; /* matches */ + return inode->i_ino == CRAMINO(cramfs_inode) && inode->i_ino != 1; } static int cramfs_iget5_set(struct inode *inode, void *opaque) { - static struct timespec zerotime; struct cramfs_inode *cramfs_inode = opaque; - inode->i_mode = cramfs_inode->mode; - inode->i_uid = cramfs_inode->uid; - inode->i_size = cramfs_inode->size; - inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1; - inode->i_gid = cramfs_inode->gid; - /* Struct copy intentional */ - inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime; inode->i_ino = CRAMINO(cramfs_inode); - /* inode->i_nlink is left 1 - arguably wrong for directories, - but it's the best we can do without reading the directory - contents. 1 yields the right result in GNU find, even - without -noleaf option. */ - if (S_ISREG(inode->i_mode)) { - inode->i_fop = &generic_ro_fops; - inode->i_data.a_ops = &cramfs_aops; - } else if (S_ISDIR(inode->i_mode)) { - inode->i_op = &cramfs_dir_inode_operations; - inode->i_fop = &cramfs_directory_operations; - } else if (S_ISLNK(inode->i_mode)) { - inode->i_op = &page_symlink_inode_operations; - inode->i_data.a_ops = &cramfs_aops; - } else { - inode->i_size = 0; - inode->i_blocks = 0; - init_special_inode(inode, inode->i_mode, - old_decode_dev(cramfs_inode->size)); - } return 0; } @@ -104,12 +59,48 @@ static struct inode *get_cramfs_inode(struct super_block *sb, struct inode *inode = iget5_locked(sb, CRAMINO(cramfs_inode), cramfs_iget5_test, cramfs_iget5_set, cramfs_inode); + static struct timespec zerotime; + if (inode && (inode->i_state & I_NEW)) { + inode->i_mode = cramfs_inode->mode; + inode->i_uid = cramfs_inode->uid; + inode->i_size = cramfs_inode->size; + inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1; + inode->i_gid = cramfs_inode->gid; + /* Struct copy intentional */ + inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime; + /* inode->i_nlink is left 1 - arguably wrong for directories, + but it's the best we can do without reading the directory + contents. 1 yields the right result in GNU find, even + without -noleaf option. */ + if (S_ISREG(inode->i_mode)) { + inode->i_fop = &generic_ro_fops; + inode->i_data.a_ops = &cramfs_aops; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &cramfs_dir_inode_operations; + inode->i_fop = &cramfs_directory_operations; + } else if (S_ISLNK(inode->i_mode)) { + inode->i_op = &page_symlink_inode_operations; + inode->i_data.a_ops = &cramfs_aops; + } else { + inode->i_size = 0; + inode->i_blocks = 0; + init_special_inode(inode, inode->i_mode, + old_decode_dev(cramfs_inode->size)); + } unlock_new_inode(inode); } return inode; } +static void cramfs_drop_inode(struct inode *inode) +{ + if (inode->i_ino == 1) + generic_delete_inode(inode); + else + generic_drop_inode(inode); +} + /* * We have our own block cache: don't fill up the buffer cache * with the rom-image, because the way the filesystem is set @@ -534,6 +525,7 @@ static const struct super_operations cramfs_ops = { .put_super = cramfs_put_super, .remount_fs = cramfs_remount, .statfs = cramfs_statfs, + .drop_inode = cramfs_drop_inode, }; static int cramfs_get_sb(struct file_system_type *fs_type, diff --git a/fs/dcache.c b/fs/dcache.c index 3ee588d5f585..e7a1a99b7464 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -17,6 +17,7 @@ #include <linux/syscalls.h> #include <linux/string.h> #include <linux/mm.h> +#include <linux/fdtable.h> #include <linux/fs.h> #include <linux/fsnotify.h> #include <linux/slab.h> @@ -60,7 +61,6 @@ static struct kmem_cache *dentry_cache __read_mostly; static unsigned int d_hash_mask __read_mostly; static unsigned int d_hash_shift __read_mostly; static struct hlist_head *dentry_hashtable __read_mostly; -static LIST_HEAD(dentry_unused); /* Statistics gathering. */ struct dentry_stat_t dentry_stat = { @@ -95,20 +95,13 @@ static void d_free(struct dentry *dentry) call_rcu(&dentry->d_u.d_rcu, d_callback); } -static void dentry_lru_remove(struct dentry *dentry) -{ - if (!list_empty(&dentry->d_lru)) { - list_del_init(&dentry->d_lru); - dentry_stat.nr_unused--; - } -} - /* * Release the dentry's inode, using the filesystem * d_iput() operation if defined. - * Called with dcache_lock and per dentry lock held, drops both. */ static void dentry_iput(struct dentry * dentry) + __releases(dentry->d_lock) + __releases(dcache_lock) { struct inode *inode = dentry->d_inode; if (inode) { @@ -128,16 +121,52 @@ static void dentry_iput(struct dentry * dentry) } } +/* + * dentry_lru_(add|add_tail|del|del_init) must be called with dcache_lock held. + */ +static void dentry_lru_add(struct dentry *dentry) +{ + list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); + dentry->d_sb->s_nr_dentry_unused++; + dentry_stat.nr_unused++; +} + +static void dentry_lru_add_tail(struct dentry *dentry) +{ + list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); + dentry->d_sb->s_nr_dentry_unused++; + dentry_stat.nr_unused++; +} + +static void dentry_lru_del(struct dentry *dentry) +{ + if (!list_empty(&dentry->d_lru)) { + list_del(&dentry->d_lru); + dentry->d_sb->s_nr_dentry_unused--; + dentry_stat.nr_unused--; + } +} + +static void dentry_lru_del_init(struct dentry *dentry) +{ + if (likely(!list_empty(&dentry->d_lru))) { + list_del_init(&dentry->d_lru); + dentry->d_sb->s_nr_dentry_unused--; + dentry_stat.nr_unused--; + } +} + /** * d_kill - kill dentry and return parent * @dentry: dentry to kill * - * Called with dcache_lock and d_lock, releases both. The dentry must - * already be unhashed and removed from the LRU. + * The dentry must already be unhashed and removed from the LRU. * * If this is the root of the dentry tree, return NULL. */ static struct dentry *d_kill(struct dentry *dentry) + __releases(dentry->d_lock) + __releases(dcache_lock) { struct dentry *parent; @@ -209,8 +238,7 @@ repeat: goto kill_it; if (list_empty(&dentry->d_lru)) { dentry->d_flags |= DCACHE_REFERENCED; - list_add(&dentry->d_lru, &dentry_unused); - dentry_stat.nr_unused++; + dentry_lru_add(dentry); } spin_unlock(&dentry->d_lock); spin_unlock(&dcache_lock); @@ -219,7 +247,8 @@ repeat: unhash_it: __d_drop(dentry); kill_it: - dentry_lru_remove(dentry); + /* if dentry was on the d_lru list delete it from there */ + dentry_lru_del(dentry); dentry = d_kill(dentry); if (dentry) goto repeat; @@ -287,7 +316,7 @@ int d_invalidate(struct dentry * dentry) static inline struct dentry * __dget_locked(struct dentry *dentry) { atomic_inc(&dentry->d_count); - dentry_lru_remove(dentry); + dentry_lru_del_init(dentry); return dentry; } @@ -383,11 +412,11 @@ restart: * Try to prune ancestors as well. This is necessary to prevent * quadratic behavior of shrink_dcache_parent(), but is also expected * to be beneficial in reducing dentry cache fragmentation. - * - * Called with dcache_lock, drops it and then regains. - * Called with dentry->d_lock held, drops it. */ static void prune_one_dentry(struct dentry * dentry) + __releases(dentry->d_lock) + __releases(dcache_lock) + __acquires(dcache_lock) { __d_drop(dentry); dentry = d_kill(dentry); @@ -403,133 +432,168 @@ static void prune_one_dentry(struct dentry * dentry) if (dentry->d_op && dentry->d_op->d_delete) dentry->d_op->d_delete(dentry); - dentry_lru_remove(dentry); + dentry_lru_del_init(dentry); __d_drop(dentry); dentry = d_kill(dentry); spin_lock(&dcache_lock); } } -/** - * prune_dcache - shrink the dcache - * @count: number of entries to try and free - * @sb: if given, ignore dentries for other superblocks - * which are being unmounted. - * - * Shrink the dcache. This is done when we need - * more memory, or simply when we need to unmount - * something (at which point we need to unuse - * all dentries). - * - * This function may fail to free any resources if - * all the dentries are in use. +/* + * Shrink the dentry LRU on a given superblock. + * @sb : superblock to shrink dentry LRU. + * @count: If count is NULL, we prune all dentries on superblock. + * @flags: If flags is non-zero, we need to do special processing based on + * which flags are set. This means we don't need to maintain multiple + * similar copies of this loop. */ - -static void prune_dcache(int count, struct super_block *sb) +static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags) { - spin_lock(&dcache_lock); - for (; count ; count--) { - struct dentry *dentry; - struct list_head *tmp; - struct rw_semaphore *s_umount; - - cond_resched_lock(&dcache_lock); + LIST_HEAD(referenced); + LIST_HEAD(tmp); + struct dentry *dentry; + int cnt = 0; - tmp = dentry_unused.prev; - if (sb) { - /* Try to find a dentry for this sb, but don't try - * too hard, if they aren't near the tail they will - * be moved down again soon + BUG_ON(!sb); + BUG_ON((flags & DCACHE_REFERENCED) && count == NULL); + spin_lock(&dcache_lock); + if (count != NULL) + /* called from prune_dcache() and shrink_dcache_parent() */ + cnt = *count; +restart: + if (count == NULL) + list_splice_init(&sb->s_dentry_lru, &tmp); + else { + while (!list_empty(&sb->s_dentry_lru)) { + dentry = list_entry(sb->s_dentry_lru.prev, + struct dentry, d_lru); + BUG_ON(dentry->d_sb != sb); + + spin_lock(&dentry->d_lock); + /* + * If we are honouring the DCACHE_REFERENCED flag and + * the dentry has this flag set, don't free it. Clear + * the flag and put it back on the LRU. */ - int skip = count; - while (skip && tmp != &dentry_unused && - list_entry(tmp, struct dentry, d_lru)->d_sb != sb) { - skip--; - tmp = tmp->prev; + if ((flags & DCACHE_REFERENCED) + && (dentry->d_flags & DCACHE_REFERENCED)) { + dentry->d_flags &= ~DCACHE_REFERENCED; + list_move_tail(&dentry->d_lru, &referenced); + spin_unlock(&dentry->d_lock); + } else { + list_move_tail(&dentry->d_lru, &tmp); + spin_unlock(&dentry->d_lock); + cnt--; + if (!cnt) + break; } + cond_resched_lock(&dcache_lock); } - if (tmp == &dentry_unused) - break; - list_del_init(tmp); - prefetch(dentry_unused.prev); - dentry_stat.nr_unused--; - dentry = list_entry(tmp, struct dentry, d_lru); - - spin_lock(&dentry->d_lock); + } + while (!list_empty(&tmp)) { + dentry = list_entry(tmp.prev, struct dentry, d_lru); + dentry_lru_del_init(dentry); + spin_lock(&dentry->d_lock); /* * We found an inuse dentry which was not removed from - * dentry_unused because of laziness during lookup. Do not free - * it - just keep it off the dentry_unused list. + * the LRU because of laziness during lookup. Do not free + * it - just keep it off the LRU list. */ - if (atomic_read(&dentry->d_count)) { - spin_unlock(&dentry->d_lock); + if (atomic_read(&dentry->d_count)) { + spin_unlock(&dentry->d_lock); continue; } - /* If the dentry was recently referenced, don't free it. */ - if (dentry->d_flags & DCACHE_REFERENCED) { - dentry->d_flags &= ~DCACHE_REFERENCED; - list_add(&dentry->d_lru, &dentry_unused); - dentry_stat.nr_unused++; - spin_unlock(&dentry->d_lock); + prune_one_dentry(dentry); + /* dentry->d_lock was dropped in prune_one_dentry() */ + cond_resched_lock(&dcache_lock); + } + if (count == NULL && !list_empty(&sb->s_dentry_lru)) + goto restart; + if (count != NULL) + *count = cnt; + if (!list_empty(&referenced)) + list_splice(&referenced, &sb->s_dentry_lru); + spin_unlock(&dcache_lock); +} + +/** + * prune_dcache - shrink the dcache + * @count: number of entries to try to free + * + * Shrink the dcache. This is done when we need more memory, or simply when we + * need to unmount something (at which point we need to unuse all dentries). + * + * This function may fail to free any resources if all the dentries are in use. + */ +static void prune_dcache(int count) +{ + struct super_block *sb; + int w_count; + int unused = dentry_stat.nr_unused; + int prune_ratio; + int pruned; + + if (unused == 0 || count == 0) + return; + spin_lock(&dcache_lock); +restart: + if (count >= unused) + prune_ratio = 1; + else + prune_ratio = unused / count; + spin_lock(&sb_lock); + list_for_each_entry(sb, &super_blocks, s_list) { + if (sb->s_nr_dentry_unused == 0) continue; - } - /* - * If the dentry is not DCACHED_REFERENCED, it is time - * to remove it from the dcache, provided the super block is - * NULL (which means we are trying to reclaim memory) - * or this dentry belongs to the same super block that - * we want to shrink. + sb->s_count++; + /* Now, we reclaim unused dentrins with fairness. + * We reclaim them same percentage from each superblock. + * We calculate number of dentries to scan on this sb + * as follows, but the implementation is arranged to avoid + * overflows: + * number of dentries to scan on this sb = + * count * (number of dentries on this sb / + * number of dentries in the machine) */ + spin_unlock(&sb_lock); + if (prune_ratio != 1) + w_count = (sb->s_nr_dentry_unused / prune_ratio) + 1; + else + w_count = sb->s_nr_dentry_unused; + pruned = w_count; /* - * If this dentry is for "my" filesystem, then I can prune it - * without taking the s_umount lock (I already hold it). - */ - if (sb && dentry->d_sb == sb) { - prune_one_dentry(dentry); - continue; - } - /* - * ...otherwise we need to be sure this filesystem isn't being - * unmounted, otherwise we could race with - * generic_shutdown_super(), and end up holding a reference to - * an inode while the filesystem is unmounted. - * So we try to get s_umount, and make sure s_root isn't NULL. - * (Take a local copy of s_umount to avoid a use-after-free of - * `dentry'). + * We need to be sure this filesystem isn't being unmounted, + * otherwise we could race with generic_shutdown_super(), and + * end up holding a reference to an inode while the filesystem + * is unmounted. So we try to get s_umount, and make sure + * s_root isn't NULL. */ - s_umount = &dentry->d_sb->s_umount; - if (down_read_trylock(s_umount)) { - if (dentry->d_sb->s_root != NULL) { - prune_one_dentry(dentry); - up_read(s_umount); - continue; + if (down_read_trylock(&sb->s_umount)) { + if ((sb->s_root != NULL) && + (!list_empty(&sb->s_dentry_lru))) { + spin_unlock(&dcache_lock); + __shrink_dcache_sb(sb, &w_count, + DCACHE_REFERENCED); + pruned -= w_count; + spin_lock(&dcache_lock); } - up_read(s_umount); + up_read(&sb->s_umount); } - spin_unlock(&dentry->d_lock); + spin_lock(&sb_lock); + count -= pruned; /* - * Insert dentry at the head of the list as inserting at the - * tail leads to a cycle. + * restart only when sb is no longer on the list and + * we have more work to do. */ - list_add(&dentry->d_lru, &dentry_unused); - dentry_stat.nr_unused++; + if (__put_super_and_need_restart(sb) && count > 0) { + spin_unlock(&sb_lock); + goto restart; + } } + spin_unlock(&sb_lock); spin_unlock(&dcache_lock); } -/* - * Shrink the dcache for the specified super block. - * This allows us to unmount a device without disturbing - * the dcache for the other devices. - * - * This implementation makes just two traversals of the - * unused list. On the first pass we move the selected - * dentries to the most recent end, and on the second - * pass we free them. The second pass must restart after - * each dput(), but since the target dentries are all at - * the end, it's really just a single traversal. - */ - /** * shrink_dcache_sb - shrink dcache for a superblock * @sb: superblock @@ -538,44 +602,9 @@ static void prune_dcache(int count, struct super_block *sb) * is used to free the dcache before unmounting a file * system */ - void shrink_dcache_sb(struct super_block * sb) { - struct list_head *tmp, *next; - struct dentry *dentry; - - /* - * Pass one ... move the dentries for the specified - * superblock to the most recent end of the unused list. - */ - spin_lock(&dcache_lock); - list_for_each_prev_safe(tmp, next, &dentry_unused) { - dentry = list_entry(tmp, struct dentry, d_lru); - if (dentry->d_sb != sb) - continue; - list_move_tail(tmp, &dentry_unused); - } - - /* - * Pass two ... free the dentries for this superblock. - */ -repeat: - list_for_each_prev_safe(tmp, next, &dentry_unused) { - dentry = list_entry(tmp, struct dentry, d_lru); - if (dentry->d_sb != sb) - continue; - dentry_stat.nr_unused--; - list_del_init(tmp); - spin_lock(&dentry->d_lock); - if (atomic_read(&dentry->d_count)) { - spin_unlock(&dentry->d_lock); - continue; - } - prune_one_dentry(dentry); - cond_resched_lock(&dcache_lock); - goto repeat; - } - spin_unlock(&dcache_lock); + __shrink_dcache_sb(sb, NULL, 0); } /* @@ -592,7 +621,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) /* detach this root from the system */ spin_lock(&dcache_lock); - dentry_lru_remove(dentry); + dentry_lru_del_init(dentry); __d_drop(dentry); spin_unlock(&dcache_lock); @@ -606,7 +635,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) spin_lock(&dcache_lock); list_for_each_entry(loop, &dentry->d_subdirs, d_u.d_child) { - dentry_lru_remove(loop); + dentry_lru_del_init(loop); __d_drop(loop); cond_resched_lock(&dcache_lock); } @@ -788,14 +817,13 @@ resume: struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); next = tmp->next; - dentry_lru_remove(dentry); + dentry_lru_del_init(dentry); /* * move only zero ref count dentries to the end * of the unused list for prune_dcache */ if (!atomic_read(&dentry->d_count)) { - list_add_tail(&dentry->d_lru, &dentry_unused); - dentry_stat.nr_unused++; + dentry_lru_add_tail(dentry); found++; } @@ -837,10 +865,11 @@ out: void shrink_dcache_parent(struct dentry * parent) { + struct super_block *sb = parent->d_sb; int found; while ((found = select_parent(parent)) != 0) - prune_dcache(found, parent->d_sb); + __shrink_dcache_sb(sb, &found, 0); } /* @@ -860,7 +889,7 @@ static int shrink_dcache_memory(int nr, gfp_t gfp_mask) if (nr) { if (!(gfp_mask & __GFP_FS)) return -1; - prune_dcache(nr, NULL); + prune_dcache(nr); } return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; } @@ -1191,6 +1220,107 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) return new; } +/** + * d_add_ci - lookup or allocate new dentry with case-exact name + * @inode: the inode case-insensitive lookup has found + * @dentry: the negative dentry that was passed to the parent's lookup func + * @name: the case-exact name to be associated with the returned dentry + * + * This is to avoid filling the dcache with case-insensitive names to the + * same inode, only the actual correct case is stored in the dcache for + * case-insensitive filesystems. + * + * For a case-insensitive lookup match and if the the case-exact dentry + * already exists in in the dcache, use it and return it. + * + * If no entry exists with the exact case name, allocate new dentry with + * the exact case, and return the spliced entry. + */ +struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode, + struct qstr *name) +{ + int error; + struct dentry *found; + struct dentry *new; + + /* Does a dentry matching the name exist already? */ + found = d_hash_and_lookup(dentry->d_parent, name); + /* If not, create it now and return */ + if (!found) { + new = d_alloc(dentry->d_parent, name); + if (!new) { + error = -ENOMEM; + goto err_out; + } + found = d_splice_alias(inode, new); + if (found) { + dput(new); + return found; + } + return new; + } + /* Matching dentry exists, check if it is negative. */ + if (found->d_inode) { + if (unlikely(found->d_inode != inode)) { + /* This can't happen because bad inodes are unhashed. */ + BUG_ON(!is_bad_inode(inode)); + BUG_ON(!is_bad_inode(found->d_inode)); + } + /* + * Already have the inode and the dentry attached, decrement + * the reference count to balance the iget() done + * earlier on. We found the dentry using d_lookup() so it + * cannot be disconnected and thus we do not need to worry + * about any NFS/disconnectedness issues here. + */ + iput(inode); + return found; + } + /* + * Negative dentry: instantiate it unless the inode is a directory and + * has a 'disconnected' dentry (i.e. IS_ROOT and DCACHE_DISCONNECTED), + * in which case d_move() that in place of the found dentry. + */ + if (!S_ISDIR(inode->i_mode)) { + /* Not a directory; everything is easy. */ + d_instantiate(found, inode); + return found; + } + spin_lock(&dcache_lock); + if (list_empty(&inode->i_dentry)) { + /* + * Directory without a 'disconnected' dentry; we need to do + * d_instantiate() by hand because it takes dcache_lock which + * we already hold. + */ + list_add(&found->d_alias, &inode->i_dentry); + found->d_inode = inode; + spin_unlock(&dcache_lock); + security_d_instantiate(found, inode); + return found; + } + /* + * Directory with a 'disconnected' dentry; get a reference to the + * 'disconnected' dentry. + */ + new = list_entry(inode->i_dentry.next, struct dentry, d_alias); + dget_locked(new); + spin_unlock(&dcache_lock); + /* Do security vodoo. */ + security_d_instantiate(found, inode); + /* Move new in place of found. */ + d_move(new, found); + /* Balance the iget() we did above. */ + iput(inode); + /* Throw away found. */ + dput(found); + /* Use new as the actual dentry. */ + return new; + +err_out: + iput(inode); + return ERR_PTR(error); +} /** * d_lookup - search for a dentry @@ -1212,7 +1342,7 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) * rcu_read_lock() and rcu_read_unlock() are used to disable preemption while * lookup is going on. * - * dentry_unused list is not updated even if lookup finds the required dentry + * The dentry unused LRU is not updated even if lookup finds the required dentry * in there. It is updated in places such as prune_dcache, shrink_dcache_sb, * select_parent and __dget_locked. This laziness saves lookup from dcache_lock * acquisition. @@ -1265,6 +1395,10 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name) if (dentry->d_parent != parent) goto next; + /* non-existing due to RCU? */ + if (d_unhashed(dentry)) + goto next; + /* * It is safe to compare names since d_move() cannot * change the qstr (protected by d_lock). @@ -1280,10 +1414,8 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name) goto next; } - if (!d_unhashed(dentry)) { - atomic_inc(&dentry->d_count); - found = dentry; - } + atomic_inc(&dentry->d_count); + found = dentry; spin_unlock(&dentry->d_lock); break; next: @@ -1604,10 +1736,9 @@ static int d_isparent(struct dentry *p1, struct dentry *p2) * * Note: If ever the locking in lock_rename() changes, then please * remember to update this too... - * - * On return, dcache_lock will have been unlocked. */ static struct dentry *__d_unalias(struct dentry *dentry, struct dentry *alias) + __releases(dcache_lock) { struct mutex *m1 = NULL, *m2 = NULL; struct dentry *ret; @@ -1743,11 +1874,9 @@ out_nolock: shouldnt_be_hashed: spin_unlock(&dcache_lock); BUG(); - goto shouldnt_be_hashed; } -static int prepend(char **buffer, int *buflen, const char *str, - int namelen) +static int prepend(char **buffer, int *buflen, const char *str, int namelen) { *buflen -= namelen; if (*buflen < 0) @@ -1757,8 +1886,13 @@ static int prepend(char **buffer, int *buflen, const char *str, return 0; } +static int prepend_name(char **buffer, int *buflen, struct qstr *name) +{ + return prepend(buffer, buflen, name->name, name->len); +} + /** - * d_path - return the path of a dentry + * __d_path - return the path of a dentry * @path: the dentry/vfsmount to report * @root: root vfsmnt/dentry (may be modified by this function) * @buffer: buffer to return value in @@ -1779,9 +1913,10 @@ char *__d_path(const struct path *path, struct path *root, { struct dentry *dentry = path->dentry; struct vfsmount *vfsmnt = path->mnt; - char * end = buffer+buflen; - char * retval; + char *end = buffer + buflen; + char *retval; + spin_lock(&vfsmount_lock); prepend(&end, &buflen, "\0", 1); if (!IS_ROOT(dentry) && d_unhashed(dentry) && (prepend(&end, &buflen, " (deleted)", 10) != 0)) @@ -1800,38 +1935,37 @@ char *__d_path(const struct path *path, struct path *root, break; if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) { /* Global root? */ - spin_lock(&vfsmount_lock); if (vfsmnt->mnt_parent == vfsmnt) { - spin_unlock(&vfsmount_lock); goto global_root; } dentry = vfsmnt->mnt_mountpoint; vfsmnt = vfsmnt->mnt_parent; - spin_unlock(&vfsmount_lock); continue; } parent = dentry->d_parent; prefetch(parent); - if ((prepend(&end, &buflen, dentry->d_name.name, - dentry->d_name.len) != 0) || + if ((prepend_name(&end, &buflen, &dentry->d_name) != 0) || (prepend(&end, &buflen, "/", 1) != 0)) goto Elong; retval = end; dentry = parent; } +out: + spin_unlock(&vfsmount_lock); return retval; global_root: retval += 1; /* hit the slash */ - if (prepend(&retval, &buflen, dentry->d_name.name, - dentry->d_name.len) != 0) + if (prepend_name(&retval, &buflen, &dentry->d_name) != 0) goto Elong; root->mnt = vfsmnt; root->dentry = dentry; - return retval; + goto out; + Elong: - return ERR_PTR(-ENAMETOOLONG); + retval = ERR_PTR(-ENAMETOOLONG); + goto out; } /** @@ -1845,9 +1979,9 @@ Elong: * * Returns the buffer or an error code if the path was too long. * - * "buflen" should be positive. Caller holds the dcache_lock. + * "buflen" should be positive. */ -char *d_path(struct path *path, char *buf, int buflen) +char *d_path(const struct path *path, char *buf, int buflen) { char *res; struct path root; @@ -1915,16 +2049,11 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen) retval = end-1; *retval = '/'; - for (;;) { - struct dentry *parent; - if (IS_ROOT(dentry)) - break; + while (!IS_ROOT(dentry)) { + struct dentry *parent = dentry->d_parent; - parent = dentry->d_parent; prefetch(parent); - - if ((prepend(&end, &buflen, dentry->d_name.name, - dentry->d_name.len) != 0) || + if ((prepend_name(&end, &buflen, &dentry->d_name) != 0) || (prepend(&end, &buflen, "/", 1) != 0)) goto Elong; @@ -1975,7 +2104,7 @@ asmlinkage long sys_getcwd(char __user *buf, unsigned long size) error = -ENOENT; /* Has the current directory has been unlinked? */ spin_lock(&dcache_lock); - if (pwd.dentry->d_parent == pwd.dentry || !d_unhashed(pwd.dentry)) { + if (IS_ROOT(pwd.dentry) || !d_unhashed(pwd.dentry)) { unsigned long len; struct path tmp = root; char * cwd; @@ -2228,6 +2357,7 @@ EXPORT_SYMBOL(d_path); EXPORT_SYMBOL(d_prune_aliases); EXPORT_SYMBOL(d_rehash); EXPORT_SYMBOL(d_splice_alias); +EXPORT_SYMBOL(d_add_ci); EXPORT_SYMBOL(d_validate); EXPORT_SYMBOL(dget_locked); EXPORT_SYMBOL(dput); diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index e9602d85c11d..3dbe2169cf36 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -26,8 +26,7 @@ #include <linux/debugfs.h> #include <linux/fsnotify.h> #include <linux/string.h> - -#define DEBUGFS_MAGIC 0x64626720 +#include <linux/magic.h> static struct vfsmount *debugfs_mount; static int debugfs_mount_count; @@ -309,6 +308,31 @@ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent, } EXPORT_SYMBOL_GPL(debugfs_create_symlink); +static void __debugfs_remove(struct dentry *dentry, struct dentry *parent) +{ + int ret = 0; + + if (debugfs_positive(dentry)) { + if (dentry->d_inode) { + dget(dentry); + switch (dentry->d_inode->i_mode & S_IFMT) { + case S_IFDIR: + ret = simple_rmdir(parent->d_inode, dentry); + break; + case S_IFLNK: + kfree(dentry->d_inode->i_private); + /* fall through */ + default: + simple_unlink(parent->d_inode, dentry); + break; + } + if (!ret) + d_delete(dentry); + dput(dentry); + } + } +} + /** * debugfs_remove - removes a file or directory from the debugfs filesystem * @dentry: a pointer to a the dentry of the file or directory to be @@ -325,7 +349,6 @@ EXPORT_SYMBOL_GPL(debugfs_create_symlink); void debugfs_remove(struct dentry *dentry) { struct dentry *parent; - int ret = 0; if (!dentry) return; @@ -335,29 +358,83 @@ void debugfs_remove(struct dentry *dentry) return; mutex_lock(&parent->d_inode->i_mutex); - if (debugfs_positive(dentry)) { - if (dentry->d_inode) { - dget(dentry); - switch (dentry->d_inode->i_mode & S_IFMT) { - case S_IFDIR: - ret = simple_rmdir(parent->d_inode, dentry); - break; - case S_IFLNK: - kfree(dentry->d_inode->i_private); - /* fall through */ - default: - simple_unlink(parent->d_inode, dentry); + __debugfs_remove(dentry, parent); + mutex_unlock(&parent->d_inode->i_mutex); + simple_release_fs(&debugfs_mount, &debugfs_mount_count); +} +EXPORT_SYMBOL_GPL(debugfs_remove); + +/** + * debugfs_remove_recursive - recursively removes a directory + * @dentry: a pointer to a the dentry of the directory to be removed. + * + * This function recursively removes a directory tree in debugfs that + * was previously created with a call to another debugfs function + * (like debugfs_create_file() or variants thereof.) + * + * This function is required to be called in order for the file to be + * removed, no automatic cleanup of files will happen when a module is + * removed, you are responsible here. + */ +void debugfs_remove_recursive(struct dentry *dentry) +{ + struct dentry *child; + struct dentry *parent; + + if (!dentry) + return; + + parent = dentry->d_parent; + if (!parent || !parent->d_inode) + return; + + parent = dentry; + mutex_lock(&parent->d_inode->i_mutex); + + while (1) { + /* + * When all dentries under "parent" has been removed, + * walk up the tree until we reach our starting point. + */ + if (list_empty(&parent->d_subdirs)) { + mutex_unlock(&parent->d_inode->i_mutex); + if (parent == dentry) break; - } - if (!ret) - d_delete(dentry); - dput(dentry); + parent = parent->d_parent; + mutex_lock(&parent->d_inode->i_mutex); + } + child = list_entry(parent->d_subdirs.next, struct dentry, + d_u.d_child); + + /* + * If "child" isn't empty, walk down the tree and + * remove all its descendants first. + */ + if (!list_empty(&child->d_subdirs)) { + mutex_unlock(&parent->d_inode->i_mutex); + parent = child; + mutex_lock(&parent->d_inode->i_mutex); + continue; } + __debugfs_remove(child, parent); + if (parent->d_subdirs.next == &child->d_u.d_child) { + /* + * Avoid infinite loop if we fail to remove + * one dentry. + */ + mutex_unlock(&parent->d_inode->i_mutex); + break; + } + simple_release_fs(&debugfs_mount, &debugfs_mount_count); } + + parent = dentry->d_parent; + mutex_lock(&parent->d_inode->i_mutex); + __debugfs_remove(dentry, parent); mutex_unlock(&parent->d_inode->i_mutex); simple_release_fs(&debugfs_mount, &debugfs_mount_count); } -EXPORT_SYMBOL_GPL(debugfs_remove); +EXPORT_SYMBOL_GPL(debugfs_remove_recursive); /** * debugfs_rename - rename a file/directory in the debugfs filesystem diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 285b64a8b06e..4a714f6c1bed 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -27,9 +27,10 @@ #define DEVPTS_SUPER_MAGIC 0x1cd1 #define DEVPTS_DEFAULT_MODE 0600 +#define PTMX_MINOR 2 extern int pty_limit; /* Config limit on Unix98 ptys */ -static DEFINE_IDR(allocated_ptys); +static DEFINE_IDA(allocated_ptys); static DEFINE_MUTEX(allocated_ptys_lock); static struct vfsmount *devpts_mnt; @@ -48,7 +49,7 @@ enum { Opt_err }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_uid, "uid=%u"}, {Opt_gid, "gid=%u"}, {Opt_mode, "mode=%o"}, @@ -169,35 +170,27 @@ static struct file_system_type devpts_fs_type = { * to the System V naming convention */ -static struct dentry *get_node(int num) -{ - char s[12]; - struct dentry *root = devpts_root; - mutex_lock(&root->d_inode->i_mutex); - return lookup_one_len(s, root, sprintf(s, "%d", num)); -} - -int devpts_new_index(void) +int devpts_new_index(struct inode *ptmx_inode) { int index; - int idr_ret; + int ida_ret; retry: - if (!idr_pre_get(&allocated_ptys, GFP_KERNEL)) { + if (!ida_pre_get(&allocated_ptys, GFP_KERNEL)) { return -ENOMEM; } mutex_lock(&allocated_ptys_lock); - idr_ret = idr_get_new(&allocated_ptys, NULL, &index); - if (idr_ret < 0) { + ida_ret = ida_get_new(&allocated_ptys, &index); + if (ida_ret < 0) { mutex_unlock(&allocated_ptys_lock); - if (idr_ret == -EAGAIN) + if (ida_ret == -EAGAIN) goto retry; return -EIO; } if (index >= pty_limit) { - idr_remove(&allocated_ptys, index); + ida_remove(&allocated_ptys, index); mutex_unlock(&allocated_ptys_lock); return -EIO; } @@ -205,20 +198,21 @@ retry: return index; } -void devpts_kill_index(int idx) +void devpts_kill_index(struct inode *ptmx_inode, int idx) { mutex_lock(&allocated_ptys_lock); - idr_remove(&allocated_ptys, idx); + ida_remove(&allocated_ptys, idx); mutex_unlock(&allocated_ptys_lock); } -int devpts_pty_new(struct tty_struct *tty) +int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty) { int number = tty->index; /* tty layer puts index from devpts_new_index() in here */ struct tty_driver *driver = tty->driver; dev_t device = MKDEV(driver->major, driver->minor_start+number); struct dentry *dentry; struct inode *inode = new_inode(devpts_mnt->mnt_sb); + char s[12]; /* We're supposed to be given the slave end of a pty */ BUG_ON(driver->type != TTY_DRIVER_TYPE_PTY); @@ -233,10 +227,15 @@ int devpts_pty_new(struct tty_struct *tty) inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; init_special_inode(inode, S_IFCHR|config.mode, device); inode->i_private = tty; + tty->driver_data = inode; - dentry = get_node(number); - if (!IS_ERR(dentry) && !dentry->d_inode) { - d_instantiate(dentry, inode); + sprintf(s, "%d", number); + + mutex_lock(&devpts_root->d_inode->i_mutex); + + dentry = d_alloc_name(devpts_root, s); + if (!IS_ERR(dentry)) { + d_add(dentry, inode); fsnotify_create(devpts_root->d_inode, dentry); } @@ -245,36 +244,31 @@ int devpts_pty_new(struct tty_struct *tty) return 0; } -struct tty_struct *devpts_get_tty(int number) +struct tty_struct *devpts_get_tty(struct inode *pts_inode, int number) { - struct dentry *dentry = get_node(number); - struct tty_struct *tty; - - tty = NULL; - if (!IS_ERR(dentry)) { - if (dentry->d_inode) - tty = dentry->d_inode->i_private; - dput(dentry); - } + BUG_ON(pts_inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR)); - mutex_unlock(&devpts_root->d_inode->i_mutex); - - return tty; + if (pts_inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC) + return (struct tty_struct *)pts_inode->i_private; + return NULL; } -void devpts_pty_kill(int number) +void devpts_pty_kill(struct tty_struct *tty) { - struct dentry *dentry = get_node(number); + struct inode *inode = tty->driver_data; + struct dentry *dentry; - if (!IS_ERR(dentry)) { - struct inode *inode = dentry->d_inode; - if (inode) { - inode->i_nlink--; - d_delete(dentry); - dput(dentry); - } + BUG_ON(inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR)); + + mutex_lock(&devpts_root->d_inode->i_mutex); + + dentry = d_find_alias(inode); + if (dentry && !IS_ERR(dentry)) { + inode->i_nlink--; + d_delete(dentry); dput(dentry); } + mutex_unlock(&devpts_root->d_inode->i_mutex); } diff --git a/fs/direct-io.c b/fs/direct-io.c index 9e81addbd6ea..9606ee848fd8 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -150,17 +150,11 @@ static int dio_refill_pages(struct dio *dio) int nr_pages; nr_pages = min(dio->total_pages - dio->curr_page, DIO_PAGES); - down_read(¤t->mm->mmap_sem); - ret = get_user_pages( - current, /* Task for fault acounting */ - current->mm, /* whose pages? */ + ret = get_user_pages_fast( dio->curr_user_address, /* Where from? */ nr_pages, /* How many pages? */ dio->rw == READ, /* Write to memory? */ - 0, /* force (?) */ - &dio->pages[0], - NULL); /* vmas */ - up_read(¤t->mm->mmap_sem); + &dio->pages[0]); /* Put results here */ if (ret < 0 && dio->blocks_available && (dio->rw & WRITE)) { struct page *page = ZERO_PAGE(0); diff --git a/fs/dlm/config.c b/fs/dlm/config.c index eac23bd288b2..fd9859f92fad 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -2,7 +2,7 @@ ******************************************************************************* ** ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. -** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. +** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -14,6 +14,9 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/configfs.h> +#include <linux/in.h> +#include <linux/in6.h> +#include <net/ipv6.h> #include <net/sock.h> #include "config.h" @@ -30,16 +33,16 @@ static struct config_group *space_list; static struct config_group *comm_list; -static struct comm *local_comm; +static struct dlm_comm *local_comm; -struct clusters; -struct cluster; -struct spaces; -struct space; -struct comms; -struct comm; -struct nodes; -struct node; +struct dlm_clusters; +struct dlm_cluster; +struct dlm_spaces; +struct dlm_space; +struct dlm_comms; +struct dlm_comm; +struct dlm_nodes; +struct dlm_node; static struct config_group *make_cluster(struct config_group *, const char *); static void drop_cluster(struct config_group *, struct config_item *); @@ -68,17 +71,22 @@ static ssize_t show_node(struct config_item *i, struct configfs_attribute *a, static ssize_t store_node(struct config_item *i, struct configfs_attribute *a, const char *buf, size_t len); -static ssize_t comm_nodeid_read(struct comm *cm, char *buf); -static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len); -static ssize_t comm_local_read(struct comm *cm, char *buf); -static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len); -static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len); -static ssize_t node_nodeid_read(struct node *nd, char *buf); -static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len); -static ssize_t node_weight_read(struct node *nd, char *buf); -static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len); - -struct cluster { +static ssize_t comm_nodeid_read(struct dlm_comm *cm, char *buf); +static ssize_t comm_nodeid_write(struct dlm_comm *cm, const char *buf, + size_t len); +static ssize_t comm_local_read(struct dlm_comm *cm, char *buf); +static ssize_t comm_local_write(struct dlm_comm *cm, const char *buf, + size_t len); +static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, + size_t len); +static ssize_t node_nodeid_read(struct dlm_node *nd, char *buf); +static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf, + size_t len); +static ssize_t node_weight_read(struct dlm_node *nd, char *buf); +static ssize_t node_weight_write(struct dlm_node *nd, const char *buf, + size_t len); + +struct dlm_cluster { struct config_group group; unsigned int cl_tcp_port; unsigned int cl_buffer_size; @@ -109,11 +117,11 @@ enum { struct cluster_attribute { struct configfs_attribute attr; - ssize_t (*show)(struct cluster *, char *); - ssize_t (*store)(struct cluster *, const char *, size_t); + ssize_t (*show)(struct dlm_cluster *, char *); + ssize_t (*store)(struct dlm_cluster *, const char *, size_t); }; -static ssize_t cluster_set(struct cluster *cl, unsigned int *cl_field, +static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field, int *info_field, int check_zero, const char *buf, size_t len) { @@ -134,12 +142,12 @@ static ssize_t cluster_set(struct cluster *cl, unsigned int *cl_field, } #define CLUSTER_ATTR(name, check_zero) \ -static ssize_t name##_write(struct cluster *cl, const char *buf, size_t len) \ +static ssize_t name##_write(struct dlm_cluster *cl, const char *buf, size_t len) \ { \ return cluster_set(cl, &cl->cl_##name, &dlm_config.ci_##name, \ check_zero, buf, len); \ } \ -static ssize_t name##_read(struct cluster *cl, char *buf) \ +static ssize_t name##_read(struct dlm_cluster *cl, char *buf) \ { \ return snprintf(buf, PAGE_SIZE, "%u\n", cl->cl_##name); \ } \ @@ -181,8 +189,8 @@ enum { struct comm_attribute { struct configfs_attribute attr; - ssize_t (*show)(struct comm *, char *); - ssize_t (*store)(struct comm *, const char *, size_t); + ssize_t (*show)(struct dlm_comm *, char *); + ssize_t (*store)(struct dlm_comm *, const char *, size_t); }; static struct comm_attribute comm_attr_nodeid = { @@ -222,8 +230,8 @@ enum { struct node_attribute { struct configfs_attribute attr; - ssize_t (*show)(struct node *, char *); - ssize_t (*store)(struct node *, const char *, size_t); + ssize_t (*show)(struct dlm_node *, char *); + ssize_t (*store)(struct dlm_node *, const char *, size_t); }; static struct node_attribute node_attr_nodeid = { @@ -248,26 +256,26 @@ static struct configfs_attribute *node_attrs[] = { NULL, }; -struct clusters { +struct dlm_clusters { struct configfs_subsystem subsys; }; -struct spaces { +struct dlm_spaces { struct config_group ss_group; }; -struct space { +struct dlm_space { struct config_group group; struct list_head members; struct mutex members_lock; int members_count; }; -struct comms { +struct dlm_comms { struct config_group cs_group; }; -struct comm { +struct dlm_comm { struct config_item item; int nodeid; int local; @@ -275,11 +283,11 @@ struct comm { struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT]; }; -struct nodes { +struct dlm_nodes { struct config_group ns_group; }; -struct node { +struct dlm_node { struct config_item item; struct list_head list; /* space->members */ int nodeid; @@ -372,38 +380,40 @@ static struct config_item_type node_type = { .ct_owner = THIS_MODULE, }; -static struct cluster *to_cluster(struct config_item *i) +static struct dlm_cluster *config_item_to_cluster(struct config_item *i) { - return i ? container_of(to_config_group(i), struct cluster, group):NULL; + return i ? container_of(to_config_group(i), struct dlm_cluster, group) : + NULL; } -static struct space *to_space(struct config_item *i) +static struct dlm_space *config_item_to_space(struct config_item *i) { - return i ? container_of(to_config_group(i), struct space, group) : NULL; + return i ? container_of(to_config_group(i), struct dlm_space, group) : + NULL; } -static struct comm *to_comm(struct config_item *i) +static struct dlm_comm *config_item_to_comm(struct config_item *i) { - return i ? container_of(i, struct comm, item) : NULL; + return i ? container_of(i, struct dlm_comm, item) : NULL; } -static struct node *to_node(struct config_item *i) +static struct dlm_node *config_item_to_node(struct config_item *i) { - return i ? container_of(i, struct node, item) : NULL; + return i ? container_of(i, struct dlm_node, item) : NULL; } static struct config_group *make_cluster(struct config_group *g, const char *name) { - struct cluster *cl = NULL; - struct spaces *sps = NULL; - struct comms *cms = NULL; + struct dlm_cluster *cl = NULL; + struct dlm_spaces *sps = NULL; + struct dlm_comms *cms = NULL; void *gps = NULL; - cl = kzalloc(sizeof(struct cluster), GFP_KERNEL); + cl = kzalloc(sizeof(struct dlm_cluster), GFP_KERNEL); gps = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL); - sps = kzalloc(sizeof(struct spaces), GFP_KERNEL); - cms = kzalloc(sizeof(struct comms), GFP_KERNEL); + sps = kzalloc(sizeof(struct dlm_spaces), GFP_KERNEL); + cms = kzalloc(sizeof(struct dlm_comms), GFP_KERNEL); if (!cl || !gps || !sps || !cms) goto fail; @@ -438,12 +448,12 @@ static struct config_group *make_cluster(struct config_group *g, kfree(gps); kfree(sps); kfree(cms); - return NULL; + return ERR_PTR(-ENOMEM); } static void drop_cluster(struct config_group *g, struct config_item *i) { - struct cluster *cl = to_cluster(i); + struct dlm_cluster *cl = config_item_to_cluster(i); struct config_item *tmp; int j; @@ -461,20 +471,20 @@ static void drop_cluster(struct config_group *g, struct config_item *i) static void release_cluster(struct config_item *i) { - struct cluster *cl = to_cluster(i); + struct dlm_cluster *cl = config_item_to_cluster(i); kfree(cl->group.default_groups); kfree(cl); } static struct config_group *make_space(struct config_group *g, const char *name) { - struct space *sp = NULL; - struct nodes *nds = NULL; + struct dlm_space *sp = NULL; + struct dlm_nodes *nds = NULL; void *gps = NULL; - sp = kzalloc(sizeof(struct space), GFP_KERNEL); + sp = kzalloc(sizeof(struct dlm_space), GFP_KERNEL); gps = kcalloc(2, sizeof(struct config_group *), GFP_KERNEL); - nds = kzalloc(sizeof(struct nodes), GFP_KERNEL); + nds = kzalloc(sizeof(struct dlm_nodes), GFP_KERNEL); if (!sp || !gps || !nds) goto fail; @@ -495,12 +505,12 @@ static struct config_group *make_space(struct config_group *g, const char *name) kfree(sp); kfree(gps); kfree(nds); - return NULL; + return ERR_PTR(-ENOMEM); } static void drop_space(struct config_group *g, struct config_item *i) { - struct space *sp = to_space(i); + struct dlm_space *sp = config_item_to_space(i); struct config_item *tmp; int j; @@ -517,18 +527,18 @@ static void drop_space(struct config_group *g, struct config_item *i) static void release_space(struct config_item *i) { - struct space *sp = to_space(i); + struct dlm_space *sp = config_item_to_space(i); kfree(sp->group.default_groups); kfree(sp); } static struct config_item *make_comm(struct config_group *g, const char *name) { - struct comm *cm; + struct dlm_comm *cm; - cm = kzalloc(sizeof(struct comm), GFP_KERNEL); + cm = kzalloc(sizeof(struct dlm_comm), GFP_KERNEL); if (!cm) - return NULL; + return ERR_PTR(-ENOMEM); config_item_init_type_name(&cm->item, name, &comm_type); cm->nodeid = -1; @@ -539,7 +549,7 @@ static struct config_item *make_comm(struct config_group *g, const char *name) static void drop_comm(struct config_group *g, struct config_item *i) { - struct comm *cm = to_comm(i); + struct dlm_comm *cm = config_item_to_comm(i); if (local_comm == cm) local_comm = NULL; dlm_lowcomms_close(cm->nodeid); @@ -550,18 +560,18 @@ static void drop_comm(struct config_group *g, struct config_item *i) static void release_comm(struct config_item *i) { - struct comm *cm = to_comm(i); + struct dlm_comm *cm = config_item_to_comm(i); kfree(cm); } static struct config_item *make_node(struct config_group *g, const char *name) { - struct space *sp = to_space(g->cg_item.ci_parent); - struct node *nd; + struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent); + struct dlm_node *nd; - nd = kzalloc(sizeof(struct node), GFP_KERNEL); + nd = kzalloc(sizeof(struct dlm_node), GFP_KERNEL); if (!nd) - return NULL; + return ERR_PTR(-ENOMEM); config_item_init_type_name(&nd->item, name, &node_type); nd->nodeid = -1; @@ -578,8 +588,8 @@ static struct config_item *make_node(struct config_group *g, const char *name) static void drop_node(struct config_group *g, struct config_item *i) { - struct space *sp = to_space(g->cg_item.ci_parent); - struct node *nd = to_node(i); + struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent); + struct dlm_node *nd = config_item_to_node(i); mutex_lock(&sp->members_lock); list_del(&nd->list); @@ -591,11 +601,11 @@ static void drop_node(struct config_group *g, struct config_item *i) static void release_node(struct config_item *i) { - struct node *nd = to_node(i); + struct dlm_node *nd = config_item_to_node(i); kfree(nd); } -static struct clusters clusters_root = { +static struct dlm_clusters clusters_root = { .subsys = { .su_group = { .cg_item = { @@ -625,7 +635,7 @@ void dlm_config_exit(void) static ssize_t show_cluster(struct config_item *i, struct configfs_attribute *a, char *buf) { - struct cluster *cl = to_cluster(i); + struct dlm_cluster *cl = config_item_to_cluster(i); struct cluster_attribute *cla = container_of(a, struct cluster_attribute, attr); return cla->show ? cla->show(cl, buf) : 0; @@ -635,7 +645,7 @@ static ssize_t store_cluster(struct config_item *i, struct configfs_attribute *a, const char *buf, size_t len) { - struct cluster *cl = to_cluster(i); + struct dlm_cluster *cl = config_item_to_cluster(i); struct cluster_attribute *cla = container_of(a, struct cluster_attribute, attr); return cla->store ? cla->store(cl, buf, len) : -EINVAL; @@ -644,7 +654,7 @@ static ssize_t store_cluster(struct config_item *i, static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a, char *buf) { - struct comm *cm = to_comm(i); + struct dlm_comm *cm = config_item_to_comm(i); struct comm_attribute *cma = container_of(a, struct comm_attribute, attr); return cma->show ? cma->show(cm, buf) : 0; @@ -653,29 +663,31 @@ static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a, static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a, const char *buf, size_t len) { - struct comm *cm = to_comm(i); + struct dlm_comm *cm = config_item_to_comm(i); struct comm_attribute *cma = container_of(a, struct comm_attribute, attr); return cma->store ? cma->store(cm, buf, len) : -EINVAL; } -static ssize_t comm_nodeid_read(struct comm *cm, char *buf) +static ssize_t comm_nodeid_read(struct dlm_comm *cm, char *buf) { return sprintf(buf, "%d\n", cm->nodeid); } -static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len) +static ssize_t comm_nodeid_write(struct dlm_comm *cm, const char *buf, + size_t len) { cm->nodeid = simple_strtol(buf, NULL, 0); return len; } -static ssize_t comm_local_read(struct comm *cm, char *buf) +static ssize_t comm_local_read(struct dlm_comm *cm, char *buf) { return sprintf(buf, "%d\n", cm->local); } -static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len) +static ssize_t comm_local_write(struct dlm_comm *cm, const char *buf, + size_t len) { cm->local= simple_strtol(buf, NULL, 0); if (cm->local && !local_comm) @@ -683,7 +695,7 @@ static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len) return len; } -static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len) +static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, size_t len) { struct sockaddr_storage *addr; @@ -705,7 +717,7 @@ static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len) static ssize_t show_node(struct config_item *i, struct configfs_attribute *a, char *buf) { - struct node *nd = to_node(i); + struct dlm_node *nd = config_item_to_node(i); struct node_attribute *nda = container_of(a, struct node_attribute, attr); return nda->show ? nda->show(nd, buf) : 0; @@ -714,29 +726,31 @@ static ssize_t show_node(struct config_item *i, struct configfs_attribute *a, static ssize_t store_node(struct config_item *i, struct configfs_attribute *a, const char *buf, size_t len) { - struct node *nd = to_node(i); + struct dlm_node *nd = config_item_to_node(i); struct node_attribute *nda = container_of(a, struct node_attribute, attr); return nda->store ? nda->store(nd, buf, len) : -EINVAL; } -static ssize_t node_nodeid_read(struct node *nd, char *buf) +static ssize_t node_nodeid_read(struct dlm_node *nd, char *buf) { return sprintf(buf, "%d\n", nd->nodeid); } -static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len) +static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf, + size_t len) { nd->nodeid = simple_strtol(buf, NULL, 0); return len; } -static ssize_t node_weight_read(struct node *nd, char *buf) +static ssize_t node_weight_read(struct dlm_node *nd, char *buf) { return sprintf(buf, "%d\n", nd->weight); } -static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len) +static ssize_t node_weight_write(struct dlm_node *nd, const char *buf, + size_t len) { nd->weight = simple_strtol(buf, NULL, 0); return len; @@ -746,7 +760,7 @@ static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len) * Functions for the dlm to get the info that's been configured */ -static struct space *get_space(char *name) +static struct dlm_space *get_space(char *name) { struct config_item *i; @@ -757,18 +771,45 @@ static struct space *get_space(char *name) i = config_group_find_item(space_list, name); mutex_unlock(&space_list->cg_subsys->su_mutex); - return to_space(i); + return config_item_to_space(i); } -static void put_space(struct space *sp) +static void put_space(struct dlm_space *sp) { config_item_put(&sp->group.cg_item); } -static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr) +static int addr_compare(struct sockaddr_storage *x, struct sockaddr_storage *y) +{ + switch (x->ss_family) { + case AF_INET: { + struct sockaddr_in *sinx = (struct sockaddr_in *)x; + struct sockaddr_in *siny = (struct sockaddr_in *)y; + if (sinx->sin_addr.s_addr != siny->sin_addr.s_addr) + return 0; + if (sinx->sin_port != siny->sin_port) + return 0; + break; + } + case AF_INET6: { + struct sockaddr_in6 *sinx = (struct sockaddr_in6 *)x; + struct sockaddr_in6 *siny = (struct sockaddr_in6 *)y; + if (!ipv6_addr_equal(&sinx->sin6_addr, &siny->sin6_addr)) + return 0; + if (sinx->sin6_port != siny->sin6_port) + return 0; + break; + } + default: + return 0; + } + return 1; +} + +static struct dlm_comm *get_comm(int nodeid, struct sockaddr_storage *addr) { struct config_item *i; - struct comm *cm = NULL; + struct dlm_comm *cm = NULL; int found = 0; if (!comm_list) @@ -777,7 +818,7 @@ static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr) mutex_lock(&clusters_root.subsys.su_mutex); list_for_each_entry(i, &comm_list->cg_children, ci_entry) { - cm = to_comm(i); + cm = config_item_to_comm(i); if (nodeid) { if (cm->nodeid != nodeid) @@ -786,8 +827,7 @@ static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr) config_item_get(i); break; } else { - if (!cm->addr_count || - memcmp(cm->addr[0], addr, sizeof(*addr))) + if (!cm->addr_count || !addr_compare(cm->addr[0], addr)) continue; found = 1; config_item_get(i); @@ -801,7 +841,7 @@ static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr) return cm; } -static void put_comm(struct comm *cm) +static void put_comm(struct dlm_comm *cm) { config_item_put(&cm->item); } @@ -810,8 +850,8 @@ static void put_comm(struct comm *cm) int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out, int **new_out, int *new_count_out) { - struct space *sp; - struct node *nd; + struct dlm_space *sp; + struct dlm_node *nd; int i = 0, rv = 0, ids_count = 0, new_count = 0; int *ids, *new; @@ -874,8 +914,8 @@ int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out, int dlm_node_weight(char *lsname, int nodeid) { - struct space *sp; - struct node *nd; + struct dlm_space *sp; + struct dlm_node *nd; int w = -EEXIST; sp = get_space(lsname); @@ -897,7 +937,7 @@ int dlm_node_weight(char *lsname, int nodeid) int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr) { - struct comm *cm = get_comm(nodeid, NULL); + struct dlm_comm *cm = get_comm(nodeid, NULL); if (!cm) return -EEXIST; if (!cm->addr_count) @@ -909,7 +949,7 @@ int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr) int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid) { - struct comm *cm = get_comm(0, addr); + struct dlm_comm *cm = get_comm(0, addr); if (!cm) return -EEXIST; *nodeid = cm->nodeid; diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 5a7ac33b629c..868e4c9ef127 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -2,7 +2,7 @@ ******************************************************************************* ** ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. -** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. +** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -441,8 +441,11 @@ struct dlm_ls { uint32_t ls_global_id; /* global unique lockspace ID */ uint32_t ls_exflags; int ls_lvblen; - int ls_count; /* reference count */ + int ls_count; /* refcount of processes in + the dlm using this ls */ + int ls_create_count; /* create/release refcount */ unsigned long ls_flags; /* LSFL_ */ + unsigned long ls_scan_time; struct kobject ls_kobj; struct dlm_rsbtable *ls_rsbtbl; diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 2d3d1027ce2b..724ddac91538 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -363,6 +363,7 @@ static int search_rsb_list(struct list_head *head, char *name, int len, if (len == r->res_length && !memcmp(name, r->res_name, len)) goto found; } + *r_ret = NULL; return -EBADR; found: @@ -1782,7 +1783,8 @@ static void grant_pending_locks(struct dlm_rsb *r) list_for_each_entry_safe(lkb, s, &r->res_grantqueue, lkb_statequeue) { if (lkb->lkb_bastfn && lock_requires_bast(lkb, high, cw)) { - if (cw && high == DLM_LOCK_PR) + if (cw && high == DLM_LOCK_PR && + lkb->lkb_grmode == DLM_LOCK_PR) queue_bast(r, lkb, DLM_LOCK_CW); else queue_bast(r, lkb, high); diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 499e16759e96..d910501de6d2 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -2,7 +2,7 @@ ******************************************************************************* ** ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. -** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. +** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -23,6 +23,7 @@ #include "lock.h" #include "recover.h" #include "requestqueue.h" +#include "user.h" static int ls_count; static struct mutex ls_lock; @@ -211,19 +212,41 @@ void dlm_lockspace_exit(void) kset_unregister(dlm_kset); } +static struct dlm_ls *find_ls_to_scan(void) +{ + struct dlm_ls *ls; + + spin_lock(&lslist_lock); + list_for_each_entry(ls, &lslist, ls_list) { + if (time_after_eq(jiffies, ls->ls_scan_time + + dlm_config.ci_scan_secs * HZ)) { + spin_unlock(&lslist_lock); + return ls; + } + } + spin_unlock(&lslist_lock); + return NULL; +} + static int dlm_scand(void *data) { struct dlm_ls *ls; + int timeout_jiffies = dlm_config.ci_scan_secs * HZ; while (!kthread_should_stop()) { - list_for_each_entry(ls, &lslist, ls_list) { + ls = find_ls_to_scan(); + if (ls) { if (dlm_lock_recovery_try(ls)) { + ls->ls_scan_time = jiffies; dlm_scan_rsbs(ls); dlm_scan_timeout(ls); dlm_unlock_recovery(ls); + } else { + ls->ls_scan_time += HZ; } + } else { + schedule_timeout_interruptible(timeout_jiffies); } - schedule_timeout_interruptible(dlm_config.ci_scan_secs * HZ); } return 0; } @@ -246,23 +269,6 @@ static void dlm_scand_stop(void) kthread_stop(scand_task); } -static struct dlm_ls *dlm_find_lockspace_name(char *name, int namelen) -{ - struct dlm_ls *ls; - - spin_lock(&lslist_lock); - - list_for_each_entry(ls, &lslist, ls_list) { - if (ls->ls_namelen == namelen && - memcmp(ls->ls_name, name, namelen) == 0) - goto out; - } - ls = NULL; - out: - spin_unlock(&lslist_lock); - return ls; -} - struct dlm_ls *dlm_find_lockspace_global(uint32_t id) { struct dlm_ls *ls; @@ -327,6 +333,7 @@ static void remove_lockspace(struct dlm_ls *ls) for (;;) { spin_lock(&lslist_lock); if (ls->ls_count == 0) { + WARN_ON(ls->ls_create_count != 0); list_del(&ls->ls_list); spin_unlock(&lslist_lock); return; @@ -381,7 +388,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace, uint32_t flags, int lvblen) { struct dlm_ls *ls; - int i, size, error = -ENOMEM; + int i, size, error; int do_unreg = 0; if (namelen > DLM_LOCKSPACE_LEN) @@ -393,12 +400,37 @@ static int new_lockspace(char *name, int namelen, void **lockspace, if (!try_module_get(THIS_MODULE)) return -EINVAL; - ls = dlm_find_lockspace_name(name, namelen); - if (ls) { - *lockspace = ls; + if (!dlm_user_daemon_available()) { + module_put(THIS_MODULE); + return -EUNATCH; + } + + error = 0; + + spin_lock(&lslist_lock); + list_for_each_entry(ls, &lslist, ls_list) { + WARN_ON(ls->ls_create_count <= 0); + if (ls->ls_namelen != namelen) + continue; + if (memcmp(ls->ls_name, name, namelen)) + continue; + if (flags & DLM_LSFL_NEWEXCL) { + error = -EEXIST; + break; + } + ls->ls_create_count++; module_put(THIS_MODULE); - return -EEXIST; + error = 1; /* not an error, return 0 */ + break; } + spin_unlock(&lslist_lock); + + if (error < 0) + goto out; + if (error) + goto ret_zero; + + error = -ENOMEM; ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_KERNEL); if (!ls) @@ -408,6 +440,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace, ls->ls_lvblen = lvblen; ls->ls_count = 0; ls->ls_flags = 0; + ls->ls_scan_time = jiffies; if (flags & DLM_LSFL_TIMEWARN) set_bit(LSFL_TIMEWARN, &ls->ls_flags); @@ -418,8 +451,9 @@ static int new_lockspace(char *name, int namelen, void **lockspace, ls->ls_allocation = GFP_KERNEL; /* ls_exflags are forced to match among nodes, and we don't - need to require all nodes to have TIMEWARN or FS set */ - ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS)); + need to require all nodes to have some flags set */ + ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS | + DLM_LSFL_NEWEXCL)); size = dlm_config.ci_rsbtbl_size; ls->ls_rsbtbl_size = size; @@ -510,6 +544,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace, down_write(&ls->ls_in_recovery); spin_lock(&lslist_lock); + ls->ls_create_count = 1; list_add(&ls->ls_list, &lslist); spin_unlock(&lslist_lock); @@ -548,7 +583,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace, dlm_create_debug_file(ls); log_debug(ls, "join complete"); - + ret_zero: *lockspace = ls; return 0; @@ -635,13 +670,34 @@ static int release_lockspace(struct dlm_ls *ls, int force) struct dlm_lkb *lkb; struct dlm_rsb *rsb; struct list_head *head; - int i; - int busy = lockspace_busy(ls); + int i, busy, rv; + + busy = lockspace_busy(ls); + + spin_lock(&lslist_lock); + if (ls->ls_create_count == 1) { + if (busy > force) + rv = -EBUSY; + else { + /* remove_lockspace takes ls off lslist */ + ls->ls_create_count = 0; + rv = 0; + } + } else if (ls->ls_create_count > 1) { + rv = --ls->ls_create_count; + } else { + rv = -EINVAL; + } + spin_unlock(&lslist_lock); - if (busy > force) - return -EBUSY; + if (rv) { + log_debug(ls, "release_lockspace no remove %d", rv); + return rv; + } + + dlm_device_deregister(ls); - if (force < 3) + if (force < 3 && dlm_user_daemon_available()) do_uevent(ls, 0); dlm_recoverd_stop(ls); @@ -720,15 +776,10 @@ static int release_lockspace(struct dlm_ls *ls, int force) dlm_clear_members(ls); dlm_clear_members_gone(ls); kfree(ls->ls_node_array); + log_debug(ls, "release_lockspace final free"); kobject_put(&ls->ls_kobj); /* The ls structure will be freed when the kobject is done with */ - mutex_lock(&ls_lock); - ls_count--; - if (!ls_count) - threads_stop(); - mutex_unlock(&ls_lock); - module_put(THIS_MODULE); return 0; } @@ -750,11 +801,38 @@ static int release_lockspace(struct dlm_ls *ls, int force) int dlm_release_lockspace(void *lockspace, int force) { struct dlm_ls *ls; + int error; ls = dlm_find_lockspace_local(lockspace); if (!ls) return -EINVAL; dlm_put_lockspace(ls); - return release_lockspace(ls, force); + + mutex_lock(&ls_lock); + error = release_lockspace(ls, force); + if (!error) + ls_count--; + else if (!ls_count) + threads_stop(); + mutex_unlock(&ls_lock); + + return error; +} + +void dlm_stop_lockspaces(void) +{ + struct dlm_ls *ls; + + restart: + spin_lock(&lslist_lock); + list_for_each_entry(ls, &lslist, ls_list) { + if (!test_bit(LSFL_RUNNING, &ls->ls_flags)) + continue; + spin_unlock(&lslist_lock); + log_error(ls, "no userland control daemon, stopping lockspace"); + dlm_ls_stop(ls); + goto restart; + } + spin_unlock(&lslist_lock); } diff --git a/fs/dlm/lockspace.h b/fs/dlm/lockspace.h index 891eabbdd021..f879f87901f8 100644 --- a/fs/dlm/lockspace.h +++ b/fs/dlm/lockspace.h @@ -20,6 +20,7 @@ struct dlm_ls *dlm_find_lockspace_global(uint32_t id); struct dlm_ls *dlm_find_lockspace_local(void *id); struct dlm_ls *dlm_find_lockspace_device(int minor); void dlm_put_lockspace(struct dlm_ls *ls); +void dlm_stop_lockspaces(void); #endif /* __LOCKSPACE_DOT_H__ */ diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 637018c891ef..3962262f991a 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -891,8 +891,10 @@ static void tcp_connect_to_sock(struct connection *con) goto out_err; memset(&saddr, 0, sizeof(saddr)); - if (dlm_nodeid_to_addr(con->nodeid, &saddr)) + if (dlm_nodeid_to_addr(con->nodeid, &saddr)) { + sock_release(sock); goto out_err; + } sock->sk->sk_user_data = con; con->rx_action = receive_from_sock; diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c index 78878c5781ca..eba87ff3177b 100644 --- a/fs/dlm/plock.c +++ b/fs/dlm/plock.c @@ -116,7 +116,7 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, if (xop->callback == NULL) wait_event(recv_wq, (op->done != 0)); else { - rv = -EINPROGRESS; + rv = FILE_LOCK_DEFERRED; goto out; } diff --git a/fs/dlm/user.c b/fs/dlm/user.c index ebbcf38fd33b..b3832c67194a 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2006-2007 Red Hat, Inc. All rights reserved. + * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions @@ -26,6 +26,8 @@ static const char name_prefix[] = "dlm"; static const struct file_operations device_fops; +static atomic_t dlm_monitor_opened; +static int dlm_monitor_unused = 1; #ifdef CONFIG_COMPAT @@ -339,10 +341,15 @@ static int device_user_deadlock(struct dlm_user_proc *proc, return error; } -static int create_misc_device(struct dlm_ls *ls, char *name) +static int dlm_device_register(struct dlm_ls *ls, char *name) { int error, len; + /* The device is already registered. This happens when the + lockspace is created multiple times from userspace. */ + if (ls->ls_device.name) + return 0; + error = -ENOMEM; len = strlen(name) + strlen(name_prefix) + 2; ls->ls_device.name = kzalloc(len, GFP_KERNEL); @@ -362,6 +369,22 @@ fail: return error; } +int dlm_device_deregister(struct dlm_ls *ls) +{ + int error; + + /* The device is not registered. This happens when the lockspace + was never used from userspace, or when device_create_lockspace() + calls dlm_release_lockspace() after the register fails. */ + if (!ls->ls_device.name) + return 0; + + error = misc_deregister(&ls->ls_device); + if (!error) + kfree(ls->ls_device.name); + return error; +} + static int device_user_purge(struct dlm_user_proc *proc, struct dlm_purge_params *params) { @@ -396,7 +419,7 @@ static int device_create_lockspace(struct dlm_lspace_params *params) if (!ls) return -ENOENT; - error = create_misc_device(ls, params->name); + error = dlm_device_register(ls, params->name); dlm_put_lockspace(ls); if (error) @@ -420,31 +443,22 @@ static int device_remove_lockspace(struct dlm_lspace_params *params) if (!ls) return -ENOENT; - /* Deregister the misc device first, so we don't have - * a device that's not attached to a lockspace. If - * dlm_release_lockspace fails then we can recreate it - */ - error = misc_deregister(&ls->ls_device); - if (error) { - dlm_put_lockspace(ls); - goto out; - } - kfree(ls->ls_device.name); - if (params->flags & DLM_USER_LSFLG_FORCEFREE) force = 2; lockspace = ls->ls_local_handle; + dlm_put_lockspace(ls); - /* dlm_release_lockspace waits for references to go to zero, - so all processes will need to close their device for the ls - before the release will procede */ + /* The final dlm_release_lockspace waits for references to go to + zero, so all processes will need to close their device for the + ls before the release will proceed. release also calls the + device_deregister above. Converting a positive return value + from release to zero means that userspace won't know when its + release was the final one, but it shouldn't need to know. */ - dlm_put_lockspace(ls); error = dlm_release_lockspace(lockspace, force); - if (error) - create_misc_device(ls, ls->ls_name); - out: + if (error > 0) + error = 0; return error; } @@ -526,8 +540,10 @@ static ssize_t device_write(struct file *file, const char __user *buf, k32buf = (struct dlm_write_request32 *)kbuf; kbuf = kmalloc(count + 1 + (sizeof(struct dlm_write_request) - sizeof(struct dlm_write_request32)), GFP_KERNEL); - if (!kbuf) + if (!kbuf) { + kfree(k32buf); return -ENOMEM; + } if (proc) set_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags); @@ -538,8 +554,10 @@ static ssize_t device_write(struct file *file, const char __user *buf, /* do we really need this? can a write happen after a close? */ if ((kbuf->cmd == DLM_USER_LOCK || kbuf->cmd == DLM_USER_UNLOCK) && - test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags)) - return -EINVAL; + (proc && test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))) { + error = -EINVAL; + goto out_free; + } sigfillset(&allsigs); sigprocmask(SIG_BLOCK, &allsigs, &tmpsig); @@ -868,6 +886,26 @@ static unsigned int device_poll(struct file *file, poll_table *wait) return 0; } +int dlm_user_daemon_available(void) +{ + /* dlm_controld hasn't started (or, has started, but not + properly populated configfs) */ + + if (!dlm_our_nodeid()) + return 0; + + /* This is to deal with versions of dlm_controld that don't + know about the monitor device. We assume that if the + dlm_controld was started (above), but the monitor device + was never opened, that it's an old version. dlm_controld + should open the monitor device before populating configfs. */ + + if (dlm_monitor_unused) + return 1; + + return atomic_read(&dlm_monitor_opened) ? 1 : 0; +} + static int ctl_device_open(struct inode *inode, struct file *file) { file->private_data = NULL; @@ -879,6 +917,20 @@ static int ctl_device_close(struct inode *inode, struct file *file) return 0; } +static int monitor_device_open(struct inode *inode, struct file *file) +{ + atomic_inc(&dlm_monitor_opened); + dlm_monitor_unused = 0; + return 0; +} + +static int monitor_device_close(struct inode *inode, struct file *file) +{ + if (atomic_dec_and_test(&dlm_monitor_opened)) + dlm_stop_lockspaces(); + return 0; +} + static const struct file_operations device_fops = { .open = device_open, .release = device_close, @@ -902,19 +954,42 @@ static struct miscdevice ctl_device = { .minor = MISC_DYNAMIC_MINOR, }; +static const struct file_operations monitor_device_fops = { + .open = monitor_device_open, + .release = monitor_device_close, + .owner = THIS_MODULE, +}; + +static struct miscdevice monitor_device = { + .name = "dlm-monitor", + .fops = &monitor_device_fops, + .minor = MISC_DYNAMIC_MINOR, +}; + int __init dlm_user_init(void) { int error; + atomic_set(&dlm_monitor_opened, 0); + error = misc_register(&ctl_device); - if (error) + if (error) { log_print("misc_register failed for control device"); + goto out; + } + error = misc_register(&monitor_device); + if (error) { + log_print("misc_register failed for monitor device"); + misc_deregister(&ctl_device); + } + out: return error; } void dlm_user_exit(void) { misc_deregister(&ctl_device); + misc_deregister(&monitor_device); } diff --git a/fs/dlm/user.h b/fs/dlm/user.h index d38e9f3e4151..35eb6a13d616 100644 --- a/fs/dlm/user.h +++ b/fs/dlm/user.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2006 Red Hat, Inc. All rights reserved. + * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions @@ -12,5 +12,7 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type); int dlm_user_init(void); void dlm_user_exit(void); +int dlm_device_deregister(struct dlm_ls *ls); +int dlm_user_daemon_available(void); #endif diff --git a/fs/dquot.c b/fs/dquot.c index 5ac77da19959..ad7e59003e04 100644 --- a/fs/dquot.c +++ b/fs/dquot.c @@ -562,6 +562,8 @@ static struct shrinker dqcache_shrinker = { */ static void dqput(struct dquot *dquot) { + int ret; + if (!dquot) return; #ifdef __DQUOT_PARANOIA @@ -594,7 +596,19 @@ we_slept: if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && dquot_dirty(dquot)) { spin_unlock(&dq_list_lock); /* Commit dquot before releasing */ - dquot->dq_sb->dq_op->write_dquot(dquot); + ret = dquot->dq_sb->dq_op->write_dquot(dquot); + if (ret < 0) { + printk(KERN_ERR "VFS: cannot write quota structure on " + "device %s (error %d). Quota may get out of " + "sync!\n", dquot->dq_sb->s_id, ret); + /* + * We clear dirty bit anyway, so that we avoid + * infinite loop here + */ + spin_lock(&dq_list_lock); + clear_dquot_dirty(dquot); + spin_unlock(&dq_list_lock); + } goto we_slept; } /* Clear flag in case dquot was inactive (something bad happened) */ @@ -875,13 +889,15 @@ static void print_warning(struct dquot *dquot, const int warntype) char *msg = NULL; struct tty_struct *tty; - if (!need_print_warning(dquot)) + if (warntype == QUOTA_NL_IHARDBELOW || + warntype == QUOTA_NL_ISOFTBELOW || + warntype == QUOTA_NL_BHARDBELOW || + warntype == QUOTA_NL_BSOFTBELOW || !need_print_warning(dquot)) return; - mutex_lock(&tty_mutex); tty = get_current_tty(); if (!tty) - goto out_lock; + return; tty_write_message(tty, dquot->dq_sb->s_id); if (warntype == QUOTA_NL_ISOFTWARN || warntype == QUOTA_NL_BSOFTWARN) tty_write_message(tty, ": warning, "); @@ -909,8 +925,7 @@ static void print_warning(struct dquot *dquot, const int warntype) break; } tty_write_message(tty, msg); -out_lock: - mutex_unlock(&tty_mutex); + tty_kref_put(tty); } #endif @@ -1083,6 +1098,35 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war return QUOTA_OK; } +static int info_idq_free(struct dquot *dquot, ulong inodes) +{ + if (test_bit(DQ_FAKE_B, &dquot->dq_flags) || + dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit) + return QUOTA_NL_NOWARN; + + if (dquot->dq_dqb.dqb_curinodes - inodes <= dquot->dq_dqb.dqb_isoftlimit) + return QUOTA_NL_ISOFTBELOW; + if (dquot->dq_dqb.dqb_curinodes >= dquot->dq_dqb.dqb_ihardlimit && + dquot->dq_dqb.dqb_curinodes - inodes < dquot->dq_dqb.dqb_ihardlimit) + return QUOTA_NL_IHARDBELOW; + return QUOTA_NL_NOWARN; +} + +static int info_bdq_free(struct dquot *dquot, qsize_t space) +{ + if (test_bit(DQ_FAKE_B, &dquot->dq_flags) || + toqb(dquot->dq_dqb.dqb_curspace) <= dquot->dq_dqb.dqb_bsoftlimit) + return QUOTA_NL_NOWARN; + + if (toqb(dquot->dq_dqb.dqb_curspace - space) <= + dquot->dq_dqb.dqb_bsoftlimit) + return QUOTA_NL_BSOFTBELOW; + if (toqb(dquot->dq_dqb.dqb_curspace) >= dquot->dq_dqb.dqb_bhardlimit && + toqb(dquot->dq_dqb.dqb_curspace - space) < + dquot->dq_dqb.dqb_bhardlimit) + return QUOTA_NL_BHARDBELOW; + return QUOTA_NL_NOWARN; +} /* * Initialize quota pointers in inode * Transaction must be started at entry @@ -1139,6 +1183,28 @@ int dquot_drop(struct inode *inode) return 0; } +/* Wrapper to remove references to quota structures from inode */ +void vfs_dq_drop(struct inode *inode) +{ + /* Here we can get arbitrary inode from clear_inode() so we have + * to be careful. OTOH we don't need locking as quota operations + * are allowed to change only at mount time */ + if (!IS_NOQUOTA(inode) && inode->i_sb && inode->i_sb->dq_op + && inode->i_sb->dq_op->drop) { + int cnt; + /* Test before calling to rule out calls from proc and such + * where we are not allowed to block. Note that this is + * actually reliable test even without the lock - the caller + * must assure that nobody can come after the DQUOT_DROP and + * add quota pointers back anyway */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) + if (inode->i_dquot[cnt] != NODQUOT) + break; + if (cnt < MAXQUOTAS) + inode->i_sb->dq_op->drop(inode); + } +} + /* * Following four functions update i_blocks+i_bytes fields and * quota information (together with appropriate checks) @@ -1248,6 +1314,7 @@ warn_put_all: int dquot_free_space(struct inode *inode, qsize_t number) { unsigned int cnt; + char warntype[MAXQUOTAS]; /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ @@ -1256,6 +1323,7 @@ out_sub: inode_sub_bytes(inode, number); return QUOTA_OK; } + down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); /* Now recheck reliably when holding dqptr_sem */ if (IS_NOQUOTA(inode)) { @@ -1266,6 +1334,7 @@ out_sub: for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (inode->i_dquot[cnt] == NODQUOT) continue; + warntype[cnt] = info_bdq_free(inode->i_dquot[cnt], number); dquot_decr_space(inode->i_dquot[cnt], number); } inode_sub_bytes(inode, number); @@ -1274,6 +1343,7 @@ out_sub: for (cnt = 0; cnt < MAXQUOTAS; cnt++) if (inode->i_dquot[cnt]) mark_dquot_dirty(inode->i_dquot[cnt]); + flush_warnings(inode->i_dquot, warntype); up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); return QUOTA_OK; } @@ -1284,11 +1354,13 @@ out_sub: int dquot_free_inode(const struct inode *inode, unsigned long number) { unsigned int cnt; + char warntype[MAXQUOTAS]; /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ if (IS_NOQUOTA(inode)) return QUOTA_OK; + down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); /* Now recheck reliably when holding dqptr_sem */ if (IS_NOQUOTA(inode)) { @@ -1299,6 +1371,7 @@ int dquot_free_inode(const struct inode *inode, unsigned long number) for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (inode->i_dquot[cnt] == NODQUOT) continue; + warntype[cnt] = info_idq_free(inode->i_dquot[cnt], number); dquot_decr_inodes(inode->i_dquot[cnt], number); } spin_unlock(&dq_data_lock); @@ -1306,6 +1379,7 @@ int dquot_free_inode(const struct inode *inode, unsigned long number) for (cnt = 0; cnt < MAXQUOTAS; cnt++) if (inode->i_dquot[cnt]) mark_dquot_dirty(inode->i_dquot[cnt]); + flush_warnings(inode->i_dquot, warntype); up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); return QUOTA_OK; } @@ -1323,7 +1397,8 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) struct dquot *transfer_to[MAXQUOTAS]; int cnt, ret = NO_QUOTA, chuid = (iattr->ia_valid & ATTR_UID) && inode->i_uid != iattr->ia_uid, chgid = (iattr->ia_valid & ATTR_GID) && inode->i_gid != iattr->ia_gid; - char warntype[MAXQUOTAS]; + char warntype_to[MAXQUOTAS]; + char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS]; /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ @@ -1332,7 +1407,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) /* Clear the arrays */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { transfer_to[cnt] = transfer_from[cnt] = NODQUOT; - warntype[cnt] = QUOTA_NL_NOWARN; + warntype_to[cnt] = QUOTA_NL_NOWARN; } down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); /* Now recheck reliably when holding dqptr_sem */ @@ -1364,8 +1439,9 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) if (transfer_to[cnt] == NODQUOT) continue; transfer_from[cnt] = inode->i_dquot[cnt]; - if (check_idq(transfer_to[cnt], 1, warntype+cnt) == NO_QUOTA || - check_bdq(transfer_to[cnt], space, 0, warntype+cnt) == NO_QUOTA) + if (check_idq(transfer_to[cnt], 1, warntype_to + cnt) == + NO_QUOTA || check_bdq(transfer_to[cnt], space, 0, + warntype_to + cnt) == NO_QUOTA) goto warn_put_all; } @@ -1381,6 +1457,10 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) /* Due to IO error we might not have transfer_from[] structure */ if (transfer_from[cnt]) { + warntype_from_inodes[cnt] = + info_idq_free(transfer_from[cnt], 1); + warntype_from_space[cnt] = + info_bdq_free(transfer_from[cnt], space); dquot_decr_inodes(transfer_from[cnt], 1); dquot_decr_space(transfer_from[cnt], space); } @@ -1400,7 +1480,9 @@ warn_put_all: if (transfer_to[cnt]) mark_dquot_dirty(transfer_to[cnt]); } - flush_warnings(transfer_to, warntype); + flush_warnings(transfer_to, warntype_to); + flush_warnings(transfer_from, warntype_from_inodes); + flush_warnings(transfer_from, warntype_from_space); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (ret == QUOTA_OK && transfer_from[cnt] != NODQUOT) @@ -1412,6 +1494,18 @@ warn_put_all: return ret; } +/* Wrapper for transferring ownership of an inode */ +int vfs_dq_transfer(struct inode *inode, struct iattr *iattr) +{ + if (sb_any_quota_enabled(inode->i_sb) && !IS_NOQUOTA(inode)) { + vfs_dq_init(inode); + if (inode->i_sb->dq_op->transfer(inode, iattr) == NO_QUOTA) + return 1; + } + return 0; +} + + /* * Write info of quota file to disk */ @@ -1697,6 +1791,21 @@ static int vfs_quota_on_remount(struct super_block *sb, int type) return ret; } +int vfs_quota_on_path(struct super_block *sb, int type, int format_id, + struct path *path) +{ + int error = security_quota_on(path->dentry); + if (error) + return error; + /* Quota file not on the same filesystem? */ + if (path->mnt->mnt_sb != sb) + error = -EXDEV; + else + error = vfs_quota_on_inode(path->dentry->d_inode, type, + format_id); + return error; +} + /* Actual function called from quotactl() */ int vfs_quota_on(struct super_block *sb, int type, int format_id, char *path, int remount) @@ -1708,19 +1817,10 @@ int vfs_quota_on(struct super_block *sb, int type, int format_id, char *path, return vfs_quota_on_remount(sb, type); error = path_lookup(path, LOOKUP_FOLLOW, &nd); - if (error < 0) - return error; - error = security_quota_on(nd.path.dentry); - if (error) - goto out_path; - /* Quota file not on the same filesystem? */ - if (nd.path.mnt->mnt_sb != sb) - error = -EXDEV; - else - error = vfs_quota_on_inode(nd.path.dentry->d_inode, type, - format_id); -out_path: - path_put(&nd.path); + if (!error) { + error = vfs_quota_on_path(sb, type, format_id, &nd.path); + path_put(&nd.path); + } return error; } @@ -1752,6 +1852,22 @@ out: return error; } +/* Wrapper to turn on quotas when remounting rw */ +int vfs_dq_quota_on_remount(struct super_block *sb) +{ + int cnt; + int ret = 0, err; + + if (!sb->s_qcop || !sb->s_qcop->quota_on) + return -ENOSYS; + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + err = sb->s_qcop->quota_on(sb, cnt, 0, NULL, 1); + if (err < 0 && !ret) + ret = err; + } + return ret; +} + /* Generic routine for getting common part of quota structure */ static void do_get_dqblk(struct dquot *dquot, struct if_dqblk *di) { @@ -2073,6 +2189,7 @@ EXPORT_SYMBOL(unregister_quota_format); EXPORT_SYMBOL(dqstats); EXPORT_SYMBOL(dq_data_lock); EXPORT_SYMBOL(vfs_quota_on); +EXPORT_SYMBOL(vfs_quota_on_path); EXPORT_SYMBOL(vfs_quota_on_mount); EXPORT_SYMBOL(vfs_quota_off); EXPORT_SYMBOL(vfs_quota_sync); @@ -2087,8 +2204,11 @@ EXPORT_SYMBOL(dquot_release); EXPORT_SYMBOL(dquot_mark_dquot_dirty); EXPORT_SYMBOL(dquot_initialize); EXPORT_SYMBOL(dquot_drop); +EXPORT_SYMBOL(vfs_dq_drop); EXPORT_SYMBOL(dquot_alloc_space); EXPORT_SYMBOL(dquot_alloc_inode); EXPORT_SYMBOL(dquot_free_space); EXPORT_SYMBOL(dquot_free_inode); EXPORT_SYMBOL(dquot_transfer); +EXPORT_SYMBOL(vfs_dq_transfer); +EXPORT_SYMBOL(vfs_dq_quota_on_remount); diff --git a/fs/ecryptfs/Makefile b/fs/ecryptfs/Makefile index 1e34a7fd4884..b4755a85996e 100644 --- a/fs/ecryptfs/Makefile +++ b/fs/ecryptfs/Makefile @@ -4,4 +4,4 @@ obj-$(CONFIG_ECRYPT_FS) += ecryptfs.o -ecryptfs-objs := dentry.o file.o inode.o main.o super.o mmap.o read_write.o crypto.o keystore.o messaging.o netlink.o miscdev.o debug.o +ecryptfs-objs := dentry.o file.o inode.o main.o super.o mmap.o read_write.o crypto.o keystore.o messaging.o netlink.o miscdev.o kthread.o debug.o diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index e2832bc7869a..06db79d05c12 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -33,6 +33,7 @@ #include <linux/crypto.h> #include <linux/file.h> #include <linux/scatterlist.h> +#include <asm/unaligned.h> #include "ecryptfs_kernel.h" static int @@ -474,8 +475,8 @@ int ecryptfs_encrypt_page(struct page *page) { struct inode *ecryptfs_inode; struct ecryptfs_crypt_stat *crypt_stat; - char *enc_extent_virt = NULL; - struct page *enc_extent_page; + char *enc_extent_virt; + struct page *enc_extent_page = NULL; loff_t extent_offset; int rc = 0; @@ -491,14 +492,14 @@ int ecryptfs_encrypt_page(struct page *page) page->index); goto out; } - enc_extent_virt = kmalloc(PAGE_CACHE_SIZE, GFP_USER); - if (!enc_extent_virt) { + enc_extent_page = alloc_page(GFP_USER); + if (!enc_extent_page) { rc = -ENOMEM; ecryptfs_printk(KERN_ERR, "Error allocating memory for " "encrypted extent\n"); goto out; } - enc_extent_page = virt_to_page(enc_extent_virt); + enc_extent_virt = kmap(enc_extent_page); for (extent_offset = 0; extent_offset < (PAGE_CACHE_SIZE / crypt_stat->extent_size); extent_offset++) { @@ -526,7 +527,10 @@ int ecryptfs_encrypt_page(struct page *page) } } out: - kfree(enc_extent_virt); + if (enc_extent_page) { + kunmap(enc_extent_page); + __free_page(enc_extent_page); + } return rc; } @@ -608,8 +612,8 @@ int ecryptfs_decrypt_page(struct page *page) { struct inode *ecryptfs_inode; struct ecryptfs_crypt_stat *crypt_stat; - char *enc_extent_virt = NULL; - struct page *enc_extent_page; + char *enc_extent_virt; + struct page *enc_extent_page = NULL; unsigned long extent_offset; int rc = 0; @@ -626,14 +630,14 @@ int ecryptfs_decrypt_page(struct page *page) page->index); goto out; } - enc_extent_virt = kmalloc(PAGE_CACHE_SIZE, GFP_USER); - if (!enc_extent_virt) { + enc_extent_page = alloc_page(GFP_USER); + if (!enc_extent_page) { rc = -ENOMEM; ecryptfs_printk(KERN_ERR, "Error allocating memory for " "encrypted extent\n"); goto out; } - enc_extent_page = virt_to_page(enc_extent_virt); + enc_extent_virt = kmap(enc_extent_page); for (extent_offset = 0; extent_offset < (PAGE_CACHE_SIZE / crypt_stat->extent_size); extent_offset++) { @@ -661,7 +665,10 @@ int ecryptfs_decrypt_page(struct page *page) } } out: - kfree(enc_extent_virt); + if (enc_extent_page) { + kunmap(enc_extent_page); + __free_page(enc_extent_page); + } return rc; } @@ -1032,10 +1039,8 @@ static int contains_ecryptfs_marker(char *data) { u32 m_1, m_2; - memcpy(&m_1, data, 4); - m_1 = be32_to_cpu(m_1); - memcpy(&m_2, (data + 4), 4); - m_2 = be32_to_cpu(m_2); + m_1 = get_unaligned_be32(data); + m_2 = get_unaligned_be32(data + 4); if ((m_1 ^ MAGIC_ECRYPTFS_MARKER) == m_2) return 1; ecryptfs_printk(KERN_DEBUG, "m_1 = [0x%.8x]; m_2 = [0x%.8x]; " @@ -1073,8 +1078,7 @@ static int ecryptfs_process_flags(struct ecryptfs_crypt_stat *crypt_stat, int i; u32 flags; - memcpy(&flags, page_virt, 4); - flags = be32_to_cpu(flags); + flags = get_unaligned_be32(page_virt); for (i = 0; i < ((sizeof(ecryptfs_flag_map) / sizeof(struct ecryptfs_flag_map_elem))); i++) if (flags & ecryptfs_flag_map[i].file_flag) { @@ -1100,11 +1104,9 @@ static void write_ecryptfs_marker(char *page_virt, size_t *written) get_random_bytes(&m_1, (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2)); m_2 = (m_1 ^ MAGIC_ECRYPTFS_MARKER); - m_1 = cpu_to_be32(m_1); - memcpy(page_virt, &m_1, (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2)); - m_2 = cpu_to_be32(m_2); - memcpy(page_virt + (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2), &m_2, - (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2)); + put_unaligned_be32(m_1, page_virt); + page_virt += (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2); + put_unaligned_be32(m_2, page_virt); (*written) = MAGIC_ECRYPTFS_MARKER_SIZE_BYTES; } @@ -1121,8 +1123,7 @@ write_ecryptfs_flags(char *page_virt, struct ecryptfs_crypt_stat *crypt_stat, flags |= ecryptfs_flag_map[i].file_flag; /* Version is in top 8 bits of the 32-bit flag vector */ flags |= ((((u8)crypt_stat->file_version) << 24) & 0xFF000000); - flags = cpu_to_be32(flags); - memcpy(page_virt, &flags, 4); + put_unaligned_be32(flags, page_virt); (*written) = 4; } @@ -1238,11 +1239,9 @@ ecryptfs_write_header_metadata(char *virt, num_header_extents_at_front = (u16)(crypt_stat->num_header_bytes_at_front / crypt_stat->extent_size); - header_extent_size = cpu_to_be32(header_extent_size); - memcpy(virt, &header_extent_size, 4); + put_unaligned_be32(header_extent_size, virt); virt += 4; - num_header_extents_at_front = cpu_to_be16(num_header_extents_at_front); - memcpy(virt, &num_header_extents_at_front, 2); + put_unaligned_be16(num_header_extents_at_front, virt); (*written) = 6; } @@ -1410,15 +1409,13 @@ static int parse_header_metadata(struct ecryptfs_crypt_stat *crypt_stat, u32 header_extent_size; u16 num_header_extents_at_front; - memcpy(&header_extent_size, virt, sizeof(u32)); - header_extent_size = be32_to_cpu(header_extent_size); - virt += sizeof(u32); - memcpy(&num_header_extents_at_front, virt, sizeof(u16)); - num_header_extents_at_front = be16_to_cpu(num_header_extents_at_front); + header_extent_size = get_unaligned_be32(virt); + virt += sizeof(__be32); + num_header_extents_at_front = get_unaligned_be16(virt); crypt_stat->num_header_bytes_at_front = (((size_t)num_header_extents_at_front * (size_t)header_extent_size)); - (*bytes_read) = (sizeof(u32) + sizeof(u16)); + (*bytes_read) = (sizeof(__be32) + sizeof(__be16)); if ((validate_header_size == ECRYPTFS_VALIDATE_HEADER_SIZE) && (crypt_stat->num_header_bytes_at_front < ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE)) { diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index c15c25745e05..b73fb752c5f8 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -559,10 +559,25 @@ extern struct kmem_cache *ecryptfs_key_record_cache; extern struct kmem_cache *ecryptfs_key_sig_cache; extern struct kmem_cache *ecryptfs_global_auth_tok_cache; extern struct kmem_cache *ecryptfs_key_tfm_cache; +extern struct kmem_cache *ecryptfs_open_req_cache; +struct ecryptfs_open_req { +#define ECRYPTFS_REQ_PROCESSED 0x00000001 +#define ECRYPTFS_REQ_DROPPED 0x00000002 +#define ECRYPTFS_REQ_ZOMBIE 0x00000004 + u32 flags; + struct file **lower_file; + struct dentry *lower_dentry; + struct vfsmount *lower_mnt; + wait_queue_head_t wait; + struct mutex mux; + struct list_head kthread_ctl_list; +}; + +#define ECRYPTFS_INTERPOSE_FLAG_D_ADD 0x00000001 int ecryptfs_interpose(struct dentry *hidden_dentry, struct dentry *this_dentry, struct super_block *sb, - int flag); + u32 flags); int ecryptfs_fill_zeros(struct file *file, loff_t new_length); int ecryptfs_decode_filename(struct ecryptfs_crypt_stat *crypt_stat, const char *name, int length, @@ -690,5 +705,11 @@ void ecryptfs_msg_ctx_alloc_to_free(struct ecryptfs_msg_ctx *msg_ctx); int ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, uid_t euid, struct user_namespace *user_ns, struct pid *pid); +int ecryptfs_init_kthread(void); +void ecryptfs_destroy_kthread(void); +int ecryptfs_privileged_open(struct file **lower_file, + struct dentry *lower_dentry, + struct vfsmount *lower_mnt); +int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry); #endif /* #ifndef ECRYPTFS_KERNEL_H */ diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 2258b8f654a6..9244d653743e 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -30,6 +30,7 @@ #include <linux/security.h> #include <linux/compat.h> #include <linux/fs_stack.h> +#include <linux/smp_lock.h> #include "ecryptfs_kernel.h" /** @@ -191,6 +192,23 @@ static int ecryptfs_open(struct inode *inode, struct file *file) | ECRYPTFS_ENCRYPTED); } mutex_unlock(&crypt_stat->cs_mutex); + if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY) + && !(file->f_flags & O_RDONLY)) { + rc = -EPERM; + printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs " + "file must hence be opened RO\n", __func__); + goto out; + } + if (!ecryptfs_inode_to_private(inode)->lower_file) { + rc = ecryptfs_init_persistent_file(ecryptfs_dentry); + if (rc) { + printk(KERN_ERR "%s: Error attempting to initialize " + "the persistent file for the dentry with name " + "[%s]; rc = [%d]\n", __func__, + ecryptfs_dentry->d_name.name, rc); + goto out; + } + } ecryptfs_set_file_lower( file, ecryptfs_inode_to_private(inode)->lower_file); if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) { @@ -277,9 +295,11 @@ static int ecryptfs_fasync(int fd, struct file *file, int flag) int rc = 0; struct file *lower_file = NULL; + lock_kernel(); lower_file = ecryptfs_file_to_lower(file); if (lower_file->f_op && lower_file->f_op->fasync) rc = lower_file->f_op->fasync(fd, lower_file, flag); + unlock_kernel(); return rc; } diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index c92cc1c00aae..89209f00f9c7 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -31,6 +31,7 @@ #include <linux/mount.h> #include <linux/crypto.h> #include <linux/fs_stack.h> +#include <asm/unaligned.h> #include "ecryptfs_kernel.h" static struct dentry *lock_parent(struct dentry *dentry) @@ -188,6 +189,16 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry) "context; rc = [%d]\n", rc); goto out; } + if (!ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->lower_file) { + rc = ecryptfs_init_persistent_file(ecryptfs_dentry); + if (rc) { + printk(KERN_ERR "%s: Error attempting to initialize " + "the persistent file for the dentry with name " + "[%s]; rc = [%d]\n", __func__, + ecryptfs_dentry->d_name.name, rc); + goto out; + } + } rc = ecryptfs_write_metadata(ecryptfs_dentry); if (rc) { printk(KERN_ERR "Error writing headers; rc = [%d]\n", rc); @@ -307,10 +318,11 @@ static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry, d_add(dentry, NULL); goto out; } - rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 1); + rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, + ECRYPTFS_INTERPOSE_FLAG_D_ADD); if (rc) { ecryptfs_printk(KERN_ERR, "Error interposing\n"); - goto out_dput; + goto out; } if (S_ISDIR(lower_inode->i_mode)) { ecryptfs_printk(KERN_DEBUG, "Is a directory; returning\n"); @@ -336,11 +348,21 @@ static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry, rc = -ENOMEM; ecryptfs_printk(KERN_ERR, "Cannot ecryptfs_kmalloc a page\n"); - goto out_dput; + goto out; } crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat; if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED)) ecryptfs_set_default_sizes(crypt_stat); + if (!ecryptfs_inode_to_private(dentry->d_inode)->lower_file) { + rc = ecryptfs_init_persistent_file(dentry); + if (rc) { + printk(KERN_ERR "%s: Error attempting to initialize " + "the persistent file for the dentry with name " + "[%s]; rc = [%d]\n", __func__, + dentry->d_name.name, rc); + goto out; + } + } rc = ecryptfs_read_and_validate_header_region(page_virt, dentry->d_inode); if (rc) { @@ -364,8 +386,7 @@ static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry, else file_size = i_size_read(lower_dentry->d_inode); } else { - memcpy(&file_size, page_virt, sizeof(file_size)); - file_size = be64_to_cpu(file_size); + file_size = get_unaligned_be64(page_virt); } i_size_write(dentry->d_inode, (loff_t)file_size); kmem_cache_free(ecryptfs_header_cache_2, page_virt); @@ -444,7 +465,6 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry, int rc; struct dentry *lower_dentry; struct dentry *lower_dir_dentry; - umode_t mode; char *encoded_symname; int encoded_symlen; struct ecryptfs_crypt_stat *crypt_stat = NULL; @@ -452,7 +472,6 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry, lower_dentry = ecryptfs_dentry_to_lower(dentry); dget(lower_dentry); lower_dir_dentry = lock_parent(lower_dentry); - mode = S_IALLUGO; encoded_symlen = ecryptfs_encode_filename(crypt_stat, symname, strlen(symname), &encoded_symname); @@ -461,7 +480,7 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry, goto out_lock; } rc = vfs_symlink(lower_dir_dentry->d_inode, lower_dentry, - encoded_symname, mode); + encoded_symname); kfree(encoded_symname); if (rc || !lower_dentry->d_inode) goto out_lock; @@ -809,22 +828,9 @@ out: } static int -ecryptfs_permission(struct inode *inode, int mask, struct nameidata *nd) +ecryptfs_permission(struct inode *inode, int mask) { - int rc; - - if (nd) { - struct vfsmount *vfsmnt_save = nd->path.mnt; - struct dentry *dentry_save = nd->path.dentry; - - nd->path.mnt = ecryptfs_dentry_to_lower_mnt(nd->path.dentry); - nd->path.dentry = ecryptfs_dentry_to_lower(nd->path.dentry); - rc = permission(ecryptfs_inode_to_lower(inode), mask, nd); - nd->path.mnt = vfsmnt_save; - nd->path.dentry = dentry_save; - } else - rc = permission(ecryptfs_inode_to_lower(inode), mask, NULL); - return rc; + return inode_permission(ecryptfs_inode_to_lower(inode), mask); } /** diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index e82b457180be..f5b76a331b9c 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -44,15 +44,15 @@ static int process_request_key_err(long err_code) int rc = 0; switch (err_code) { - case ENOKEY: + case -ENOKEY: ecryptfs_printk(KERN_WARNING, "No key\n"); rc = -ENOENT; break; - case EKEYEXPIRED: + case -EKEYEXPIRED: ecryptfs_printk(KERN_WARNING, "Key expired\n"); rc = -ETIME; break; - case EKEYREVOKED: + case -EKEYREVOKED: ecryptfs_printk(KERN_WARNING, "Key revoked\n"); rc = -EINVAL; break; @@ -963,8 +963,7 @@ int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key, if (!(*auth_tok_key) || IS_ERR(*auth_tok_key)) { printk(KERN_ERR "Could not find key with description: [%s]\n", sig); - process_request_key_err(PTR_ERR(*auth_tok_key)); - rc = -EINVAL; + rc = process_request_key_err(PTR_ERR(*auth_tok_key)); goto out; } (*auth_tok) = ecryptfs_get_key_payload_data(*auth_tok_key); diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c new file mode 100644 index 000000000000..c440c6b58b2d --- /dev/null +++ b/fs/ecryptfs/kthread.c @@ -0,0 +1,203 @@ +/** + * eCryptfs: Linux filesystem encryption layer + * + * Copyright (C) 2008 International Business Machines Corp. + * Author(s): Michael A. Halcrow <mahalcro@us.ibm.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + * 02111-1307, USA. + */ + +#include <linux/kthread.h> +#include <linux/freezer.h> +#include <linux/wait.h> +#include <linux/mount.h> +#include "ecryptfs_kernel.h" + +struct kmem_cache *ecryptfs_open_req_cache; + +static struct ecryptfs_kthread_ctl { +#define ECRYPTFS_KTHREAD_ZOMBIE 0x00000001 + u32 flags; + struct mutex mux; + struct list_head req_list; + wait_queue_head_t wait; +} ecryptfs_kthread_ctl; + +static struct task_struct *ecryptfs_kthread; + +/** + * ecryptfs_threadfn + * @ignored: ignored + * + * The eCryptfs kernel thread that has the responsibility of getting + * the lower persistent file with RW permissions. + * + * Returns zero on success; non-zero otherwise + */ +static int ecryptfs_threadfn(void *ignored) +{ + set_freezable(); + while (1) { + struct ecryptfs_open_req *req; + + wait_event_freezable( + ecryptfs_kthread_ctl.wait, + (!list_empty(&ecryptfs_kthread_ctl.req_list) + || kthread_should_stop())); + mutex_lock(&ecryptfs_kthread_ctl.mux); + if (ecryptfs_kthread_ctl.flags & ECRYPTFS_KTHREAD_ZOMBIE) { + mutex_unlock(&ecryptfs_kthread_ctl.mux); + goto out; + } + while (!list_empty(&ecryptfs_kthread_ctl.req_list)) { + req = list_first_entry(&ecryptfs_kthread_ctl.req_list, + struct ecryptfs_open_req, + kthread_ctl_list); + mutex_lock(&req->mux); + list_del(&req->kthread_ctl_list); + if (!(req->flags & ECRYPTFS_REQ_ZOMBIE)) { + dget(req->lower_dentry); + mntget(req->lower_mnt); + (*req->lower_file) = dentry_open( + req->lower_dentry, req->lower_mnt, + (O_RDWR | O_LARGEFILE)); + req->flags |= ECRYPTFS_REQ_PROCESSED; + } + wake_up(&req->wait); + mutex_unlock(&req->mux); + } + mutex_unlock(&ecryptfs_kthread_ctl.mux); + } +out: + return 0; +} + +int ecryptfs_init_kthread(void) +{ + int rc = 0; + + mutex_init(&ecryptfs_kthread_ctl.mux); + init_waitqueue_head(&ecryptfs_kthread_ctl.wait); + INIT_LIST_HEAD(&ecryptfs_kthread_ctl.req_list); + ecryptfs_kthread = kthread_run(&ecryptfs_threadfn, NULL, + "ecryptfs-kthread"); + if (IS_ERR(ecryptfs_kthread)) { + rc = PTR_ERR(ecryptfs_kthread); + printk(KERN_ERR "%s: Failed to create kernel thread; rc = [%d]" + "\n", __func__, rc); + } + return rc; +} + +void ecryptfs_destroy_kthread(void) +{ + struct ecryptfs_open_req *req; + + mutex_lock(&ecryptfs_kthread_ctl.mux); + ecryptfs_kthread_ctl.flags |= ECRYPTFS_KTHREAD_ZOMBIE; + list_for_each_entry(req, &ecryptfs_kthread_ctl.req_list, + kthread_ctl_list) { + mutex_lock(&req->mux); + req->flags |= ECRYPTFS_REQ_ZOMBIE; + wake_up(&req->wait); + mutex_unlock(&req->mux); + } + mutex_unlock(&ecryptfs_kthread_ctl.mux); + kthread_stop(ecryptfs_kthread); + wake_up(&ecryptfs_kthread_ctl.wait); +} + +/** + * ecryptfs_privileged_open + * @lower_file: Result of dentry_open by root on lower dentry + * @lower_dentry: Lower dentry for file to open + * @lower_mnt: Lower vfsmount for file to open + * + * This function gets a r/w file opened againt the lower dentry. + * + * Returns zero on success; non-zero otherwise + */ +int ecryptfs_privileged_open(struct file **lower_file, + struct dentry *lower_dentry, + struct vfsmount *lower_mnt) +{ + struct ecryptfs_open_req *req; + int rc = 0; + + /* Corresponding dput() and mntput() are done when the + * persistent file is fput() when the eCryptfs inode is + * destroyed. */ + dget(lower_dentry); + mntget(lower_mnt); + (*lower_file) = dentry_open(lower_dentry, lower_mnt, + (O_RDWR | O_LARGEFILE)); + if (!IS_ERR(*lower_file)) + goto out; + req = kmem_cache_alloc(ecryptfs_open_req_cache, GFP_KERNEL); + if (!req) { + rc = -ENOMEM; + goto out; + } + mutex_init(&req->mux); + req->lower_file = lower_file; + req->lower_dentry = lower_dentry; + req->lower_mnt = lower_mnt; + init_waitqueue_head(&req->wait); + req->flags = 0; + mutex_lock(&ecryptfs_kthread_ctl.mux); + if (ecryptfs_kthread_ctl.flags & ECRYPTFS_KTHREAD_ZOMBIE) { + rc = -EIO; + mutex_unlock(&ecryptfs_kthread_ctl.mux); + printk(KERN_ERR "%s: We are in the middle of shutting down; " + "aborting privileged request to open lower file\n", + __func__); + goto out_free; + } + list_add_tail(&req->kthread_ctl_list, &ecryptfs_kthread_ctl.req_list); + mutex_unlock(&ecryptfs_kthread_ctl.mux); + wake_up(&ecryptfs_kthread_ctl.wait); + wait_event(req->wait, (req->flags != 0)); + mutex_lock(&req->mux); + BUG_ON(req->flags == 0); + if (req->flags & ECRYPTFS_REQ_DROPPED + || req->flags & ECRYPTFS_REQ_ZOMBIE) { + rc = -EIO; + printk(KERN_WARNING "%s: Privileged open request dropped\n", + __func__); + goto out_unlock; + } + if (IS_ERR(*req->lower_file)) { + rc = PTR_ERR(*req->lower_file); + dget(lower_dentry); + mntget(lower_mnt); + (*lower_file) = dentry_open(lower_dentry, lower_mnt, + (O_RDONLY | O_LARGEFILE)); + if (IS_ERR(*lower_file)) { + rc = PTR_ERR(*req->lower_file); + (*lower_file) = NULL; + printk(KERN_WARNING "%s: Error attempting privileged " + "open of lower file with either RW or RO " + "perms; rc = [%d]. Giving up.\n", + __func__, rc); + } + } +out_unlock: + mutex_unlock(&req->mux); +out_free: + kmem_cache_free(ecryptfs_open_req_cache, req); +out: + return rc; +} diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index d603631601eb..8ebe9a5d1d99 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -117,7 +117,7 @@ void __ecryptfs_printk(const char *fmt, ...) * * Returns zero on success; non-zero otherwise */ -static int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry) +int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry) { struct ecryptfs_inode_info *inode_info = ecryptfs_inode_to_private(ecryptfs_dentry->d_inode); @@ -130,26 +130,12 @@ static int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry) ecryptfs_dentry_to_lower_mnt(ecryptfs_dentry); lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); - /* Corresponding dput() and mntput() are done when the - * persistent file is fput() when the eCryptfs inode - * is destroyed. */ - dget(lower_dentry); - mntget(lower_mnt); - inode_info->lower_file = dentry_open(lower_dentry, - lower_mnt, - (O_RDWR | O_LARGEFILE)); - if (IS_ERR(inode_info->lower_file)) { - dget(lower_dentry); - mntget(lower_mnt); - inode_info->lower_file = dentry_open(lower_dentry, - lower_mnt, - (O_RDONLY - | O_LARGEFILE)); - } - if (IS_ERR(inode_info->lower_file)) { + rc = ecryptfs_privileged_open(&inode_info->lower_file, + lower_dentry, lower_mnt); + if (rc || IS_ERR(inode_info->lower_file)) { printk(KERN_ERR "Error opening lower persistent file " - "for lower_dentry [0x%p] and lower_mnt [0x%p]\n", - lower_dentry, lower_mnt); + "for lower_dentry [0x%p] and lower_mnt [0x%p]; " + "rc = [%d]\n", lower_dentry, lower_mnt, rc); rc = PTR_ERR(inode_info->lower_file); inode_info->lower_file = NULL; } @@ -163,14 +149,14 @@ static int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry) * @lower_dentry: Existing dentry in the lower filesystem * @dentry: ecryptfs' dentry * @sb: ecryptfs's super_block - * @flag: If set to true, then d_add is called, else d_instantiate is called + * @flags: flags to govern behavior of interpose procedure * * Interposes upper and lower dentries. * * Returns zero on success; non-zero otherwise */ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry, - struct super_block *sb, int flag) + struct super_block *sb, u32 flags) { struct inode *lower_inode; struct inode *inode; @@ -207,7 +193,7 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry, init_special_inode(inode, lower_inode->i_mode, lower_inode->i_rdev); dentry->d_op = &ecryptfs_dops; - if (flag) + if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD) d_add(dentry, inode); else d_instantiate(dentry, inode); @@ -215,13 +201,6 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry, /* This size will be overwritten for real files w/ headers and * other metadata */ fsstack_copy_inode_size(inode, lower_inode); - rc = ecryptfs_init_persistent_file(dentry); - if (rc) { - printk(KERN_ERR "%s: Error attempting to initialize the " - "persistent file for the dentry with name [%s]; " - "rc = [%d]\n", __func__, dentry->d_name.name, rc); - goto out; - } out: return rc; } @@ -232,7 +211,7 @@ enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig, ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata, ecryptfs_opt_encrypted_view, ecryptfs_opt_err }; -static match_table_t tokens = { +static const match_table_t tokens = { {ecryptfs_opt_sig, "sig=%s"}, {ecryptfs_opt_ecryptfs_sig, "ecryptfs_sig=%s"}, {ecryptfs_opt_cipher, "cipher=%s"}, @@ -262,10 +241,11 @@ static int ecryptfs_init_global_auth_toks( "session keyring for sig specified in mount " "option: [%s]\n", global_auth_tok->sig); global_auth_tok->flags |= ECRYPTFS_AUTH_TOK_INVALID; - rc = 0; + goto out; } else global_auth_tok->flags &= ~ECRYPTFS_AUTH_TOK_INVALID; } +out: return rc; } @@ -314,7 +294,6 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options) char *cipher_name_dst; char *cipher_name_src; char *cipher_key_bytes_src; - int cipher_name_len; if (!options) { rc = -EINVAL; @@ -395,17 +374,12 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options) goto out; } if (!cipher_name_set) { - cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER); - if (unlikely(cipher_name_len - >= ECRYPTFS_MAX_CIPHER_NAME_SIZE)) { - rc = -EINVAL; - BUG(); - goto out; - } - memcpy(mount_crypt_stat->global_default_cipher_name, - ECRYPTFS_DEFAULT_CIPHER, cipher_name_len); - mount_crypt_stat->global_default_cipher_name[cipher_name_len] - = '\0'; + int cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER); + + BUG_ON(cipher_name_len >= ECRYPTFS_MAX_CIPHER_NAME_SIZE); + + strcpy(mount_crypt_stat->global_default_cipher_name, + ECRYPTFS_DEFAULT_CIPHER); } if (!cipher_key_bytes_set) { mount_crypt_stat->global_default_cipher_key_size = 0; @@ -430,7 +404,6 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options) printk(KERN_WARNING "One or more global auth toks could not " "properly register; rc = [%d]\n", rc); } - rc = 0; out: return rc; } @@ -605,7 +578,7 @@ static struct file_system_type ecryptfs_fs_type = { * Initializes the ecryptfs_inode_info_cache when it is created */ static void -inode_info_init_once(struct kmem_cache *cachep, void *vptr) +inode_info_init_once(void *vptr) { struct ecryptfs_inode_info *ei = (struct ecryptfs_inode_info *)vptr; @@ -616,7 +589,7 @@ static struct ecryptfs_cache_info { struct kmem_cache **cache; const char *name; size_t size; - void (*ctor)(struct kmem_cache *cache, void *obj); + void (*ctor)(void *obj); } ecryptfs_cache_infos[] = { { .cache = &ecryptfs_auth_tok_list_item_cache, @@ -679,6 +652,11 @@ static struct ecryptfs_cache_info { .name = "ecryptfs_key_tfm_cache", .size = sizeof(struct ecryptfs_key_tfm), }, + { + .cache = &ecryptfs_open_req_cache, + .name = "ecryptfs_open_req_cache", + .size = sizeof(struct ecryptfs_open_req), + }, }; static void ecryptfs_free_kmem_caches(void) @@ -795,11 +773,17 @@ static int __init ecryptfs_init(void) printk(KERN_ERR "sysfs registration failed\n"); goto out_unregister_filesystem; } + rc = ecryptfs_init_kthread(); + if (rc) { + printk(KERN_ERR "%s: kthread initialization failed; " + "rc = [%d]\n", __func__, rc); + goto out_do_sysfs_unregistration; + } rc = ecryptfs_init_messaging(ecryptfs_transport); if (rc) { - ecryptfs_printk(KERN_ERR, "Failure occured while attempting to " + printk(KERN_ERR "Failure occured while attempting to " "initialize the eCryptfs netlink socket\n"); - goto out_do_sysfs_unregistration; + goto out_destroy_kthread; } rc = ecryptfs_init_crypto(); if (rc) { @@ -814,6 +798,8 @@ static int __init ecryptfs_init(void) goto out; out_release_messaging: ecryptfs_release_messaging(ecryptfs_transport); +out_destroy_kthread: + ecryptfs_destroy_kthread(); out_do_sysfs_unregistration: do_sysfs_unregistration(); out_unregister_filesystem: @@ -833,6 +819,7 @@ static void __exit ecryptfs_exit(void) printk(KERN_ERR "Failure whilst attempting to destroy crypto; " "rc = [%d]\n", rc); ecryptfs_release_messaging(ecryptfs_transport); + ecryptfs_destroy_kthread(); do_sysfs_unregistration(); unregister_filesystem(&ecryptfs_fs_type); ecryptfs_free_kmem_caches(); diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c index 50c994a249a5..b484792a0996 100644 --- a/fs/ecryptfs/miscdev.c +++ b/fs/ecryptfs/miscdev.c @@ -358,46 +358,6 @@ out_unlock_daemon: } /** - * ecryptfs_miscdev_helo - * @euid: effective user id of miscdevess sending helo packet - * @user_ns: The namespace in which @euid applies - * @pid: miscdevess id of miscdevess sending helo packet - * - * Returns zero on success; non-zero otherwise - */ -static int ecryptfs_miscdev_helo(uid_t euid, struct user_namespace *user_ns, - struct pid *pid) -{ - int rc; - - rc = ecryptfs_process_helo(ECRYPTFS_TRANSPORT_MISCDEV, euid, user_ns, - pid); - if (rc) - printk(KERN_WARNING "Error processing HELO; rc = [%d]\n", rc); - return rc; -} - -/** - * ecryptfs_miscdev_quit - * @euid: effective user id of miscdevess sending quit packet - * @user_ns: The namespace in which @euid applies - * @pid: miscdevess id of miscdevess sending quit packet - * - * Returns zero on success; non-zero otherwise - */ -static int ecryptfs_miscdev_quit(uid_t euid, struct user_namespace *user_ns, - struct pid *pid) -{ - int rc; - - rc = ecryptfs_process_quit(euid, user_ns, pid); - if (rc) - printk(KERN_WARNING - "Error processing QUIT message; rc = [%d]\n", rc); - return rc; -} - -/** * ecryptfs_miscdev_response - miscdevess response to message previously sent to daemon * @data: Bytes comprising struct ecryptfs_message * @data_size: sizeof(struct ecryptfs_message) + data len @@ -512,26 +472,7 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf, __func__, rc); break; case ECRYPTFS_MSG_HELO: - rc = ecryptfs_miscdev_helo(current->euid, - current->nsproxy->user_ns, - task_pid(current)); - if (rc) { - printk(KERN_ERR "%s: Error attempting to process " - "helo from pid [0x%p]; rc = [%d]\n", __func__, - task_pid(current), rc); - goto out_free; - } - break; case ECRYPTFS_MSG_QUIT: - rc = ecryptfs_miscdev_quit(current->euid, - current->nsproxy->user_ns, - task_pid(current)); - if (rc) { - printk(KERN_ERR "%s: Error attempting to process " - "quit from pid [0x%p]; rc = [%d]\n", __func__, - task_pid(current), rc); - goto out_free; - } break; default: ecryptfs_printk(KERN_WARNING, "Dropping miscdev " @@ -575,13 +516,11 @@ int ecryptfs_init_ecryptfs_miscdev(void) int rc; atomic_set(&ecryptfs_num_miscdev_opens, 0); - mutex_lock(&ecryptfs_daemon_hash_mux); rc = misc_register(&ecryptfs_miscdev); if (rc) printk(KERN_ERR "%s: Failed to register miscellaneous device " "for communications with userspace daemons; rc = [%d]\n", __func__, rc); - mutex_unlock(&ecryptfs_daemon_hash_mux); return rc; } diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 2b6fe1e6e8ba..245c2dc02d5c 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -32,6 +32,7 @@ #include <linux/file.h> #include <linux/crypto.h> #include <linux/scatterlist.h> +#include <asm/unaligned.h> #include "ecryptfs_kernel.h" /** @@ -372,7 +373,6 @@ out: */ static int ecryptfs_write_inode_size_to_header(struct inode *ecryptfs_inode) { - u64 file_size; char *file_size_virt; int rc; @@ -381,9 +381,7 @@ static int ecryptfs_write_inode_size_to_header(struct inode *ecryptfs_inode) rc = -ENOMEM; goto out; } - file_size = (u64)i_size_read(ecryptfs_inode); - file_size = cpu_to_be64(file_size); - memcpy(file_size_virt, &file_size, sizeof(u64)); + put_unaligned_be64(i_size_read(ecryptfs_inode), file_size_virt); rc = ecryptfs_write_lower(ecryptfs_inode, file_size_virt, 0, sizeof(u64)); kfree(file_size_virt); @@ -403,7 +401,6 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode) struct dentry *lower_dentry = ecryptfs_inode_to_private(ecryptfs_inode)->lower_file->f_dentry; struct inode *lower_inode = lower_dentry->d_inode; - u64 file_size; int rc; if (!lower_inode->i_op->getxattr || !lower_inode->i_op->setxattr) { @@ -424,9 +421,7 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode) xattr_virt, PAGE_CACHE_SIZE); if (size < 0) size = 8; - file_size = (u64)i_size_read(ecryptfs_inode); - file_size = cpu_to_be64(file_size); - memcpy(xattr_virt, &file_size, sizeof(u64)); + put_unaligned_be64(i_size_read(ecryptfs_inode), xattr_virt); rc = lower_inode->i_op->setxattr(lower_dentry, ECRYPTFS_XATTR_NAME, xattr_virt, size, 0); mutex_unlock(&lower_inode->i_mutex); diff --git a/fs/efs/namei.c b/fs/efs/namei.c index 3a404e7fad53..291abb11e20e 100644 --- a/fs/efs/namei.c +++ b/fs/efs/namei.c @@ -74,8 +74,7 @@ struct dentry *efs_lookup(struct inode *dir, struct dentry *dentry, struct namei } unlock_kernel(); - d_add(dentry, inode); - return NULL; + return d_splice_alias(inode, dentry); } static struct inode *efs_nfs_get_inode(struct super_block *sb, u64 ino, diff --git a/fs/efs/super.c b/fs/efs/super.c index d733531b55e2..73b19cfc91fc 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -70,7 +70,7 @@ static void efs_destroy_inode(struct inode *inode) kmem_cache_free(efs_inode_cachep, INODE_INFO(inode)); } -static void init_once(struct kmem_cache *cachep, void *foo) +static void init_once(void *foo) { struct efs_inode_info *ei = (struct efs_inode_info *) foo; @@ -341,8 +341,6 @@ static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) { sb->inode_blocks * (EFS_BLOCKSIZE / sizeof(struct efs_dinode)); buf->f_ffree = sb->inode_free; /* free inodes */ - buf->f_fsid.val[0] = (sb->fs_magic >> 16) & 0xffff; /* fs ID */ - buf->f_fsid.val[1] = sb->fs_magic & 0xffff; /* fs ID */ buf->f_namelen = EFS_MAXNAMELEN; /* max filename length */ return 0; diff --git a/fs/eventfd.c b/fs/eventfd.c index 343942deeec1..08bf558d0408 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -198,11 +198,18 @@ struct file *eventfd_fget(int fd) return file; } -asmlinkage long sys_eventfd(unsigned int count) +asmlinkage long sys_eventfd2(unsigned int count, int flags) { int fd; struct eventfd_ctx *ctx; + /* Check the EFD_* constants for consistency. */ + BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC); + BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK); + + if (flags & ~(EFD_CLOEXEC | EFD_NONBLOCK)) + return -EINVAL; + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return -ENOMEM; @@ -214,9 +221,15 @@ asmlinkage long sys_eventfd(unsigned int count) * When we call this, the initialization must be complete, since * anon_inode_getfd() will install the fd. */ - fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx); + fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx, + flags & (O_CLOEXEC | O_NONBLOCK)); if (fd < 0) kfree(ctx); return fd; } +asmlinkage long sys_eventfd(unsigned int count) +{ + return sys_eventfd2(count, 0); +} + diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 990c01d2d66b..7cc0eb756b55 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1041,25 +1041,27 @@ retry: } /* - * It opens an eventpoll file descriptor. The "size" parameter is there - * for historical reasons, when epoll was using an hash instead of an - * RB tree. With the current implementation, the "size" parameter is ignored - * (besides sanity checks). + * Open an eventpoll file descriptor. */ -asmlinkage long sys_epoll_create(int size) +asmlinkage long sys_epoll_create1(int flags) { int error, fd = -1; struct eventpoll *ep; + /* Check the EPOLL_* constant for consistency. */ + BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); + + if (flags & ~EPOLL_CLOEXEC) + return -EINVAL; + DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n", - current, size)); + current, flags)); /* - * Sanity check on the size parameter, and create the internal data - * structure ( "struct eventpoll" ). + * Create the internal data structure ( "struct eventpoll" ). */ - error = -EINVAL; - if (size <= 0 || (error = ep_alloc(&ep)) < 0) { + error = ep_alloc(&ep); + if (error < 0) { fd = error; goto error_return; } @@ -1068,17 +1070,26 @@ asmlinkage long sys_epoll_create(int size) * Creates all the items needed to setup an eventpoll file. That is, * a file structure and a free file descriptor. */ - fd = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep); + fd = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep, + flags & O_CLOEXEC); if (fd < 0) ep_free(ep); error_return: DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", - current, size, fd)); + current, flags, fd)); return fd; } +asmlinkage long sys_epoll_create(int size) +{ + if (size < 0) + return -EINVAL; + + return sys_epoll_create1(0); +} + /* * The following function implements the controller interface for * the eventpoll file that enables the insertion/removal/change of diff --git a/fs/exec.c b/fs/exec.c index da94a6f05df3..cecee501ce78 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -25,10 +25,11 @@ #include <linux/slab.h> #include <linux/file.h> #include <linux/fdtable.h> -#include <linux/mman.h> +#include <linux/mm.h> #include <linux/stat.h> #include <linux/fcntl.h> #include <linux/smp_lock.h> +#include <linux/swap.h> #include <linux/string.h> #include <linux/init.h> #include <linux/pagemap.h> @@ -37,20 +38,18 @@ #include <linux/key.h> #include <linux/personality.h> #include <linux/binfmts.h> -#include <linux/swap.h> #include <linux/utsname.h> #include <linux/pid_namespace.h> #include <linux/module.h> #include <linux/namei.h> #include <linux/proc_fs.h> -#include <linux/ptrace.h> #include <linux/mount.h> #include <linux/security.h> #include <linux/syscalls.h> -#include <linux/rmap.h> #include <linux/tsacct_kern.h> #include <linux/cn_proc.h> #include <linux/audit.h> +#include <linux/tracehook.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> @@ -108,11 +107,17 @@ static inline void put_binfmt(struct linux_binfmt * fmt) */ asmlinkage long sys_uselib(const char __user * library) { - struct file * file; + struct file *file; struct nameidata nd; - int error; - - error = __user_path_lookup_open(library, LOOKUP_FOLLOW, &nd, FMODE_READ|FMODE_EXEC); + char *tmp = getname(library); + int error = PTR_ERR(tmp); + + if (!IS_ERR(tmp)) { + error = path_lookup_open(AT_FDCWD, tmp, + LOOKUP_FOLLOW, &nd, + FMODE_READ|FMODE_EXEC); + putname(tmp); + } if (error) goto out; @@ -120,7 +125,11 @@ asmlinkage long sys_uselib(const char __user * library) if (!S_ISREG(nd.path.dentry->d_inode->i_mode)) goto exit; - error = vfs_permission(&nd, MAY_READ | MAY_EXEC); + error = -EACCES; + if (nd.path.mnt->mnt_flags & MNT_NOEXEC) + goto exit; + + error = vfs_permission(&nd, MAY_READ | MAY_EXEC | MAY_OPEN); if (error) goto exit; @@ -541,7 +550,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) /* * when the old and new regions overlap clear from new_end. */ - free_pgd_range(&tlb, new_end, old_end, new_end, + free_pgd_range(tlb, new_end, old_end, new_end, vma->vm_next ? vma->vm_next->vm_start : 0); } else { /* @@ -550,7 +559,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) * have constraints on va-space that make this illegal (IA64) - * for the others its just a little faster. */ - free_pgd_range(&tlb, old_start, old_end, new_end, + free_pgd_range(tlb, old_start, old_end, new_end, vma->vm_next ? vma->vm_next->vm_start : 0); } tlb_finish_mmu(tlb, new_end, old_end); @@ -610,7 +619,7 @@ int setup_arg_pages(struct linux_binprm *bprm, bprm->exec -= stack_shift; down_write(&mm->mmap_sem); - vm_flags = vma->vm_flags; + vm_flags = VM_STACK_FLAGS; /* * Adjust stack execute permissions; explicitly enable for @@ -658,38 +667,43 @@ EXPORT_SYMBOL(setup_arg_pages); struct file *open_exec(const char *name) { struct nameidata nd; - int err; struct file *file; + int err; - err = path_lookup_open(AT_FDCWD, name, LOOKUP_FOLLOW, &nd, FMODE_READ|FMODE_EXEC); - file = ERR_PTR(err); - - if (!err) { - struct inode *inode = nd.path.dentry->d_inode; - file = ERR_PTR(-EACCES); - if (S_ISREG(inode->i_mode)) { - int err = vfs_permission(&nd, MAY_EXEC); - file = ERR_PTR(err); - if (!err) { - file = nameidata_to_filp(&nd, - O_RDONLY|O_LARGEFILE); - if (!IS_ERR(file)) { - err = deny_write_access(file); - if (err) { - fput(file); - file = ERR_PTR(err); - } - } -out: - return file; - } - } - release_open_intent(&nd); - path_put(&nd.path); + err = path_lookup_open(AT_FDCWD, name, LOOKUP_FOLLOW, &nd, + FMODE_READ|FMODE_EXEC); + if (err) + goto out; + + err = -EACCES; + if (!S_ISREG(nd.path.dentry->d_inode->i_mode)) + goto out_path_put; + + if (nd.path.mnt->mnt_flags & MNT_NOEXEC) + goto out_path_put; + + err = vfs_permission(&nd, MAY_EXEC | MAY_OPEN); + if (err) + goto out_path_put; + + file = nameidata_to_filp(&nd, O_RDONLY|O_LARGEFILE); + if (IS_ERR(file)) + return file; + + err = deny_write_access(file); + if (err) { + fput(file); + goto out; } - goto out; -} + return file; + + out_path_put: + release_open_intent(&nd); + path_put(&nd.path); + out: + return ERR_PTR(err); +} EXPORT_SYMBOL(open_exec); int kernel_read(struct file *file, unsigned long offset, @@ -724,12 +738,10 @@ static int exec_mmap(struct mm_struct *mm) * Make sure that if there is a core dump in progress * for the old mm, we get out and die instead of going * through with the exec. We must hold mmap_sem around - * checking core_waiters and changing tsk->mm. The - * core-inducing thread will increment core_waiters for - * each thread whose ->mm == old_mm. + * checking core_state and changing tsk->mm. */ down_read(&old_mm->mmap_sem); - if (unlikely(old_mm->core_waiters)) { + if (unlikely(old_mm->core_state)) { up_read(&old_mm->mmap_sem); return -EINTR; } @@ -740,11 +752,11 @@ static int exec_mmap(struct mm_struct *mm) tsk->active_mm = mm; activate_mm(active_mm, mm); task_unlock(tsk); - mm_update_next_owner(old_mm); arch_pick_mmap_layout(mm); if (old_mm) { up_read(&old_mm->mmap_sem); BUG_ON(active_mm != old_mm); + mm_update_next_owner(old_mm); mmput(old_mm); return 0; } @@ -1075,13 +1087,8 @@ EXPORT_SYMBOL(prepare_binprm); static int unsafe_exec(struct task_struct *p) { - int unsafe = 0; - if (p->ptrace & PT_PTRACED) { - if (p->ptrace & PT_PTRACE_CAP) - unsafe |= LSM_UNSAFE_PTRACE_CAP; - else - unsafe |= LSM_UNSAFE_PTRACE; - } + int unsafe = tracehook_unsafe_exec(p); + if (atomic_read(&p->fs->count) > 1 || atomic_read(&p->files->count) > 1 || atomic_read(&p->sighand->count) > 1) @@ -1218,6 +1225,7 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) read_unlock(&binfmt_lock); retval = fn(bprm, regs); if (retval >= 0) { + tracehook_report_exec(fmt, bprm, regs); put_binfmt(fmt); allow_write_access(bprm->file); if (bprm->file) @@ -1328,6 +1336,7 @@ int do_execve(char * filename, if (retval < 0) goto out; + current->flags &= ~PF_KTHREAD; retval = search_binary_handler(bprm,regs); if (retval >= 0) { /* execve success */ @@ -1382,17 +1391,14 @@ EXPORT_SYMBOL(set_binfmt); * name into corename, which must have space for at least * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator. */ -static int format_corename(char *corename, const char *pattern, long signr) +static int format_corename(char *corename, int nr_threads, long signr) { - const char *pat_ptr = pattern; + const char *pat_ptr = core_pattern; + int ispipe = (*pat_ptr == '|'); char *out_ptr = corename; char *const out_end = corename + CORENAME_MAX_SIZE; int rc; int pid_in_pattern = 0; - int ispipe = 0; - - if (*pattern == '|') - ispipe = 1; /* Repeat as long as we have more pattern to process and more output space */ @@ -1493,7 +1499,7 @@ static int format_corename(char *corename, const char *pattern, long signr) * and core_uses_pid is set, then .%pid will be appended to * the filename. Do not do this for piped commands. */ if (!ispipe && !pid_in_pattern - && (core_uses_pid || atomic_read(¤t->mm->mm_users) != 1)) { + && (core_uses_pid || nr_threads)) { rc = snprintf(out_ptr, out_end - out_ptr, ".%d", task_tgid_vnr(current)); if (rc > out_end - out_ptr) @@ -1505,9 +1511,10 @@ out: return ispipe; } -static void zap_process(struct task_struct *start) +static int zap_process(struct task_struct *start) { struct task_struct *t; + int nr = 0; start->signal->flags = SIGNAL_GROUP_EXIT; start->signal->group_stop_count = 0; @@ -1515,72 +1522,99 @@ static void zap_process(struct task_struct *start) t = start; do { if (t != current && t->mm) { - t->mm->core_waiters++; sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); + nr++; } - } while ((t = next_thread(t)) != start); + } while_each_thread(start, t); + + return nr; } static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm, - int exit_code) + struct core_state *core_state, int exit_code) { struct task_struct *g, *p; unsigned long flags; - int err = -EAGAIN; + int nr = -EAGAIN; spin_lock_irq(&tsk->sighand->siglock); if (!signal_group_exit(tsk->signal)) { + mm->core_state = core_state; tsk->signal->group_exit_code = exit_code; - zap_process(tsk); - err = 0; + nr = zap_process(tsk); } spin_unlock_irq(&tsk->sighand->siglock); - if (err) - return err; + if (unlikely(nr < 0)) + return nr; - if (atomic_read(&mm->mm_users) == mm->core_waiters + 1) + if (atomic_read(&mm->mm_users) == nr + 1) goto done; - + /* + * We should find and kill all tasks which use this mm, and we should + * count them correctly into ->nr_threads. We don't take tasklist + * lock, but this is safe wrt: + * + * fork: + * None of sub-threads can fork after zap_process(leader). All + * processes which were created before this point should be + * visible to zap_threads() because copy_process() adds the new + * process to the tail of init_task.tasks list, and lock/unlock + * of ->siglock provides a memory barrier. + * + * do_exit: + * The caller holds mm->mmap_sem. This means that the task which + * uses this mm can't pass exit_mm(), so it can't exit or clear + * its ->mm. + * + * de_thread: + * It does list_replace_rcu(&leader->tasks, ¤t->tasks), + * we must see either old or new leader, this does not matter. + * However, it can change p->sighand, so lock_task_sighand(p) + * must be used. Since p->mm != NULL and we hold ->mmap_sem + * it can't fail. + * + * Note also that "g" can be the old leader with ->mm == NULL + * and already unhashed and thus removed from ->thread_group. + * This is OK, __unhash_process()->list_del_rcu() does not + * clear the ->next pointer, we will find the new leader via + * next_thread(). + */ rcu_read_lock(); for_each_process(g) { if (g == tsk->group_leader) continue; - + if (g->flags & PF_KTHREAD) + continue; p = g; do { if (p->mm) { - if (p->mm == mm) { - /* - * p->sighand can't disappear, but - * may be changed by de_thread() - */ + if (unlikely(p->mm == mm)) { lock_task_sighand(p, &flags); - zap_process(p); + nr += zap_process(p); unlock_task_sighand(p, &flags); } break; } - } while ((p = next_thread(p)) != g); + } while_each_thread(g, p); } rcu_read_unlock(); done: - return mm->core_waiters; + atomic_set(&core_state->nr_threads, nr); + return nr; } -static int coredump_wait(int exit_code) +static int coredump_wait(int exit_code, struct core_state *core_state) { struct task_struct *tsk = current; struct mm_struct *mm = tsk->mm; - struct completion startup_done; struct completion *vfork_done; int core_waiters; - init_completion(&mm->core_done); - init_completion(&startup_done); - mm->core_startup_done = &startup_done; - - core_waiters = zap_threads(tsk, mm, exit_code); + init_completion(&core_state->startup); + core_state->dumper.task = tsk; + core_state->dumper.next = NULL; + core_waiters = zap_threads(tsk, mm, core_state, exit_code); up_write(&mm->mmap_sem); if (unlikely(core_waiters < 0)) @@ -1597,12 +1631,32 @@ static int coredump_wait(int exit_code) } if (core_waiters) - wait_for_completion(&startup_done); + wait_for_completion(&core_state->startup); fail: - BUG_ON(mm->core_waiters); return core_waiters; } +static void coredump_finish(struct mm_struct *mm) +{ + struct core_thread *curr, *next; + struct task_struct *task; + + next = mm->core_state->dumper.next; + while ((curr = next) != NULL) { + next = curr->next; + task = curr->task; + /* + * see exit_mm(), curr->task must not see + * ->task == NULL before we read ->next. + */ + smp_mb(); + curr->task = NULL; + wake_up_process(task); + } + + mm->core_state = NULL; +} + /* * set_dumpable converts traditional three-value dumpable to two flags and * stores them into mm->flags. It modifies lower two bits of mm->flags, but @@ -1654,6 +1708,7 @@ int get_dumpable(struct mm_struct *mm) int do_coredump(long signr, int exit_code, struct pt_regs * regs) { + struct core_state core_state; char corename[CORENAME_MAX_SIZE + 1]; struct mm_struct *mm = current->mm; struct linux_binfmt * binfmt; @@ -1677,7 +1732,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs) /* * If another thread got here first, or we are not dumpable, bail out. */ - if (mm->core_waiters || !get_dumpable(mm)) { + if (mm->core_state || !get_dumpable(mm)) { up_write(&mm->mmap_sem); goto fail; } @@ -1692,7 +1747,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs) current->fsuid = 0; /* Dump root private */ } - retval = coredump_wait(exit_code); + retval = coredump_wait(exit_code, &core_state); if (retval < 0) goto fail; @@ -1707,7 +1762,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs) * uses lock_kernel() */ lock_kernel(); - ispipe = format_corename(corename, core_pattern, signr); + ispipe = format_corename(corename, retval, signr); unlock_kernel(); /* * Don't bother to check the RLIMIT_CORE value if core_pattern points @@ -1786,7 +1841,7 @@ fail_unlock: argv_free(helper_argv); current->fsuid = fsuid; - complete_all(&mm->core_done); + coredump_finish(mm); fail: return retval; } diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index e58669e1b87c..ae8c4f850b27 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -294,7 +294,7 @@ ext2_check_acl(struct inode *inode, int mask) } int -ext2_permission(struct inode *inode, int mask, struct nameidata *nd) +ext2_permission(struct inode *inode, int mask) { return generic_permission(inode, mask, ext2_check_acl); } diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h index 0bde85bafe38..b42cf578554b 100644 --- a/fs/ext2/acl.h +++ b/fs/ext2/acl.h @@ -58,7 +58,7 @@ static inline int ext2_acl_count(size_t size) #define EXT2_ACL_NOT_CACHED ((void *)-1) /* acl.c */ -extern int ext2_permission (struct inode *, int, struct nameidata *); +extern int ext2_permission (struct inode *, int); extern int ext2_acl_chmod (struct inode *); extern int ext2_init_acl (struct inode *, struct inode *); diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 47d88da2d33b..bae998c1e44e 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -133,6 +133,8 @@ extern void ext2_truncate (struct inode *); extern int ext2_setattr (struct dentry *, struct iattr *); extern void ext2_set_inode_flags(struct inode *inode); extern void ext2_get_inode_flags(struct ext2_inode_info *); +extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len); int __ext2_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata); diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 5f2fa9c36293..45ed07122182 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -86,4 +86,5 @@ const struct inode_operations ext2_file_inode_operations = { #endif .setattr = ext2_setattr, .permission = ext2_permission, + .fiemap = ext2_fiemap, }; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 384fc0d1dd74..7658b33e2653 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -31,6 +31,7 @@ #include <linux/writeback.h> #include <linux/buffer_head.h> #include <linux/mpage.h> +#include <linux/fiemap.h> #include "ext2.h" #include "acl.h" #include "xip.h" @@ -704,6 +705,13 @@ int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_ } +int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len) +{ + return generic_block_fiemap(inode, fieinfo, start, len, + ext2_get_block); +} + static int ext2_writepage(struct page *page, struct writeback_control *wbc) { return block_write_full_page(page, ext2_get_block, wbc); @@ -791,6 +799,7 @@ const struct address_space_operations ext2_aops = { .direct_IO = ext2_direct_IO, .writepages = ext2_writepages, .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, }; const struct address_space_operations ext2_aops_xip = { diff --git a/fs/ext2/super.c b/fs/ext2/super.c index ef50cbc792db..647cd888ac87 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -31,6 +31,7 @@ #include <linux/seq_file.h> #include <linux/mount.h> #include <linux/log2.h> +#include <linux/quotaops.h> #include <asm/uaccess.h> #include "ext2.h" #include "xattr.h" @@ -158,7 +159,7 @@ static void ext2_destroy_inode(struct inode *inode) kmem_cache_free(ext2_inode_cachep, EXT2_I(inode)); } -static void init_once(struct kmem_cache * cachep, void *foo) +static void init_once(void *foo) { struct ext2_inode_info *ei = (struct ext2_inode_info *) foo; @@ -392,7 +393,7 @@ enum { Opt_usrquota, Opt_grpquota, Opt_reservation, Opt_noreservation }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_bsd_df, "bsddf"}, {Opt_minix_df, "minixdf"}, {Opt_grpid, "grpid"}, diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c index eaa23d2d5213..70c0dbdcdcb7 100644 --- a/fs/ext2/xattr_security.c +++ b/fs/ext2/xattr_security.c @@ -14,7 +14,7 @@ static size_t ext2_xattr_security_list(struct inode *inode, char *list, size_t list_size, const char *name, size_t name_len) { - const int prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1; + const int prefix_len = XATTR_SECURITY_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; if (list && total_len <= list_size) { diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c index 83ee149f353d..e8219f8eae9f 100644 --- a/fs/ext2/xattr_trusted.c +++ b/fs/ext2/xattr_trusted.c @@ -12,13 +12,11 @@ #include <linux/ext2_fs.h> #include "xattr.h" -#define XATTR_TRUSTED_PREFIX "trusted." - static size_t ext2_xattr_trusted_list(struct inode *inode, char *list, size_t list_size, const char *name, size_t name_len) { - const int prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1; + const int prefix_len = XATTR_TRUSTED_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; if (!capable(CAP_SYS_ADMIN)) diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c index f383e7c3a7b5..92495d28c62f 100644 --- a/fs/ext2/xattr_user.c +++ b/fs/ext2/xattr_user.c @@ -11,13 +11,11 @@ #include "ext2.h" #include "xattr.h" -#define XATTR_USER_PREFIX "user." - static size_t ext2_xattr_user_list(struct inode *inode, char *list, size_t list_size, const char *name, size_t name_len) { - const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1; + const size_t prefix_len = XATTR_USER_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; if (!test_opt(inode->i_sb, XATTR_USER)) diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index a754d1848173..b60bb241880c 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c @@ -299,7 +299,7 @@ ext3_check_acl(struct inode *inode, int mask) } int -ext3_permission(struct inode *inode, int mask, struct nameidata *nd) +ext3_permission(struct inode *inode, int mask) { return generic_permission(inode, mask, ext3_check_acl); } diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h index 0d1e6279cbfd..42da16b8cac0 100644 --- a/fs/ext3/acl.h +++ b/fs/ext3/acl.h @@ -58,7 +58,7 @@ static inline int ext3_acl_count(size_t size) #define EXT3_ACL_NOT_CACHED ((void *)-1) /* acl.c */ -extern int ext3_permission (struct inode *, int, struct nameidata *); +extern int ext3_permission (struct inode *, int); extern int ext3_acl_chmod (struct inode *); extern int ext3_init_acl (handle_t *, struct inode *, struct inode *); diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c index 8ca3bfd72427..2eea96ec78ed 100644 --- a/fs/ext3/dir.c +++ b/fs/ext3/dir.c @@ -272,7 +272,7 @@ static void free_rb_tree_fname(struct rb_root *root) while (n) { /* Do the node's children first */ - if ((n)->rb_left) { + if (n->rb_left) { n = n->rb_left; continue; } @@ -301,24 +301,18 @@ static void free_rb_tree_fname(struct rb_root *root) parent->rb_right = NULL; n = parent; } - root->rb_node = NULL; } -static struct dir_private_info *create_dir_info(loff_t pos) +static struct dir_private_info *ext3_htree_create_dir_info(loff_t pos) { struct dir_private_info *p; - p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL); + p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL); if (!p) return NULL; - p->root.rb_node = NULL; - p->curr_node = NULL; - p->extra_fname = NULL; - p->last_pos = 0; p->curr_hash = pos2maj_hash(pos); p->curr_minor_hash = pos2min_hash(pos); - p->next_hash = 0; return p; } @@ -433,7 +427,7 @@ static int ext3_dx_readdir(struct file * filp, int ret; if (!info) { - info = create_dir_info(filp->f_pos); + info = ext3_htree_create_dir_info(filp->f_pos); if (!info) return -ENOMEM; filp->private_data = info; diff --git a/fs/ext3/file.c b/fs/ext3/file.c index acc4913d3019..3be1e0689c9a 100644 --- a/fs/ext3/file.c +++ b/fs/ext3/file.c @@ -134,5 +134,6 @@ const struct inode_operations ext3_file_inode_operations = { .removexattr = generic_removexattr, #endif .permission = ext3_permission, + .fiemap = ext3_fiemap, }; diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index 77126821b2e9..47b678d73e7a 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -669,6 +669,14 @@ struct inode *ext3_orphan_get(struct super_block *sb, unsigned long ino) if (IS_ERR(inode)) goto iget_failed; + /* + * If the orphans has i_nlinks > 0 then it should be able to be + * truncated, otherwise it won't be removed from the orphan list + * during processing and an infinite loop will result. + */ + if (inode->i_nlink && !ext3_can_truncate(inode)) + goto bad_orphan; + if (NEXT_ORPHAN(inode) > max_ino) goto bad_orphan; brelse(bitmap_bh); @@ -690,6 +698,7 @@ bad_orphan: printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n", NEXT_ORPHAN(inode)); printk(KERN_NOTICE "max_ino=%lu\n", max_ino); + printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink); /* Avoid freeing blocks if we got a bad deleted inode */ if (inode->i_nlink == 0) inode->i_blocks = 0; diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 6ae4ecf3ce40..ebfec4d0148e 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -36,6 +36,7 @@ #include <linux/mpage.h> #include <linux/uio.h> #include <linux/bio.h> +#include <linux/fiemap.h> #include "xattr.h" #include "acl.h" @@ -981,6 +982,13 @@ out: return ret; } +int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len) +{ + return generic_block_fiemap(inode, fieinfo, start, len, + ext3_get_block); +} + /* * `handle' can be NULL if create is zero */ @@ -1767,44 +1775,47 @@ static int ext3_journalled_set_page_dirty(struct page *page) } static const struct address_space_operations ext3_ordered_aops = { - .readpage = ext3_readpage, - .readpages = ext3_readpages, - .writepage = ext3_ordered_writepage, - .sync_page = block_sync_page, - .write_begin = ext3_write_begin, - .write_end = ext3_ordered_write_end, - .bmap = ext3_bmap, - .invalidatepage = ext3_invalidatepage, - .releasepage = ext3_releasepage, - .direct_IO = ext3_direct_IO, - .migratepage = buffer_migrate_page, + .readpage = ext3_readpage, + .readpages = ext3_readpages, + .writepage = ext3_ordered_writepage, + .sync_page = block_sync_page, + .write_begin = ext3_write_begin, + .write_end = ext3_ordered_write_end, + .bmap = ext3_bmap, + .invalidatepage = ext3_invalidatepage, + .releasepage = ext3_releasepage, + .direct_IO = ext3_direct_IO, + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, }; static const struct address_space_operations ext3_writeback_aops = { - .readpage = ext3_readpage, - .readpages = ext3_readpages, - .writepage = ext3_writeback_writepage, - .sync_page = block_sync_page, - .write_begin = ext3_write_begin, - .write_end = ext3_writeback_write_end, - .bmap = ext3_bmap, - .invalidatepage = ext3_invalidatepage, - .releasepage = ext3_releasepage, - .direct_IO = ext3_direct_IO, - .migratepage = buffer_migrate_page, + .readpage = ext3_readpage, + .readpages = ext3_readpages, + .writepage = ext3_writeback_writepage, + .sync_page = block_sync_page, + .write_begin = ext3_write_begin, + .write_end = ext3_writeback_write_end, + .bmap = ext3_bmap, + .invalidatepage = ext3_invalidatepage, + .releasepage = ext3_releasepage, + .direct_IO = ext3_direct_IO, + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, }; static const struct address_space_operations ext3_journalled_aops = { - .readpage = ext3_readpage, - .readpages = ext3_readpages, - .writepage = ext3_journalled_writepage, - .sync_page = block_sync_page, - .write_begin = ext3_write_begin, - .write_end = ext3_journalled_write_end, - .set_page_dirty = ext3_journalled_set_page_dirty, - .bmap = ext3_bmap, - .invalidatepage = ext3_invalidatepage, - .releasepage = ext3_releasepage, + .readpage = ext3_readpage, + .readpages = ext3_readpages, + .writepage = ext3_journalled_writepage, + .sync_page = block_sync_page, + .write_begin = ext3_write_begin, + .write_end = ext3_journalled_write_end, + .set_page_dirty = ext3_journalled_set_page_dirty, + .bmap = ext3_bmap, + .invalidatepage = ext3_invalidatepage, + .releasepage = ext3_releasepage, + .is_partially_uptodate = block_is_partially_uptodate, }; void ext3_set_aops(struct inode *inode) @@ -2127,7 +2138,21 @@ static void ext3_free_data(handle_t *handle, struct inode *inode, if (this_bh) { BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata"); - ext3_journal_dirty_metadata(handle, this_bh); + + /* + * The buffer head should have an attached journal head at this + * point. However, if the data is corrupted and an indirect + * block pointed to itself, it would have been detached when + * the block was cleared. Check for this instead of OOPSing. + */ + if (bh2jh(this_bh)) + ext3_journal_dirty_metadata(handle, this_bh); + else + ext3_error(inode->i_sb, "ext3_free_data", + "circular indirect block detected, " + "inode=%lu, block=%llu", + inode->i_ino, + (unsigned long long)this_bh->b_blocknr); } } @@ -2253,6 +2278,19 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode, } } +int ext3_can_truncate(struct inode *inode) +{ + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return 0; + if (S_ISREG(inode->i_mode)) + return 1; + if (S_ISDIR(inode->i_mode)) + return 1; + if (S_ISLNK(inode->i_mode)) + return !ext3_inode_is_fast_symlink(inode); + return 0; +} + /* * ext3_truncate() * @@ -2297,12 +2335,7 @@ void ext3_truncate(struct inode *inode) unsigned blocksize = inode->i_sb->s_blocksize; struct page *page; - if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - S_ISLNK(inode->i_mode))) - return; - if (ext3_inode_is_fast_symlink(inode)) - return; - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + if (!ext3_can_truncate(inode)) return; /* @@ -2513,6 +2546,16 @@ static int __ext3_get_inode_loc(struct inode *inode, } if (!buffer_uptodate(bh)) { lock_buffer(bh); + + /* + * If the buffer has the write error flag, we have failed + * to write out another inode in the same block. In this + * case, we don't have to read the block because we may + * read the old inode data successfully. + */ + if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) + set_buffer_uptodate(bh); + if (buffer_uptodate(bh)) { /* someone brought it uptodate while we waited */ unlock_buffer(bh); diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 0b8cf80154f1..de13e919cd81 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -240,13 +240,13 @@ static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize) { unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) - EXT3_DIR_REC_LEN(2) - infosize; - return 0? 20: entry_space / sizeof(struct dx_entry); + return entry_space / sizeof(struct dx_entry); } static inline unsigned dx_node_limit (struct inode *dir) { unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0); - return 0? 22: entry_space / sizeof(struct dx_entry); + return entry_space / sizeof(struct dx_entry); } /* @@ -991,19 +991,21 @@ static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, de = (struct ext3_dir_entry_2 *) bh->b_data; top = (struct ext3_dir_entry_2 *) ((char *) de + sb->s_blocksize - EXT3_DIR_REC_LEN(0)); - for (; de < top; de = ext3_next_entry(de)) - if (ext3_match (namelen, name, de)) { - if (!ext3_check_dir_entry("ext3_find_entry", - dir, de, bh, - (block<<EXT3_BLOCK_SIZE_BITS(sb)) - +((char *)de - bh->b_data))) { - brelse (bh); + for (; de < top; de = ext3_next_entry(de)) { + int off = (block << EXT3_BLOCK_SIZE_BITS(sb)) + + ((char *) de - bh->b_data); + + if (!ext3_check_dir_entry(__func__, dir, de, bh, off)) { + brelse(bh); *err = ERR_BAD_DX_DIR; goto errout; } - *res_dir = de; - dx_release (frames); - return bh; + + if (ext3_match(namelen, name, de)) { + *res_dir = de; + dx_release(frames); + return bh; + } } brelse (bh); /* Check to see if we should continue to search */ diff --git a/fs/ext3/super.c b/fs/ext3/super.c index fe3119a71ada..399a96a6c556 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -472,7 +472,7 @@ static void ext3_destroy_inode(struct inode *inode) kmem_cache_free(ext3_inode_cachep, EXT3_I(inode)); } -static void init_once(struct kmem_cache * cachep, void *foo) +static void init_once(void *foo) { struct ext3_inode_info *ei = (struct ext3_inode_info *) foo; @@ -760,7 +760,7 @@ enum { Opt_grpquota }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_bsd_df, "bsddf"}, {Opt_minix_df, "minixdf"}, {Opt_grpid, "grpid"}, @@ -842,7 +842,7 @@ static int parse_options (char *options, struct super_block *sb, int data_opt = 0; int option; #ifdef CONFIG_QUOTA - int qtype; + int qtype, qfmt; char *qname; #endif @@ -1018,9 +1018,11 @@ static int parse_options (char *options, struct super_block *sb, case Opt_grpjquota: qtype = GRPQUOTA; set_qf_name: - if (sb_any_quota_enabled(sb)) { + if ((sb_any_quota_enabled(sb) || + sb_any_quota_suspended(sb)) && + !sbi->s_qf_names[qtype]) { printk(KERN_ERR - "EXT3-fs: Cannot change journalled " + "EXT3-fs: Cannot change journaled " "quota options when quota turned on.\n"); return 0; } @@ -1056,9 +1058,11 @@ set_qf_name: case Opt_offgrpjquota: qtype = GRPQUOTA; clear_qf_name: - if (sb_any_quota_enabled(sb)) { + if ((sb_any_quota_enabled(sb) || + sb_any_quota_suspended(sb)) && + sbi->s_qf_names[qtype]) { printk(KERN_ERR "EXT3-fs: Cannot change " - "journalled quota options when " + "journaled quota options when " "quota turned on.\n"); return 0; } @@ -1069,10 +1073,20 @@ clear_qf_name: sbi->s_qf_names[qtype] = NULL; break; case Opt_jqfmt_vfsold: - sbi->s_jquota_fmt = QFMT_VFS_OLD; - break; + qfmt = QFMT_VFS_OLD; + goto set_qf_format; case Opt_jqfmt_vfsv0: - sbi->s_jquota_fmt = QFMT_VFS_V0; + qfmt = QFMT_VFS_V0; +set_qf_format: + if ((sb_any_quota_enabled(sb) || + sb_any_quota_suspended(sb)) && + sbi->s_jquota_fmt != qfmt) { + printk(KERN_ERR "EXT3-fs: Cannot change " + "journaled quota options when " + "quota turned on.\n"); + return 0; + } + sbi->s_jquota_fmt = qfmt; break; case Opt_quota: case Opt_usrquota: @@ -1084,7 +1098,8 @@ clear_qf_name: set_opt(sbi->s_mount_opt, GRPQUOTA); break; case Opt_noquota: - if (sb_any_quota_enabled(sb)) { + if (sb_any_quota_enabled(sb) || + sb_any_quota_suspended(sb)) { printk(KERN_ERR "EXT3-fs: Cannot change quota " "options when quota turned on.\n"); return 0; @@ -1169,14 +1184,14 @@ clear_qf_name: } if (!sbi->s_jquota_fmt) { - printk(KERN_ERR "EXT3-fs: journalled quota format " + printk(KERN_ERR "EXT3-fs: journaled quota format " "not specified.\n"); return 0; } } else { if (sbi->s_jquota_fmt) { - printk(KERN_ERR "EXT3-fs: journalled quota format " - "specified with no journalling " + printk(KERN_ERR "EXT3-fs: journaled quota format " + "specified with no journaling " "enabled.\n"); return 0; } @@ -1370,7 +1385,7 @@ static void ext3_orphan_cleanup (struct super_block * sb, int ret = ext3_quota_on_mount(sb, i); if (ret < 0) printk(KERN_ERR - "EXT3-fs: Cannot turn on journalled " + "EXT3-fs: Cannot turn on journaled " "quota: error %d\n", ret); } } @@ -2712,7 +2727,7 @@ static int ext3_release_dquot(struct dquot *dquot) static int ext3_mark_dquot_dirty(struct dquot *dquot) { - /* Are we journalling quotas? */ + /* Are we journaling quotas? */ if (EXT3_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] || EXT3_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) { dquot_mark_dquot_dirty(dquot); @@ -2759,25 +2774,45 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id, if (!test_opt(sb, QUOTA)) return -EINVAL; - /* Not journalling quota or remount? */ - if ((!EXT3_SB(sb)->s_qf_names[USRQUOTA] && - !EXT3_SB(sb)->s_qf_names[GRPQUOTA]) || remount) + /* When remounting, no checks are needed and in fact, path is NULL */ + if (remount) return vfs_quota_on(sb, type, format_id, path, remount); + err = path_lookup(path, LOOKUP_FOLLOW, &nd); if (err) return err; + /* Quotafile not on the same filesystem? */ if (nd.path.mnt->mnt_sb != sb) { path_put(&nd.path); return -EXDEV; } - /* Quotafile not in fs root? */ - if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode) - printk(KERN_WARNING - "EXT3-fs: Quota file not on filesystem root. " - "Journalled quota will not work.\n"); + /* Journaling quota? */ + if (EXT3_SB(sb)->s_qf_names[type]) { + /* Quotafile not of fs root? */ + if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode) + printk(KERN_WARNING + "EXT3-fs: Quota file not on filesystem root. " + "Journaled quota will not work.\n"); + } + + /* + * When we journal data on quota file, we have to flush journal to see + * all updates to the file when we bypass pagecache... + */ + if (ext3_should_journal_data(nd.path.dentry->d_inode)) { + /* + * We don't need to lock updates but journal_flush() could + * otherwise be livelocked... + */ + journal_lock_updates(EXT3_SB(sb)->s_journal); + journal_flush(EXT3_SB(sb)->s_journal); + journal_unlock_updates(EXT3_SB(sb)->s_journal); + } + + err = vfs_quota_on_path(sb, type, format_id, &nd.path); path_put(&nd.path); - return vfs_quota_on(sb, type, format_id, path, remount); + return err; } /* Read data from quotafile - avoid pagecache and such because we cannot afford @@ -2875,8 +2910,10 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type, blk++; } out: - if (len == towrite) + if (len == towrite) { + mutex_unlock(&inode->i_mutex); return err; + } if (inode->i_size < off+len-towrite) { i_size_write(inode, off+len-towrite); EXT3_I(inode)->i_disksize = inode->i_size; diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c index 821efaf2b94e..37b81097bdf2 100644 --- a/fs/ext3/xattr_security.c +++ b/fs/ext3/xattr_security.c @@ -15,7 +15,7 @@ static size_t ext3_xattr_security_list(struct inode *inode, char *list, size_t list_size, const char *name, size_t name_len) { - const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1; + const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c index 0327497a55ce..c7c41a410c4b 100644 --- a/fs/ext3/xattr_trusted.c +++ b/fs/ext3/xattr_trusted.c @@ -13,13 +13,11 @@ #include <linux/ext3_fs.h> #include "xattr.h" -#define XATTR_TRUSTED_PREFIX "trusted." - static size_t ext3_xattr_trusted_list(struct inode *inode, char *list, size_t list_size, const char *name, size_t name_len) { - const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1; + const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; if (!capable(CAP_SYS_ADMIN)) diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c index 1abd8f92c440..430fe63b31b3 100644 --- a/fs/ext3/xattr_user.c +++ b/fs/ext3/xattr_user.c @@ -12,13 +12,11 @@ #include <linux/ext3_fs.h> #include "xattr.h" -#define XATTR_USER_PREFIX "user." - static size_t ext3_xattr_user_list(struct inode *inode, char *list, size_t list_size, const char *name, size_t name_len) { - const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1; + const size_t prefix_len = XATTR_USER_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; if (!test_opt(inode->i_sb, XATTR_USER)) diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index ac6fa8ca0a2f..a8ff003a00f7 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -2,12 +2,12 @@ # Makefile for the linux ext4-filesystem routines. # -obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o +obj-$(CONFIG_EXT4_FS) += ext4.o -ext4dev-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ +ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ ext4_jbd2.o migrate.o mballoc.o -ext4dev-$(CONFIG_EXT4DEV_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o -ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL) += acl.o -ext4dev-$(CONFIG_EXT4DEV_FS_SECURITY) += xattr_security.o +ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o +ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o +ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 3c8dab880d91..694ed6fadcc8 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -40,34 +40,35 @@ ext4_acl_from_disk(const void *value, size_t size) acl = posix_acl_alloc(count, GFP_NOFS); if (!acl) return ERR_PTR(-ENOMEM); - for (n=0; n < count; n++) { + for (n = 0; n < count; n++) { ext4_acl_entry *entry = (ext4_acl_entry *)value; if ((char *)value + sizeof(ext4_acl_entry_short) > end) goto fail; acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag); acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm); - switch(acl->a_entries[n].e_tag) { - case ACL_USER_OBJ: - case ACL_GROUP_OBJ: - case ACL_MASK: - case ACL_OTHER: - value = (char *)value + - sizeof(ext4_acl_entry_short); - acl->a_entries[n].e_id = ACL_UNDEFINED_ID; - break; - - case ACL_USER: - case ACL_GROUP: - value = (char *)value + sizeof(ext4_acl_entry); - if ((char *)value > end) - goto fail; - acl->a_entries[n].e_id = - le32_to_cpu(entry->e_id); - break; - - default: + + switch (acl->a_entries[n].e_tag) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + value = (char *)value + + sizeof(ext4_acl_entry_short); + acl->a_entries[n].e_id = ACL_UNDEFINED_ID; + break; + + case ACL_USER: + case ACL_GROUP: + value = (char *)value + sizeof(ext4_acl_entry); + if ((char *)value > end) goto fail; + acl->a_entries[n].e_id = + le32_to_cpu(entry->e_id); + break; + + default: + goto fail; } } if (value != end) @@ -96,27 +97,26 @@ ext4_acl_to_disk(const struct posix_acl *acl, size_t *size) return ERR_PTR(-ENOMEM); ext_acl->a_version = cpu_to_le32(EXT4_ACL_VERSION); e = (char *)ext_acl + sizeof(ext4_acl_header); - for (n=0; n < acl->a_count; n++) { + for (n = 0; n < acl->a_count; n++) { ext4_acl_entry *entry = (ext4_acl_entry *)e; entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag); entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm); - switch(acl->a_entries[n].e_tag) { - case ACL_USER: - case ACL_GROUP: - entry->e_id = - cpu_to_le32(acl->a_entries[n].e_id); - e += sizeof(ext4_acl_entry); - break; - - case ACL_USER_OBJ: - case ACL_GROUP_OBJ: - case ACL_MASK: - case ACL_OTHER: - e += sizeof(ext4_acl_entry_short); - break; - - default: - goto fail; + switch (acl->a_entries[n].e_tag) { + case ACL_USER: + case ACL_GROUP: + entry->e_id = cpu_to_le32(acl->a_entries[n].e_id); + e += sizeof(ext4_acl_entry); + break; + + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + e += sizeof(ext4_acl_entry_short); + break; + + default: + goto fail; } } return (char *)ext_acl; @@ -167,23 +167,23 @@ ext4_get_acl(struct inode *inode, int type) if (!test_opt(inode->i_sb, POSIX_ACL)) return NULL; - switch(type) { - case ACL_TYPE_ACCESS: - acl = ext4_iget_acl(inode, &ei->i_acl); - if (acl != EXT4_ACL_NOT_CACHED) - return acl; - name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; - break; - - case ACL_TYPE_DEFAULT: - acl = ext4_iget_acl(inode, &ei->i_default_acl); - if (acl != EXT4_ACL_NOT_CACHED) - return acl; - name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT; - break; - - default: - return ERR_PTR(-EINVAL); + switch (type) { + case ACL_TYPE_ACCESS: + acl = ext4_iget_acl(inode, &ei->i_acl); + if (acl != EXT4_ACL_NOT_CACHED) + return acl; + name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; + break; + + case ACL_TYPE_DEFAULT: + acl = ext4_iget_acl(inode, &ei->i_default_acl); + if (acl != EXT4_ACL_NOT_CACHED) + return acl; + name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT; + break; + + default: + return ERR_PTR(-EINVAL); } retval = ext4_xattr_get(inode, name_index, "", NULL, 0); if (retval > 0) { @@ -201,14 +201,14 @@ ext4_get_acl(struct inode *inode, int type) kfree(value); if (!IS_ERR(acl)) { - switch(type) { - case ACL_TYPE_ACCESS: - ext4_iset_acl(inode, &ei->i_acl, acl); - break; - - case ACL_TYPE_DEFAULT: - ext4_iset_acl(inode, &ei->i_default_acl, acl); - break; + switch (type) { + case ACL_TYPE_ACCESS: + ext4_iset_acl(inode, &ei->i_acl, acl); + break; + + case ACL_TYPE_DEFAULT: + ext4_iset_acl(inode, &ei->i_default_acl, acl); + break; } } return acl; @@ -232,31 +232,31 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type, if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; - switch(type) { - case ACL_TYPE_ACCESS: - name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; - if (acl) { - mode_t mode = inode->i_mode; - error = posix_acl_equiv_mode(acl, &mode); - if (error < 0) - return error; - else { - inode->i_mode = mode; - ext4_mark_inode_dirty(handle, inode); - if (error == 0) - acl = NULL; - } + switch (type) { + case ACL_TYPE_ACCESS: + name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; + if (acl) { + mode_t mode = inode->i_mode; + error = posix_acl_equiv_mode(acl, &mode); + if (error < 0) + return error; + else { + inode->i_mode = mode; + ext4_mark_inode_dirty(handle, inode); + if (error == 0) + acl = NULL; } - break; + } + break; - case ACL_TYPE_DEFAULT: - name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT; - if (!S_ISDIR(inode->i_mode)) - return acl ? -EACCES : 0; - break; + case ACL_TYPE_DEFAULT: + name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT; + if (!S_ISDIR(inode->i_mode)) + return acl ? -EACCES : 0; + break; - default: - return -EINVAL; + default: + return -EINVAL; } if (acl) { value = ext4_acl_to_disk(acl, &size); @@ -269,14 +269,14 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type, kfree(value); if (!error) { - switch(type) { - case ACL_TYPE_ACCESS: - ext4_iset_acl(inode, &ei->i_acl, acl); - break; - - case ACL_TYPE_DEFAULT: - ext4_iset_acl(inode, &ei->i_default_acl, acl); - break; + switch (type) { + case ACL_TYPE_ACCESS: + ext4_iset_acl(inode, &ei->i_acl, acl); + break; + + case ACL_TYPE_DEFAULT: + ext4_iset_acl(inode, &ei->i_default_acl, acl); + break; } } return error; @@ -299,7 +299,7 @@ ext4_check_acl(struct inode *inode, int mask) } int -ext4_permission(struct inode *inode, int mask, struct nameidata *nd) +ext4_permission(struct inode *inode, int mask) { return generic_permission(inode, mask, ext4_check_acl); } diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h index 26a5c1abf147..cb45257a246e 100644 --- a/fs/ext4/acl.h +++ b/fs/ext4/acl.h @@ -51,18 +51,18 @@ static inline int ext4_acl_count(size_t size) } } -#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL +#ifdef CONFIG_EXT4_FS_POSIX_ACL /* Value for inode->u.ext4_i.i_acl and inode->u.ext4_i.i_default_acl if the ACL has not been cached */ #define EXT4_ACL_NOT_CACHED ((void *)-1) /* acl.c */ -extern int ext4_permission (struct inode *, int, struct nameidata *); -extern int ext4_acl_chmod (struct inode *); -extern int ext4_init_acl (handle_t *, struct inode *, struct inode *); +extern int ext4_permission(struct inode *, int); +extern int ext4_acl_chmod(struct inode *); +extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); -#else /* CONFIG_EXT4DEV_FS_POSIX_ACL */ +#else /* CONFIG_EXT4_FS_POSIX_ACL */ #include <linux/sched.h> #define ext4_permission NULL @@ -77,5 +77,5 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) { return 0; } -#endif /* CONFIG_EXT4DEV_FS_POSIX_ACL */ +#endif /* CONFIG_EXT4_FS_POSIX_ACL */ diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 9cc80b9cc8d8..bd2ece228827 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -47,7 +47,7 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block, ext4_group_t block_group) { ext4_group_t actual_group; - ext4_get_group_no_and_offset(sb, block, &actual_group, 0); + ext4_get_group_no_and_offset(sb, block, &actual_group, NULL); if (actual_group == block_group) return 1; return 0; @@ -83,6 +83,7 @@ static int ext4_group_used_meta_blocks(struct super_block *sb, } return used_blocks; } + /* Initializes an uninitialized block bitmap if given, and returns the * number of blocks free in the group. */ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, @@ -121,12 +122,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks); } } else { /* For META_BG_BLOCK_GROUPS */ - int group_rel = (block_group - - le32_to_cpu(sbi->s_es->s_first_meta_bg)) % - EXT4_DESC_PER_BLOCK(sb); - if (group_rel == 0 || group_rel == 1 || - (group_rel == EXT4_DESC_PER_BLOCK(sb) - 1)) - bit_max += 1; + bit_max += ext4_bg_num_gdb(sb, block_group); } if (block_group == sbi->s_groups_count - 1) { @@ -137,7 +133,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, */ group_blocks = ext4_blocks_count(sbi->s_es) - le32_to_cpu(sbi->s_es->s_first_data_block) - - (EXT4_BLOCKS_PER_GROUP(sb) * (sbi->s_groups_count -1)); + (EXT4_BLOCKS_PER_GROUP(sb) * (sbi->s_groups_count - 1)); } else { group_blocks = EXT4_BLOCKS_PER_GROUP(sb); } @@ -205,20 +201,20 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, * @bh: pointer to the buffer head to store the block * group descriptor */ -struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, +struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, ext4_group_t block_group, - struct buffer_head ** bh) + struct buffer_head **bh) { unsigned long group_desc; unsigned long offset; - struct ext4_group_desc * desc; + struct ext4_group_desc *desc; struct ext4_sb_info *sbi = EXT4_SB(sb); if (block_group >= sbi->s_groups_count) { - ext4_error (sb, "ext4_get_group_desc", - "block_group >= groups_count - " - "block_group = %lu, groups_count = %lu", - block_group, sbi->s_groups_count); + ext4_error(sb, "ext4_get_group_desc", + "block_group >= groups_count - " + "block_group = %lu, groups_count = %lu", + block_group, sbi->s_groups_count); return NULL; } @@ -227,10 +223,10 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb); offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1); if (!sbi->s_group_desc[group_desc]) { - ext4_error (sb, "ext4_get_group_desc", - "Group descriptor not loaded - " - "block_group = %lu, group_desc = %lu, desc = %lu", - block_group, group_desc, offset); + ext4_error(sb, "ext4_get_group_desc", + "Group descriptor not loaded - " + "block_group = %lu, group_desc = %lu, desc = %lu", + block_group, group_desc, offset); return NULL; } @@ -295,7 +291,7 @@ err_out: return 0; } /** - * read_block_bitmap() + * ext4_read_block_bitmap() * @sb: super block * @block_group: given block group * @@ -305,10 +301,10 @@ err_out: * Return buffer_head on success or NULL in case of failure. */ struct buffer_head * -read_block_bitmap(struct super_block *sb, ext4_group_t block_group) +ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) { - struct ext4_group_desc * desc; - struct buffer_head * bh = NULL; + struct ext4_group_desc *desc; + struct buffer_head *bh = NULL; ext4_fsblk_t bitmap_blk; desc = ext4_get_group_desc(sb, block_group, NULL); @@ -319,25 +315,30 @@ read_block_bitmap(struct super_block *sb, ext4_group_t block_group) if (unlikely(!bh)) { ext4_error(sb, __func__, "Cannot read block bitmap - " - "block_group = %d, block_bitmap = %llu", - (int)block_group, (unsigned long long)bitmap_blk); + "block_group = %lu, block_bitmap = %llu", + block_group, bitmap_blk); return NULL; } - if (bh_uptodate_or_lock(bh)) + if (buffer_uptodate(bh) && + !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) return bh; + lock_buffer(bh); + spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { ext4_init_block_bitmap(sb, bh, block_group, desc); set_buffer_uptodate(bh); unlock_buffer(bh); + spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); return bh; } + spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); if (bh_submit_read(bh) < 0) { put_bh(bh); ext4_error(sb, __func__, "Cannot read block bitmap - " - "block_group = %d, block_bitmap = %llu", - (int)block_group, (unsigned long long)bitmap_blk); + "block_group = %lu, block_bitmap = %llu", + block_group, bitmap_blk); return NULL; } ext4_valid_block_bitmap(sb, desc, block_group, bh); @@ -347,302 +348,6 @@ read_block_bitmap(struct super_block *sb, ext4_group_t block_group) */ return bh; } -/* - * The reservation window structure operations - * -------------------------------------------- - * Operations include: - * dump, find, add, remove, is_empty, find_next_reservable_window, etc. - * - * We use a red-black tree to represent per-filesystem reservation - * windows. - * - */ - -/** - * __rsv_window_dump() -- Dump the filesystem block allocation reservation map - * @rb_root: root of per-filesystem reservation rb tree - * @verbose: verbose mode - * @fn: function which wishes to dump the reservation map - * - * If verbose is turned on, it will print the whole block reservation - * windows(start, end). Otherwise, it will only print out the "bad" windows, - * those windows that overlap with their immediate neighbors. - */ -#if 1 -static void __rsv_window_dump(struct rb_root *root, int verbose, - const char *fn) -{ - struct rb_node *n; - struct ext4_reserve_window_node *rsv, *prev; - int bad; - -restart: - n = rb_first(root); - bad = 0; - prev = NULL; - - printk("Block Allocation Reservation Windows Map (%s):\n", fn); - while (n) { - rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node); - if (verbose) - printk("reservation window 0x%p " - "start: %llu, end: %llu\n", - rsv, rsv->rsv_start, rsv->rsv_end); - if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) { - printk("Bad reservation %p (start >= end)\n", - rsv); - bad = 1; - } - if (prev && prev->rsv_end >= rsv->rsv_start) { - printk("Bad reservation %p (prev->end >= start)\n", - rsv); - bad = 1; - } - if (bad) { - if (!verbose) { - printk("Restarting reservation walk in verbose mode\n"); - verbose = 1; - goto restart; - } - } - n = rb_next(n); - prev = rsv; - } - printk("Window map complete.\n"); - if (bad) - BUG(); -} -#define rsv_window_dump(root, verbose) \ - __rsv_window_dump((root), (verbose), __func__) -#else -#define rsv_window_dump(root, verbose) do {} while (0) -#endif - -/** - * goal_in_my_reservation() - * @rsv: inode's reservation window - * @grp_goal: given goal block relative to the allocation block group - * @group: the current allocation block group - * @sb: filesystem super block - * - * Test if the given goal block (group relative) is within the file's - * own block reservation window range. - * - * If the reservation window is outside the goal allocation group, return 0; - * grp_goal (given goal block) could be -1, which means no specific - * goal block. In this case, always return 1. - * If the goal block is within the reservation window, return 1; - * otherwise, return 0; - */ -static int -goal_in_my_reservation(struct ext4_reserve_window *rsv, ext4_grpblk_t grp_goal, - ext4_group_t group, struct super_block *sb) -{ - ext4_fsblk_t group_first_block, group_last_block; - - group_first_block = ext4_group_first_block_no(sb, group); - group_last_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1); - - if ((rsv->_rsv_start > group_last_block) || - (rsv->_rsv_end < group_first_block)) - return 0; - if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start) - || (grp_goal + group_first_block > rsv->_rsv_end))) - return 0; - return 1; -} - -/** - * search_reserve_window() - * @rb_root: root of reservation tree - * @goal: target allocation block - * - * Find the reserved window which includes the goal, or the previous one - * if the goal is not in any window. - * Returns NULL if there are no windows or if all windows start after the goal. - */ -static struct ext4_reserve_window_node * -search_reserve_window(struct rb_root *root, ext4_fsblk_t goal) -{ - struct rb_node *n = root->rb_node; - struct ext4_reserve_window_node *rsv; - - if (!n) - return NULL; - - do { - rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node); - - if (goal < rsv->rsv_start) - n = n->rb_left; - else if (goal > rsv->rsv_end) - n = n->rb_right; - else - return rsv; - } while (n); - /* - * We've fallen off the end of the tree: the goal wasn't inside - * any particular node. OK, the previous node must be to one - * side of the interval containing the goal. If it's the RHS, - * we need to back up one. - */ - if (rsv->rsv_start > goal) { - n = rb_prev(&rsv->rsv_node); - rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node); - } - return rsv; -} - -/** - * ext4_rsv_window_add() -- Insert a window to the block reservation rb tree. - * @sb: super block - * @rsv: reservation window to add - * - * Must be called with rsv_lock hold. - */ -void ext4_rsv_window_add(struct super_block *sb, - struct ext4_reserve_window_node *rsv) -{ - struct rb_root *root = &EXT4_SB(sb)->s_rsv_window_root; - struct rb_node *node = &rsv->rsv_node; - ext4_fsblk_t start = rsv->rsv_start; - - struct rb_node ** p = &root->rb_node; - struct rb_node * parent = NULL; - struct ext4_reserve_window_node *this; - - while (*p) - { - parent = *p; - this = rb_entry(parent, struct ext4_reserve_window_node, rsv_node); - - if (start < this->rsv_start) - p = &(*p)->rb_left; - else if (start > this->rsv_end) - p = &(*p)->rb_right; - else { - rsv_window_dump(root, 1); - BUG(); - } - } - - rb_link_node(node, parent, p); - rb_insert_color(node, root); -} - -/** - * ext4_rsv_window_remove() -- unlink a window from the reservation rb tree - * @sb: super block - * @rsv: reservation window to remove - * - * Mark the block reservation window as not allocated, and unlink it - * from the filesystem reservation window rb tree. Must be called with - * rsv_lock hold. - */ -static void rsv_window_remove(struct super_block *sb, - struct ext4_reserve_window_node *rsv) -{ - rsv->rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED; - rsv->rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED; - rsv->rsv_alloc_hit = 0; - rb_erase(&rsv->rsv_node, &EXT4_SB(sb)->s_rsv_window_root); -} - -/* - * rsv_is_empty() -- Check if the reservation window is allocated. - * @rsv: given reservation window to check - * - * returns 1 if the end block is EXT4_RESERVE_WINDOW_NOT_ALLOCATED. - */ -static inline int rsv_is_empty(struct ext4_reserve_window *rsv) -{ - /* a valid reservation end block could not be 0 */ - return rsv->_rsv_end == EXT4_RESERVE_WINDOW_NOT_ALLOCATED; -} - -/** - * ext4_init_block_alloc_info() - * @inode: file inode structure - * - * Allocate and initialize the reservation window structure, and - * link the window to the ext4 inode structure at last - * - * The reservation window structure is only dynamically allocated - * and linked to ext4 inode the first time the open file - * needs a new block. So, before every ext4_new_block(s) call, for - * regular files, we should check whether the reservation window - * structure exists or not. In the latter case, this function is called. - * Fail to do so will result in block reservation being turned off for that - * open file. - * - * This function is called from ext4_get_blocks_handle(), also called - * when setting the reservation window size through ioctl before the file - * is open for write (needs block allocation). - * - * Needs down_write(i_data_sem) protection prior to call this function. - */ -void ext4_init_block_alloc_info(struct inode *inode) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_block_alloc_info *block_i = ei->i_block_alloc_info; - struct super_block *sb = inode->i_sb; - - block_i = kmalloc(sizeof(*block_i), GFP_NOFS); - if (block_i) { - struct ext4_reserve_window_node *rsv = &block_i->rsv_window_node; - - rsv->rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED; - rsv->rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED; - - /* - * if filesystem is mounted with NORESERVATION, the goal - * reservation window size is set to zero to indicate - * block reservation is off - */ - if (!test_opt(sb, RESERVATION)) - rsv->rsv_goal_size = 0; - else - rsv->rsv_goal_size = EXT4_DEFAULT_RESERVE_BLOCKS; - rsv->rsv_alloc_hit = 0; - block_i->last_alloc_logical_block = 0; - block_i->last_alloc_physical_block = 0; - } - ei->i_block_alloc_info = block_i; -} - -/** - * ext4_discard_reservation() - * @inode: inode - * - * Discard(free) block reservation window on last file close, or truncate - * or at last iput(). - * - * It is being called in three cases: - * ext4_release_file(): last writer close the file - * ext4_clear_inode(): last iput(), when nobody link to this file. - * ext4_truncate(): when the block indirect map is about to change. - * - */ -void ext4_discard_reservation(struct inode *inode) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_block_alloc_info *block_i = ei->i_block_alloc_info; - struct ext4_reserve_window_node *rsv; - spinlock_t *rsv_lock = &EXT4_SB(inode->i_sb)->s_rsv_window_lock; - - ext4_mb_discard_inode_preallocations(inode); - - if (!block_i) - return; - - rsv = &block_i->rsv_window_node; - if (!rsv_is_empty(&rsv->rsv_window)) { - spin_lock(rsv_lock); - if (!rsv_is_empty(&rsv->rsv_window)) - rsv_window_remove(inode->i_sb, rsv); - spin_unlock(rsv_lock); - } -} /** * ext4_free_blocks_sb() -- Free given blocks and update quota @@ -651,6 +356,13 @@ void ext4_discard_reservation(struct inode *inode) * @block: start physcial block to free * @count: number of blocks to free * @pdquot_freed_blocks: pointer to quota + * + * XXX This function is only used by the on-line resizing code, which + * should probably be fixed up to call the mballoc variant. There + * this needs to be cleaned up later; in fact, I'm not convinced this + * is 100% correct in the face of the mballoc code. The online resizing + * code needs to be fixed up to more tightly (and correctly) interlock + * with the mballoc code. */ void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb, ext4_fsblk_t block, unsigned long count, @@ -662,8 +374,8 @@ void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb, ext4_grpblk_t bit; unsigned long i; unsigned long overflow; - struct ext4_group_desc * desc; - struct ext4_super_block * es; + struct ext4_group_desc *desc; + struct ext4_super_block *es; struct ext4_sb_info *sbi; int err = 0, ret; ext4_grpblk_t group_freed; @@ -674,13 +386,13 @@ void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb, if (block < le32_to_cpu(es->s_first_data_block) || block + count < block || block + count > ext4_blocks_count(es)) { - ext4_error (sb, "ext4_free_blocks", - "Freeing blocks not in datazone - " - "block = %llu, count = %lu", block, count); + ext4_error(sb, "ext4_free_blocks", + "Freeing blocks not in datazone - " + "block = %llu, count = %lu", block, count); goto error_return; } - ext4_debug ("freeing block(s) %llu-%llu\n", block, block + count - 1); + ext4_debug("freeing block(s) %llu-%llu\n", block, block + count - 1); do_more: overflow = 0; @@ -694,10 +406,10 @@ do_more: count -= overflow; } brelse(bitmap_bh); - bitmap_bh = read_block_bitmap(sb, block_group); + bitmap_bh = ext4_read_block_bitmap(sb, block_group); if (!bitmap_bh) goto error_return; - desc = ext4_get_group_desc (sb, block_group, &gd_bh); + desc = ext4_get_group_desc(sb, block_group, &gd_bh); if (!desc) goto error_return; @@ -706,10 +418,10 @@ do_more: in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) || in_range(block + count - 1, ext4_inode_table(sb, desc), sbi->s_itb_per_group)) { - ext4_error (sb, "ext4_free_blocks", - "Freeing blocks in system zones - " - "Block = %llu, count = %lu", - block, count); + ext4_error(sb, "ext4_free_blocks", + "Freeing blocks in system zones - " + "Block = %llu, count = %lu", + block, count); goto error_return; } @@ -810,6 +522,13 @@ do_more: spin_unlock(sb_bgl_lock(sbi, block_group)); percpu_counter_add(&sbi->s_freeblocks_counter, count); + if (sbi->s_log_groups_per_flex) { + ext4_group_t flex_group = ext4_flex_group(sbi, block_group); + spin_lock(sb_bgl_lock(sbi, flex_group)); + sbi->s_flex_groups[flex_group].free_blocks += count; + spin_unlock(sb_bgl_lock(sbi, flex_group)); + } + /* We dirtied the bitmap block */ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); err = ext4_journal_dirty_metadata(handle, bitmap_bh); @@ -844,7 +563,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t block, unsigned long count, int metadata) { - struct super_block * sb; + struct super_block *sb; unsigned long dquot_freed_blocks; /* this isn't the right place to decide whether block is metadata @@ -855,766 +574,90 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, sb = inode->i_sb; - if (!test_opt(sb, MBALLOC) || !EXT4_SB(sb)->s_group_info) - ext4_free_blocks_sb(handle, sb, block, count, - &dquot_freed_blocks); - else - ext4_mb_free_blocks(handle, inode, block, count, - metadata, &dquot_freed_blocks); + ext4_mb_free_blocks(handle, inode, block, count, + metadata, &dquot_freed_blocks); if (dquot_freed_blocks) DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); return; } -/** - * ext4_test_allocatable() - * @nr: given allocation block group - * @bh: bufferhead contains the bitmap of the given block group - * - * For ext4 allocations, we must not reuse any blocks which are - * allocated in the bitmap buffer's "last committed data" copy. This - * prevents deletes from freeing up the page for reuse until we have - * committed the delete transaction. - * - * If we didn't do this, then deleting something and reallocating it as - * data would allow the old block to be overwritten before the - * transaction committed (because we force data to disk before commit). - * This would lead to corruption if we crashed between overwriting the - * data and committing the delete. - * - * @@@ We may want to make this allocation behaviour conditional on - * data-writes at some point, and disable it for metadata allocations or - * sync-data inodes. - */ -static int ext4_test_allocatable(ext4_grpblk_t nr, struct buffer_head *bh) -{ - int ret; - struct journal_head *jh = bh2jh(bh); - - if (ext4_test_bit(nr, bh->b_data)) - return 0; - - jbd_lock_bh_state(bh); - if (!jh->b_committed_data) - ret = 1; - else - ret = !ext4_test_bit(nr, jh->b_committed_data); - jbd_unlock_bh_state(bh); - return ret; -} - -/** - * bitmap_search_next_usable_block() - * @start: the starting block (group relative) of the search - * @bh: bufferhead contains the block group bitmap - * @maxblocks: the ending block (group relative) of the reservation - * - * The bitmap search --- search forward alternately through the actual - * bitmap on disk and the last-committed copy in journal, until we find a - * bit free in both bitmaps. - */ -static ext4_grpblk_t -bitmap_search_next_usable_block(ext4_grpblk_t start, struct buffer_head *bh, - ext4_grpblk_t maxblocks) -{ - ext4_grpblk_t next; - struct journal_head *jh = bh2jh(bh); - - while (start < maxblocks) { - next = ext4_find_next_zero_bit(bh->b_data, maxblocks, start); - if (next >= maxblocks) - return -1; - if (ext4_test_allocatable(next, bh)) - return next; - jbd_lock_bh_state(bh); - if (jh->b_committed_data) - start = ext4_find_next_zero_bit(jh->b_committed_data, - maxblocks, next); - jbd_unlock_bh_state(bh); - } - return -1; -} - -/** - * find_next_usable_block() - * @start: the starting block (group relative) to find next - * allocatable block in bitmap. - * @bh: bufferhead contains the block group bitmap - * @maxblocks: the ending block (group relative) for the search - * - * Find an allocatable block in a bitmap. We honor both the bitmap and - * its last-committed copy (if that exists), and perform the "most - * appropriate allocation" algorithm of looking for a free block near - * the initial goal; then for a free byte somewhere in the bitmap; then - * for any free bit in the bitmap. - */ -static ext4_grpblk_t -find_next_usable_block(ext4_grpblk_t start, struct buffer_head *bh, - ext4_grpblk_t maxblocks) -{ - ext4_grpblk_t here, next; - char *p, *r; - - if (start > 0) { - /* - * The goal was occupied; search forward for a free - * block within the next XX blocks. - * - * end_goal is more or less random, but it has to be - * less than EXT4_BLOCKS_PER_GROUP. Aligning up to the - * next 64-bit boundary is simple.. - */ - ext4_grpblk_t end_goal = (start + 63) & ~63; - if (end_goal > maxblocks) - end_goal = maxblocks; - here = ext4_find_next_zero_bit(bh->b_data, end_goal, start); - if (here < end_goal && ext4_test_allocatable(here, bh)) - return here; - ext4_debug("Bit not found near goal\n"); - } - - here = start; - if (here < 0) - here = 0; - - p = ((char *)bh->b_data) + (here >> 3); - r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3)); - next = (r - ((char *)bh->b_data)) << 3; - - if (next < maxblocks && next >= start && ext4_test_allocatable(next, bh)) - return next; - - /* - * The bitmap search --- search forward alternately through the actual - * bitmap and the last-committed copy until we find a bit free in - * both - */ - here = bitmap_search_next_usable_block(here, bh, maxblocks); - return here; -} - -/** - * claim_block() - * @block: the free block (group relative) to allocate - * @bh: the bufferhead containts the block group bitmap - * - * We think we can allocate this block in this bitmap. Try to set the bit. - * If that succeeds then check that nobody has allocated and then freed the - * block since we saw that is was not marked in b_committed_data. If it _was_ - * allocated and freed then clear the bit in the bitmap again and return - * zero (failure). - */ -static inline int -claim_block(spinlock_t *lock, ext4_grpblk_t block, struct buffer_head *bh) -{ - struct journal_head *jh = bh2jh(bh); - int ret; - - if (ext4_set_bit_atomic(lock, block, bh->b_data)) - return 0; - jbd_lock_bh_state(bh); - if (jh->b_committed_data && ext4_test_bit(block,jh->b_committed_data)) { - ext4_clear_bit_atomic(lock, block, bh->b_data); - ret = 0; - } else { - ret = 1; - } - jbd_unlock_bh_state(bh); - return ret; -} - -/** - * ext4_try_to_allocate() - * @sb: superblock - * @handle: handle to this transaction - * @group: given allocation block group - * @bitmap_bh: bufferhead holds the block bitmap - * @grp_goal: given target block within the group - * @count: target number of blocks to allocate - * @my_rsv: reservation window - * - * Attempt to allocate blocks within a give range. Set the range of allocation - * first, then find the first free bit(s) from the bitmap (within the range), - * and at last, allocate the blocks by claiming the found free bit as allocated. - * - * To set the range of this allocation: - * if there is a reservation window, only try to allocate block(s) from the - * file's own reservation window; - * Otherwise, the allocation range starts from the give goal block, ends at - * the block group's last block. - * - * If we failed to allocate the desired block then we may end up crossing to a - * new bitmap. In that case we must release write access to the old one via - * ext4_journal_release_buffer(), else we'll run out of credits. - */ -static ext4_grpblk_t -ext4_try_to_allocate(struct super_block *sb, handle_t *handle, - ext4_group_t group, struct buffer_head *bitmap_bh, - ext4_grpblk_t grp_goal, unsigned long *count, - struct ext4_reserve_window *my_rsv) +int ext4_claim_free_blocks(struct ext4_sb_info *sbi, + s64 nblocks) { - ext4_fsblk_t group_first_block; - ext4_grpblk_t start, end; - unsigned long num = 0; - - /* we do allocation within the reservation window if we have a window */ - if (my_rsv) { - group_first_block = ext4_group_first_block_no(sb, group); - if (my_rsv->_rsv_start >= group_first_block) - start = my_rsv->_rsv_start - group_first_block; - else - /* reservation window cross group boundary */ - start = 0; - end = my_rsv->_rsv_end - group_first_block + 1; - if (end > EXT4_BLOCKS_PER_GROUP(sb)) - /* reservation window crosses group boundary */ - end = EXT4_BLOCKS_PER_GROUP(sb); - if ((start <= grp_goal) && (grp_goal < end)) - start = grp_goal; - else - grp_goal = -1; - } else { - if (grp_goal > 0) - start = grp_goal; - else - start = 0; - end = EXT4_BLOCKS_PER_GROUP(sb); - } + s64 free_blocks, dirty_blocks; + s64 root_blocks = 0; + struct percpu_counter *fbc = &sbi->s_freeblocks_counter; + struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter; - BUG_ON(start > EXT4_BLOCKS_PER_GROUP(sb)); - -repeat: - if (grp_goal < 0 || !ext4_test_allocatable(grp_goal, bitmap_bh)) { - grp_goal = find_next_usable_block(start, bitmap_bh, end); - if (grp_goal < 0) - goto fail_access; - if (!my_rsv) { - int i; - - for (i = 0; i < 7 && grp_goal > start && - ext4_test_allocatable(grp_goal - 1, - bitmap_bh); - i++, grp_goal--) - ; - } - } - start = grp_goal; - - if (!claim_block(sb_bgl_lock(EXT4_SB(sb), group), - grp_goal, bitmap_bh)) { - /* - * The block was allocated by another thread, or it was - * allocated and then freed by another thread - */ - start++; - grp_goal++; - if (start >= end) - goto fail_access; - goto repeat; - } - num++; - grp_goal++; - while (num < *count && grp_goal < end - && ext4_test_allocatable(grp_goal, bitmap_bh) - && claim_block(sb_bgl_lock(EXT4_SB(sb), group), - grp_goal, bitmap_bh)) { - num++; - grp_goal++; - } - *count = num; - return grp_goal - num; -fail_access: - *count = num; - return -1; -} + free_blocks = percpu_counter_read_positive(fbc); + dirty_blocks = percpu_counter_read_positive(dbc); -/** - * find_next_reservable_window(): - * find a reservable space within the given range. - * It does not allocate the reservation window for now: - * alloc_new_reservation() will do the work later. - * - * @search_head: the head of the searching list; - * This is not necessarily the list head of the whole filesystem - * - * We have both head and start_block to assist the search - * for the reservable space. The list starts from head, - * but we will shift to the place where start_block is, - * then start from there, when looking for a reservable space. - * - * @size: the target new reservation window size - * - * @group_first_block: the first block we consider to start - * the real search from - * - * @last_block: - * the maximum block number that our goal reservable space - * could start from. This is normally the last block in this - * group. The search will end when we found the start of next - * possible reservable space is out of this boundary. - * This could handle the cross boundary reservation window - * request. - * - * basically we search from the given range, rather than the whole - * reservation double linked list, (start_block, last_block) - * to find a free region that is of my size and has not - * been reserved. - * - */ -static int find_next_reservable_window( - struct ext4_reserve_window_node *search_head, - struct ext4_reserve_window_node *my_rsv, - struct super_block * sb, - ext4_fsblk_t start_block, - ext4_fsblk_t last_block) -{ - struct rb_node *next; - struct ext4_reserve_window_node *rsv, *prev; - ext4_fsblk_t cur; - int size = my_rsv->rsv_goal_size; - - /* TODO: make the start of the reservation window byte-aligned */ - /* cur = *start_block & ~7;*/ - cur = start_block; - rsv = search_head; - if (!rsv) - return -1; - - while (1) { - if (cur <= rsv->rsv_end) - cur = rsv->rsv_end + 1; - - /* TODO? - * in the case we could not find a reservable space - * that is what is expected, during the re-search, we could - * remember what's the largest reservable space we could have - * and return that one. - * - * For now it will fail if we could not find the reservable - * space with expected-size (or more)... - */ - if (cur > last_block) - return -1; /* fail */ - - prev = rsv; - next = rb_next(&rsv->rsv_node); - rsv = rb_entry(next,struct ext4_reserve_window_node,rsv_node); - - /* - * Reached the last reservation, we can just append to the - * previous one. - */ - if (!next) - break; - - if (cur + size <= rsv->rsv_start) { - /* - * Found a reserveable space big enough. We could - * have a reservation across the group boundary here - */ - break; + if (!capable(CAP_SYS_RESOURCE) && + sbi->s_resuid != current->fsuid && + (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid))) + root_blocks = ext4_r_blocks_count(sbi->s_es); + + if (free_blocks - (nblocks + root_blocks + dirty_blocks) < + EXT4_FREEBLOCKS_WATERMARK) { + free_blocks = percpu_counter_sum(fbc); + dirty_blocks = percpu_counter_sum(dbc); + if (dirty_blocks < 0) { + printk(KERN_CRIT "Dirty block accounting " + "went wrong %lld\n", + dirty_blocks); } } - /* - * we come here either : - * when we reach the end of the whole list, - * and there is empty reservable space after last entry in the list. - * append it to the end of the list. - * - * or we found one reservable space in the middle of the list, - * return the reservation window that we could append to. - * succeed. + /* Check whether we have space after + * accounting for current dirty blocks */ + if (free_blocks < ((root_blocks + nblocks) + dirty_blocks)) + /* we don't have free space */ + return -ENOSPC; - if ((prev != my_rsv) && (!rsv_is_empty(&my_rsv->rsv_window))) - rsv_window_remove(sb, my_rsv); - - /* - * Let's book the whole avaliable window for now. We will check the - * disk bitmap later and then, if there are free blocks then we adjust - * the window size if it's larger than requested. - * Otherwise, we will remove this node from the tree next time - * call find_next_reservable_window. - */ - my_rsv->rsv_start = cur; - my_rsv->rsv_end = cur + size - 1; - my_rsv->rsv_alloc_hit = 0; - - if (prev != my_rsv) - ext4_rsv_window_add(sb, my_rsv); - + /* Add the blocks to nblocks */ + percpu_counter_add(dbc, nblocks); return 0; } /** - * alloc_new_reservation()--allocate a new reservation window - * - * To make a new reservation, we search part of the filesystem - * reservation list (the list that inside the group). We try to - * allocate a new reservation window near the allocation goal, - * or the beginning of the group, if there is no goal. - * - * We first find a reservable space after the goal, then from - * there, we check the bitmap for the first free block after - * it. If there is no free block until the end of group, then the - * whole group is full, we failed. Otherwise, check if the free - * block is inside the expected reservable space, if so, we - * succeed. - * If the first free block is outside the reservable space, then - * start from the first free block, we search for next available - * space, and go on. - * - * on succeed, a new reservation will be found and inserted into the list - * It contains at least one free block, and it does not overlap with other - * reservation windows. - * - * failed: we failed to find a reservation window in this group - * - * @rsv: the reservation - * - * @grp_goal: The goal (group-relative). It is where the search for a - * free reservable space should start from. - * if we have a grp_goal(grp_goal >0 ), then start from there, - * no grp_goal(grp_goal = -1), we start from the first block - * of the group. - * - * @sb: the super block - * @group: the group we are trying to allocate in - * @bitmap_bh: the block group block bitmap - * - */ -static int alloc_new_reservation(struct ext4_reserve_window_node *my_rsv, - ext4_grpblk_t grp_goal, struct super_block *sb, - ext4_group_t group, struct buffer_head *bitmap_bh) -{ - struct ext4_reserve_window_node *search_head; - ext4_fsblk_t group_first_block, group_end_block, start_block; - ext4_grpblk_t first_free_block; - struct rb_root *fs_rsv_root = &EXT4_SB(sb)->s_rsv_window_root; - unsigned long size; - int ret; - spinlock_t *rsv_lock = &EXT4_SB(sb)->s_rsv_window_lock; - - group_first_block = ext4_group_first_block_no(sb, group); - group_end_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1); - - if (grp_goal < 0) - start_block = group_first_block; - else - start_block = grp_goal + group_first_block; - - size = my_rsv->rsv_goal_size; - - if (!rsv_is_empty(&my_rsv->rsv_window)) { - /* - * if the old reservation is cross group boundary - * and if the goal is inside the old reservation window, - * we will come here when we just failed to allocate from - * the first part of the window. We still have another part - * that belongs to the next group. In this case, there is no - * point to discard our window and try to allocate a new one - * in this group(which will fail). we should - * keep the reservation window, just simply move on. - * - * Maybe we could shift the start block of the reservation - * window to the first block of next group. - */ - - if ((my_rsv->rsv_start <= group_end_block) && - (my_rsv->rsv_end > group_end_block) && - (start_block >= my_rsv->rsv_start)) - return -1; - - if ((my_rsv->rsv_alloc_hit > - (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) { - /* - * if the previously allocation hit ratio is - * greater than 1/2, then we double the size of - * the reservation window the next time, - * otherwise we keep the same size window - */ - size = size * 2; - if (size > EXT4_MAX_RESERVE_BLOCKS) - size = EXT4_MAX_RESERVE_BLOCKS; - my_rsv->rsv_goal_size= size; - } - } - - spin_lock(rsv_lock); - /* - * shift the search start to the window near the goal block - */ - search_head = search_reserve_window(fs_rsv_root, start_block); - - /* - * find_next_reservable_window() simply finds a reservable window - * inside the given range(start_block, group_end_block). - * - * To make sure the reservation window has a free bit inside it, we - * need to check the bitmap after we found a reservable window. - */ -retry: - ret = find_next_reservable_window(search_head, my_rsv, sb, - start_block, group_end_block); - - if (ret == -1) { - if (!rsv_is_empty(&my_rsv->rsv_window)) - rsv_window_remove(sb, my_rsv); - spin_unlock(rsv_lock); - return -1; - } - - /* - * On success, find_next_reservable_window() returns the - * reservation window where there is a reservable space after it. - * Before we reserve this reservable space, we need - * to make sure there is at least a free block inside this region. - * - * searching the first free bit on the block bitmap and copy of - * last committed bitmap alternatively, until we found a allocatable - * block. Search start from the start block of the reservable space - * we just found. - */ - spin_unlock(rsv_lock); - first_free_block = bitmap_search_next_usable_block( - my_rsv->rsv_start - group_first_block, - bitmap_bh, group_end_block - group_first_block + 1); - - if (first_free_block < 0) { - /* - * no free block left on the bitmap, no point - * to reserve the space. return failed. - */ - spin_lock(rsv_lock); - if (!rsv_is_empty(&my_rsv->rsv_window)) - rsv_window_remove(sb, my_rsv); - spin_unlock(rsv_lock); - return -1; /* failed */ - } - - start_block = first_free_block + group_first_block; - /* - * check if the first free block is within the - * free space we just reserved - */ - if (start_block >= my_rsv->rsv_start && start_block <= my_rsv->rsv_end) - return 0; /* success */ - /* - * if the first free bit we found is out of the reservable space - * continue search for next reservable space, - * start from where the free block is, - * we also shift the list head to where we stopped last time - */ - search_head = my_rsv; - spin_lock(rsv_lock); - goto retry; -} - -/** - * try_to_extend_reservation() - * @my_rsv: given reservation window - * @sb: super block - * @size: the delta to extend - * - * Attempt to expand the reservation window large enough to have - * required number of free blocks - * - * Since ext4_try_to_allocate() will always allocate blocks within - * the reservation window range, if the window size is too small, - * multiple blocks allocation has to stop at the end of the reservation - * window. To make this more efficient, given the total number of - * blocks needed and the current size of the window, we try to - * expand the reservation window size if necessary on a best-effort - * basis before ext4_new_blocks() tries to allocate blocks, - */ -static void try_to_extend_reservation(struct ext4_reserve_window_node *my_rsv, - struct super_block *sb, int size) -{ - struct ext4_reserve_window_node *next_rsv; - struct rb_node *next; - spinlock_t *rsv_lock = &EXT4_SB(sb)->s_rsv_window_lock; - - if (!spin_trylock(rsv_lock)) - return; - - next = rb_next(&my_rsv->rsv_node); - - if (!next) - my_rsv->rsv_end += size; - else { - next_rsv = rb_entry(next, struct ext4_reserve_window_node, rsv_node); - - if ((next_rsv->rsv_start - my_rsv->rsv_end - 1) >= size) - my_rsv->rsv_end += size; - else - my_rsv->rsv_end = next_rsv->rsv_start - 1; - } - spin_unlock(rsv_lock); -} - -/** - * ext4_try_to_allocate_with_rsv() - * @sb: superblock - * @handle: handle to this transaction - * @group: given allocation block group - * @bitmap_bh: bufferhead holds the block bitmap - * @grp_goal: given target block within the group - * @count: target number of blocks to allocate - * @my_rsv: reservation window - * @errp: pointer to store the error code - * - * This is the main function used to allocate a new block and its reservation - * window. - * - * Each time when a new block allocation is need, first try to allocate from - * its own reservation. If it does not have a reservation window, instead of - * looking for a free bit on bitmap first, then look up the reservation list to - * see if it is inside somebody else's reservation window, we try to allocate a - * reservation window for it starting from the goal first. Then do the block - * allocation within the reservation window. - * - * This will avoid keeping on searching the reservation list again and - * again when somebody is looking for a free block (without - * reservation), and there are lots of free blocks, but they are all - * being reserved. - * - * We use a red-black tree for the per-filesystem reservation list. + * ext4_has_free_blocks() + * @sbi: in-core super block structure. + * @nblocks: number of neeed blocks * + * Check if filesystem has free blocks available for allocation. + * Return the number of blocks avaible for allocation for this request + * On success, return nblocks */ -static ext4_grpblk_t -ext4_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle, - ext4_group_t group, struct buffer_head *bitmap_bh, - ext4_grpblk_t grp_goal, - struct ext4_reserve_window_node * my_rsv, - unsigned long *count, int *errp) +ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi, + s64 nblocks) { - ext4_fsblk_t group_first_block, group_last_block; - ext4_grpblk_t ret = 0; - int fatal; - unsigned long num = *count; - - *errp = 0; + s64 free_blocks, dirty_blocks; + s64 root_blocks = 0; + struct percpu_counter *fbc = &sbi->s_freeblocks_counter; + struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter; - /* - * Make sure we use undo access for the bitmap, because it is critical - * that we do the frozen_data COW on bitmap buffers in all cases even - * if the buffer is in BJ_Forget state in the committing transaction. - */ - BUFFER_TRACE(bitmap_bh, "get undo access for new block"); - fatal = ext4_journal_get_undo_access(handle, bitmap_bh); - if (fatal) { - *errp = fatal; - return -1; - } + free_blocks = percpu_counter_read_positive(fbc); + dirty_blocks = percpu_counter_read_positive(dbc); - /* - * we don't deal with reservation when - * filesystem is mounted without reservation - * or the file is not a regular file - * or last attempt to allocate a block with reservation turned on failed - */ - if (my_rsv == NULL ) { - ret = ext4_try_to_allocate(sb, handle, group, bitmap_bh, - grp_goal, count, NULL); - goto out; - } - /* - * grp_goal is a group relative block number (if there is a goal) - * 0 <= grp_goal < EXT4_BLOCKS_PER_GROUP(sb) - * first block is a filesystem wide block number - * first block is the block number of the first block in this group - */ - group_first_block = ext4_group_first_block_no(sb, group); - group_last_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1); - - /* - * Basically we will allocate a new block from inode's reservation - * window. - * - * We need to allocate a new reservation window, if: - * a) inode does not have a reservation window; or - * b) last attempt to allocate a block from existing reservation - * failed; or - * c) we come here with a goal and with a reservation window - * - * We do not need to allocate a new reservation window if we come here - * at the beginning with a goal and the goal is inside the window, or - * we don't have a goal but already have a reservation window. - * then we could go to allocate from the reservation window directly. - */ - while (1) { - if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) || - !goal_in_my_reservation(&my_rsv->rsv_window, - grp_goal, group, sb)) { - if (my_rsv->rsv_goal_size < *count) - my_rsv->rsv_goal_size = *count; - ret = alloc_new_reservation(my_rsv, grp_goal, sb, - group, bitmap_bh); - if (ret < 0) - break; /* failed */ - - if (!goal_in_my_reservation(&my_rsv->rsv_window, - grp_goal, group, sb)) - grp_goal = -1; - } else if (grp_goal >= 0) { - int curr = my_rsv->rsv_end - - (grp_goal + group_first_block) + 1; - - if (curr < *count) - try_to_extend_reservation(my_rsv, sb, - *count - curr); - } + if (!capable(CAP_SYS_RESOURCE) && + sbi->s_resuid != current->fsuid && + (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid))) + root_blocks = ext4_r_blocks_count(sbi->s_es); - if ((my_rsv->rsv_start > group_last_block) || - (my_rsv->rsv_end < group_first_block)) { - rsv_window_dump(&EXT4_SB(sb)->s_rsv_window_root, 1); - BUG(); - } - ret = ext4_try_to_allocate(sb, handle, group, bitmap_bh, - grp_goal, &num, &my_rsv->rsv_window); - if (ret >= 0) { - my_rsv->rsv_alloc_hit += num; - *count = num; - break; /* succeed */ - } - num = *count; - } -out: - if (ret >= 0) { - BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for " - "bitmap block"); - fatal = ext4_journal_dirty_metadata(handle, bitmap_bh); - if (fatal) { - *errp = fatal; - return -1; - } - return ret; + if (free_blocks - (nblocks + root_blocks + dirty_blocks) < + EXT4_FREEBLOCKS_WATERMARK) { + free_blocks = percpu_counter_sum(fbc); + dirty_blocks = percpu_counter_sum(dbc); } + if (free_blocks <= (root_blocks + dirty_blocks)) + /* we don't have free space */ + return 0; - BUFFER_TRACE(bitmap_bh, "journal_release_buffer"); - ext4_journal_release_buffer(handle, bitmap_bh); - return ret; + if (free_blocks - (root_blocks + dirty_blocks) < nblocks) + return free_blocks - (root_blocks + dirty_blocks); + return nblocks; } -/** - * ext4_has_free_blocks() - * @sbi: in-core super block structure. - * - * Check if filesystem has at least 1 free block available for allocation. - */ -static int ext4_has_free_blocks(struct ext4_sb_info *sbi) -{ - ext4_fsblk_t free_blocks, root_blocks; - - free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); - root_blocks = ext4_r_blocks_count(sbi->s_es); - if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) && - sbi->s_resuid != current->fsuid && - (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) { - return 0; - } - return 1; -} /** * ext4_should_retry_alloc() @@ -1630,7 +673,7 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi) */ int ext4_should_retry_alloc(struct super_block *sb, int *retries) { - if (!ext4_has_free_blocks(EXT4_SB(sb)) || (*retries)++ > 3) + if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || (*retries)++ > 3) return 0; jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); @@ -1638,323 +681,100 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries) return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal); } -/** - * ext4_new_blocks_old() -- core block(s) allocation function - * @handle: handle to this transaction - * @inode: file inode - * @goal: given target block(filesystem wide) - * @count: target number of blocks to allocate - * @errp: error code - * - * ext4_new_blocks uses a goal block to assist allocation. It tries to - * allocate block(s) from the block group contains the goal block first. If that - * fails, it will try to allocate block(s) from other block groups without - * any specific goal block. - * - */ -ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode, - ext4_fsblk_t goal, unsigned long *count, int *errp) -{ - struct buffer_head *bitmap_bh = NULL; - struct buffer_head *gdp_bh; - ext4_group_t group_no; - ext4_group_t goal_group; - ext4_grpblk_t grp_target_blk; /* blockgroup relative goal block */ - ext4_grpblk_t grp_alloc_blk; /* blockgroup-relative allocated block*/ - ext4_fsblk_t ret_block; /* filesyetem-wide allocated block */ - ext4_group_t bgi; /* blockgroup iteration index */ - int fatal = 0, err; - int performed_allocation = 0; - ext4_grpblk_t free_blocks; /* number of free blocks in a group */ - struct super_block *sb; - struct ext4_group_desc *gdp; - struct ext4_super_block *es; - struct ext4_sb_info *sbi; - struct ext4_reserve_window_node *my_rsv = NULL; - struct ext4_block_alloc_info *block_i; - unsigned short windowsz = 0; - ext4_group_t ngroups; - unsigned long num = *count; - - *errp = -ENOSPC; - sb = inode->i_sb; - if (!sb) { - printk("ext4_new_block: nonexistent device"); - return 0; - } - - /* - * Check quota for allocation of this block. - */ - if (DQUOT_ALLOC_BLOCK(inode, num)) { - *errp = -EDQUOT; - return 0; - } +#define EXT4_META_BLOCK 0x1 - sbi = EXT4_SB(sb); - es = EXT4_SB(sb)->s_es; - ext4_debug("goal=%llu.\n", goal); - /* - * Allocate a block from reservation only when - * filesystem is mounted with reservation(default,-o reservation), and - * it's a regular file, and - * the desired window size is greater than 0 (One could use ioctl - * command EXT4_IOC_SETRSVSZ to set the window size to 0 to turn off - * reservation on that particular file) - */ - block_i = EXT4_I(inode)->i_block_alloc_info; - if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0)) - my_rsv = &block_i->rsv_window_node; - - if (!ext4_has_free_blocks(sbi)) { - *errp = -ENOSPC; - goto out; - } - - /* - * First, test whether the goal block is free. - */ - if (goal < le32_to_cpu(es->s_first_data_block) || - goal >= ext4_blocks_count(es)) - goal = le32_to_cpu(es->s_first_data_block); - ext4_get_group_no_and_offset(sb, goal, &group_no, &grp_target_blk); - goal_group = group_no; -retry_alloc: - gdp = ext4_get_group_desc(sb, group_no, &gdp_bh); - if (!gdp) - goto io_error; - - free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); - /* - * if there is not enough free blocks to make a new resevation - * turn off reservation for this allocation - */ - if (my_rsv && (free_blocks < windowsz) - && (rsv_is_empty(&my_rsv->rsv_window))) - my_rsv = NULL; - - if (free_blocks > 0) { - bitmap_bh = read_block_bitmap(sb, group_no); - if (!bitmap_bh) - goto io_error; - grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle, - group_no, bitmap_bh, grp_target_blk, - my_rsv, &num, &fatal); - if (fatal) - goto out; - if (grp_alloc_blk >= 0) - goto allocated; - } - - ngroups = EXT4_SB(sb)->s_groups_count; - smp_rmb(); - - /* - * Now search the rest of the groups. We assume that - * group_no and gdp correctly point to the last group visited. - */ - for (bgi = 0; bgi < ngroups; bgi++) { - group_no++; - if (group_no >= ngroups) - group_no = 0; - gdp = ext4_get_group_desc(sb, group_no, &gdp_bh); - if (!gdp) - goto io_error; - free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); - /* - * skip this group if the number of - * free blocks is less than half of the reservation - * window size. - */ - if (free_blocks <= (windowsz/2)) - continue; - - brelse(bitmap_bh); - bitmap_bh = read_block_bitmap(sb, group_no); - if (!bitmap_bh) - goto io_error; - /* - * try to allocate block(s) from this group, without a goal(-1). - */ - grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle, - group_no, bitmap_bh, -1, my_rsv, - &num, &fatal); - if (fatal) - goto out; - if (grp_alloc_blk >= 0) - goto allocated; - } - /* - * We may end up a bogus ealier ENOSPC error due to - * filesystem is "full" of reservations, but - * there maybe indeed free blocks avaliable on disk - * In this case, we just forget about the reservations - * just do block allocation as without reservations. - */ - if (my_rsv) { - my_rsv = NULL; - windowsz = 0; - group_no = goal_group; - goto retry_alloc; - } - /* No space left on the device */ - *errp = -ENOSPC; - goto out; - -allocated: - - ext4_debug("using block group %lu(%d)\n", - group_no, gdp->bg_free_blocks_count); - - BUFFER_TRACE(gdp_bh, "get_write_access"); - fatal = ext4_journal_get_write_access(handle, gdp_bh); - if (fatal) - goto out; - - ret_block = grp_alloc_blk + ext4_group_first_block_no(sb, group_no); - - if (in_range(ext4_block_bitmap(sb, gdp), ret_block, num) || - in_range(ext4_inode_bitmap(sb, gdp), ret_block, num) || - in_range(ret_block, ext4_inode_table(sb, gdp), - EXT4_SB(sb)->s_itb_per_group) || - in_range(ret_block + num - 1, ext4_inode_table(sb, gdp), - EXT4_SB(sb)->s_itb_per_group)) { - ext4_error(sb, "ext4_new_block", - "Allocating block in system zone - " - "blocks from %llu, length %lu", - ret_block, num); - /* - * claim_block marked the blocks we allocated - * as in use. So we may want to selectively - * mark some of the blocks as free - */ - goto retry_alloc; - } - - performed_allocation = 1; - -#ifdef CONFIG_JBD2_DEBUG - { - struct buffer_head *debug_bh; - - /* Record bitmap buffer state in the newly allocated block */ - debug_bh = sb_find_get_block(sb, ret_block); - if (debug_bh) { - BUFFER_TRACE(debug_bh, "state when allocated"); - BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state"); - brelse(debug_bh); - } - } - jbd_lock_bh_state(bitmap_bh); - spin_lock(sb_bgl_lock(sbi, group_no)); - if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) { - int i; - - for (i = 0; i < num; i++) { - if (ext4_test_bit(grp_alloc_blk+i, - bh2jh(bitmap_bh)->b_committed_data)) { - printk("%s: block was unexpectedly set in " - "b_committed_data\n", __func__); - } - } - } - ext4_debug("found bit %d\n", grp_alloc_blk); - spin_unlock(sb_bgl_lock(sbi, group_no)); - jbd_unlock_bh_state(bitmap_bh); -#endif - - if (ret_block + num - 1 >= ext4_blocks_count(es)) { - ext4_error(sb, "ext4_new_block", - "block(%llu) >= blocks count(%llu) - " - "block_group = %lu, es == %p ", ret_block, - ext4_blocks_count(es), group_no, es); - goto out; - } - - /* - * It is up to the caller to add the new buffer to a journal - * list of some description. We don't know in advance whether - * the caller wants to use it as metadata or data. - */ - spin_lock(sb_bgl_lock(sbi, group_no)); - if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) - gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); - le16_add_cpu(&gdp->bg_free_blocks_count, -num); - gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp); - spin_unlock(sb_bgl_lock(sbi, group_no)); - percpu_counter_sub(&sbi->s_freeblocks_counter, num); - - BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor"); - err = ext4_journal_dirty_metadata(handle, gdp_bh); - if (!fatal) - fatal = err; - - sb->s_dirt = 1; - if (fatal) - goto out; - - *errp = 0; - brelse(bitmap_bh); - DQUOT_FREE_BLOCK(inode, *count-num); - *count = num; - return ret_block; - -io_error: - *errp = -EIO; -out: - if (fatal) { - *errp = fatal; - ext4_std_error(sb, fatal); - } - /* - * Undo the block allocation - */ - if (!performed_allocation) - DQUOT_FREE_BLOCK(inode, *count); - brelse(bitmap_bh); - return 0; -} - -ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode, - ext4_fsblk_t goal, int *errp) +static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode, + ext4_lblk_t iblock, ext4_fsblk_t goal, + unsigned long *count, int *errp, int flags) { struct ext4_allocation_request ar; ext4_fsblk_t ret; - if (!test_opt(inode->i_sb, MBALLOC)) { - unsigned long count = 1; - ret = ext4_new_blocks_old(handle, inode, goal, &count, errp); - return ret; - } - memset(&ar, 0, sizeof(ar)); + /* Fill with neighbour allocated blocks */ + ar.inode = inode; ar.goal = goal; - ar.len = 1; + ar.len = *count; + ar.logical = iblock; + + if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK)) + /* enable in-core preallocation for data block allocation */ + ar.flags = EXT4_MB_HINT_DATA; + else + /* disable in-core preallocation for non-regular files */ + ar.flags = 0; + ret = ext4_mb_new_blocks(handle, &ar, errp); + *count = ar.len; return ret; } -ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode, +/* + * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks + * + * @handle: handle to this transaction + * @inode: file inode + * @goal: given target block(filesystem wide) + * @count: total number of blocks need + * @errp: error code + * + * Return 1st allocated block numberon success, *count stores total account + * error stores in errp pointer + */ +ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t goal, unsigned long *count, int *errp) { - struct ext4_allocation_request ar; ext4_fsblk_t ret; - - if (!test_opt(inode->i_sb, MBALLOC)) { - ret = ext4_new_blocks_old(handle, inode, goal, count, errp); - return ret; + ret = do_blk_alloc(handle, inode, 0, goal, + count, errp, EXT4_META_BLOCK); + /* + * Account for the allocated meta blocks + */ + if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) { + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); + EXT4_I(inode)->i_allocated_meta_blocks += *count; + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); } - - memset(&ar, 0, sizeof(ar)); - ar.inode = inode; - ar.goal = goal; - ar.len = *count; - ret = ext4_mb_new_blocks(handle, &ar, errp); - *count = ar.len; return ret; } +/* + * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks + * + * @handle: handle to this transaction + * @inode: file inode + * @goal: given target block(filesystem wide) + * @errp: error code + * + * Return allocated block number on success + */ +ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode, + ext4_fsblk_t goal, int *errp) +{ + unsigned long count = 1; + return ext4_new_meta_blocks(handle, inode, goal, &count, errp); +} + +/* + * ext4_new_blocks() -- allocate data blocks + * + * @handle: handle to this transaction + * @inode: file inode + * @goal: given target block(filesystem wide) + * @count: total number of blocks need + * @errp: error code + * + * Return 1st allocated block numberon success, *count stores total account + * error stores in errp pointer + */ + +ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode, + ext4_lblk_t iblock, ext4_fsblk_t goal, + unsigned long *count, int *errp) +{ + return do_blk_alloc(handle, inode, iblock, goal, count, errp, 0); +} /** * ext4_count_free_blocks() -- count filesystem free blocks @@ -1986,7 +806,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) continue; desc_count += le16_to_cpu(gdp->bg_free_blocks_count); brelse(bitmap_bh); - bitmap_bh = read_block_bitmap(sb, i); + bitmap_bh = ext4_read_block_bitmap(sb, i); if (bitmap_bh == NULL) continue; @@ -1996,10 +816,9 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) bitmap_count += x; } brelse(bitmap_bh); - printk("ext4_count_free_blocks: stored = %llu" - ", computed = %llu, %llu\n", - ext4_free_blocks_count(es), - desc_count, bitmap_count); + printk(KERN_DEBUG "ext4_count_free_blocks: stored = %llu" + ", computed = %llu, %llu\n", ext4_free_blocks_count(es), + desc_count, bitmap_count); return bitmap_count; #else desc_count = 0; @@ -2086,8 +905,9 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group) if (!EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG) || metagroup < first_meta_bg) - return ext4_bg_num_gdb_nometa(sb,group); + return ext4_bg_num_gdb_nometa(sb, group); return ext4_bg_num_gdb_meta(sb,group); } + diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c index d37ea6750454..0a7a6663c190 100644 --- a/fs/ext4/bitmap.c +++ b/fs/ext4/bitmap.c @@ -15,17 +15,17 @@ static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0}; -unsigned long ext4_count_free (struct buffer_head * map, unsigned int numchars) +unsigned long ext4_count_free(struct buffer_head *map, unsigned int numchars) { unsigned int i; unsigned long sum = 0; if (!map) - return (0); + return 0; for (i = 0; i < numchars; i++) sum += nibblemap[map->b_data[i] & 0xf] + nibblemap[(map->b_data[i] >> 4) & 0xf]; - return (sum); + return sum; } #endif /* EXT4FS_DEBUG */ diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 2bf0331ea194..3ca6a2b7632d 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -33,10 +33,10 @@ static unsigned char ext4_filetype_table[] = { }; static int ext4_readdir(struct file *, void *, filldir_t); -static int ext4_dx_readdir(struct file * filp, - void * dirent, filldir_t filldir); -static int ext4_release_dir (struct inode * inode, - struct file * filp); +static int ext4_dx_readdir(struct file *filp, + void *dirent, filldir_t filldir); +static int ext4_release_dir(struct inode *inode, + struct file *filp); const struct file_operations ext4_dir_operations = { .llseek = generic_file_llseek, @@ -61,12 +61,12 @@ static unsigned char get_dtype(struct super_block *sb, int filetype) } -int ext4_check_dir_entry (const char * function, struct inode * dir, - struct ext4_dir_entry_2 * de, - struct buffer_head * bh, - unsigned long offset) +int ext4_check_dir_entry(const char *function, struct inode *dir, + struct ext4_dir_entry_2 *de, + struct buffer_head *bh, + unsigned long offset) { - const char * error_msg = NULL; + const char *error_msg = NULL; const int rlen = ext4_rec_len_from_disk(de->rec_len); if (rlen < EXT4_DIR_REC_LEN(1)) @@ -82,7 +82,7 @@ int ext4_check_dir_entry (const char * function, struct inode * dir, error_msg = "inode out of bounds"; if (error_msg != NULL) - ext4_error (dir->i_sb, function, + ext4_error(dir->i_sb, function, "bad entry in directory #%lu: %s - " "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", dir->i_ino, error_msg, offset, @@ -91,8 +91,8 @@ int ext4_check_dir_entry (const char * function, struct inode * dir, return error_msg == NULL ? 1 : 0; } -static int ext4_readdir(struct file * filp, - void * dirent, filldir_t filldir) +static int ext4_readdir(struct file *filp, + void *dirent, filldir_t filldir) { int error = 0; unsigned long offset; @@ -102,6 +102,7 @@ static int ext4_readdir(struct file * filp, int err; struct inode *inode = filp->f_path.dentry->d_inode; int ret = 0; + int dir_has_error = 0; sb = inode->i_sb; @@ -129,7 +130,8 @@ static int ext4_readdir(struct file * filp, struct buffer_head *bh = NULL; map_bh.b_state = 0; - err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, 0, 0); + err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, + 0, 0, 0); if (err > 0) { pgoff_t index = map_bh.b_blocknr >> (PAGE_CACHE_SHIFT - inode->i_blkbits); @@ -147,9 +149,13 @@ static int ext4_readdir(struct file * filp, * of recovering data when there's a bad sector */ if (!bh) { - ext4_error (sb, "ext4_readdir", - "directory #%lu contains a hole at offset %lu", - inode->i_ino, (unsigned long)filp->f_pos); + if (!dir_has_error) { + ext4_error(sb, __func__, "directory #%lu " + "contains a hole at offset %Lu", + inode->i_ino, + (unsigned long long) filp->f_pos); + dir_has_error = 1; + } /* corrupt size? Maybe no more blocks to read */ if (filp->f_pos > inode->i_blocks << 9) break; @@ -186,14 +192,14 @@ revalidate: while (!error && filp->f_pos < inode->i_size && offset < sb->s_blocksize) { de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); - if (!ext4_check_dir_entry ("ext4_readdir", inode, de, - bh, offset)) { + if (!ext4_check_dir_entry("ext4_readdir", inode, de, + bh, offset)) { /* * On error, skip the f_pos to the next block */ filp->f_pos = (filp->f_pos | (sb->s_blocksize - 1)) + 1; - brelse (bh); + brelse(bh); ret = stored; goto out; } @@ -217,12 +223,12 @@ revalidate: break; if (version != filp->f_version) goto revalidate; - stored ++; + stored++; } filp->f_pos += ext4_rec_len_from_disk(de->rec_len); } offset = 0; - brelse (bh); + brelse(bh); } out: return ret; @@ -272,7 +278,7 @@ static void free_rb_tree_fname(struct rb_root *root) while (n) { /* Do the node's children first */ - if ((n)->rb_left) { + if (n->rb_left) { n = n->rb_left; continue; } @@ -289,9 +295,9 @@ static void free_rb_tree_fname(struct rb_root *root) parent = rb_parent(n); fname = rb_entry(n, struct fname, rb_hash); while (fname) { - struct fname * old = fname; + struct fname *old = fname; fname = fname->next; - kfree (old); + kfree(old); } if (!parent) root->rb_node = NULL; @@ -301,24 +307,18 @@ static void free_rb_tree_fname(struct rb_root *root) parent->rb_right = NULL; n = parent; } - root->rb_node = NULL; } -static struct dir_private_info *create_dir_info(loff_t pos) +static struct dir_private_info *ext4_htree_create_dir_info(loff_t pos) { struct dir_private_info *p; - p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL); + p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL); if (!p) return NULL; - p->root.rb_node = NULL; - p->curr_node = NULL; - p->extra_fname = NULL; - p->last_pos = 0; p->curr_hash = pos2maj_hash(pos); p->curr_minor_hash = pos2min_hash(pos); - p->next_hash = 0; return p; } @@ -336,7 +336,7 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, struct ext4_dir_entry_2 *dirent) { struct rb_node **p, *parent = NULL; - struct fname * fname, *new_fn; + struct fname *fname, *new_fn; struct dir_private_info *info; int len; @@ -393,19 +393,20 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, * for all entres on the fname linked list. (Normally there is only * one entry on the linked list, unless there are 62 bit hash collisions.) */ -static int call_filldir(struct file * filp, void * dirent, +static int call_filldir(struct file *filp, void *dirent, filldir_t filldir, struct fname *fname) { struct dir_private_info *info = filp->private_data; loff_t curr_pos; struct inode *inode = filp->f_path.dentry->d_inode; - struct super_block * sb; + struct super_block *sb; int error; sb = inode->i_sb; if (!fname) { - printk("call_filldir: called with null fname?!?\n"); + printk(KERN_ERR "ext4: call_filldir: called with " + "null fname?!?\n"); return 0; } curr_pos = hash2pos(fname->hash, fname->minor_hash); @@ -416,7 +417,7 @@ static int call_filldir(struct file * filp, void * dirent, get_dtype(sb, fname->file_type)); if (error) { filp->f_pos = curr_pos; - info->extra_fname = fname->next; + info->extra_fname = fname; return error; } fname = fname->next; @@ -424,8 +425,8 @@ static int call_filldir(struct file * filp, void * dirent, return 0; } -static int ext4_dx_readdir(struct file * filp, - void * dirent, filldir_t filldir) +static int ext4_dx_readdir(struct file *filp, + void *dirent, filldir_t filldir) { struct dir_private_info *info = filp->private_data; struct inode *inode = filp->f_path.dentry->d_inode; @@ -433,7 +434,7 @@ static int ext4_dx_readdir(struct file * filp, int ret; if (!info) { - info = create_dir_info(filp->f_pos); + info = ext4_htree_create_dir_info(filp->f_pos); if (!info) return -ENOMEM; filp->private_data = info; @@ -455,11 +456,21 @@ static int ext4_dx_readdir(struct file * filp, * If there are any leftover names on the hash collision * chain, return them first. */ - if (info->extra_fname && - call_filldir(filp, dirent, filldir, info->extra_fname)) - goto finished; + if (info->extra_fname) { + if (call_filldir(filp, dirent, filldir, info->extra_fname)) + goto finished; - if (!info->curr_node) + info->extra_fname = NULL; + info->curr_node = rb_next(info->curr_node); + if (!info->curr_node) { + if (info->next_hash == ~0) { + filp->f_pos = EXT4_HTREE_EOF; + goto finished; + } + info->curr_hash = info->next_hash; + info->curr_minor_hash = 0; + } + } else if (!info->curr_node) info->curr_node = rb_first(&info->root); while (1) { @@ -506,7 +517,7 @@ finished: return 0; } -static int ext4_release_dir (struct inode * inode, struct file * filp) +static int ext4_release_dir(struct inode *inode, struct file *filp) { if (filp->private_data) ext4_htree_free_dir_info(filp->private_data); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8158083f7ac0..6690a41cdd9f 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -22,7 +22,7 @@ #include "ext4_i.h" /* - * The second extended filesystem constants/structures + * The fourth extended filesystem constants/structures */ /* @@ -44,9 +44,9 @@ #ifdef EXT4FS_DEBUG #define ext4_debug(f, a...) \ do { \ - printk (KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \ - __FILE__, __LINE__, __FUNCTION__); \ - printk (KERN_DEBUG f, ## a); \ + printk(KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \ + __FILE__, __LINE__, __func__); \ + printk(KERN_DEBUG f, ## a); \ } while (0) #else #define ext4_debug(f, a...) do {} while (0) @@ -74,6 +74,9 @@ #define EXT4_MB_HINT_GOAL_ONLY 256 /* goal is meaningful */ #define EXT4_MB_HINT_TRY_GOAL 512 +/* blocks already pre-reserved by delayed allocation */ +#define EXT4_MB_DELALLOC_RESERVED 1024 + struct ext4_allocation_request { /* target inode for block we're allocating */ @@ -125,7 +128,7 @@ struct ext4_allocation_request { #else # define EXT4_BLOCK_SIZE(s) (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size) #endif -#define EXT4_ADDR_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / sizeof (__u32)) +#define EXT4_ADDR_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / sizeof(__u32)) #ifdef __KERNEL__ # define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) #else @@ -170,6 +173,15 @@ struct ext4_group_desc __u32 bg_reserved2[3]; }; +/* + * Structure of a flex block group info + */ + +struct flex_groups { + __u32 free_inodes; + __u32 free_blocks; +}; + #define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ #define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */ #define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */ @@ -233,7 +245,7 @@ struct ext4_group_desc #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ #define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ -#define EXT4_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ +#define EXT4_FL_USER_MODIFIABLE 0x000B80FF /* User modifiable flags */ /* * Inode dynamic state flags @@ -279,8 +291,6 @@ struct ext4_new_group_data { #define EXT4_IOC_SETFLAGS FS_IOC_SETFLAGS #define EXT4_IOC_GETVERSION _IOR('f', 3, long) #define EXT4_IOC_SETVERSION _IOW('f', 4, long) -#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long) -#define EXT4_IOC_GROUP_ADD _IOW('f', 8,struct ext4_new_group_input) #define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION #define EXT4_IOC_SETVERSION_OLD FS_IOC_SETVERSION #ifdef CONFIG_JBD2_DEBUG @@ -288,7 +298,10 @@ struct ext4_new_group_data { #endif #define EXT4_IOC_GETRSVSZ _IOR('f', 5, long) #define EXT4_IOC_SETRSVSZ _IOW('f', 6, long) -#define EXT4_IOC_MIGRATE _IO('f', 7) +#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long) +#define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input) +#define EXT4_IOC_MIGRATE _IO('f', 9) + /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */ /* * ioctl commands in 32 bit emulation @@ -526,7 +539,9 @@ do { \ #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ -#define EXT4_MOUNT_MBALLOC 0x4000000 /* Buddy allocation support */ +#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ +#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ + /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt @@ -647,11 +662,14 @@ struct ext4_super_block { __le16 s_mmp_interval; /* # seconds to wait in MMP checking */ __le64 s_mmp_block; /* Block for multi-mount protection */ __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ - __u32 s_reserved[163]; /* Padding to the end of the block */ + __u8 s_log_groups_per_flex; /* FLEX_BG group size */ + __u8 s_reserved_char_pad2; + __le16 s_reserved_pad; + __u32 s_reserved[162]; /* Padding to the end of the block */ }; #ifdef __KERNEL__ -static inline struct ext4_sb_info * EXT4_SB(struct super_block *sb) +static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) { return sb->s_fs_info; } @@ -709,11 +727,11 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) */ #define EXT4_HAS_COMPAT_FEATURE(sb,mask) \ - ( EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask) ) + (EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask)) #define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \ - ( EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask) ) + (EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask)) #define EXT4_HAS_INCOMPAT_FEATURE(sb,mask) \ - ( EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask) ) + (EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask)) #define EXT4_SET_COMPAT_FEATURE(sb,mask) \ EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask) #define EXT4_SET_RO_COMPAT_FEATURE(sb,mask) \ @@ -773,6 +791,8 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) #define EXT4_DEF_RESUID 0 #define EXT4_DEF_RESGID 0 +#define EXT4_DEF_INODE_READAHEAD_BLKS 32 + /* * Default mount options */ @@ -938,6 +958,24 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no) void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, unsigned long *blockgrpp, ext4_grpblk_t *offsetp); +extern struct proc_dir_entry *ext4_proc_root; + +#ifdef CONFIG_PROC_FS +extern const struct file_operations ext4_ui_proc_fops; + +#define EXT4_PROC_HANDLER(name, var) \ +do { \ + proc = proc_create_data(name, mode, sbi->s_proc, \ + &ext4_ui_proc_fops, &sbi->s_##var); \ + if (proc == NULL) { \ + printk(KERN_ERR "EXT4-fs: can't create %s\n", name); \ + goto err_out; \ + } \ +} while (0) +#else +#define EXT4_PROC_HANDLER(name, var) +#endif + /* * Function prototypes */ @@ -958,25 +996,27 @@ extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb, extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group); extern unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group); -extern ext4_fsblk_t ext4_new_block (handle_t *handle, struct inode *inode, +extern ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode, ext4_fsblk_t goal, int *errp); -extern ext4_fsblk_t ext4_new_blocks (handle_t *handle, struct inode *inode, +extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t goal, unsigned long *count, int *errp); -extern ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode, - ext4_fsblk_t goal, unsigned long *count, int *errp); -extern void ext4_free_blocks (handle_t *handle, struct inode *inode, +extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode, + ext4_lblk_t iblock, ext4_fsblk_t goal, + unsigned long *count, int *errp); +extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); +extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi, + s64 nblocks); +extern void ext4_free_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t block, unsigned long count, int metadata); -extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb, - ext4_fsblk_t block, unsigned long count, +extern void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb, + ext4_fsblk_t block, unsigned long count, unsigned long *pdquot_freed_blocks); -extern ext4_fsblk_t ext4_count_free_blocks (struct super_block *); -extern void ext4_check_blocks_bitmap (struct super_block *); +extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *); +extern void ext4_check_blocks_bitmap(struct super_block *); extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, ext4_group_t block_group, struct buffer_head ** bh); extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); -extern void ext4_init_block_alloc_info(struct inode *); -extern void ext4_rsv_window_add(struct super_block *sb, struct ext4_reserve_window_node *rsv); /* dir.c */ extern int ext4_check_dir_entry(const char *, struct inode *, @@ -988,20 +1028,20 @@ extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, extern void ext4_htree_free_dir_info(struct dir_private_info *p); /* fsync.c */ -extern int ext4_sync_file (struct file *, struct dentry *, int); +extern int ext4_sync_file(struct file *, struct dentry *, int); /* hash.c */ extern int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo); /* ialloc.c */ -extern struct inode * ext4_new_inode (handle_t *, struct inode *, int); -extern void ext4_free_inode (handle_t *, struct inode *); -extern struct inode * ext4_orphan_get (struct super_block *, unsigned long); -extern unsigned long ext4_count_free_inodes (struct super_block *); -extern unsigned long ext4_count_dirs (struct super_block *); -extern void ext4_check_inodes_bitmap (struct super_block *); -extern unsigned long ext4_count_free (struct buffer_head *, unsigned); +extern struct inode * ext4_new_inode(handle_t *, struct inode *, int); +extern void ext4_free_inode(handle_t *, struct inode *); +extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); +extern unsigned long ext4_count_free_inodes(struct super_block *); +extern unsigned long ext4_count_dirs(struct super_block *); +extern void ext4_check_inodes_bitmap(struct super_block *); +extern unsigned long ext4_count_free(struct buffer_head *, unsigned); /* mballoc.c */ extern long ext4_mb_stats; @@ -1011,11 +1051,15 @@ extern int ext4_mb_release(struct super_block *); extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, struct ext4_allocation_request *, int *); extern int ext4_mb_reserve_blocks(struct super_block *, int); -extern void ext4_mb_discard_inode_preallocations(struct inode *); +extern void ext4_discard_preallocations(struct inode *); extern int __init init_ext4_mballoc(void); extern void exit_ext4_mballoc(void); extern void ext4_mb_free_blocks(handle_t *, struct inode *, unsigned long, unsigned long, int, unsigned long *); +extern int ext4_mb_add_more_groupinfo(struct super_block *sb, + ext4_group_t i, struct ext4_group_desc *desc); +extern void ext4_mb_update_group_info(struct ext4_group_info *grp, + ext4_grpblk_t add); /* inode.c */ @@ -1025,35 +1069,41 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int, int *); struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int, int *); +int ext4_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, ext4_lblk_t iblock, unsigned long maxblocks, struct buffer_head *bh_result, int create, int extend_disksize); extern struct inode *ext4_iget(struct super_block *, unsigned long); -extern int ext4_write_inode (struct inode *, int); -extern int ext4_setattr (struct dentry *, struct iattr *); -extern void ext4_delete_inode (struct inode *); -extern int ext4_sync_inode (handle_t *, struct inode *); -extern void ext4_discard_reservation (struct inode *); +extern int ext4_write_inode(struct inode *, int); +extern int ext4_setattr(struct dentry *, struct iattr *); +extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat); +extern void ext4_delete_inode(struct inode *); +extern int ext4_sync_inode(handle_t *, struct inode *); extern void ext4_dirty_inode(struct inode *); extern int ext4_change_inode_journal_flag(struct inode *, int); extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); -extern void ext4_truncate (struct inode *); +extern int ext4_can_truncate(struct inode *inode); +extern void ext4_truncate(struct inode *); extern void ext4_set_inode_flags(struct inode *); extern void ext4_get_inode_flags(struct ext4_inode_info *); extern void ext4_set_aops(struct inode *inode); extern int ext4_writepage_trans_blocks(struct inode *); -extern int ext4_block_truncate_page(handle_t *handle, struct page *page, +extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks); +extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); +extern int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from); +extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page); /* ioctl.c */ extern long ext4_ioctl(struct file *, unsigned int, unsigned long); -extern long ext4_compat_ioctl (struct file *, unsigned int, unsigned long); +extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); /* migrate.c */ -extern int ext4_ext_migrate(struct inode *, struct file *, unsigned int, - unsigned long); +extern int ext4_ext_migrate(struct inode *); /* namei.c */ extern int ext4_orphan_add(handle_t *, struct inode *); extern int ext4_orphan_del(handle_t *, struct inode *); @@ -1068,14 +1118,14 @@ extern int ext4_group_extend(struct super_block *sb, ext4_fsblk_t n_blocks_count); /* super.c */ -extern void ext4_error (struct super_block *, const char *, const char *, ...) +extern void ext4_error(struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); -extern void __ext4_std_error (struct super_block *, const char *, int); -extern void ext4_abort (struct super_block *, const char *, const char *, ...) +extern void __ext4_std_error(struct super_block *, const char *, int); +extern void ext4_abort(struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); -extern void ext4_warning (struct super_block *, const char *, const char *, ...) +extern void ext4_warning(struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); -extern void ext4_update_dynamic_rev (struct super_block *sb); +extern void ext4_update_dynamic_rev(struct super_block *sb); extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, __u32 compat); extern int ext4_update_rocompat_feature(handle_t *handle, @@ -1148,7 +1198,7 @@ static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) static inline struct ext4_group_info *ext4_get_group_info(struct super_block *sb, - ext4_group_t group) + ext4_group_t group) { struct ext4_group_info ***grp_info; long indexv, indexh; @@ -1159,12 +1209,45 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb, } +static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi, + ext4_group_t block_group) +{ + return block_group >> sbi->s_log_groups_per_flex; +} + +static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi) +{ + return 1 << sbi->s_log_groups_per_flex; +} + #define ext4_std_error(sb, errno) \ do { \ if ((errno)) \ - __ext4_std_error((sb), __FUNCTION__, (errno)); \ + __ext4_std_error((sb), __func__, (errno)); \ } while (0) +#ifdef CONFIG_SMP +/* Each CPU can accumulate FBC_BATCH blocks in their local + * counters. So we need to make sure we have free blocks more + * than FBC_BATCH * nr_cpu_ids. Also add a window of 4 times. + */ +#define EXT4_FREEBLOCKS_WATERMARK (4 * (FBC_BATCH * nr_cpu_ids)) +#else +#define EXT4_FREEBLOCKS_WATERMARK 0 +#endif + +static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) +{ + /* + * XXX: replace with spinlock if seen contended -bzzz + */ + down_write(&EXT4_I(inode)->i_data_sem); + if (newsize > EXT4_I(inode)->i_disksize) + EXT4_I(inode)->i_disksize = newsize; + up_write(&EXT4_I(inode)->i_data_sem); + return ; +} + /* * Inodes and files operations */ @@ -1187,11 +1270,13 @@ extern const struct inode_operations ext4_fast_symlink_inode_operations; /* extents.c */ extern int ext4_ext_tree_init(handle_t *handle, struct inode *); extern int ext4_ext_writepage_trans_blocks(struct inode *, int); +extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, + int chunk); extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, ext4_lblk_t iblock, unsigned long max_blocks, struct buffer_head *bh_result, int create, int extend_disksize); -extern void ext4_ext_truncate(struct inode *, struct page *); +extern void ext4_ext_truncate(struct inode *); extern void ext4_ext_init(struct super_block *); extern void ext4_ext_release(struct super_block *); extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, @@ -1199,7 +1284,7 @@ extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, unsigned long max_blocks, struct buffer_head *bh, int create, - int extend_disksize); + int extend_disksize, int flag); #endif /* __KERNEL__ */ #endif /* _EXT4_H */ diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 75333b595fab..bec7ce59fc0d 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -124,6 +124,19 @@ struct ext4_ext_path { #define EXT4_EXT_CACHE_GAP 1 #define EXT4_EXT_CACHE_EXTENT 2 +/* + * to be called by ext4_ext_walk_space() + * negative retcode - error + * positive retcode - signal for ext4_ext_walk_space(), see below + * callback must return valid extent (passed or newly created) + */ +typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *, + struct ext4_ext_cache *, + struct ext4_extent *, void *); + +#define EXT_CONTINUE 0 +#define EXT_BREAK 1 +#define EXT_REPEAT 2 #define EXT_MAX_BLOCK 0xffffffff @@ -212,15 +225,20 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext) (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN)); } +extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks); extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *); extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t); extern int ext4_extent_tree_init(handle_t *, struct inode *); -extern int ext4_ext_calc_credits_for_insert(struct inode *, struct ext4_ext_path *); +extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, + int num, + struct ext4_ext_path *path); extern int ext4_ext_try_to_merge(struct inode *inode, struct ext4_ext_path *path, struct ext4_extent *); extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *); extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *); +extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t, + ext_prepare_callback, void *); extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, struct ext4_ext_path *); extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *, diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h index 26a4ae255d79..5c124c0ac6d3 100644 --- a/fs/ext4/ext4_i.h +++ b/fs/ext4/ext4_i.h @@ -33,38 +33,6 @@ typedef __u32 ext4_lblk_t; /* data type for block group number */ typedef unsigned long ext4_group_t; -struct ext4_reserve_window { - ext4_fsblk_t _rsv_start; /* First byte reserved */ - ext4_fsblk_t _rsv_end; /* Last byte reserved or 0 */ -}; - -struct ext4_reserve_window_node { - struct rb_node rsv_node; - __u32 rsv_goal_size; - __u32 rsv_alloc_hit; - struct ext4_reserve_window rsv_window; -}; - -struct ext4_block_alloc_info { - /* information about reservation window */ - struct ext4_reserve_window_node rsv_window_node; - /* - * was i_next_alloc_block in ext4_inode_info - * is the logical (file-relative) number of the - * most-recently-allocated block in this file. - * We use this for detecting linearly ascending allocation requests. - */ - ext4_lblk_t last_alloc_logical_block; - /* - * Was i_next_alloc_goal in ext4_inode_info - * is the *physical* companion to i_next_alloc_block. - * it the physical block number of the block which was most-recentl - * allocated to this file. This give us the goal (target) for the next - * allocation when we detect linearly ascending requests. - */ - ext4_fsblk_t last_alloc_physical_block; -}; - #define rsv_start rsv_window._rsv_start #define rsv_end rsv_window._rsv_end @@ -79,7 +47,7 @@ struct ext4_ext_cache { }; /* - * third extended file system inode data in memory + * fourth extended file system inode data in memory */ struct ext4_inode_info { __le32 i_data[15]; /* unconverted */ @@ -97,11 +65,8 @@ struct ext4_inode_info { ext4_group_t i_block_group; __u32 i_state; /* Dynamic state flags for ext4 */ - /* block reservation info */ - struct ext4_block_alloc_info *i_block_alloc_info; - ext4_lblk_t i_dir_start_lookup; -#ifdef CONFIG_EXT4DEV_FS_XATTR +#ifdef CONFIG_EXT4_FS_XATTR /* * Extended attributes can be read independently of the main file * data. Taking i_mutex even when reading would cause contention @@ -111,7 +76,7 @@ struct ext4_inode_info { */ struct rw_semaphore xattr_sem; #endif -#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL +#ifdef CONFIG_EXT4_FS_POSIX_ACL struct posix_acl *i_acl; struct posix_acl *i_default_acl; #endif @@ -150,6 +115,7 @@ struct ext4_inode_info { */ struct rw_semaphore i_data_sem; struct inode vfs_inode; + struct jbd2_inode jinode; unsigned long i_ext_generation; struct ext4_ext_cache i_cached_extent; @@ -162,6 +128,13 @@ struct ext4_inode_info { /* mballoc */ struct list_head i_prealloc_list; spinlock_t i_prealloc_lock; + + /* allocation reservation info for delalloc */ + unsigned long i_reserved_data_blocks; + unsigned long i_reserved_meta_blocks; + unsigned long i_allocated_meta_blocks; + unsigned short i_delalloc_reserved_flag; + spinlock_t i_block_reservation_lock; }; #endif /* _EXT4_I */ diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 9255a7d28b24..b455c685a98b 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -51,6 +51,14 @@ EXT4_XATTR_TRANS_BLOCKS - 2 + \ 2*EXT4_QUOTA_TRANS_BLOCKS(sb)) +/* + * Define the number of metadata blocks we need to account to modify data. + * + * This include super block, inode block, quota blocks and xattr blocks + */ +#define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \ + 2*EXT4_QUOTA_TRANS_BLOCKS(sb)) + /* Delete operations potentially hit one directory's namespace plus an * entire inode, plus arbitrary amounts of bitmap/indirection data. Be * generous. We can grow the delete transaction later if necessary. */ @@ -142,19 +150,17 @@ int __ext4_journal_dirty_metadata(const char *where, handle_t *handle, struct buffer_head *bh); #define ext4_journal_get_undo_access(handle, bh) \ - __ext4_journal_get_undo_access(__FUNCTION__, (handle), (bh)) + __ext4_journal_get_undo_access(__func__, (handle), (bh)) #define ext4_journal_get_write_access(handle, bh) \ - __ext4_journal_get_write_access(__FUNCTION__, (handle), (bh)) + __ext4_journal_get_write_access(__func__, (handle), (bh)) #define ext4_journal_revoke(handle, blocknr, bh) \ - __ext4_journal_revoke(__FUNCTION__, (handle), (blocknr), (bh)) + __ext4_journal_revoke(__func__, (handle), (blocknr), (bh)) #define ext4_journal_get_create_access(handle, bh) \ - __ext4_journal_get_create_access(__FUNCTION__, (handle), (bh)) + __ext4_journal_get_create_access(__func__, (handle), (bh)) #define ext4_journal_dirty_metadata(handle, bh) \ - __ext4_journal_dirty_metadata(__FUNCTION__, (handle), (bh)) + __ext4_journal_dirty_metadata(__func__, (handle), (bh)) #define ext4_journal_forget(handle, bh) \ - __ext4_journal_forget(__FUNCTION__, (handle), (bh)) - -int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh); + __ext4_journal_forget(__func__, (handle), (bh)) handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); int __ext4_journal_stop(const char *where, handle_t *handle); @@ -165,7 +171,7 @@ static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks) } #define ext4_journal_stop(handle) \ - __ext4_journal_stop(__FUNCTION__, (handle)) + __ext4_journal_stop(__func__, (handle)) static inline handle_t *ext4_journal_current_handle(void) { @@ -192,6 +198,11 @@ static inline int ext4_journal_force_commit(journal_t *journal) return jbd2_journal_force_commit(journal); } +static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode) +{ + return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode); +} + /* super.c */ int ext4_force_commit(struct super_block *sb); diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h index 5802e69f2191..6a0b40d43264 100644 --- a/fs/ext4/ext4_sb.h +++ b/fs/ext4/ext4_sb.h @@ -25,7 +25,7 @@ #include <linux/rbtree.h> /* - * third extended-fs super-block data in memory + * fourth extended-fs super-block data in memory */ struct ext4_sb_info { unsigned long s_desc_size; /* Size of a group descriptor in bytes */ @@ -40,8 +40,8 @@ struct ext4_sb_info { unsigned long s_blocks_last; /* Last seen block count */ loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ struct buffer_head * s_sbh; /* Buffer containing the super block */ - struct ext4_super_block * s_es; /* Pointer to the super block in the buffer */ - struct buffer_head ** s_group_desc; + struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */ + struct buffer_head **s_group_desc; unsigned long s_mount_opt; ext4_fsblk_t s_sb_block; uid_t s_resuid; @@ -52,6 +52,7 @@ struct ext4_sb_info { int s_desc_per_block_bits; int s_inode_size; int s_first_ino; + unsigned int s_inode_readahead_blks; spinlock_t s_next_gen_lock; u32 s_next_generation; u32 s_hash_seed[4]; @@ -59,16 +60,17 @@ struct ext4_sb_info { struct percpu_counter s_freeblocks_counter; struct percpu_counter s_freeinodes_counter; struct percpu_counter s_dirs_counter; + struct percpu_counter s_dirtyblocks_counter; struct blockgroup_lock s_blockgroup_lock; + struct proc_dir_entry *s_proc; /* root of the per fs reservation window tree */ spinlock_t s_rsv_window_lock; struct rb_root s_rsv_window_root; - struct ext4_reserve_window_node s_rsv_window_head; /* Journaling */ - struct inode * s_journal_inode; - struct journal_s * s_journal; + struct inode *s_journal_inode; + struct journal_s *s_journal; struct list_head s_orphan; unsigned long s_commit_interval; struct block_device *journal_bdev; @@ -106,12 +108,12 @@ struct ext4_sb_info { /* tunables */ unsigned long s_stripe; - unsigned long s_mb_stream_request; - unsigned long s_mb_max_to_scan; - unsigned long s_mb_min_to_scan; - unsigned long s_mb_stats; - unsigned long s_mb_order2_reqs; - unsigned long s_mb_group_prealloc; + unsigned int s_mb_stream_request; + unsigned int s_mb_max_to_scan; + unsigned int s_mb_min_to_scan; + unsigned int s_mb_stats; + unsigned int s_mb_order2_reqs; + unsigned int s_mb_group_prealloc; /* where last allocation was done - for stream allocation */ unsigned long s_mb_last_group; unsigned long s_mb_last_start; @@ -121,7 +123,6 @@ struct ext4_sb_info { int s_mb_history_cur; int s_mb_history_max; int s_mb_history_num; - struct proc_dir_entry *s_mb_proc; spinlock_t s_mb_history_lock; int s_mb_history_filter; @@ -143,6 +144,9 @@ struct ext4_sb_info { /* locality groups */ struct ext4_locality_group *s_locality_groups; + + unsigned int s_log_groups_per_flex; + struct flex_groups *s_flex_groups; }; #endif /* _EXT4_SB */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 47929c4e3dae..ea2ce3c0ae66 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -40,6 +40,7 @@ #include <linux/slab.h> #include <linux/falloc.h> #include <asm/uaccess.h> +#include <linux/fiemap.h> #include "ext4_jbd2.h" #include "ext4_extents.h" @@ -92,17 +93,16 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb) ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); } -static handle_t *ext4_ext_journal_restart(handle_t *handle, int needed) +static int ext4_ext_journal_restart(handle_t *handle, int needed) { int err; if (handle->h_buffer_credits > needed) - return handle; - if (!ext4_journal_extend(handle, needed)) - return handle; - err = ext4_journal_restart(handle, needed); - - return handle; + return 0; + err = ext4_journal_extend(handle, needed); + if (err <= 0) + return err; + return ext4_journal_restart(handle, needed); } /* @@ -180,15 +180,18 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, return bg_start + colour + block; } +/* + * Allocation for a meta data block + */ static ext4_fsblk_t -ext4_ext_new_block(handle_t *handle, struct inode *inode, +ext4_ext_new_meta_block(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, struct ext4_extent *ex, int *err) { ext4_fsblk_t goal, newblock; goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); - newblock = ext4_new_block(handle, inode, goal, err); + newblock = ext4_new_meta_block(handle, inode, goal, err); return newblock; } @@ -246,6 +249,36 @@ static int ext4_ext_space_root_idx(struct inode *inode) return size; } +/* + * Calculate the number of metadata blocks needed + * to allocate @blocks + * Worse case is one block per extent + */ +int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks) +{ + int lcap, icap, rcap, leafs, idxs, num; + int newextents = blocks; + + rcap = ext4_ext_space_root_idx(inode); + lcap = ext4_ext_space_block(inode); + icap = ext4_ext_space_block_idx(inode); + + /* number of new leaf blocks needed */ + num = leafs = (newextents + lcap - 1) / lcap; + + /* + * Worse case, we need separate index block(s) + * to link all new leaf blocks + */ + idxs = (leafs + icap - 1) / icap; + do { + num += idxs; + idxs = (idxs + icap - 1) / icap; + } while (idxs > rcap); + + return num; +} + static int ext4_ext_max_entries(struct inode *inode, int depth) { @@ -351,8 +384,8 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) ext_debug("\n"); } #else -#define ext4_ext_show_path(inode,path) -#define ext4_ext_show_leaf(inode,path) +#define ext4_ext_show_path(inode, path) +#define ext4_ext_show_leaf(inode, path) #endif void ext4_ext_drop_refs(struct ext4_ext_path *path) @@ -408,9 +441,10 @@ ext4_ext_binsearch_idx(struct inode *inode, for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) { if (k != 0 && le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) { - printk("k=%d, ix=0x%p, first=0x%p\n", k, - ix, EXT_FIRST_INDEX(eh)); - printk("%u <= %u\n", + printk(KERN_DEBUG "k=%d, ix=0x%p, " + "first=0x%p\n", k, + ix, EXT_FIRST_INDEX(eh)); + printk(KERN_DEBUG "%u <= %u\n", le32_to_cpu(ix->ei_block), le32_to_cpu(ix[-1].ei_block)); } @@ -524,6 +558,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, alloc = 1; } path[0].p_hdr = eh; + path[0].p_bh = NULL; i = depth; /* walk through the tree */ @@ -552,12 +587,14 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, } path[ppos].p_depth = i; - path[ppos].p_hdr = eh; path[ppos].p_ext = NULL; path[ppos].p_idx = NULL; /* find extent */ ext4_ext_binsearch(inode, path + ppos, block); + /* if not an empty leaf */ + if (path[ppos].p_ext) + path[ppos].p_block = ext_pblock(path[ppos].p_ext); ext4_ext_show_path(inode, path); @@ -688,7 +725,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, /* allocate all needed blocks */ ext_debug("allocate %d blocks for indexes/leaf\n", depth - at); for (a = 0; a < depth - at; a++) { - newblock = ext4_ext_new_block(handle, inode, path, newext, &err); + newblock = ext4_ext_new_meta_block(handle, inode, path, + newext, &err); if (newblock == 0) goto cleanup; ablocks[a] = newblock; @@ -884,7 +922,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, ext4_fsblk_t newblock; int err = 0; - newblock = ext4_ext_new_block(handle, inode, path, newext, &err); + newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err); if (newblock == 0) return err; @@ -981,6 +1019,8 @@ repeat: /* if we found index with free entry, then use that * entry: create all needed subtree and add new leaf */ err = ext4_ext_split(handle, inode, path, newext, i); + if (err) + goto out; /* refill path */ ext4_ext_drop_refs(path); @@ -1403,7 +1443,7 @@ unsigned int ext4_ext_check_overlap(struct inode *inode, /* * get the next allocated block if the extent in the path - * is before the requested block(s) + * is before the requested block(s) */ if (b2 < b1) { b2 = ext4_ext_next_allocated_block(path); @@ -1437,7 +1477,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, struct ext4_extent *newext) { - struct ext4_extent_header * eh; + struct ext4_extent_header *eh; struct ext4_extent *ex, *fex; struct ext4_extent *nearex; /* nearest extent */ struct ext4_ext_path *npath = NULL; @@ -1587,6 +1627,113 @@ cleanup: return err; } +int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, + ext4_lblk_t num, ext_prepare_callback func, + void *cbdata) +{ + struct ext4_ext_path *path = NULL; + struct ext4_ext_cache cbex; + struct ext4_extent *ex; + ext4_lblk_t next, start = 0, end = 0; + ext4_lblk_t last = block + num; + int depth, exists, err = 0; + + BUG_ON(func == NULL); + BUG_ON(inode == NULL); + + while (block < last && block != EXT_MAX_BLOCK) { + num = last - block; + /* find extent for this block */ + path = ext4_ext_find_extent(inode, block, path); + if (IS_ERR(path)) { + err = PTR_ERR(path); + path = NULL; + break; + } + + depth = ext_depth(inode); + BUG_ON(path[depth].p_hdr == NULL); + ex = path[depth].p_ext; + next = ext4_ext_next_allocated_block(path); + + exists = 0; + if (!ex) { + /* there is no extent yet, so try to allocate + * all requested space */ + start = block; + end = block + num; + } else if (le32_to_cpu(ex->ee_block) > block) { + /* need to allocate space before found extent */ + start = block; + end = le32_to_cpu(ex->ee_block); + if (block + num < end) + end = block + num; + } else if (block >= le32_to_cpu(ex->ee_block) + + ext4_ext_get_actual_len(ex)) { + /* need to allocate space after found extent */ + start = block; + end = block + num; + if (end >= next) + end = next; + } else if (block >= le32_to_cpu(ex->ee_block)) { + /* + * some part of requested space is covered + * by found extent + */ + start = block; + end = le32_to_cpu(ex->ee_block) + + ext4_ext_get_actual_len(ex); + if (block + num < end) + end = block + num; + exists = 1; + } else { + BUG(); + } + BUG_ON(end <= start); + + if (!exists) { + cbex.ec_block = start; + cbex.ec_len = end - start; + cbex.ec_start = 0; + cbex.ec_type = EXT4_EXT_CACHE_GAP; + } else { + cbex.ec_block = le32_to_cpu(ex->ee_block); + cbex.ec_len = ext4_ext_get_actual_len(ex); + cbex.ec_start = ext_pblock(ex); + cbex.ec_type = EXT4_EXT_CACHE_EXTENT; + } + + BUG_ON(cbex.ec_len == 0); + err = func(inode, path, &cbex, ex, cbdata); + ext4_ext_drop_refs(path); + + if (err < 0) + break; + + if (err == EXT_REPEAT) + continue; + else if (err == EXT_BREAK) { + err = 0; + break; + } + + if (ext_depth(inode) != depth) { + /* depth was changed. we have to realloc path */ + kfree(path); + path = NULL; + } + + block = cbex.ec_block + cbex.ec_len; + } + + if (path) { + ext4_ext_drop_refs(path); + kfree(path); + } + + return err; +} + static void ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block, __u32 len, ext4_fsblk_t start, int type) @@ -1709,54 +1856,61 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, } /* - * ext4_ext_calc_credits_for_insert: - * This routine returns max. credits that the extent tree can consume. - * It should be OK for low-performance paths like ->writepage() - * To allow many writing processes to fit into a single transaction, - * the caller should calculate credits under i_data_sem and - * pass the actual path. + * ext4_ext_calc_credits_for_single_extent: + * This routine returns max. credits that needed to insert an extent + * to the extent tree. + * When pass the actual path, the caller should calculate credits + * under i_data_sem. */ -int ext4_ext_calc_credits_for_insert(struct inode *inode, +int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, struct ext4_ext_path *path) { - int depth, needed; - if (path) { + int depth = ext_depth(inode); + int ret = 0; + /* probably there is space in leaf? */ - depth = ext_depth(inode); if (le16_to_cpu(path[depth].p_hdr->eh_entries) - < le16_to_cpu(path[depth].p_hdr->eh_max)) - return 1; - } - - /* - * given 32-bit logical block (4294967296 blocks), max. tree - * can be 4 levels in depth -- 4 * 340^4 == 53453440000. - * Let's also add one more level for imbalance. - */ - depth = 5; + < le16_to_cpu(path[depth].p_hdr->eh_max)) { - /* allocation of new data block(s) */ - needed = 2; + /* + * There are some space in the leaf tree, no + * need to account for leaf block credit + * + * bitmaps and block group descriptor blocks + * and other metadat blocks still need to be + * accounted. + */ + /* 1 bitmap, 1 block group descriptor */ + ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb); + } + } - /* - * tree can be full, so it would need to grow in depth: - * we need one credit to modify old root, credits for - * new root will be added in split accounting - */ - needed += 1; + return ext4_chunk_trans_blocks(inode, nrblocks); +} - /* - * Index split can happen, we would need: - * allocate intermediate indexes (bitmap + group) - * + change two blocks at each level, but root (already included) - */ - needed += (depth * 2) + (depth * 2); +/* + * How many index/leaf blocks need to change/allocate to modify nrblocks? + * + * if nrblocks are fit in a single extent (chunk flag is 1), then + * in the worse case, each tree level index/leaf need to be changed + * if the tree split due to insert a new extent, then the old tree + * index/leaf need to be updated too + * + * If the nrblocks are discontiguous, they could cause + * the whole tree split more than once, but this is really rare. + */ +int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) +{ + int index; + int depth = ext_depth(inode); - /* any allocation modifies superblock */ - needed += 1; + if (chunk) + index = depth * 2; + else + index = depth * 3; - return needed; + return index; } static int ext4_remove_blocks(handle_t *handle, struct inode *inode, @@ -1872,22 +2026,22 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, BUG_ON(b != ex_ee_block + ex_ee_len - 1); } - /* at present, extent can't cross block group: */ - /* leaf + bitmap + group desc + sb + inode */ - credits = 5; + /* + * 3 for leaf, sb, and inode plus 2 (bmap and group + * descriptor) for each block group; assume two block + * groups plus ex_ee_len/blocks_per_block_group for + * the worst case + */ + credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb)); if (ex == EXT_FIRST_EXTENT(eh)) { correct_index = 1; credits += (ext_depth(inode)) + 1; } -#ifdef CONFIG_QUOTA credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); -#endif - handle = ext4_ext_journal_restart(handle, credits); - if (IS_ERR(handle)) { - err = PTR_ERR(handle); + err = ext4_ext_journal_restart(handle, credits); + if (err) goto out; - } err = ext4_ext_get_access(handle, inode, path + depth); if (err) @@ -2097,7 +2251,7 @@ void ext4_ext_init(struct super_block *sb) */ if (test_opt(sb, EXTENTS)) { - printk("EXT4-fs: file extents enabled"); + printk(KERN_INFO "EXT4-fs: file extents enabled"); #ifdef AGGRESSIVE_TEST printk(", aggressive tests"); #endif @@ -2287,7 +2441,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, unsigned int newdepth; /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */ if (allocated <= EXT4_EXT_ZERO_LEN) { - /* Mark first half uninitialized. + /* + * iblock == ee_block is handled by the zerouout + * at the beginning. + * Mark first half uninitialized. * Mark second half initialized and zero out the * initialized extent */ @@ -2310,7 +2467,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ex->ee_len = orig_ex.ee_len; ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); ext4_ext_dirty(handle, inode, path + depth); - /* zeroed the full extent */ + /* blocks available from iblock */ return allocated; } else if (err) @@ -2338,6 +2495,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, err = PTR_ERR(path); return err; } + /* get the second half extent details */ ex = path[depth].p_ext; err = ext4_ext_get_access(handle, inode, path + depth); @@ -2367,6 +2525,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); ext4_ext_dirty(handle, inode, path + depth); /* zeroed the full extent */ + /* blocks available from iblock */ return allocated; } else if (err) @@ -2382,23 +2541,22 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, */ orig_ex.ee_len = cpu_to_le16(ee_len - ext4_ext_get_actual_len(ex3)); - if (newdepth != depth) { - depth = newdepth; - ext4_ext_drop_refs(path); - path = ext4_ext_find_extent(inode, iblock, path); - if (IS_ERR(path)) { - err = PTR_ERR(path); - goto out; - } - eh = path[depth].p_hdr; - ex = path[depth].p_ext; - if (ex2 != &newex) - ex2 = ex; - - err = ext4_ext_get_access(handle, inode, path + depth); - if (err) - goto out; + depth = newdepth; + ext4_ext_drop_refs(path); + path = ext4_ext_find_extent(inode, iblock, path); + if (IS_ERR(path)) { + err = PTR_ERR(path); + goto out; } + eh = path[depth].p_hdr; + ex = path[depth].p_ext; + if (ex2 != &newex) + ex2 = ex; + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + allocated = max_blocks; /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying @@ -2416,6 +2574,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); ext4_ext_dirty(handle, inode, path + depth); /* zero out the first half */ + /* blocks available from iblock */ return allocated; } } @@ -2529,6 +2688,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, int err = 0, depth, ret; unsigned long allocated = 0; struct ext4_allocation_request ar; + loff_t disksize; __clear_bit(BH_New, &bh_result->b_state); ext_debug("blocks %u/%lu requested for inode %u\n", @@ -2616,8 +2776,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, */ if (allocated > max_blocks) allocated = max_blocks; - /* mark the buffer unwritten */ - __set_bit(BH_Unwritten, &bh_result->b_state); + set_buffer_unwritten(bh_result); goto out2; } @@ -2646,11 +2805,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, goto out2; } /* - * Okay, we need to do block allocation. Lazily initialize the block - * allocation info here if necessary. + * Okay, we need to do block allocation. */ - if (S_ISREG(inode->i_mode) && (!EXT4_I(inode)->i_block_alloc_info)) - ext4_init_block_alloc_info(inode); /* find neighbour allocated blocks */ ar.lleft = iblock; @@ -2710,20 +2866,25 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, /* free data blocks we just allocated */ /* not a good idea to call discard here directly, * but otherwise we'd need to call it every free() */ - ext4_mb_discard_inode_preallocations(inode); + ext4_discard_preallocations(inode); ext4_free_blocks(handle, inode, ext_pblock(&newex), ext4_ext_get_actual_len(&newex), 0); goto out2; } - if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize) - EXT4_I(inode)->i_disksize = inode->i_size; - /* previous routine could use block we allocated */ newblock = ext_pblock(&newex); allocated = ext4_ext_get_actual_len(&newex); outnew: - __set_bit(BH_New, &bh_result->b_state); + if (extend_disksize) { + disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits; + if (disksize > i_size_read(inode)) + disksize = i_size_read(inode); + if (disksize > EXT4_I(inode)->i_disksize) + EXT4_I(inode)->i_disksize = disksize; + } + + set_buffer_new(bh_result); /* Cache only when it is _not_ an uninitialized extent */ if (create != EXT4_CREATE_UNINITIALIZED_EXT) @@ -2733,7 +2894,7 @@ out: if (allocated > max_blocks) allocated = max_blocks; ext4_ext_show_leaf(inode, path); - __set_bit(BH_Mapped, &bh_result->b_state); + set_buffer_mapped(bh_result); bh_result->b_bdev = inode->i_sb->s_bdev; bh_result->b_blocknr = newblock; out2: @@ -2744,7 +2905,7 @@ out2: return err ? err : allocated; } -void ext4_ext_truncate(struct inode * inode, struct page *page) +void ext4_ext_truncate(struct inode *inode) { struct address_space *mapping = inode->i_mapping; struct super_block *sb = inode->i_sb; @@ -2755,33 +2916,27 @@ void ext4_ext_truncate(struct inode * inode, struct page *page) /* * probably first extent we're gonna free will be last in block */ - err = ext4_writepage_trans_blocks(inode) + 3; + err = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, err); - if (IS_ERR(handle)) { - if (page) { - clear_highpage(page); - flush_dcache_page(page); - unlock_page(page); - page_cache_release(page); - } + if (IS_ERR(handle)) return; - } - if (page) - ext4_block_truncate_page(handle, page, mapping, inode->i_size); + if (inode->i_size & (sb->s_blocksize - 1)) + ext4_block_truncate_page(handle, mapping, inode->i_size); + + if (ext4_orphan_add(handle, inode)) + goto out_stop; down_write(&EXT4_I(inode)->i_data_sem); ext4_ext_invalidate_cache(inode); - ext4_mb_discard_inode_preallocations(inode); + ext4_discard_preallocations(inode); /* * TODO: optimization is possible here. * Probably we need not scan at all, * because page truncation is enough. */ - if (ext4_orphan_add(handle, inode)) - goto out_stop; /* we have to know where to truncate from in crash case */ EXT4_I(inode)->i_disksize = inode->i_size; @@ -2798,6 +2953,7 @@ void ext4_ext_truncate(struct inode * inode, struct page *page) handle->h_sync = 1; out_stop: + up_write(&EXT4_I(inode)->i_data_sem); /* * If this was a simple ftruncate() and the file will remain alive, * then we need to clear up the orphan record which we created above. @@ -2808,33 +2964,11 @@ out_stop: if (inode->i_nlink) ext4_orphan_del(handle, inode); - up_write(&EXT4_I(inode)->i_data_sem); inode->i_mtime = inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); } -/* - * ext4_ext_writepage_trans_blocks: - * calculate max number of blocks we could modify - * in order to allocate new block for an inode - */ -int ext4_ext_writepage_trans_blocks(struct inode *inode, int num) -{ - int needed; - - needed = ext4_ext_calc_credits_for_insert(inode, NULL); - - /* caller wants to allocate num blocks, but note it includes sb */ - needed = needed * num - (num - 1); - -#ifdef CONFIG_QUOTA - needed += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); -#endif - - return needed; -} - static void ext4_falloc_update_inode(struct inode *inode, int mode, loff_t new_size, int update_ctime) { @@ -2849,10 +2983,11 @@ static void ext4_falloc_update_inode(struct inode *inode, * Update only when preallocation was requested beyond * the file size. */ - if (!(mode & FALLOC_FL_KEEP_SIZE) && - new_size > i_size_read(inode)) { - i_size_write(inode, new_size); - EXT4_I(inode)->i_disksize = new_size; + if (!(mode & FALLOC_FL_KEEP_SIZE)) { + if (new_size > i_size_read(inode)) + i_size_write(inode, new_size); + if (new_size > EXT4_I(inode)->i_disksize) + ext4_update_i_disksize(inode, new_size); } } @@ -2895,10 +3030,9 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - block; /* - * credits to insert 1 extent into extent tree + buffers to be able to - * modify 1 super block, 1 block bitmap and 1 group descriptor. + * credits to insert 1 extent into extent tree */ - credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 3; + credits = ext4_chunk_trans_blocks(inode, max_blocks); mutex_lock(&inode->i_mutex); retry: while (ret >= 0 && ret < max_blocks) { @@ -2911,7 +3045,7 @@ retry: } ret = ext4_get_blocks_wrap(handle, inode, block, max_blocks, &map_bh, - EXT4_CREATE_UNINITIALIZED_EXT, 0); + EXT4_CREATE_UNINITIALIZED_EXT, 0, 0); if (ret <= 0) { #ifdef EXT4FS_DEBUG WARN_ON(ret <= 0); @@ -2945,3 +3079,143 @@ retry: mutex_unlock(&inode->i_mutex); return ret > 0 ? ret2 : ret; } + +/* + * Callback function called for each extent to gather FIEMAP information. + */ +int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, + struct ext4_ext_cache *newex, struct ext4_extent *ex, + void *data) +{ + struct fiemap_extent_info *fieinfo = data; + unsigned long blksize_bits = inode->i_sb->s_blocksize_bits; + __u64 logical; + __u64 physical; + __u64 length; + __u32 flags = 0; + int error; + + logical = (__u64)newex->ec_block << blksize_bits; + + if (newex->ec_type == EXT4_EXT_CACHE_GAP) { + pgoff_t offset; + struct page *page; + struct buffer_head *bh = NULL; + + offset = logical >> PAGE_SHIFT; + page = find_get_page(inode->i_mapping, offset); + if (!page || !page_has_buffers(page)) + return EXT_CONTINUE; + + bh = page_buffers(page); + + if (!bh) + return EXT_CONTINUE; + + if (buffer_delay(bh)) { + flags |= FIEMAP_EXTENT_DELALLOC; + page_cache_release(page); + } else { + page_cache_release(page); + return EXT_CONTINUE; + } + } + + physical = (__u64)newex->ec_start << blksize_bits; + length = (__u64)newex->ec_len << blksize_bits; + + if (ex && ext4_ext_is_uninitialized(ex)) + flags |= FIEMAP_EXTENT_UNWRITTEN; + + /* + * If this extent reaches EXT_MAX_BLOCK, it must be last. + * + * Or if ext4_ext_next_allocated_block is EXT_MAX_BLOCK, + * this also indicates no more allocated blocks. + * + * XXX this might miss a single-block extent at EXT_MAX_BLOCK + */ + if (logical + length - 1 == EXT_MAX_BLOCK || + ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK) + flags |= FIEMAP_EXTENT_LAST; + + error = fiemap_fill_next_extent(fieinfo, logical, physical, + length, flags); + if (error < 0) + return error; + if (error == 1) + return EXT_BREAK; + + return EXT_CONTINUE; +} + +/* fiemap flags we can handle specified here */ +#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) + +int ext4_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo) +{ + __u64 physical = 0; + __u64 length; + __u32 flags = FIEMAP_EXTENT_LAST; + int blockbits = inode->i_sb->s_blocksize_bits; + int error = 0; + + /* in-inode? */ + if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) { + struct ext4_iloc iloc; + int offset; /* offset of xattr in inode */ + + error = ext4_get_inode_loc(inode, &iloc); + if (error) + return error; + physical = iloc.bh->b_blocknr << blockbits; + offset = EXT4_GOOD_OLD_INODE_SIZE + + EXT4_I(inode)->i_extra_isize; + physical += offset; + length = EXT4_SB(inode->i_sb)->s_inode_size - offset; + flags |= FIEMAP_EXTENT_DATA_INLINE; + } else { /* external block */ + physical = EXT4_I(inode)->i_file_acl << blockbits; + length = inode->i_sb->s_blocksize; + } + + if (physical) + error = fiemap_fill_next_extent(fieinfo, 0, physical, + length, flags); + return (error < 0 ? error : 0); +} + +int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len) +{ + ext4_lblk_t start_blk; + ext4_lblk_t len_blks; + int error = 0; + + /* fallback to generic here if not in extents fmt */ + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) + return generic_block_fiemap(inode, fieinfo, start, len, + ext4_get_block); + + if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS)) + return -EBADR; + + if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { + error = ext4_xattr_fiemap(inode, fieinfo); + } else { + start_blk = start >> inode->i_sb->s_blocksize_bits; + len_blks = len >> inode->i_sb->s_blocksize_bits; + + /* + * Walk the extent tree gathering extent information. + * ext4_ext_fiemap_cb will push extents back to user. + */ + down_write(&EXT4_I(inode)->i_data_sem); + error = ext4_ext_walk_space(inode, start_blk, len_blks, + ext4_ext_fiemap_cb, fieinfo); + up_write(&EXT4_I(inode)->i_data_sem); + } + + return error; +} + diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 4159be6366ab..6bd11fba71f7 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -31,14 +31,14 @@ * from ext4_file_open: open gets called at every open, but release * gets called only when /all/ the files are closed. */ -static int ext4_release_file (struct inode * inode, struct file * filp) +static int ext4_release_file(struct inode *inode, struct file *filp) { /* if we are the last writer on the inode, drop the block reservation */ if ((filp->f_mode & FMODE_WRITE) && (atomic_read(&inode->i_writecount) == 1)) { down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_reservation(inode); + ext4_discard_preallocations(inode); up_write(&EXT4_I(inode)->i_data_sem); } if (is_dx(inode) && filp->private_data) @@ -123,6 +123,26 @@ force_commit: return ret; } +static struct vm_operations_struct ext4_file_vm_ops = { + .fault = filemap_fault, + .page_mkwrite = ext4_page_mkwrite, +}; + +static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct address_space *mapping = file->f_mapping; + + if (!mapping->a_ops->readpage) + return -ENOEXEC; + file_accessed(file); + vma->vm_ops = &ext4_file_vm_ops; + vma->vm_flags |= VM_CAN_NONLINEAR; + return 0; +} + +extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len); + const struct file_operations ext4_file_operations = { .llseek = generic_file_llseek, .read = do_sync_read, @@ -133,7 +153,7 @@ const struct file_operations ext4_file_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = ext4_compat_ioctl, #endif - .mmap = generic_file_mmap, + .mmap = ext4_file_mmap, .open = generic_file_open, .release = ext4_release_file, .fsync = ext4_sync_file, @@ -144,7 +164,8 @@ const struct file_operations ext4_file_operations = { const struct inode_operations ext4_file_inode_operations = { .truncate = ext4_truncate, .setattr = ext4_setattr, -#ifdef CONFIG_EXT4DEV_FS_XATTR + .getattr = ext4_getattr, +#ifdef CONFIG_EXT4_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = ext4_listxattr, @@ -152,5 +173,6 @@ const struct inode_operations ext4_file_inode_operations = { #endif .permission = ext4_permission, .fallocate = ext4_fallocate, + .fiemap = ext4_fiemap, }; diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 1c8ba48d4f8d..5afe4370840b 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -27,6 +27,8 @@ #include <linux/sched.h> #include <linux/writeback.h> #include <linux/jbd2.h> +#include <linux/blkdev.h> +#include <linux/marker.h> #include "ext4.h" #include "ext4_jbd2.h" @@ -42,13 +44,18 @@ * inode to disk. */ -int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync) +int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) { struct inode *inode = dentry->d_inode; + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; int ret = 0; J_ASSERT(ext4_journal_current_handle() == NULL); + trace_mark(ext4_sync_file, "dev %s datasync %d ino %ld parent %ld", + inode->i_sb->s_id, datasync, inode->i_ino, + dentry->d_parent->d_inode->i_ino); + /* * data=writeback: * The caller's filemap_fdatawrite()/wait will sync the data. @@ -85,6 +92,8 @@ int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync) .nr_to_write = 0, /* sys_fsync did this */ }; ret = sync_inode(inode, &wbc); + if (journal && (journal->j_flags & JBD2_BARRIER)) + blkdev_issue_flush(inode->i_sb->s_bdev, NULL); } out: return ret; diff --git a/fs/ext4/group.h b/fs/ext4/group.h index 7eb0604e7eea..c2c0a8d06d0e 100644 --- a/fs/ext4/group.h +++ b/fs/ext4/group.h @@ -13,7 +13,7 @@ extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group, struct ext4_group_desc *gdp); extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group, struct ext4_group_desc *gdp); -struct buffer_head *read_block_bitmap(struct super_block *sb, +struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group); extern unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index 1d6329dbe390..556ca8eba3db 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -27,7 +27,7 @@ static void TEA_transform(__u32 buf[4], __u32 const in[]) sum += DELTA; b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); - } while(--n); + } while (--n); buf[0] += b0; buf[1] += b1; @@ -35,7 +35,7 @@ static void TEA_transform(__u32 buf[4], __u32 const in[]) /* The old legacy hash */ -static __u32 dx_hack_hash (const char *name, int len) +static __u32 dx_hack_hash(const char *name, int len) { __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; while (len--) { @@ -59,7 +59,7 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) val = pad; if (len > num*4) len = num * 4; - for (i=0; i < len; i++) { + for (i = 0; i < len; i++) { if ((i % 4) == 0) val = pad; val = msg[i] + (val << 8); @@ -104,7 +104,7 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) /* Check to see if the seed is all zero's */ if (hinfo->seed) { - for (i=0; i < 4; i++) { + for (i = 0; i < 4; i++) { if (hinfo->seed[i]) break; } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index c6efbab0c801..fe34d74cfb19 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -97,34 +97,46 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh, * Return buffer_head of bitmap on success or NULL. */ static struct buffer_head * -read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) +ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) { struct ext4_group_desc *desc; struct buffer_head *bh = NULL; + ext4_fsblk_t bitmap_blk; desc = ext4_get_group_desc(sb, block_group, NULL); if (!desc) - goto error_out; + return NULL; + bitmap_blk = ext4_inode_bitmap(sb, desc); + bh = sb_getblk(sb, bitmap_blk); + if (unlikely(!bh)) { + ext4_error(sb, __func__, + "Cannot read inode bitmap - " + "block_group = %lu, inode_bitmap = %llu", + block_group, bitmap_blk); + return NULL; + } + if (buffer_uptodate(bh) && + !(desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) + return bh; + + lock_buffer(bh); + spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { - bh = sb_getblk(sb, ext4_inode_bitmap(sb, desc)); - if (!buffer_uptodate(bh)) { - lock_buffer(bh); - if (!buffer_uptodate(bh)) { - ext4_init_inode_bitmap(sb, bh, block_group, - desc); - set_buffer_uptodate(bh); - } - unlock_buffer(bh); - } - } else { - bh = sb_bread(sb, ext4_inode_bitmap(sb, desc)); + ext4_init_inode_bitmap(sb, bh, block_group, desc); + set_buffer_uptodate(bh); + unlock_buffer(bh); + spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); + return bh; } - if (!bh) - ext4_error(sb, "read_inode_bitmap", + spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); + if (bh_submit_read(bh) < 0) { + put_bh(bh); + ext4_error(sb, __func__, "Cannot read inode bitmap - " "block_group = %lu, inode_bitmap = %llu", - block_group, ext4_inode_bitmap(sb, desc)); -error_out: + block_group, bitmap_blk); + return NULL; + } return bh; } @@ -144,38 +156,40 @@ error_out: * though), and then we'd have two inodes sharing the * same inode number and space on the harddisk. */ -void ext4_free_inode (handle_t *handle, struct inode * inode) +void ext4_free_inode(handle_t *handle, struct inode *inode) { - struct super_block * sb = inode->i_sb; + struct super_block *sb = inode->i_sb; int is_directory; unsigned long ino; struct buffer_head *bitmap_bh = NULL; struct buffer_head *bh2; ext4_group_t block_group; unsigned long bit; - struct ext4_group_desc * gdp; - struct ext4_super_block * es; + struct ext4_group_desc *gdp; + struct ext4_super_block *es; struct ext4_sb_info *sbi; int fatal = 0, err; + ext4_group_t flex_group; if (atomic_read(&inode->i_count) > 1) { - printk ("ext4_free_inode: inode has count=%d\n", - atomic_read(&inode->i_count)); + printk(KERN_ERR "ext4_free_inode: inode has count=%d\n", + atomic_read(&inode->i_count)); return; } if (inode->i_nlink) { - printk ("ext4_free_inode: inode has nlink=%d\n", - inode->i_nlink); + printk(KERN_ERR "ext4_free_inode: inode has nlink=%d\n", + inode->i_nlink); return; } if (!sb) { - printk("ext4_free_inode: inode on nonexistent device\n"); + printk(KERN_ERR "ext4_free_inode: inode on " + "nonexistent device\n"); return; } sbi = EXT4_SB(sb); ino = inode->i_ino; - ext4_debug ("freeing inode %lu\n", ino); + ext4_debug("freeing inode %lu\n", ino); /* * Note: we must free any quota before locking the superblock, @@ -189,17 +203,17 @@ void ext4_free_inode (handle_t *handle, struct inode * inode) is_directory = S_ISDIR(inode->i_mode); /* Do this BEFORE marking the inode not in use or returning an error */ - clear_inode (inode); + clear_inode(inode); es = EXT4_SB(sb)->s_es; if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { - ext4_error (sb, "ext4_free_inode", - "reserved or nonexistent inode %lu", ino); + ext4_error(sb, "ext4_free_inode", + "reserved or nonexistent inode %lu", ino); goto error_return; } block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); - bitmap_bh = read_inode_bitmap(sb, block_group); + bitmap_bh = ext4_read_inode_bitmap(sb, block_group); if (!bitmap_bh) goto error_return; @@ -211,10 +225,10 @@ void ext4_free_inode (handle_t *handle, struct inode * inode) /* Ok, now we can actually update the inode bitmaps.. */ if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group), bit, bitmap_bh->b_data)) - ext4_error (sb, "ext4_free_inode", - "bit already cleared for inode %lu", ino); + ext4_error(sb, "ext4_free_inode", + "bit already cleared for inode %lu", ino); else { - gdp = ext4_get_group_desc (sb, block_group, &bh2); + gdp = ext4_get_group_desc(sb, block_group, &bh2); BUFFER_TRACE(bh2, "get_write_access"); fatal = ext4_journal_get_write_access(handle, bh2); @@ -232,6 +246,12 @@ void ext4_free_inode (handle_t *handle, struct inode * inode) if (is_directory) percpu_counter_dec(&sbi->s_dirs_counter); + if (sbi->s_log_groups_per_flex) { + flex_group = ext4_flex_group(sbi, block_group); + spin_lock(sb_bgl_lock(sbi, flex_group)); + sbi->s_flex_groups[flex_group].free_inodes++; + spin_unlock(sb_bgl_lock(sbi, flex_group)); + } } BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata"); err = ext4_journal_dirty_metadata(handle, bh2); @@ -270,7 +290,7 @@ static int find_group_dir(struct super_block *sb, struct inode *parent, avefreei = freei / ngroups; for (group = 0; group < ngroups; group++) { - desc = ext4_get_group_desc (sb, group, NULL); + desc = ext4_get_group_desc(sb, group, NULL); if (!desc || !desc->bg_free_inodes_count) continue; if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) @@ -286,6 +306,80 @@ static int find_group_dir(struct super_block *sb, struct inode *parent, return ret; } +#define free_block_ratio 10 + +static int find_group_flex(struct super_block *sb, struct inode *parent, + ext4_group_t *best_group) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_desc *desc; + struct buffer_head *bh; + struct flex_groups *flex_group = sbi->s_flex_groups; + ext4_group_t parent_group = EXT4_I(parent)->i_block_group; + ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group); + ext4_group_t ngroups = sbi->s_groups_count; + int flex_size = ext4_flex_bg_size(sbi); + ext4_group_t best_flex = parent_fbg_group; + int blocks_per_flex = sbi->s_blocks_per_group * flex_size; + int flexbg_free_blocks; + int flex_freeb_ratio; + ext4_group_t n_fbg_groups; + ext4_group_t i; + + n_fbg_groups = (sbi->s_groups_count + flex_size - 1) >> + sbi->s_log_groups_per_flex; + +find_close_to_parent: + flexbg_free_blocks = flex_group[best_flex].free_blocks; + flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex; + if (flex_group[best_flex].free_inodes && + flex_freeb_ratio > free_block_ratio) + goto found_flexbg; + + if (best_flex && best_flex == parent_fbg_group) { + best_flex--; + goto find_close_to_parent; + } + + for (i = 0; i < n_fbg_groups; i++) { + if (i == parent_fbg_group || i == parent_fbg_group - 1) + continue; + + flexbg_free_blocks = flex_group[i].free_blocks; + flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex; + + if (flex_freeb_ratio > free_block_ratio && + flex_group[i].free_inodes) { + best_flex = i; + goto found_flexbg; + } + + if (flex_group[best_flex].free_inodes == 0 || + (flex_group[i].free_blocks > + flex_group[best_flex].free_blocks && + flex_group[i].free_inodes)) + best_flex = i; + } + + if (!flex_group[best_flex].free_inodes || + !flex_group[best_flex].free_blocks) + return -1; + +found_flexbg: + for (i = best_flex * flex_size; i < ngroups && + i < (best_flex + 1) * flex_size; i++) { + desc = ext4_get_group_desc(sb, i, &bh); + if (le16_to_cpu(desc->bg_free_inodes_count)) { + *best_group = i; + goto out; + } + } + + return -1; +out: + return 0; +} + /* * Orlov's allocator for directories. * @@ -485,22 +579,23 @@ static int find_group_other(struct super_block *sb, struct inode *parent, * For other inodes, search forward from the parent directory's block * group to find a free inode. */ -struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode) +struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) { struct super_block *sb; struct buffer_head *bitmap_bh = NULL; struct buffer_head *bh2; ext4_group_t group = 0; unsigned long ino = 0; - struct inode * inode; - struct ext4_group_desc * gdp = NULL; - struct ext4_super_block * es; + struct inode *inode; + struct ext4_group_desc *gdp = NULL; + struct ext4_super_block *es; struct ext4_inode_info *ei; struct ext4_sb_info *sbi; int ret2, err = 0; struct inode *ret; ext4_group_t i; int free = 0; + ext4_group_t flex_group; /* Cannot create files in a deleted directory */ if (!dir || !dir->i_nlink) @@ -514,14 +609,21 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode) sbi = EXT4_SB(sb); es = sbi->s_es; + + if (sbi->s_log_groups_per_flex) { + ret2 = find_group_flex(sb, dir, &group); + goto got_group; + } + if (S_ISDIR(mode)) { - if (test_opt (sb, OLDALLOC)) + if (test_opt(sb, OLDALLOC)) ret2 = find_group_dir(sb, dir, &group); else ret2 = find_group_orlov(sb, dir, &group); } else ret2 = find_group_other(sb, dir, &group); +got_group: err = -ENOSPC; if (ret2 == -1) goto out; @@ -534,7 +636,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode) goto fail; brelse(bitmap_bh); - bitmap_bh = read_inode_bitmap(sb, group); + bitmap_bh = ext4_read_inode_bitmap(sb, group); if (!bitmap_bh) goto fail; @@ -600,7 +702,7 @@ got: /* We may have to initialize the block bitmap if it isn't already */ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) && gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { - struct buffer_head *block_bh = read_block_bitmap(sb, group); + struct buffer_head *block_bh = ext4_read_block_bitmap(sb, group); BUFFER_TRACE(block_bh, "get block bitmap access"); err = ext4_journal_get_write_access(handle, block_bh); @@ -639,7 +741,7 @@ got: /* When marking the block group with * ~EXT4_BG_INODE_UNINIT we don't want to depend - * on the value of bg_itable_unsed even though + * on the value of bg_itable_unused even though * mke2fs could have initialized the same for us. * Instead we calculated the value below */ @@ -676,8 +778,15 @@ got: percpu_counter_inc(&sbi->s_dirs_counter); sb->s_dirt = 1; + if (sbi->s_log_groups_per_flex) { + flex_group = ext4_flex_group(sbi, group); + spin_lock(sb_bgl_lock(sbi, flex_group)); + sbi->s_flex_groups[flex_group].free_inodes--; + spin_unlock(sb_bgl_lock(sbi, flex_group)); + } + inode->i_uid = current->fsuid; - if (test_opt (sb, GRPID)) + if (test_opt(sb, GRPID)) inode->i_gid = dir->i_gid; else if (dir->i_mode & S_ISGID) { inode->i_gid = dir->i_gid; @@ -710,7 +819,6 @@ got: ei->i_flags &= ~EXT4_DIRSYNC_FL; ei->i_file_acl = 0; ei->i_dtime = 0; - ei->i_block_alloc_info = NULL; ei->i_block_group = group; ext4_set_inode_flags(inode); @@ -726,7 +834,7 @@ got: ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; ret = inode; - if(DQUOT_ALLOC_INODE(inode)) { + if (DQUOT_ALLOC_INODE(inode)) { err = -EDQUOT; goto fail_drop; } @@ -735,19 +843,15 @@ got: if (err) goto fail_free_drop; - err = ext4_init_security(handle,inode, dir); + err = ext4_init_security(handle, inode, dir); if (err) goto fail_free_drop; if (test_opt(sb, EXTENTS)) { - /* set extent flag only for diretory, file and normal symlink*/ + /* set extent flag only for directory, file and normal symlink*/ if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) { EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL; ext4_ext_tree_init(handle, inode); - err = ext4_update_incompat_feature(handle, sb, - EXT4_FEATURE_INCOMPAT_EXTENTS); - if (err) - goto fail_free_drop; } } @@ -799,7 +903,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); - bitmap_bh = read_inode_bitmap(sb, block_group); + bitmap_bh = ext4_read_inode_bitmap(sb, block_group); if (!bitmap_bh) { ext4_warning(sb, __func__, "inode bitmap error for orphan %lu", ino); @@ -817,6 +921,14 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) if (IS_ERR(inode)) goto iget_failed; + /* + * If the orphans has i_nlinks > 0 then it should be able to be + * truncated, otherwise it won't be removed from the orphan list + * during processing and an infinite loop will result. + */ + if (inode->i_nlink && !ext4_can_truncate(inode)) + goto bad_orphan; + if (NEXT_ORPHAN(inode) > max_ino) goto bad_orphan; brelse(bitmap_bh); @@ -838,6 +950,7 @@ bad_orphan: printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n", NEXT_ORPHAN(inode)); printk(KERN_NOTICE "max_ino=%lu\n", max_ino); + printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink); /* Avoid freeing blocks if we got a bad deleted inode */ if (inode->i_nlink == 0) inode->i_blocks = 0; @@ -848,7 +961,7 @@ error: return ERR_PTR(err); } -unsigned long ext4_count_free_inodes (struct super_block * sb) +unsigned long ext4_count_free_inodes(struct super_block *sb) { unsigned long desc_count; struct ext4_group_desc *gdp; @@ -863,12 +976,12 @@ unsigned long ext4_count_free_inodes (struct super_block * sb) bitmap_count = 0; gdp = NULL; for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { - gdp = ext4_get_group_desc (sb, i, NULL); + gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; desc_count += le16_to_cpu(gdp->bg_free_inodes_count); brelse(bitmap_bh); - bitmap_bh = read_inode_bitmap(sb, i); + bitmap_bh = ext4_read_inode_bitmap(sb, i); if (!bitmap_bh) continue; @@ -878,13 +991,14 @@ unsigned long ext4_count_free_inodes (struct super_block * sb) bitmap_count += x; } brelse(bitmap_bh); - printk("ext4_count_free_inodes: stored = %u, computed = %lu, %lu\n", - le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count); + printk(KERN_DEBUG "ext4_count_free_inodes: " + "stored = %u, computed = %lu, %lu\n", + le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count); return desc_count; #else desc_count = 0; for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { - gdp = ext4_get_group_desc (sb, i, NULL); + gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; desc_count += le16_to_cpu(gdp->bg_free_inodes_count); @@ -895,13 +1009,13 @@ unsigned long ext4_count_free_inodes (struct super_block * sb) } /* Called at mount-time, super-block is locked */ -unsigned long ext4_count_dirs (struct super_block * sb) +unsigned long ext4_count_dirs(struct super_block * sb) { unsigned long count = 0; ext4_group_t i; for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { - struct ext4_group_desc *gdp = ext4_get_group_desc (sb, i, NULL); + struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; count += le16_to_cpu(gdp->bg_used_dirs_count); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 8d9707746413..9b4ec9decfd1 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -32,12 +32,25 @@ #include <linux/string.h> #include <linux/buffer_head.h> #include <linux/writeback.h> +#include <linux/pagevec.h> #include <linux/mpage.h> #include <linux/uio.h> #include <linux/bio.h> #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" +#include "ext4_extents.h" + +#define MPAGE_DA_EXTENT_TAIL 0x01 + +static inline int ext4_begin_ordered_truncate(struct inode *inode, + loff_t new_size) +{ + return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode, + new_size); +} + +static void ext4_invalidatepage(struct page *page, unsigned long offset); /* * Test whether an inode is a fast symlink. @@ -177,17 +190,21 @@ static int ext4_journal_test_restart(handle_t *handle, struct inode *inode) /* * Called at the last iput() if i_nlink is zero. */ -void ext4_delete_inode (struct inode * inode) +void ext4_delete_inode(struct inode *inode) { handle_t *handle; + int err; + if (ext4_should_order_data(inode)) + ext4_begin_ordered_truncate(inode, 0); truncate_inode_pages(&inode->i_data, 0); if (is_bad_inode(inode)) goto no_delete; - handle = start_transaction(inode); + handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3); if (IS_ERR(handle)) { + ext4_std_error(inode->i_sb, PTR_ERR(handle)); /* * If we're going to skip the normal cleanup, we still need to * make sure that the in-core orphan linked list is properly @@ -200,8 +217,34 @@ void ext4_delete_inode (struct inode * inode) if (IS_SYNC(inode)) handle->h_sync = 1; inode->i_size = 0; + err = ext4_mark_inode_dirty(handle, inode); + if (err) { + ext4_warning(inode->i_sb, __func__, + "couldn't mark inode dirty (err %d)", err); + goto stop_handle; + } if (inode->i_blocks) ext4_truncate(inode); + + /* + * ext4_ext_truncate() doesn't reserve any slop when it + * restarts journal transactions; therefore there may not be + * enough credits left in the handle to remove the inode from + * the orphan list and set the dtime field. + */ + if (handle->h_buffer_credits < 3) { + err = ext4_journal_extend(handle, 3); + if (err > 0) + err = ext4_journal_restart(handle, 3); + if (err != 0) { + ext4_warning(inode->i_sb, __func__, + "couldn't extend journal (err %d)", err); + stop_handle: + ext4_journal_stop(handle); + goto no_delete; + } + } + /* * Kill off the orphan record which ext4_truncate created. * AKPM: I think this can be inside the above `if'. @@ -287,11 +330,11 @@ static int ext4_block_to_path(struct inode *inode, int final = 0; if (i_block < 0) { - ext4_warning (inode->i_sb, "ext4_block_to_path", "block < 0"); + ext4_warning(inode->i_sb, "ext4_block_to_path", "block < 0"); } else if (i_block < direct_blocks) { offsets[n++] = i_block; final = direct_blocks; - } else if ( (i_block -= direct_blocks) < indirect_blocks) { + } else if ((i_block -= direct_blocks) < indirect_blocks) { offsets[n++] = EXT4_IND_BLOCK; offsets[n++] = i_block; final = ptrs; @@ -357,14 +400,14 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth, *err = 0; /* i_data is not going away, no lock needed */ - add_chain (chain, NULL, EXT4_I(inode)->i_data + *offsets); + add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets); if (!p->key) goto no_block; while (--depth) { bh = sb_bread(sb, le32_to_cpu(p->key)); if (!bh) goto failure; - add_chain(++p, bh, (__le32*)bh->b_data + *++offsets); + add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); /* Reader: end */ if (!p->key) goto no_block; @@ -400,7 +443,7 @@ no_block: static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) { struct ext4_inode_info *ei = EXT4_I(inode); - __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data; + __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data; __le32 *p; ext4_fsblk_t bg_start; ext4_fsblk_t last_block; @@ -443,18 +486,9 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, Indirect *partial) { - struct ext4_block_alloc_info *block_i; - - block_i = EXT4_I(inode)->i_block_alloc_info; - /* - * try the heuristic for sequential allocation, - * failing that at least try to get decent locality. + * XXX need to get goal block from mballoc's data structures */ - if (block_i && (block == block_i->last_alloc_logical_block + 1) - && (block_i->last_alloc_physical_block != 0)) { - return block_i->last_alloc_physical_block + 1; - } return ext4_find_near(inode, partial); } @@ -508,11 +542,12 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks, * direct blocks */ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, - ext4_fsblk_t goal, int indirect_blks, int blks, - ext4_fsblk_t new_blocks[4], int *err) + ext4_lblk_t iblock, ext4_fsblk_t goal, + int indirect_blks, int blks, + ext4_fsblk_t new_blocks[4], int *err) { int target, i; - unsigned long count = 0; + unsigned long count = 0, blk_allocated = 0; int index = 0; ext4_fsblk_t current_block = 0; int ret = 0; @@ -525,12 +560,13 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, * the first direct block of this branch. That's the * minimum number of blocks need to allocate(required) */ - target = blks + indirect_blks; - - while (1) { + /* first we try to allocate the indirect blocks */ + target = indirect_blks; + while (target > 0) { count = target; /* allocating blocks for indirect blocks and direct blocks */ - current_block = ext4_new_blocks(handle,inode,goal,&count,err); + current_block = ext4_new_meta_blocks(handle, inode, + goal, &count, err); if (*err) goto failed_out; @@ -540,20 +576,52 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, new_blocks[index++] = current_block++; count--; } - - if (count > 0) + if (count > 0) { + /* + * save the new block number + * for the first direct block + */ + new_blocks[index] = current_block; + printk(KERN_INFO "%s returned more blocks than " + "requested\n", __func__); + WARN_ON(1); break; + } } - /* save the new block number for the first direct block */ - new_blocks[index] = current_block; - + target = blks - count ; + blk_allocated = count; + if (!target) + goto allocated; + /* Now allocate data blocks */ + count = target; + /* allocating blocks for data blocks */ + current_block = ext4_new_blocks(handle, inode, iblock, + goal, &count, err); + if (*err && (target == blks)) { + /* + * if the allocation failed and we didn't allocate + * any blocks before + */ + goto failed_out; + } + if (!*err) { + if (target == blks) { + /* + * save the new block number + * for the first direct block + */ + new_blocks[index] = current_block; + } + blk_allocated += count; + } +allocated: /* total number of blocks allocated for direct blocks */ - ret = count; + ret = blk_allocated; *err = 0; return ret; failed_out: - for (i = 0; i <index; i++) + for (i = 0; i < index; i++) ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); return ret; } @@ -584,8 +652,9 @@ failed_out: * as described above and return 0. */ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, - int indirect_blks, int *blks, ext4_fsblk_t goal, - ext4_lblk_t *offsets, Indirect *branch) + ext4_lblk_t iblock, int indirect_blks, + int *blks, ext4_fsblk_t goal, + ext4_lblk_t *offsets, Indirect *branch) { int blocksize = inode->i_sb->s_blocksize; int i, n = 0; @@ -595,7 +664,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, ext4_fsblk_t new_blocks[4]; ext4_fsblk_t current_block; - num = ext4_alloc_blocks(handle, inode, goal, indirect_blks, + num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks, *blks, new_blocks, &err); if (err) return err; @@ -625,7 +694,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, branch[n].p = (__le32 *) bh->b_data + offsets[n]; branch[n].key = cpu_to_le32(new_blocks[n]); *branch[n].p = branch[n].key; - if ( n == indirect_blks) { + if (n == indirect_blks) { current_block = new_blocks[n]; /* * End of chain, update the last new metablock of @@ -652,7 +721,7 @@ failed: BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget"); ext4_journal_forget(handle, branch[i].bh); } - for (i = 0; i <indirect_blks; i++) + for (i = 0; i < indirect_blks; i++) ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); ext4_free_blocks(handle, inode, new_blocks[i], num, 0); @@ -679,10 +748,8 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode, { int i; int err = 0; - struct ext4_block_alloc_info *block_i; ext4_fsblk_t current_block; - block_i = EXT4_I(inode)->i_block_alloc_info; /* * If we're splicing into a [td]indirect block (as opposed to the * inode) then we need to get write access to the [td]indirect block @@ -705,18 +772,7 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode, if (num == 0 && blks > 1) { current_block = le32_to_cpu(where->key) + 1; for (i = 1; i < blks; i++) - *(where->p + i ) = cpu_to_le32(current_block++); - } - - /* - * update the most recently allocated logical & physical block - * in i_block_alloc_info, to assist find the proper goal block for next - * allocation - */ - if (block_i) { - block_i->last_alloc_logical_block = block + blks - 1; - block_i->last_alloc_physical_block = - le32_to_cpu(where[num].key) + blks - 1; + *(where->p + i) = cpu_to_le32(current_block++); } /* We are done with atomic stuff, now do the rest of housekeeping */ @@ -799,6 +855,7 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, struct ext4_inode_info *ei = EXT4_I(inode); int count = 0; ext4_fsblk_t first_block = 0; + loff_t disksize; J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)); @@ -835,12 +892,8 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, goto cleanup; /* - * Okay, we need to do block allocation. Lazily initialize the block - * allocation info here if necessary + * Okay, we need to do block allocation. */ - if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info)) - ext4_init_block_alloc_info(inode); - goal = ext4_find_goal(inode, iblock, partial); /* the number of blocks need to allocate for [d,t]indirect blocks */ @@ -855,8 +908,9 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, /* * Block out ext4_truncate while we alter the tree */ - err = ext4_alloc_branch(handle, inode, indirect_blks, &count, goal, - offsets + (partial - chain), partial); + err = ext4_alloc_branch(handle, inode, iblock, indirect_blks, + &count, goal, + offsets + (partial - chain), partial); /* * The ext4_splice_branch call will free and forget any buffers @@ -873,8 +927,13 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, * protect it if you're about to implement concurrent * ext4_get_block() -bzzz */ - if (!err && extend_disksize && inode->i_size > ei->i_disksize) - ei->i_disksize = inode->i_size; + if (!err && extend_disksize) { + disksize = ((loff_t) iblock + count) << inode->i_blkbits; + if (disksize > i_size_read(inode)) + disksize = i_size_read(inode); + if (disksize > ei->i_disksize) + ei->i_disksize = disksize; + } if (err) goto cleanup; @@ -897,23 +956,75 @@ out: return err; } -/* Maximum number of blocks we map for direct IO at once. */ -#define DIO_MAX_BLOCKS 4096 /* - * Number of credits we need for writing DIO_MAX_BLOCKS: - * We need sb + group descriptor + bitmap + inode -> 4 - * For B blocks with A block pointers per block we need: - * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect). - * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25. + * Calculate the number of metadata blocks need to reserve + * to allocate @blocks for non extent file based file */ -#define DIO_CREDITS 25 +static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks) +{ + int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb); + int ind_blks, dind_blks, tind_blks; + + /* number of new indirect blocks needed */ + ind_blks = (blocks + icap - 1) / icap; + + dind_blks = (ind_blks + icap - 1) / icap; + tind_blks = 1; + + return ind_blks + dind_blks + tind_blks; +} /* + * Calculate the number of metadata blocks need to reserve + * to allocate given number of blocks + */ +static int ext4_calc_metadata_amount(struct inode *inode, int blocks) +{ + if (!blocks) + return 0; + + if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) + return ext4_ext_calc_metadata_amount(inode, blocks); + + return ext4_indirect_calc_metadata_amount(inode, blocks); +} + +static void ext4_da_update_reserve_space(struct inode *inode, int used) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + int total, mdb, mdb_free; + + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); + /* recalculate the number of metablocks still need to be reserved */ + total = EXT4_I(inode)->i_reserved_data_blocks - used; + mdb = ext4_calc_metadata_amount(inode, total); + + /* figure out how many metablocks to release */ + BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); + mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; + + if (mdb_free) { + /* Account for allocated meta_blocks */ + mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks; + + /* update fs dirty blocks counter */ + percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free); + EXT4_I(inode)->i_allocated_meta_blocks = 0; + EXT4_I(inode)->i_reserved_meta_blocks = mdb; + } + + /* update per-inode reservations */ + BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks); + EXT4_I(inode)->i_reserved_data_blocks -= used; + + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); +} + +/* + * The ext4_get_blocks_wrap() function try to look up the requested blocks, + * and returns if the blocks are already mapped. * - * - * ext4_ext4 get_block() wrapper function - * It will do a look up first, and returns if the blocks already mapped. * Otherwise it takes the write lock of the i_data_sem and allocate blocks * and store the allocated blocks in the result buffer head and mark it * mapped. @@ -934,7 +1045,7 @@ out: */ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, unsigned long max_blocks, struct buffer_head *bh, - int create, int extend_disksize) + int create, int extend_disksize, int flag) { int retval; @@ -975,6 +1086,15 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, * with create == 1 flag. */ down_write((&EXT4_I(inode)->i_data_sem)); + + /* + * if the caller is from delayed allocation writeout path + * we have already reserved fs blocks for allocation + * let the underlying get_block() function know to + * avoid double accounting + */ + if (flag) + EXT4_I(inode)->i_delalloc_reserved_flag = 1; /* * We need to check for EXT4 here because migrate * could have changed the inode type in between @@ -996,23 +1116,39 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, ~EXT4_EXT_MIGRATE; } } + + if (flag) { + EXT4_I(inode)->i_delalloc_reserved_flag = 0; + /* + * Update reserved blocks/metadata blocks + * after successful block allocation + * which were deferred till now + */ + if ((retval > 0) && buffer_delay(bh)) + ext4_da_update_reserve_space(inode, retval); + } + up_write((&EXT4_I(inode)->i_data_sem)); return retval; } -static int ext4_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) +/* Maximum number of blocks we map for direct IO at once. */ +#define DIO_MAX_BLOCKS 4096 + +int ext4_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) { handle_t *handle = ext4_journal_current_handle(); int ret = 0, started = 0; unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; + int dio_credits; if (create && !handle) { /* Direct IO write... */ if (max_blocks > DIO_MAX_BLOCKS) max_blocks = DIO_MAX_BLOCKS; - handle = ext4_journal_start(inode, DIO_CREDITS + - 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb)); + dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); + handle = ext4_journal_start(inode, dio_credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; @@ -1021,7 +1157,7 @@ static int ext4_get_block(struct inode *inode, sector_t iblock, } ret = ext4_get_blocks_wrap(handle, inode, iblock, - max_blocks, bh_result, create, 0); + max_blocks, bh_result, create, 0, 0); if (ret > 0) { bh_result->b_size = (ret << inode->i_blkbits); ret = 0; @@ -1047,7 +1183,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, dummy.b_blocknr = -1000; buffer_trace_init(&dummy.b_history); err = ext4_get_blocks_wrap(handle, inode, block, 1, - &dummy, create, 1); + &dummy, create, 1, 0); /* * ext4_get_blocks_handle() returns number of blocks * mapped. 0 in case of a HOLE. @@ -1080,7 +1216,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, BUFFER_TRACE(bh, "call get_create_access"); fatal = ext4_journal_get_create_access(handle, bh); if (!fatal && !buffer_uptodate(bh)) { - memset(bh->b_data,0,inode->i_sb->s_blocksize); + memset(bh->b_data, 0, inode->i_sb->s_blocksize); set_buffer_uptodate(bh); } unlock_buffer(bh); @@ -1105,7 +1241,7 @@ err: struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, ext4_lblk_t block, int create, int *err) { - struct buffer_head * bh; + struct buffer_head *bh; bh = ext4_getblk(handle, inode, block, create, err); if (!bh) @@ -1121,13 +1257,13 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, return NULL; } -static int walk_page_buffers( handle_t *handle, - struct buffer_head *head, - unsigned from, - unsigned to, - int *partial, - int (*fn)( handle_t *handle, - struct buffer_head *bh)) +static int walk_page_buffers(handle_t *handle, + struct buffer_head *head, + unsigned from, + unsigned to, + int *partial, + int (*fn)(handle_t *handle, + struct buffer_head *bh)) { struct buffer_head *bh; unsigned block_start, block_end; @@ -1135,9 +1271,9 @@ static int walk_page_buffers( handle_t *handle, int err, ret = 0; struct buffer_head *next; - for ( bh = head, block_start = 0; - ret == 0 && (bh != head || !block_start); - block_start = block_end, bh = next) + for (bh = head, block_start = 0; + ret == 0 && (bh != head || !block_start); + block_start = block_end, bh = next) { next = bh->b_this_page; block_end = block_start + blocksize; @@ -1190,31 +1326,32 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { - struct inode *inode = mapping->host; + struct inode *inode = mapping->host; int ret, needed_blocks = ext4_writepage_trans_blocks(inode); handle_t *handle; int retries = 0; - struct page *page; + struct page *page; pgoff_t index; - unsigned from, to; + unsigned from, to; index = pos >> PAGE_CACHE_SHIFT; - from = pos & (PAGE_CACHE_SIZE - 1); - to = from + len; + from = pos & (PAGE_CACHE_SIZE - 1); + to = from + len; retry: - page = __grab_cache_page(mapping, index); - if (!page) - return -ENOMEM; - *pagep = page; + handle = ext4_journal_start(inode, needed_blocks); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } - handle = ext4_journal_start(inode, needed_blocks); - if (IS_ERR(handle)) { - unlock_page(page); - page_cache_release(page); - ret = PTR_ERR(handle); - goto out; + page = __grab_cache_page(mapping, index); + if (!page) { + ext4_journal_stop(handle); + ret = -ENOMEM; + goto out; } + *pagep = page; ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, ext4_get_block); @@ -1225,9 +1362,16 @@ retry: } if (ret) { + unlock_page(page); ext4_journal_stop(handle); - unlock_page(page); - page_cache_release(page); + page_cache_release(page); + /* + * block_write_begin may have instantiated a few blocks + * outside i_size. Trim these off again. Don't need + * i_size_read because we hold i_mutex. + */ + if (pos + len > inode->i_size) + vmtruncate(inode, inode->i_size); } if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) @@ -1236,15 +1380,6 @@ out: return ret; } -int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh) -{ - int err = jbd2_journal_dirty_data(handle, bh); - if (err) - ext4_journal_abort_handle(__func__, __func__, - bh, handle, err); - return err; -} - /* For write_end() in data=journal mode */ static int write_end_fn(handle_t *handle, struct buffer_head *bh) { @@ -1255,29 +1390,6 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh) } /* - * Generic write_end handler for ordered and writeback ext4 journal modes. - * We can't use generic_write_end, because that unlocks the page and we need to - * unlock the page after ext4_journal_stop, but ext4_journal_stop must run - * after block_write_end. - */ -static int ext4_generic_write_end(struct file *file, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - struct inode *inode = file->f_mapping->host; - - copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); - - if (pos+copied > inode->i_size) { - i_size_write(inode, pos+copied); - mark_inode_dirty(inode); - } - - return copied; -} - -/* * We need to pick up the new inode size which generic_commit_write gave us * `file' can be NULL - eg, when called from page_symlink(). * @@ -1290,28 +1402,25 @@ static int ext4_ordered_write_end(struct file *file, struct page *page, void *fsdata) { handle_t *handle = ext4_journal_current_handle(); - struct inode *inode = file->f_mapping->host; - unsigned from, to; + struct inode *inode = mapping->host; int ret = 0, ret2; - from = pos & (PAGE_CACHE_SIZE - 1); - to = from + len; - - ret = walk_page_buffers(handle, page_buffers(page), - from, to, NULL, ext4_journal_dirty_data); + ret = ext4_jbd2_file_inode(handle, inode); if (ret == 0) { - /* - * generic_write_end() will run mark_inode_dirty() if i_size - * changes. So let's piggyback the i_disksize mark_inode_dirty - * into that. - */ loff_t new_i_size; new_i_size = pos + copied; - if (new_i_size > EXT4_I(inode)->i_disksize) - EXT4_I(inode)->i_disksize = new_i_size; - ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, + if (new_i_size > EXT4_I(inode)->i_disksize) { + ext4_update_i_disksize(inode, new_i_size); + /* We need to mark inode dirty even if + * new_i_size is less that inode->i_size + * bu greater than i_disksize.(hint delalloc) + */ + ext4_mark_inode_dirty(handle, inode); + } + + ret2 = generic_write_end(file, mapping, pos, len, copied, page, fsdata); copied = ret2; if (ret2 < 0) @@ -1320,8 +1429,6 @@ static int ext4_ordered_write_end(struct file *file, ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; - unlock_page(page); - page_cache_release(page); return ret ? ret : copied; } @@ -1332,15 +1439,21 @@ static int ext4_writeback_write_end(struct file *file, struct page *page, void *fsdata) { handle_t *handle = ext4_journal_current_handle(); - struct inode *inode = file->f_mapping->host; + struct inode *inode = mapping->host; int ret = 0, ret2; loff_t new_i_size; new_i_size = pos + copied; - if (new_i_size > EXT4_I(inode)->i_disksize) - EXT4_I(inode)->i_disksize = new_i_size; + if (new_i_size > EXT4_I(inode)->i_disksize) { + ext4_update_i_disksize(inode, new_i_size); + /* We need to mark inode dirty even if + * new_i_size is less that inode->i_size + * bu greater than i_disksize.(hint delalloc) + */ + ext4_mark_inode_dirty(handle, inode); + } - ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, + ret2 = generic_write_end(file, mapping, pos, len, copied, page, fsdata); copied = ret2; if (ret2 < 0) @@ -1349,8 +1462,6 @@ static int ext4_writeback_write_end(struct file *file, ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; - unlock_page(page); - page_cache_release(page); return ret ? ret : copied; } @@ -1365,6 +1476,7 @@ static int ext4_journalled_write_end(struct file *file, int ret = 0, ret2; int partial = 0; unsigned from, to; + loff_t new_i_size; from = pos & (PAGE_CACHE_SIZE - 1); to = from + len; @@ -1379,25 +1491,1181 @@ static int ext4_journalled_write_end(struct file *file, to, &partial, write_end_fn); if (!partial) SetPageUptodate(page); - if (pos+copied > inode->i_size) + new_i_size = pos + copied; + if (new_i_size > inode->i_size) i_size_write(inode, pos+copied); EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; - if (inode->i_size > EXT4_I(inode)->i_disksize) { - EXT4_I(inode)->i_disksize = inode->i_size; + if (new_i_size > EXT4_I(inode)->i_disksize) { + ext4_update_i_disksize(inode, new_i_size); ret2 = ext4_mark_inode_dirty(handle, inode); if (!ret) ret = ret2; } + unlock_page(page); ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; - unlock_page(page); page_cache_release(page); return ret ? ret : copied; } +static int ext4_da_reserve_space(struct inode *inode, int nrblocks) +{ + int retries = 0; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + unsigned long md_needed, mdblocks, total = 0; + + /* + * recalculate the amount of metadata blocks to reserve + * in order to allocate nrblocks + * worse case is one extent per block + */ +repeat: + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); + total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks; + mdblocks = ext4_calc_metadata_amount(inode, total); + BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks); + + md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks; + total = md_needed + nrblocks; + + if (ext4_claim_free_blocks(sbi, total)) { + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + if (ext4_should_retry_alloc(inode->i_sb, &retries)) { + yield(); + goto repeat; + } + return -ENOSPC; + } + EXT4_I(inode)->i_reserved_data_blocks += nrblocks; + EXT4_I(inode)->i_reserved_meta_blocks = mdblocks; + + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + return 0; /* success */ +} + +static void ext4_da_release_space(struct inode *inode, int to_free) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + int total, mdb, mdb_free, release; + + if (!to_free) + return; /* Nothing to release, exit */ + + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); + + if (!EXT4_I(inode)->i_reserved_data_blocks) { + /* + * if there is no reserved blocks, but we try to free some + * then the counter is messed up somewhere. + * but since this function is called from invalidate + * page, it's harmless to return without any action + */ + printk(KERN_INFO "ext4 delalloc try to release %d reserved " + "blocks for inode %lu, but there is no reserved " + "data blocks\n", to_free, inode->i_ino); + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + return; + } + + /* recalculate the number of metablocks still need to be reserved */ + total = EXT4_I(inode)->i_reserved_data_blocks - to_free; + mdb = ext4_calc_metadata_amount(inode, total); + + /* figure out how many metablocks to release */ + BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); + mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; + + release = to_free + mdb_free; + + /* update fs dirty blocks counter for truncate case */ + percpu_counter_sub(&sbi->s_dirtyblocks_counter, release); + + /* update per-inode reservations */ + BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks); + EXT4_I(inode)->i_reserved_data_blocks -= to_free; + + BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); + EXT4_I(inode)->i_reserved_meta_blocks = mdb; + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); +} + +static void ext4_da_page_release_reservation(struct page *page, + unsigned long offset) +{ + int to_release = 0; + struct buffer_head *head, *bh; + unsigned int curr_off = 0; + + head = page_buffers(page); + bh = head; + do { + unsigned int next_off = curr_off + bh->b_size; + + if ((offset <= curr_off) && (buffer_delay(bh))) { + to_release++; + clear_buffer_delay(bh); + } + curr_off = next_off; + } while ((bh = bh->b_this_page) != head); + ext4_da_release_space(page->mapping->host, to_release); +} + +/* + * Delayed allocation stuff + */ + +struct mpage_da_data { + struct inode *inode; + struct buffer_head lbh; /* extent of blocks */ + unsigned long first_page, next_page; /* extent of pages */ + get_block_t *get_block; + struct writeback_control *wbc; + int io_done; + long pages_written; + int retval; +}; + +/* + * mpage_da_submit_io - walks through extent of pages and try to write + * them with writepage() call back + * + * @mpd->inode: inode + * @mpd->first_page: first page of the extent + * @mpd->next_page: page after the last page of the extent + * @mpd->get_block: the filesystem's block mapper function + * + * By the time mpage_da_submit_io() is called we expect all blocks + * to be allocated. this may be wrong if allocation failed. + * + * As pages are already locked by write_cache_pages(), we can't use it + */ +static int mpage_da_submit_io(struct mpage_da_data *mpd) +{ + struct address_space *mapping = mpd->inode->i_mapping; + int ret = 0, err, nr_pages, i; + unsigned long index, end; + struct pagevec pvec; + + BUG_ON(mpd->next_page <= mpd->first_page); + pagevec_init(&pvec, 0); + index = mpd->first_page; + end = mpd->next_page - 1; + + while (index <= end) { + /* XXX: optimize tail */ + nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); + if (nr_pages == 0) + break; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + index = page->index; + if (index > end) + break; + index++; + + err = mapping->a_ops->writepage(page, mpd->wbc); + if (!err) + mpd->pages_written++; + /* + * In error case, we have to continue because + * remaining pages are still locked + * XXX: unlock and re-dirty them? + */ + if (ret == 0) + ret = err; + } + pagevec_release(&pvec); + } + return ret; +} + +/* + * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers + * + * @mpd->inode - inode to walk through + * @exbh->b_blocknr - first block on a disk + * @exbh->b_size - amount of space in bytes + * @logical - first logical block to start assignment with + * + * the function goes through all passed space and put actual disk + * block numbers into buffer heads, dropping BH_Delay + */ +static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, + struct buffer_head *exbh) +{ + struct inode *inode = mpd->inode; + struct address_space *mapping = inode->i_mapping; + int blocks = exbh->b_size >> inode->i_blkbits; + sector_t pblock = exbh->b_blocknr, cur_logical; + struct buffer_head *head, *bh; + pgoff_t index, end; + struct pagevec pvec; + int nr_pages, i; + + index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits); + end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits); + cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + + pagevec_init(&pvec, 0); + + while (index <= end) { + /* XXX: optimize tail */ + nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); + if (nr_pages == 0) + break; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + index = page->index; + if (index > end) + break; + index++; + + BUG_ON(!PageLocked(page)); + BUG_ON(PageWriteback(page)); + BUG_ON(!page_has_buffers(page)); + + bh = page_buffers(page); + head = bh; + + /* skip blocks out of the range */ + do { + if (cur_logical >= logical) + break; + cur_logical++; + } while ((bh = bh->b_this_page) != head); + + do { + if (cur_logical >= logical + blocks) + break; + if (buffer_delay(bh)) { + bh->b_blocknr = pblock; + clear_buffer_delay(bh); + bh->b_bdev = inode->i_sb->s_bdev; + } else if (buffer_unwritten(bh)) { + bh->b_blocknr = pblock; + clear_buffer_unwritten(bh); + set_buffer_mapped(bh); + set_buffer_new(bh); + bh->b_bdev = inode->i_sb->s_bdev; + } else if (buffer_mapped(bh)) + BUG_ON(bh->b_blocknr != pblock); + + cur_logical++; + pblock++; + } while ((bh = bh->b_this_page) != head); + } + pagevec_release(&pvec); + } +} + + +/* + * __unmap_underlying_blocks - just a helper function to unmap + * set of blocks described by @bh + */ +static inline void __unmap_underlying_blocks(struct inode *inode, + struct buffer_head *bh) +{ + struct block_device *bdev = inode->i_sb->s_bdev; + int blocks, i; + + blocks = bh->b_size >> inode->i_blkbits; + for (i = 0; i < blocks; i++) + unmap_underlying_metadata(bdev, bh->b_blocknr + i); +} + +static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd, + sector_t logical, long blk_cnt) +{ + int nr_pages, i; + pgoff_t index, end; + struct pagevec pvec; + struct inode *inode = mpd->inode; + struct address_space *mapping = inode->i_mapping; + + index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits); + end = (logical + blk_cnt - 1) >> + (PAGE_CACHE_SHIFT - inode->i_blkbits); + while (index <= end) { + nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); + if (nr_pages == 0) + break; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + index = page->index; + if (index > end) + break; + index++; + + BUG_ON(!PageLocked(page)); + BUG_ON(PageWriteback(page)); + block_invalidatepage(page, 0); + ClearPageUptodate(page); + unlock_page(page); + } + } + return; +} + +static void ext4_print_free_blocks(struct inode *inode) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + printk(KERN_EMERG "Total free blocks count %lld\n", + ext4_count_free_blocks(inode->i_sb)); + printk(KERN_EMERG "Free/Dirty block details\n"); + printk(KERN_EMERG "free_blocks=%lld\n", + percpu_counter_sum(&sbi->s_freeblocks_counter)); + printk(KERN_EMERG "dirty_blocks=%lld\n", + percpu_counter_sum(&sbi->s_dirtyblocks_counter)); + printk(KERN_EMERG "Block reservation details\n"); + printk(KERN_EMERG "i_reserved_data_blocks=%lu\n", + EXT4_I(inode)->i_reserved_data_blocks); + printk(KERN_EMERG "i_reserved_meta_blocks=%lu\n", + EXT4_I(inode)->i_reserved_meta_blocks); + return; +} + +/* + * mpage_da_map_blocks - go through given space + * + * @mpd->lbh - bh describing space + * @mpd->get_block - the filesystem's block mapper function + * + * The function skips space we know is already mapped to disk blocks. + * + */ +static int mpage_da_map_blocks(struct mpage_da_data *mpd) +{ + int err = 0; + struct buffer_head new; + struct buffer_head *lbh = &mpd->lbh; + sector_t next; + + /* + * We consider only non-mapped and non-allocated blocks + */ + if (buffer_mapped(lbh) && !buffer_delay(lbh)) + return 0; + new.b_state = lbh->b_state; + new.b_blocknr = 0; + new.b_size = lbh->b_size; + next = lbh->b_blocknr; + /* + * If we didn't accumulate anything + * to write simply return + */ + if (!new.b_size) + return 0; + err = mpd->get_block(mpd->inode, next, &new, 1); + if (err) { + + /* If get block returns with error + * we simply return. Later writepage + * will redirty the page and writepages + * will find the dirty page again + */ + if (err == -EAGAIN) + return 0; + + if (err == -ENOSPC && + ext4_count_free_blocks(mpd->inode->i_sb)) { + mpd->retval = err; + return 0; + } + + /* + * get block failure will cause us + * to loop in writepages. Because + * a_ops->writepage won't be able to + * make progress. The page will be redirtied + * by writepage and writepages will again + * try to write the same. + */ + printk(KERN_EMERG "%s block allocation failed for inode %lu " + "at logical offset %llu with max blocks " + "%zd with error %d\n", + __func__, mpd->inode->i_ino, + (unsigned long long)next, + lbh->b_size >> mpd->inode->i_blkbits, err); + printk(KERN_EMERG "This should not happen.!! " + "Data will be lost\n"); + if (err == -ENOSPC) { + ext4_print_free_blocks(mpd->inode); + } + /* invlaidate all the pages */ + ext4_da_block_invalidatepages(mpd, next, + lbh->b_size >> mpd->inode->i_blkbits); + return err; + } + BUG_ON(new.b_size == 0); + + if (buffer_new(&new)) + __unmap_underlying_blocks(mpd->inode, &new); + + /* + * If blocks are delayed marked, we need to + * put actual blocknr and drop delayed bit + */ + if (buffer_delay(lbh) || buffer_unwritten(lbh)) + mpage_put_bnr_to_bhs(mpd, next, &new); + + return 0; +} + +#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ + (1 << BH_Delay) | (1 << BH_Unwritten)) + +/* + * mpage_add_bh_to_extent - try to add one more block to extent of blocks + * + * @mpd->lbh - extent of blocks + * @logical - logical number of the block in the file + * @bh - bh of the block (used to access block's state) + * + * the function is used to collect contig. blocks in same state + */ +static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, + sector_t logical, struct buffer_head *bh) +{ + sector_t next; + size_t b_size = bh->b_size; + struct buffer_head *lbh = &mpd->lbh; + int nrblocks = lbh->b_size >> mpd->inode->i_blkbits; + + /* check if thereserved journal credits might overflow */ + if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) { + if (nrblocks >= EXT4_MAX_TRANS_DATA) { + /* + * With non-extent format we are limited by the journal + * credit available. Total credit needed to insert + * nrblocks contiguous blocks is dependent on the + * nrblocks. So limit nrblocks. + */ + goto flush_it; + } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) > + EXT4_MAX_TRANS_DATA) { + /* + * Adding the new buffer_head would make it cross the + * allowed limit for which we have journal credit + * reserved. So limit the new bh->b_size + */ + b_size = (EXT4_MAX_TRANS_DATA - nrblocks) << + mpd->inode->i_blkbits; + /* we will do mpage_da_submit_io in the next loop */ + } + } + /* + * First block in the extent + */ + if (lbh->b_size == 0) { + lbh->b_blocknr = logical; + lbh->b_size = b_size; + lbh->b_state = bh->b_state & BH_FLAGS; + return; + } + + next = lbh->b_blocknr + nrblocks; + /* + * Can we merge the block to our big extent? + */ + if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) { + lbh->b_size += b_size; + return; + } + +flush_it: + /* + * We couldn't merge the block to our extent, so we + * need to flush current extent and start new one + */ + if (mpage_da_map_blocks(mpd) == 0) + mpage_da_submit_io(mpd); + mpd->io_done = 1; + return; +} + +/* + * __mpage_da_writepage - finds extent of pages and blocks + * + * @page: page to consider + * @wbc: not used, we just follow rules + * @data: context + * + * The function finds extents of pages and scan them for all blocks. + */ +static int __mpage_da_writepage(struct page *page, + struct writeback_control *wbc, void *data) +{ + struct mpage_da_data *mpd = data; + struct inode *inode = mpd->inode; + struct buffer_head *bh, *head, fake; + sector_t logical; + + if (mpd->io_done) { + /* + * Rest of the page in the page_vec + * redirty then and skip then. We will + * try to to write them again after + * starting a new transaction + */ + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return MPAGE_DA_EXTENT_TAIL; + } + /* + * Can we merge this page to current extent? + */ + if (mpd->next_page != page->index) { + /* + * Nope, we can't. So, we map non-allocated blocks + * and start IO on them using writepage() + */ + if (mpd->next_page != mpd->first_page) { + if (mpage_da_map_blocks(mpd) == 0) + mpage_da_submit_io(mpd); + /* + * skip rest of the page in the page_vec + */ + mpd->io_done = 1; + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return MPAGE_DA_EXTENT_TAIL; + } + + /* + * Start next extent of pages ... + */ + mpd->first_page = page->index; + + /* + * ... and blocks + */ + mpd->lbh.b_size = 0; + mpd->lbh.b_state = 0; + mpd->lbh.b_blocknr = 0; + } + + mpd->next_page = page->index + 1; + logical = (sector_t) page->index << + (PAGE_CACHE_SHIFT - inode->i_blkbits); + + if (!page_has_buffers(page)) { + /* + * There is no attached buffer heads yet (mmap?) + * we treat the page asfull of dirty blocks + */ + bh = &fake; + bh->b_size = PAGE_CACHE_SIZE; + bh->b_state = 0; + set_buffer_dirty(bh); + set_buffer_uptodate(bh); + mpage_add_bh_to_extent(mpd, logical, bh); + if (mpd->io_done) + return MPAGE_DA_EXTENT_TAIL; + } else { + /* + * Page with regular buffer heads, just add all dirty ones + */ + head = page_buffers(page); + bh = head; + do { + BUG_ON(buffer_locked(bh)); + if (buffer_dirty(bh) && + (!buffer_mapped(bh) || buffer_delay(bh))) { + mpage_add_bh_to_extent(mpd, logical, bh); + if (mpd->io_done) + return MPAGE_DA_EXTENT_TAIL; + } + logical++; + } while ((bh = bh->b_this_page) != head); + } + + return 0; +} + +/* + * mpage_da_writepages - walk the list of dirty pages of the given + * address space, allocates non-allocated blocks, maps newly-allocated + * blocks to existing bhs and issue IO them + * + * @mapping: address space structure to write + * @wbc: subtract the number of written pages from *@wbc->nr_to_write + * @get_block: the filesystem's block mapper function. + * + * This is a library function, which implements the writepages() + * address_space_operation. + */ +static int mpage_da_writepages(struct address_space *mapping, + struct writeback_control *wbc, + struct mpage_da_data *mpd) +{ + long to_write; + int ret; + + if (!mpd->get_block) + return generic_writepages(mapping, wbc); + + mpd->lbh.b_size = 0; + mpd->lbh.b_state = 0; + mpd->lbh.b_blocknr = 0; + mpd->first_page = 0; + mpd->next_page = 0; + mpd->io_done = 0; + mpd->pages_written = 0; + mpd->retval = 0; + + to_write = wbc->nr_to_write; + + ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd); + + /* + * Handle last extent of pages + */ + if (!mpd->io_done && mpd->next_page != mpd->first_page) { + if (mpage_da_map_blocks(mpd) == 0) + mpage_da_submit_io(mpd); + } + + wbc->nr_to_write = to_write - mpd->pages_written; + return ret; +} + +/* + * this is a special callback for ->write_begin() only + * it's intention is to return mapped block or reserve space + */ +static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int ret = 0; + + BUG_ON(create == 0); + BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); + + /* + * first, we need to know whether the block is allocated already + * preallocated blocks are unmapped but should treated + * the same as allocated blocks. + */ + ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1, bh_result, 0, 0, 0); + if ((ret == 0) && !buffer_delay(bh_result)) { + /* the block isn't (pre)allocated yet, let's reserve space */ + /* + * XXX: __block_prepare_write() unmaps passed block, + * is it OK? + */ + ret = ext4_da_reserve_space(inode, 1); + if (ret) + /* not enough space to reserve */ + return ret; + + map_bh(bh_result, inode->i_sb, 0); + set_buffer_new(bh_result); + set_buffer_delay(bh_result); + } else if (ret > 0) { + bh_result->b_size = (ret << inode->i_blkbits); + ret = 0; + } + + return ret; +} +#define EXT4_DELALLOC_RSVED 1 +static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int ret; + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; + loff_t disksize = EXT4_I(inode)->i_disksize; + handle_t *handle = NULL; + + handle = ext4_journal_current_handle(); + BUG_ON(!handle); + ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks, + bh_result, create, 0, EXT4_DELALLOC_RSVED); + if (ret > 0) { + + bh_result->b_size = (ret << inode->i_blkbits); + + if (ext4_should_order_data(inode)) { + int retval; + retval = ext4_jbd2_file_inode(handle, inode); + if (retval) + /* + * Failed to add inode for ordered + * mode. Don't update file size + */ + return retval; + } + + /* + * Update on-disk size along with block allocation + * we don't use 'extend_disksize' as size may change + * within already allocated block -bzzz + */ + disksize = ((loff_t) iblock + ret) << inode->i_blkbits; + if (disksize > i_size_read(inode)) + disksize = i_size_read(inode); + if (disksize > EXT4_I(inode)->i_disksize) { + ext4_update_i_disksize(inode, disksize); + ret = ext4_mark_inode_dirty(handle, inode); + return ret; + } + ret = 0; + } + return ret; +} + +static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) +{ + /* + * unmapped buffer is possible for holes. + * delay buffer is possible with delayed allocation + */ + return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh)); +} + +static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int ret = 0; + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; + + /* + * we don't want to do block allocation in writepage + * so call get_block_wrap with create = 0 + */ + ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks, + bh_result, 0, 0, 0); + if (ret > 0) { + bh_result->b_size = (ret << inode->i_blkbits); + ret = 0; + } + return ret; +} + +/* + * get called vi ext4_da_writepages after taking page lock (have journal handle) + * get called via journal_submit_inode_data_buffers (no journal handle) + * get called via shrink_page_list via pdflush (no journal handle) + * or grab_page_cache when doing write_begin (have journal handle) + */ +static int ext4_da_writepage(struct page *page, + struct writeback_control *wbc) +{ + int ret = 0; + loff_t size; + unsigned long len; + struct buffer_head *page_bufs; + struct inode *inode = page->mapping->host; + + size = i_size_read(inode); + if (page->index == size >> PAGE_CACHE_SHIFT) + len = size & ~PAGE_CACHE_MASK; + else + len = PAGE_CACHE_SIZE; + + if (page_has_buffers(page)) { + page_bufs = page_buffers(page); + if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, + ext4_bh_unmapped_or_delay)) { + /* + * We don't want to do block allocation + * So redirty the page and return + * We may reach here when we do a journal commit + * via journal_submit_inode_data_buffers. + * If we don't have mapping block we just ignore + * them. We can also reach here via shrink_page_list + */ + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; + } + } else { + /* + * The test for page_has_buffers() is subtle: + * We know the page is dirty but it lost buffers. That means + * that at some moment in time after write_begin()/write_end() + * has been called all buffers have been clean and thus they + * must have been written at least once. So they are all + * mapped and we can happily proceed with mapping them + * and writing the page. + * + * Try to initialize the buffer_heads and check whether + * all are mapped and non delay. We don't want to + * do block allocation here. + */ + ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, + ext4_normal_get_block_write); + if (!ret) { + page_bufs = page_buffers(page); + /* check whether all are mapped and non delay */ + if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, + ext4_bh_unmapped_or_delay)) { + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; + } + } else { + /* + * We can't do block allocation here + * so just redity the page and unlock + * and return + */ + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; + } + } + + if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) + ret = nobh_writepage(page, ext4_normal_get_block_write, wbc); + else + ret = block_write_full_page(page, + ext4_normal_get_block_write, + wbc); + + return ret; +} + +/* + * This is called via ext4_da_writepages() to + * calulate the total number of credits to reserve to fit + * a single extent allocation into a single transaction, + * ext4_da_writpeages() will loop calling this before + * the block allocation. + */ + +static int ext4_da_writepages_trans_blocks(struct inode *inode) +{ + int max_blocks = EXT4_I(inode)->i_reserved_data_blocks; + + /* + * With non-extent format the journal credit needed to + * insert nrblocks contiguous block is dependent on + * number of contiguous block. So we will limit + * number of contiguous block to a sane value + */ + if (!(inode->i_flags & EXT4_EXTENTS_FL) && + (max_blocks > EXT4_MAX_TRANS_DATA)) + max_blocks = EXT4_MAX_TRANS_DATA; + + return ext4_chunk_trans_blocks(inode, max_blocks); +} + +static int ext4_da_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + handle_t *handle = NULL; + loff_t range_start = 0; + struct mpage_da_data mpd; + struct inode *inode = mapping->host; + int needed_blocks, ret = 0, nr_to_writebump = 0; + long to_write, pages_skipped = 0; + struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); + + /* + * No pages to write? This is mainly a kludge to avoid starting + * a transaction for special inodes like journal inode on last iput() + * because that could violate lock ordering on umount + */ + if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) + return 0; + /* + * Make sure nr_to_write is >= sbi->s_mb_stream_request + * This make sure small files blocks are allocated in + * single attempt. This ensure that small files + * get less fragmented. + */ + if (wbc->nr_to_write < sbi->s_mb_stream_request) { + nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write; + wbc->nr_to_write = sbi->s_mb_stream_request; + } + + if (!wbc->range_cyclic) + /* + * If range_cyclic is not set force range_cont + * and save the old writeback_index + */ + wbc->range_cont = 1; + + range_start = wbc->range_start; + pages_skipped = wbc->pages_skipped; + + mpd.wbc = wbc; + mpd.inode = mapping->host; + +restart_loop: + to_write = wbc->nr_to_write; + while (!ret && to_write > 0) { + + /* + * we insert one extent at a time. So we need + * credit needed for single extent allocation. + * journalled mode is currently not supported + * by delalloc + */ + BUG_ON(ext4_should_journal_data(inode)); + needed_blocks = ext4_da_writepages_trans_blocks(inode); + + /* start a new transaction*/ + handle = ext4_journal_start(inode, needed_blocks); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + printk(KERN_EMERG "%s: jbd2_start: " + "%ld pages, ino %lu; err %d\n", __func__, + wbc->nr_to_write, inode->i_ino, ret); + dump_stack(); + goto out_writepages; + } + to_write -= wbc->nr_to_write; + + mpd.get_block = ext4_da_get_block_write; + ret = mpage_da_writepages(mapping, wbc, &mpd); + + ext4_journal_stop(handle); + + if (mpd.retval == -ENOSPC) + jbd2_journal_force_commit_nested(sbi->s_journal); + + /* reset the retry count */ + if (ret == MPAGE_DA_EXTENT_TAIL) { + /* + * got one extent now try with + * rest of the pages + */ + to_write += wbc->nr_to_write; + ret = 0; + } else if (wbc->nr_to_write) { + /* + * There is no more writeout needed + * or we requested for a noblocking writeout + * and we found the device congested + */ + to_write += wbc->nr_to_write; + break; + } + wbc->nr_to_write = to_write; + } + + if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) { + /* We skipped pages in this loop */ + wbc->range_start = range_start; + wbc->nr_to_write = to_write + + wbc->pages_skipped - pages_skipped; + wbc->pages_skipped = pages_skipped; + goto restart_loop; + } + +out_writepages: + wbc->nr_to_write = to_write - nr_to_writebump; + wbc->range_start = range_start; + return ret; +} + +#define FALL_BACK_TO_NONDELALLOC 1 +static int ext4_nonda_switch(struct super_block *sb) +{ + s64 free_blocks, dirty_blocks; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + /* + * switch to non delalloc mode if we are running low + * on free block. The free block accounting via percpu + * counters can get slightly wrong with FBC_BATCH getting + * accumulated on each CPU without updating global counters + * Delalloc need an accurate free block accounting. So switch + * to non delalloc when we are near to error range. + */ + free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); + dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter); + if (2 * free_blocks < 3 * dirty_blocks || + free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) { + /* + * free block count is less that 150% of dirty blocks + * or free blocks is less that watermark + */ + return 1; + } + return 0; +} + +static int ext4_da_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret, retries = 0; + struct page *page; + pgoff_t index; + unsigned from, to; + struct inode *inode = mapping->host; + handle_t *handle; + + index = pos >> PAGE_CACHE_SHIFT; + from = pos & (PAGE_CACHE_SIZE - 1); + to = from + len; + + if (ext4_nonda_switch(inode->i_sb)) { + *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; + return ext4_write_begin(file, mapping, pos, + len, flags, pagep, fsdata); + } + *fsdata = (void *)0; +retry: + /* + * With delayed allocation, we don't log the i_disksize update + * if there is delayed block allocation. But we still need + * to journalling the i_disksize update if writes to the end + * of file which has an already mapped buffer. + */ + handle = ext4_journal_start(inode, 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + + page = __grab_cache_page(mapping, index); + if (!page) { + ext4_journal_stop(handle); + ret = -ENOMEM; + goto out; + } + *pagep = page; + + ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, + ext4_da_get_block_prep); + if (ret < 0) { + unlock_page(page); + ext4_journal_stop(handle); + page_cache_release(page); + /* + * block_write_begin may have instantiated a few blocks + * outside i_size. Trim these off again. Don't need + * i_size_read because we hold i_mutex. + */ + if (pos + len > inode->i_size) + vmtruncate(inode, inode->i_size); + } + + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; +out: + return ret; +} + +/* + * Check if we should update i_disksize + * when write to the end of file but not require block allocation + */ +static int ext4_da_should_update_i_disksize(struct page *page, + unsigned long offset) +{ + struct buffer_head *bh; + struct inode *inode = page->mapping->host; + unsigned int idx; + int i; + + bh = page_buffers(page); + idx = offset >> inode->i_blkbits; + + for (i = 0; i < idx; i++) + bh = bh->b_this_page; + + if (!buffer_mapped(bh) || (buffer_delay(bh))) + return 0; + return 1; +} + +static int ext4_da_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = mapping->host; + int ret = 0, ret2; + handle_t *handle = ext4_journal_current_handle(); + loff_t new_i_size; + unsigned long start, end; + int write_mode = (int)(unsigned long)fsdata; + + if (write_mode == FALL_BACK_TO_NONDELALLOC) { + if (ext4_should_order_data(inode)) { + return ext4_ordered_write_end(file, mapping, pos, + len, copied, page, fsdata); + } else if (ext4_should_writeback_data(inode)) { + return ext4_writeback_write_end(file, mapping, pos, + len, copied, page, fsdata); + } else { + BUG(); + } + } + + start = pos & (PAGE_CACHE_SIZE - 1); + end = start + copied - 1; + + /* + * generic_write_end() will run mark_inode_dirty() if i_size + * changes. So let's piggyback the i_disksize mark_inode_dirty + * into that. + */ + + new_i_size = pos + copied; + if (new_i_size > EXT4_I(inode)->i_disksize) { + if (ext4_da_should_update_i_disksize(page, end)) { + down_write(&EXT4_I(inode)->i_data_sem); + if (new_i_size > EXT4_I(inode)->i_disksize) { + /* + * Updating i_disksize when extending file + * without needing block allocation + */ + if (ext4_should_order_data(inode)) + ret = ext4_jbd2_file_inode(handle, + inode); + + EXT4_I(inode)->i_disksize = new_i_size; + } + up_write(&EXT4_I(inode)->i_data_sem); + /* We need to mark inode dirty even if + * new_i_size is less that inode->i_size + * bu greater than i_disksize.(hint delalloc) + */ + ext4_mark_inode_dirty(handle, inode); + } + } + ret2 = generic_write_end(file, mapping, pos, len, copied, + page, fsdata); + copied = ret2; + if (ret2 < 0) + ret = ret2; + ret2 = ext4_journal_stop(handle); + if (!ret) + ret = ret2; + + return ret ? ret : copied; +} + +static void ext4_da_invalidatepage(struct page *page, unsigned long offset) +{ + /* + * Drop reserved blocks + */ + BUG_ON(!PageLocked(page)); + if (!page_has_buffers(page)) + goto out; + + ext4_da_page_release_reservation(page, offset); + +out: + ext4_invalidatepage(page, offset); + + return; +} + + /* * bmap() is special. It gets used by applications such as lilo and by * the swapper to find the on-disk block of a specific piece of data. @@ -1418,6 +2686,16 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) journal_t *journal; int err; + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && + test_opt(inode->i_sb, DELALLOC)) { + /* + * With delalloc we want to sync the file + * so that we can make sure we allocate + * blocks for file + */ + filemap_write_and_wait(mapping); + } + if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { /* * This is a REALLY heavyweight approach, but the use of @@ -1447,7 +2725,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) return 0; } - return generic_block_bmap(mapping,block,ext4_get_block); + return generic_block_bmap(mapping, block, ext4_get_block); } static int bget_one(handle_t *handle, struct buffer_head *bh) @@ -1462,21 +2740,17 @@ static int bput_one(handle_t *handle, struct buffer_head *bh) return 0; } -static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh) -{ - if (buffer_mapped(bh)) - return ext4_journal_dirty_data(handle, bh); - return 0; -} - /* - * Note that we always start a transaction even if we're not journalling - * data. This is to preserve ordering: any hole instantiation within - * __block_write_full_page -> ext4_get_block() should be journalled - * along with the data so we don't crash and then get metadata which - * refers to old data. + * Note that we don't need to start a transaction unless we're journaling data + * because we should have holes filled from ext4_page_mkwrite(). We even don't + * need to file the inode to the transaction's list in ordered mode because if + * we are writing back data added by write(), the inode is already there and if + * we are writing back data modified via mmap(), noone guarantees in which + * transaction the data will hit the disk. In case we are journaling data, we + * cannot start transaction directly because transaction start ranks above page + * lock so we have to do some magic. * - * In all journalling modes block_write_full_page() will start the I/O. + * In all journaling modes block_write_full_page() will start the I/O. * * Problem: * @@ -1518,105 +2792,103 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh) * disastrous. Any write() or metadata operation will sync the fs for * us. * - * AKPM2: if all the page's buffers are mapped to disk and !data=journal, - * we don't need to open a transaction here. */ -static int ext4_ordered_writepage(struct page *page, +static int __ext4_normal_writepage(struct page *page, struct writeback_control *wbc) { struct inode *inode = page->mapping->host; - struct buffer_head *page_bufs; - handle_t *handle = NULL; - int ret = 0; - int err; - J_ASSERT(PageLocked(page)); - - /* - * We give up here if we're reentered, because it might be for a - * different filesystem. - */ - if (ext4_journal_current_handle()) - goto out_fail; - - handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); + if (test_opt(inode->i_sb, NOBH)) + return nobh_writepage(page, + ext4_normal_get_block_write, wbc); + else + return block_write_full_page(page, + ext4_normal_get_block_write, + wbc); +} - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out_fail; - } +static int ext4_normal_writepage(struct page *page, + struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + loff_t size = i_size_read(inode); + loff_t len; - if (!page_has_buffers(page)) { - create_empty_buffers(page, inode->i_sb->s_blocksize, - (1 << BH_Dirty)|(1 << BH_Uptodate)); + J_ASSERT(PageLocked(page)); + if (page->index == size >> PAGE_CACHE_SHIFT) + len = size & ~PAGE_CACHE_MASK; + else + len = PAGE_CACHE_SIZE; + + if (page_has_buffers(page)) { + /* if page has buffers it should all be mapped + * and allocated. If there are not buffers attached + * to the page we know the page is dirty but it lost + * buffers. That means that at some moment in time + * after write_begin() / write_end() has been called + * all buffers have been clean and thus they must have been + * written at least once. So they are all mapped and we can + * happily proceed with mapping them and writing the page. + */ + BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, + ext4_bh_unmapped_or_delay)); } - page_bufs = page_buffers(page); - walk_page_buffers(handle, page_bufs, 0, - PAGE_CACHE_SIZE, NULL, bget_one); - ret = block_write_full_page(page, ext4_get_block, wbc); - - /* - * The page can become unlocked at any point now, and - * truncate can then come in and change things. So we - * can't touch *page from now on. But *page_bufs is - * safe due to elevated refcount. - */ + if (!ext4_journal_current_handle()) + return __ext4_normal_writepage(page, wbc); - /* - * And attach them to the current transaction. But only if - * block_write_full_page() succeeded. Otherwise they are unmapped, - * and generally junk. - */ - if (ret == 0) { - err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, - NULL, jbd2_journal_dirty_data_fn); - if (!ret) - ret = err; - } - walk_page_buffers(handle, page_bufs, 0, - PAGE_CACHE_SIZE, NULL, bput_one); - err = ext4_journal_stop(handle); - if (!ret) - ret = err; - return ret; - -out_fail: redirty_page_for_writepage(wbc, page); unlock_page(page); - return ret; + return 0; } -static int ext4_writeback_writepage(struct page *page, +static int __ext4_journalled_writepage(struct page *page, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host; + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + struct buffer_head *page_bufs; handle_t *handle = NULL; int ret = 0; int err; - if (ext4_journal_current_handle()) - goto out_fail; + ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, + ext4_normal_get_block_write); + if (ret != 0) + goto out_unlock; + + page_bufs = page_buffers(page); + walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL, + bget_one); + /* As soon as we unlock the page, it can go away, but we have + * references to buffers so we are safe */ + unlock_page(page); handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); if (IS_ERR(handle)) { ret = PTR_ERR(handle); - goto out_fail; + goto out; } - if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) - ret = nobh_writepage(page, ext4_get_block, wbc); - else - ret = block_write_full_page(page, ext4_get_block, wbc); + ret = walk_page_buffers(handle, page_bufs, 0, + PAGE_CACHE_SIZE, NULL, do_journal_get_write_access); + err = walk_page_buffers(handle, page_bufs, 0, + PAGE_CACHE_SIZE, NULL, write_end_fn); + if (ret == 0) + ret = err; err = ext4_journal_stop(handle); if (!ret) ret = err; - return ret; -out_fail: - redirty_page_for_writepage(wbc, page); + walk_page_buffers(handle, page_bufs, 0, + PAGE_CACHE_SIZE, NULL, bput_one); + EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; + goto out; + +out_unlock: unlock_page(page); +out: return ret; } @@ -1624,59 +2896,53 @@ static int ext4_journalled_writepage(struct page *page, struct writeback_control *wbc) { struct inode *inode = page->mapping->host; - handle_t *handle = NULL; - int ret = 0; - int err; + loff_t size = i_size_read(inode); + loff_t len; - if (ext4_journal_current_handle()) - goto no_write; + J_ASSERT(PageLocked(page)); + if (page->index == size >> PAGE_CACHE_SHIFT) + len = size & ~PAGE_CACHE_MASK; + else + len = PAGE_CACHE_SIZE; + + if (page_has_buffers(page)) { + /* if page has buffers it should all be mapped + * and allocated. If there are not buffers attached + * to the page we know the page is dirty but it lost + * buffers. That means that at some moment in time + * after write_begin() / write_end() has been called + * all buffers have been clean and thus they must have been + * written at least once. So they are all mapped and we can + * happily proceed with mapping them and writing the page. + */ + BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, + ext4_bh_unmapped_or_delay)); + } - handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); + if (ext4_journal_current_handle()) goto no_write; - } - if (!page_has_buffers(page) || PageChecked(page)) { + if (PageChecked(page)) { /* * It's mmapped pagecache. Add buffers and journal it. There * doesn't seem much point in redirtying the page here. */ ClearPageChecked(page); - ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, - ext4_get_block); - if (ret != 0) { - ext4_journal_stop(handle); - goto out_unlock; - } - ret = walk_page_buffers(handle, page_buffers(page), 0, - PAGE_CACHE_SIZE, NULL, do_journal_get_write_access); - - err = walk_page_buffers(handle, page_buffers(page), 0, - PAGE_CACHE_SIZE, NULL, write_end_fn); - if (ret == 0) - ret = err; - EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; - unlock_page(page); + return __ext4_journalled_writepage(page, wbc); } else { /* * It may be a page full of checkpoint-mode buffers. We don't * really know unless we go poke around in the buffer_heads. * But block_write_full_page will do the right thing. */ - ret = block_write_full_page(page, ext4_get_block, wbc); + return block_write_full_page(page, + ext4_normal_get_block_write, + wbc); } - err = ext4_journal_stop(handle); - if (!ret) - ret = err; -out: - return ret; - no_write: redirty_page_for_writepage(wbc, page); -out_unlock: unlock_page(page); - goto out; + return 0; } static int ext4_readpage(struct file *file, struct page *page) @@ -1817,50 +3083,75 @@ static int ext4_journalled_set_page_dirty(struct page *page) } static const struct address_space_operations ext4_ordered_aops = { - .readpage = ext4_readpage, - .readpages = ext4_readpages, - .writepage = ext4_ordered_writepage, - .sync_page = block_sync_page, - .write_begin = ext4_write_begin, - .write_end = ext4_ordered_write_end, - .bmap = ext4_bmap, - .invalidatepage = ext4_invalidatepage, - .releasepage = ext4_releasepage, - .direct_IO = ext4_direct_IO, - .migratepage = buffer_migrate_page, + .readpage = ext4_readpage, + .readpages = ext4_readpages, + .writepage = ext4_normal_writepage, + .sync_page = block_sync_page, + .write_begin = ext4_write_begin, + .write_end = ext4_ordered_write_end, + .bmap = ext4_bmap, + .invalidatepage = ext4_invalidatepage, + .releasepage = ext4_releasepage, + .direct_IO = ext4_direct_IO, + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, }; static const struct address_space_operations ext4_writeback_aops = { - .readpage = ext4_readpage, - .readpages = ext4_readpages, - .writepage = ext4_writeback_writepage, - .sync_page = block_sync_page, - .write_begin = ext4_write_begin, - .write_end = ext4_writeback_write_end, - .bmap = ext4_bmap, - .invalidatepage = ext4_invalidatepage, - .releasepage = ext4_releasepage, - .direct_IO = ext4_direct_IO, - .migratepage = buffer_migrate_page, + .readpage = ext4_readpage, + .readpages = ext4_readpages, + .writepage = ext4_normal_writepage, + .sync_page = block_sync_page, + .write_begin = ext4_write_begin, + .write_end = ext4_writeback_write_end, + .bmap = ext4_bmap, + .invalidatepage = ext4_invalidatepage, + .releasepage = ext4_releasepage, + .direct_IO = ext4_direct_IO, + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, }; static const struct address_space_operations ext4_journalled_aops = { - .readpage = ext4_readpage, - .readpages = ext4_readpages, - .writepage = ext4_journalled_writepage, - .sync_page = block_sync_page, - .write_begin = ext4_write_begin, - .write_end = ext4_journalled_write_end, - .set_page_dirty = ext4_journalled_set_page_dirty, - .bmap = ext4_bmap, - .invalidatepage = ext4_invalidatepage, - .releasepage = ext4_releasepage, + .readpage = ext4_readpage, + .readpages = ext4_readpages, + .writepage = ext4_journalled_writepage, + .sync_page = block_sync_page, + .write_begin = ext4_write_begin, + .write_end = ext4_journalled_write_end, + .set_page_dirty = ext4_journalled_set_page_dirty, + .bmap = ext4_bmap, + .invalidatepage = ext4_invalidatepage, + .releasepage = ext4_releasepage, + .is_partially_uptodate = block_is_partially_uptodate, +}; + +static const struct address_space_operations ext4_da_aops = { + .readpage = ext4_readpage, + .readpages = ext4_readpages, + .writepage = ext4_da_writepage, + .writepages = ext4_da_writepages, + .sync_page = block_sync_page, + .write_begin = ext4_da_write_begin, + .write_end = ext4_da_write_end, + .bmap = ext4_bmap, + .invalidatepage = ext4_da_invalidatepage, + .releasepage = ext4_releasepage, + .direct_IO = ext4_direct_IO, + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, }; void ext4_set_aops(struct inode *inode) { - if (ext4_should_order_data(inode)) + if (ext4_should_order_data(inode) && + test_opt(inode->i_sb, DELALLOC)) + inode->i_mapping->a_ops = &ext4_da_aops; + else if (ext4_should_order_data(inode)) inode->i_mapping->a_ops = &ext4_ordered_aops; + else if (ext4_should_writeback_data(inode) && + test_opt(inode->i_sb, DELALLOC)) + inode->i_mapping->a_ops = &ext4_da_aops; else if (ext4_should_writeback_data(inode)) inode->i_mapping->a_ops = &ext4_writeback_aops; else @@ -1873,7 +3164,7 @@ void ext4_set_aops(struct inode *inode) * This required during truncate. We need to physically zero the tail end * of that block so it doesn't yield old data if the file is later grown. */ -int ext4_block_truncate_page(handle_t *handle, struct page *page, +int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from) { ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; @@ -1882,8 +3173,13 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page, ext4_lblk_t iblock; struct inode *inode = mapping->host; struct buffer_head *bh; + struct page *page; int err = 0; + page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT); + if (!page) + return -EINVAL; + blocksize = inode->i_sb->s_blocksize; length = blocksize - (offset & (blocksize - 1)); iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); @@ -1956,7 +3252,7 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page, err = ext4_journal_dirty_metadata(handle, bh); } else { if (ext4_should_order_data(inode)) - err = ext4_journal_dirty_data(handle, bh); + err = ext4_jbd2_file_inode(handle, inode); mark_buffer_dirty(bh); } @@ -2035,7 +3331,7 @@ static Indirect *ext4_find_shared(struct inode *inode, int depth, if (!partial->key && *partial->p) /* Writer: end */ goto no_top; - for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--) + for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--) ; /* * OK, we've found the last block that must survive. The rest of our @@ -2054,7 +3350,7 @@ static Indirect *ext4_find_shared(struct inode *inode, int depth, } /* Writer: end */ - while(partial > p) { + while (partial > p) { brelse(partial->bh); partial--; } @@ -2179,7 +3475,21 @@ static void ext4_free_data(handle_t *handle, struct inode *inode, if (this_bh) { BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata"); - ext4_journal_dirty_metadata(handle, this_bh); + + /* + * The buffer head should have an attached journal head at this + * point. However, if the data is corrupted and an indirect + * block pointed to itself, it would have been detached when + * the block was cleared. Check for this instead of OOPSing. + */ + if (bh2jh(this_bh)) + ext4_journal_dirty_metadata(handle, this_bh); + else + ext4_error(inode->i_sb, __func__, + "circular indirect block detected, " + "inode=%lu, block=%llu", + inode->i_ino, + (unsigned long long) this_bh->b_blocknr); } } @@ -2232,9 +3542,9 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, /* This zaps the entire block. Bottom up. */ BUFFER_TRACE(bh, "free child branches"); ext4_free_branches(handle, inode, bh, - (__le32*)bh->b_data, - (__le32*)bh->b_data + addr_per_block, - depth); + (__le32 *) bh->b_data, + (__le32 *) bh->b_data + addr_per_block, + depth); /* * We've probably journalled the indirect block several @@ -2305,6 +3615,19 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, } } +int ext4_can_truncate(struct inode *inode) +{ + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return 0; + if (S_ISREG(inode->i_mode)) + return 1; + if (S_ISDIR(inode->i_mode)) + return 1; + if (S_ISLNK(inode->i_mode)) + return !ext4_inode_is_fast_symlink(inode); + return 0; +} + /* * ext4_truncate() * @@ -2347,51 +3670,25 @@ void ext4_truncate(struct inode *inode) int n; ext4_lblk_t last_block; unsigned blocksize = inode->i_sb->s_blocksize; - struct page *page; - if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - S_ISLNK(inode->i_mode))) + if (!ext4_can_truncate(inode)) return; - if (ext4_inode_is_fast_symlink(inode)) - return; - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) - return; - - /* - * We have to lock the EOF page here, because lock_page() nests - * outside jbd2_journal_start(). - */ - if ((inode->i_size & (blocksize - 1)) == 0) { - /* Block boundary? Nothing to do */ - page = NULL; - } else { - page = grab_cache_page(mapping, - inode->i_size >> PAGE_CACHE_SHIFT); - if (!page) - return; - } if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { - ext4_ext_truncate(inode, page); + ext4_ext_truncate(inode); return; } handle = start_transaction(inode); - if (IS_ERR(handle)) { - if (page) { - clear_highpage(page); - flush_dcache_page(page); - unlock_page(page); - page_cache_release(page); - } + if (IS_ERR(handle)) return; /* AKPM: return what? */ - } last_block = (inode->i_size + blocksize-1) >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); - if (page) - ext4_block_truncate_page(handle, page, mapping, inode->i_size); + if (inode->i_size & (blocksize - 1)) + if (ext4_block_truncate_page(handle, mapping, inode->i_size)) + goto out_stop; n = ext4_block_to_path(inode, last_block, offsets, NULL); if (n == 0) @@ -2410,6 +3707,14 @@ void ext4_truncate(struct inode *inode) goto out_stop; /* + * From here we block out all ext4_get_block() callers who want to + * modify the block allocation tree. + */ + down_write(&ei->i_data_sem); + + ext4_discard_preallocations(inode); + + /* * The orphan list entry will now protect us from any crash which * occurs before the truncate completes, so it is now safe to propagate * the new, shorter inode size (held for now in i_size) into the @@ -2418,12 +3723,6 @@ void ext4_truncate(struct inode *inode) */ ei->i_disksize = inode->i_size; - /* - * From here we block out all ext4_get_block() callers who want to - * modify the block allocation tree. - */ - down_write(&ei->i_data_sem); - if (n == 1) { /* direct blocks */ ext4_free_data(handle, inode, NULL, i_data+offsets[0], i_data + EXT4_NDIR_BLOCKS); @@ -2484,8 +3783,6 @@ do_indirects: ; } - ext4_discard_reservation(inode); - up_write(&ei->i_data_sem); inode->i_mtime = inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); @@ -2510,41 +3807,6 @@ out_stop: ext4_journal_stop(handle); } -static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb, - unsigned long ino, struct ext4_iloc *iloc) -{ - ext4_group_t block_group; - unsigned long offset; - ext4_fsblk_t block; - struct ext4_group_desc *gdp; - - if (!ext4_valid_inum(sb, ino)) { - /* - * This error is already checked for in namei.c unless we are - * looking at an NFS filehandle, in which case no error - * report is needed - */ - return 0; - } - - block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); - gdp = ext4_get_group_desc(sb, block_group, NULL); - if (!gdp) - return 0; - - /* - * Figure out the offset within the block group inode table - */ - offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)) * - EXT4_INODE_SIZE(sb); - block = ext4_inode_table(sb, gdp) + - (offset >> EXT4_BLOCK_SIZE_BITS(sb)); - - iloc->block_group = block_group; - iloc->offset = offset & (EXT4_BLOCK_SIZE(sb) - 1); - return block; -} - /* * ext4_get_inode_loc returns with an extra refcount against the inode's * underlying buffer_head on success. If 'in_mem' is true, we have all @@ -2554,23 +3816,49 @@ static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb, static int __ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc, int in_mem) { - ext4_fsblk_t block; - struct buffer_head *bh; + struct ext4_group_desc *gdp; + struct buffer_head *bh; + struct super_block *sb = inode->i_sb; + ext4_fsblk_t block; + int inodes_per_block, inode_offset; + + iloc->bh = 0; + if (!ext4_valid_inum(sb, inode->i_ino)) + return -EIO; - block = ext4_get_inode_block(inode->i_sb, inode->i_ino, iloc); - if (!block) + iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb); + gdp = ext4_get_group_desc(sb, iloc->block_group, NULL); + if (!gdp) return -EIO; - bh = sb_getblk(inode->i_sb, block); + /* + * Figure out the offset within the block group inode table + */ + inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb)); + inode_offset = ((inode->i_ino - 1) % + EXT4_INODES_PER_GROUP(sb)); + block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block); + iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb); + + bh = sb_getblk(sb, block); if (!bh) { - ext4_error (inode->i_sb, "ext4_get_inode_loc", - "unable to read inode block - " - "inode=%lu, block=%llu", - inode->i_ino, block); + ext4_error(sb, "ext4_get_inode_loc", "unable to read " + "inode block - inode=%lu, block=%llu", + inode->i_ino, block); return -EIO; } if (!buffer_uptodate(bh)) { lock_buffer(bh); + + /* + * If the buffer has the write error flag, we have failed + * to write out another inode in the same block. In this + * case, we don't have to read the block because we may + * read the old inode data successfully. + */ + if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) + set_buffer_uptodate(bh); + if (buffer_uptodate(bh)) { /* someone brought it uptodate while we waited */ unlock_buffer(bh); @@ -2584,28 +3872,12 @@ static int __ext4_get_inode_loc(struct inode *inode, */ if (in_mem) { struct buffer_head *bitmap_bh; - struct ext4_group_desc *desc; - int inodes_per_buffer; - int inode_offset, i; - ext4_group_t block_group; - int start; - - block_group = (inode->i_ino - 1) / - EXT4_INODES_PER_GROUP(inode->i_sb); - inodes_per_buffer = bh->b_size / - EXT4_INODE_SIZE(inode->i_sb); - inode_offset = ((inode->i_ino - 1) % - EXT4_INODES_PER_GROUP(inode->i_sb)); - start = inode_offset & ~(inodes_per_buffer - 1); + int i, start; - /* Is the inode bitmap in cache? */ - desc = ext4_get_group_desc(inode->i_sb, - block_group, NULL); - if (!desc) - goto make_io; + start = inode_offset & ~(inodes_per_block - 1); - bitmap_bh = sb_getblk(inode->i_sb, - ext4_inode_bitmap(inode->i_sb, desc)); + /* Is the inode bitmap in cache? */ + bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp)); if (!bitmap_bh) goto make_io; @@ -2618,14 +3890,14 @@ static int __ext4_get_inode_loc(struct inode *inode, brelse(bitmap_bh); goto make_io; } - for (i = start; i < start + inodes_per_buffer; i++) { + for (i = start; i < start + inodes_per_block; i++) { if (i == inode_offset) continue; if (ext4_test_bit(i, bitmap_bh->b_data)) break; } brelse(bitmap_bh); - if (i == start + inodes_per_buffer) { + if (i == start + inodes_per_block) { /* all other inodes are free, so skip I/O */ memset(bh->b_data, 0, bh->b_size); set_buffer_uptodate(bh); @@ -2636,6 +3908,36 @@ static int __ext4_get_inode_loc(struct inode *inode, make_io: /* + * If we need to do any I/O, try to pre-readahead extra + * blocks from the inode table. + */ + if (EXT4_SB(sb)->s_inode_readahead_blks) { + ext4_fsblk_t b, end, table; + unsigned num; + + table = ext4_inode_table(sb, gdp); + /* Make sure s_inode_readahead_blks is a power of 2 */ + while (EXT4_SB(sb)->s_inode_readahead_blks & + (EXT4_SB(sb)->s_inode_readahead_blks-1)) + EXT4_SB(sb)->s_inode_readahead_blks = + (EXT4_SB(sb)->s_inode_readahead_blks & + (EXT4_SB(sb)->s_inode_readahead_blks-1)); + b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1); + if (table > b) + b = table; + end = b + EXT4_SB(sb)->s_inode_readahead_blks; + num = EXT4_INODES_PER_GROUP(sb); + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) + num -= le16_to_cpu(gdp->bg_itable_unused); + table += num / inodes_per_block; + if (end > table) + end = table; + while (b <= end) + sb_breadahead(sb, b++); + } + + /* * There are other valid inodes in the buffer, this inode * has in-inode xattrs, or we don't have this inode in memory. * Read the block from disk. @@ -2645,10 +3947,9 @@ make_io: submit_bh(READ_META, bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { - ext4_error(inode->i_sb, "ext4_get_inode_loc", - "unable to read inode block - " - "inode=%lu, block=%llu", - inode->i_ino, block); + ext4_error(sb, __func__, + "unable to read inode block - inode=%lu, " + "block=%llu", inode->i_ino, block); brelse(bh); return -EIO; } @@ -2740,11 +4041,10 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) return inode; ei = EXT4_I(inode); -#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL +#ifdef CONFIG_EXT4_FS_POSIX_ACL ei->i_acl = EXT4_ACL_NOT_CACHED; ei->i_default_acl = EXT4_ACL_NOT_CACHED; #endif - ei->i_block_alloc_info = NULL; ret = __ext4_get_inode_loc(inode, &iloc, 0); if (ret < 0) @@ -2754,7 +4054,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) inode->i_mode = le16_to_cpu(raw_inode->i_mode); inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); - if(!(test_opt (inode->i_sb, NO_UID32))) { + if (!(test_opt(inode->i_sb, NO_UID32))) { inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; } @@ -2772,7 +4072,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) if (inode->i_mode == 0 || !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) { /* this inode is deleted */ - brelse (bh); + brelse(bh); ret = -ESTALE; goto bad_inode; } @@ -2805,7 +4105,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > EXT4_INODE_SIZE(inode->i_sb)) { - brelse (bh); + brelse(bh); ret = -EIO; goto bad_inode; } @@ -2858,7 +4158,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) init_special_inode(inode, inode->i_mode, new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); } - brelse (iloc.bh); + brelse(iloc.bh); ext4_set_inode_flags(inode); unlock_new_inode(inode); return inode; @@ -2940,14 +4240,14 @@ static int ext4_do_update_inode(handle_t *handle, ext4_get_inode_flags(ei); raw_inode->i_mode = cpu_to_le16(inode->i_mode); - if(!(test_opt(inode->i_sb, NO_UID32))) { + if (!(test_opt(inode->i_sb, NO_UID32))) { raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid)); /* * Fix up interoperability with old kernels. Otherwise, old inodes get * re-used with the upper 16 bits of the uid/gid intact */ - if(!ei->i_dtime) { + if (!ei->i_dtime) { raw_inode->i_uid_high = cpu_to_le16(high_16_bits(inode->i_uid)); raw_inode->i_gid_high = @@ -3035,7 +4335,7 @@ static int ext4_do_update_inode(handle_t *handle, ei->i_state &= ~EXT4_STATE_NEW; out_brelse: - brelse (bh); + brelse(bh); ext4_std_error(inode->i_sb, err); return err; } @@ -3107,7 +4407,14 @@ int ext4_write_inode(struct inode *inode, int wait) * be freed, so we have a strong guarantee that no future commit will * leave these blocks visible to the user.) * - * Called with inode->sem down. + * Another thing we have to assure is that if we are in ordered mode + * and inode is still attached to the committing transaction, we must + * we start writeout of all the dirty pages which are being truncated. + * This way we are sure that all the data written in the previous + * transaction are already on disk (truncate waits for pages under + * writeback). + * + * Called with inode->i_mutex down. */ int ext4_setattr(struct dentry *dentry, struct iattr *attr) { @@ -3173,6 +4480,22 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (!error) error = rc; ext4_journal_stop(handle); + + if (ext4_should_order_data(inode)) { + error = ext4_begin_ordered_truncate(inode, + attr->ia_size); + if (error) { + /* Do as much error cleanup as possible */ + handle = ext4_journal_start(inode, 3); + if (IS_ERR(handle)) { + ext4_orphan_del(NULL, inode); + goto err_out; + } + ext4_orphan_del(handle, inode); + ext4_journal_stop(handle); + goto err_out; + } + } } rc = inode_setattr(inode, attr); @@ -3193,58 +4516,156 @@ err_out: return error; } +int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct inode *inode; + unsigned long delalloc_blocks; + + inode = dentry->d_inode; + generic_fillattr(inode, stat); + /* + * We can't update i_blocks if the block allocation is delayed + * otherwise in the case of system crash before the real block + * allocation is done, we will have i_blocks inconsistent with + * on-disk file blocks. + * We always keep i_blocks updated together with real + * allocation. But to not confuse with user, stat + * will return the blocks that include the delayed allocation + * blocks for this file. + */ + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); + delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks; + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + + stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9; + return 0; +} + +static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks, + int chunk) +{ + int indirects; + + /* if nrblocks are contiguous */ + if (chunk) { + /* + * With N contiguous data blocks, it need at most + * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks + * 2 dindirect blocks + * 1 tindirect block + */ + indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb); + return indirects + 3; + } + /* + * if nrblocks are not contiguous, worse case, each block touch + * a indirect block, and each indirect block touch a double indirect + * block, plus a triple indirect block + */ + indirects = nrblocks * 2 + 1; + return indirects; +} + +static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) +{ + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) + return ext4_indirect_trans_blocks(inode, nrblocks, 0); + return ext4_ext_index_trans_blocks(inode, nrblocks, 0); +} /* - * How many blocks doth make a writepage()? - * - * With N blocks per page, it may be: - * N data blocks - * 2 indirect block - * 2 dindirect - * 1 tindirect - * N+5 bitmap blocks (from the above) - * N+5 group descriptor summary blocks - * 1 inode block - * 1 superblock. - * 2 * EXT4_SINGLEDATA_TRANS_BLOCKS for the quote files - * - * 3 * (N + 5) + 2 + 2 * EXT4_SINGLEDATA_TRANS_BLOCKS - * - * With ordered or writeback data it's the same, less the N data blocks. - * - * If the inode's direct blocks can hold an integral number of pages then a - * page cannot straddle two indirect blocks, and we can only touch one indirect - * and dindirect block, and the "5" above becomes "3". - * - * This still overestimates under most circumstances. If we were to pass the - * start and end offsets in here as well we could do block_to_path() on each - * block and work out the exact number of indirects which are touched. Pah. + * Account for index blocks, block groups bitmaps and block group + * descriptor blocks if modify datablocks and index blocks + * worse case, the indexs blocks spread over different block groups + * + * If datablocks are discontiguous, they are possible to spread over + * different block groups too. If they are contiugous, with flexbg, + * they could still across block group boundary. + * + * Also account for superblock, inode, quota and xattr blocks */ +int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) +{ + int groups, gdpblocks; + int idxblocks; + int ret = 0; + + /* + * How many index blocks need to touch to modify nrblocks? + * The "Chunk" flag indicating whether the nrblocks is + * physically contiguous on disk + * + * For Direct IO and fallocate, they calls get_block to allocate + * one single extent at a time, so they could set the "Chunk" flag + */ + idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk); + + ret = idxblocks; + + /* + * Now let's see how many group bitmaps and group descriptors need + * to account + */ + groups = idxblocks; + if (chunk) + groups += 1; + else + groups += nrblocks; + + gdpblocks = groups; + if (groups > EXT4_SB(inode->i_sb)->s_groups_count) + groups = EXT4_SB(inode->i_sb)->s_groups_count; + if (groups > EXT4_SB(inode->i_sb)->s_gdb_count) + gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count; + /* bitmaps and block group descriptor blocks */ + ret += groups + gdpblocks; + + /* Blocks for super block, inode, quota and xattr blocks */ + ret += EXT4_META_TRANS_BLOCKS(inode->i_sb); + + return ret; +} + +/* + * Calulate the total number of credits to reserve to fit + * the modification of a single pages into a single transaction, + * which may include multiple chunks of block allocations. + * + * This could be called via ext4_write_begin() + * + * We need to consider the worse case, when + * one new block per extent. + */ int ext4_writepage_trans_blocks(struct inode *inode) { int bpp = ext4_journal_blocks_per_page(inode); - int indirects = (EXT4_NDIR_BLOCKS % bpp) ? 5 : 3; int ret; - if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) - return ext4_ext_writepage_trans_blocks(inode, bpp); + ret = ext4_meta_trans_blocks(inode, bpp, 0); + /* Account for data blocks for journalled mode */ if (ext4_should_journal_data(inode)) - ret = 3 * (bpp + indirects) + 2; - else - ret = 2 * (bpp + indirects) + 2; - -#ifdef CONFIG_QUOTA - /* We know that structure was already allocated during DQUOT_INIT so - * we will be updating only the data blocks + inodes */ - ret += 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); -#endif - + ret += bpp; return ret; } /* + * Calculate the journal credits for a chunk of data modification. + * + * This is called from DIO, fallocate or whoever calling + * ext4_get_blocks_wrap() to map/allocate a chunk of contigous disk blocks. + * + * journal buffers for data blocks are not included here, as DIO + * and fallocate do no need to journal data buffers. + */ +int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks) +{ + return ext4_meta_trans_blocks(inode, nrblocks, 1); +} + +/* * The caller must have previously called ext4_reserve_inode_write(). * Give this, we know that the caller already has write access to iloc->bh. */ @@ -3506,3 +4927,65 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) return err; } + +static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh) +{ + return !buffer_mapped(bh); +} + +int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page) +{ + loff_t size; + unsigned long len; + int ret = -EINVAL; + void *fsdata; + struct file *file = vma->vm_file; + struct inode *inode = file->f_path.dentry->d_inode; + struct address_space *mapping = inode->i_mapping; + + /* + * Get i_alloc_sem to stop truncates messing with the inode. We cannot + * get i_mutex because we are already holding mmap_sem. + */ + down_read(&inode->i_alloc_sem); + size = i_size_read(inode); + if (page->mapping != mapping || size <= page_offset(page) + || !PageUptodate(page)) { + /* page got truncated from under us? */ + goto out_unlock; + } + ret = 0; + if (PageMappedToDisk(page)) + goto out_unlock; + + if (page->index == size >> PAGE_CACHE_SHIFT) + len = size & ~PAGE_CACHE_MASK; + else + len = PAGE_CACHE_SIZE; + + if (page_has_buffers(page)) { + /* return if we have all the buffers mapped */ + if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, + ext4_bh_unmapped)) + goto out_unlock; + } + /* + * OK, we need to fill the hole... Do write_begin write_end + * to do block allocation/reservation.We are not holding + * inode.i__mutex here. That allow * parallel write_begin, + * write_end call. lock_page prevent this from happening + * on the same page though + */ + ret = mapping->a_ops->write_begin(file, mapping, page_offset(page), + len, AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata); + if (ret < 0) + goto out_unlock; + ret = mapping->a_ops->write_end(file, mapping, page_offset(page), + len, len, page, fsdata); + if (ret < 0) + goto out_unlock; + ret = 0; +out_unlock: + up_read(&inode->i_alloc_sem); + return ret; +} diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 7a6c2f1faba6..dc99b4776d58 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -23,9 +23,8 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) struct inode *inode = filp->f_dentry->d_inode; struct ext4_inode_info *ei = EXT4_I(inode); unsigned int flags; - unsigned short rsv_window_size; - ext4_debug ("cmd = %u, arg = %lu\n", cmd, arg); + ext4_debug("cmd = %u, arg = %lu\n", cmd, arg); switch (cmd) { case EXT4_IOC_GETFLAGS: @@ -34,7 +33,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return put_user(flags, (int __user *) arg); case EXT4_IOC_SETFLAGS: { handle_t *handle = NULL; - int err; + int err, migrate = 0; struct ext4_iloc iloc; unsigned int oldflags; unsigned int jflag; @@ -82,6 +81,17 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (!capable(CAP_SYS_RESOURCE)) goto flags_out; } + if (oldflags & EXT4_EXTENTS_FL) { + /* We don't support clearning extent flags */ + if (!(flags & EXT4_EXTENTS_FL)) { + err = -EOPNOTSUPP; + goto flags_out; + } + } else if (flags & EXT4_EXTENTS_FL) { + /* migrate the file */ + migrate = 1; + flags &= ~EXT4_EXTENTS_FL; + } handle = ext4_journal_start(inode, 1); if (IS_ERR(handle)) { @@ -109,6 +119,10 @@ flags_err: if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) err = ext4_change_inode_journal_flag(inode, jflag); + if (err) + goto flags_out; + if (migrate) + err = ext4_ext_migrate(inode); flags_out: mutex_unlock(&inode->i_mutex); mnt_drop_write(filp->f_path.mnt); @@ -175,53 +189,10 @@ setversion_out: return ret; } #endif - case EXT4_IOC_GETRSVSZ: - if (test_opt(inode->i_sb, RESERVATION) - && S_ISREG(inode->i_mode) - && ei->i_block_alloc_info) { - rsv_window_size = ei->i_block_alloc_info->rsv_window_node.rsv_goal_size; - return put_user(rsv_window_size, (int __user *)arg); - } - return -ENOTTY; - case EXT4_IOC_SETRSVSZ: { - int err; - - if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode)) - return -ENOTTY; - - if (!is_owner_or_cap(inode)) - return -EACCES; - - if (get_user(rsv_window_size, (int __user *)arg)) - return -EFAULT; - - err = mnt_want_write(filp->f_path.mnt); - if (err) - return err; - - if (rsv_window_size > EXT4_MAX_RESERVE_BLOCKS) - rsv_window_size = EXT4_MAX_RESERVE_BLOCKS; - - /* - * need to allocate reservation structure for this inode - * before set the window size - */ - down_write(&ei->i_data_sem); - if (!ei->i_block_alloc_info) - ext4_init_block_alloc_info(inode); - - if (ei->i_block_alloc_info){ - struct ext4_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node; - rsv->rsv_goal_size = rsv_window_size; - } - up_write(&ei->i_data_sem); - mnt_drop_write(filp->f_path.mnt); - return 0; - } case EXT4_IOC_GROUP_EXTEND: { ext4_fsblk_t n_blocks_count; struct super_block *sb = inode->i_sb; - int err; + int err, err2; if (!capable(CAP_SYS_RESOURCE)) return -EPERM; @@ -235,8 +206,10 @@ setversion_out: err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count); jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); - jbd2_journal_flush(EXT4_SB(sb)->s_journal); + err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + if (err == 0) + err = err2; mnt_drop_write(filp->f_path.mnt); return err; @@ -244,7 +217,7 @@ setversion_out: case EXT4_IOC_GROUP_ADD: { struct ext4_new_group_data input; struct super_block *sb = inode->i_sb; - int err; + int err, err2; if (!capable(CAP_SYS_RESOURCE)) return -EPERM; @@ -259,15 +232,36 @@ setversion_out: err = ext4_group_add(sb, &input); jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); - jbd2_journal_flush(EXT4_SB(sb)->s_journal); + err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + if (err == 0) + err = err2; mnt_drop_write(filp->f_path.mnt); return err; } case EXT4_IOC_MIGRATE: - return ext4_ext_migrate(inode, filp, cmd, arg); + { + int err; + if (!is_owner_or_cap(inode)) + return -EACCES; + + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; + /* + * inode_mutex prevent write and truncate on the file. + * Read still goes through. We take i_data_sem in + * ext4_ext_swap_inode_data before we switch the + * inode format to prevent read. + */ + mutex_lock(&(inode->i_mutex)); + err = ext4_ext_migrate(inode); + mutex_unlock(&(inode->i_mutex)); + mnt_drop_write(filp->f_path.mnt); + return err; + } default: return -ENOTTY; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index c9900aade150..b580714f0d85 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -381,22 +381,28 @@ static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr) static inline int mb_find_next_zero_bit(void *addr, int max, int start) { - int fix = 0; + int fix = 0, ret, tmpmax; addr = mb_correct_addr_and_bit(&fix, addr); - max += fix; + tmpmax = max + fix; start += fix; - return ext4_find_next_zero_bit(addr, max, start) - fix; + ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix; + if (ret > max) + return max; + return ret; } static inline int mb_find_next_bit(void *addr, int max, int start) { - int fix = 0; + int fix = 0, ret, tmpmax; addr = mb_correct_addr_and_bit(&fix, addr); - max += fix; + tmpmax = max + fix; start += fix; - return ext4_find_next_bit(addr, max, start) - fix; + ret = ext4_find_next_bit(addr, tmpmax, start) - fix; + if (ret > max) + return max; + return ret; } static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) @@ -471,9 +477,10 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) b2 = (unsigned char *) bitmap; for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { if (b1[i] != b2[i]) { - printk("corruption in group %lu at byte %u(%u):" - " %x in copy != %x on disk/prealloc\n", - e4b->bd_group, i, i * 8, b1[i], b2[i]); + printk(KERN_ERR "corruption in group %lu " + "at byte %u(%u): %x in copy != %x " + "on disk/prealloc\n", + e4b->bd_group, i, i * 8, b1[i], b2[i]); BUG(); } } @@ -527,9 +534,6 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, void *buddy; void *buddy2; - if (!test_opt(sb, MBALLOC)) - return 0; - { static int mb_check_counter; if (mb_check_counter++ % 100 != 0) @@ -778,16 +782,21 @@ static int ext4_mb_init_cache(struct page *page, char *incore) if (bh[i] == NULL) goto out; - if (bh_uptodate_or_lock(bh[i])) + if (buffer_uptodate(bh[i]) && + !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) continue; + lock_buffer(bh[i]); + spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { ext4_init_block_bitmap(sb, bh[i], first_group + i, desc); set_buffer_uptodate(bh[i]); unlock_buffer(bh[i]); + spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); continue; } + spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); get_bh(bh[i]); bh[i]->b_end_io = end_buffer_read_sync; submit_bh(READ, bh[i]); @@ -803,6 +812,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) if (!buffer_uptodate(bh[i])) goto out; + err = 0; first_block = page->index * blocks_per_page; for (i = 0; i < blocks_per_page; i++) { int group; @@ -883,6 +893,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, int pnum; int poff; struct page *page; + int ret; mb_debug("load group %lu\n", group); @@ -914,15 +925,21 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, if (page) { BUG_ON(page->mapping != inode->i_mapping); if (!PageUptodate(page)) { - ext4_mb_init_cache(page, NULL); + ret = ext4_mb_init_cache(page, NULL); + if (ret) { + unlock_page(page); + goto err; + } mb_cmp_bitmaps(e4b, page_address(page) + (poff * sb->s_blocksize)); } unlock_page(page); } } - if (page == NULL || !PageUptodate(page)) + if (page == NULL || !PageUptodate(page)) { + ret = -EIO; goto err; + } e4b->bd_bitmap_page = page; e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); mark_page_accessed(page); @@ -938,14 +955,20 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); if (page) { BUG_ON(page->mapping != inode->i_mapping); - if (!PageUptodate(page)) - ext4_mb_init_cache(page, e4b->bd_bitmap); - + if (!PageUptodate(page)) { + ret = ext4_mb_init_cache(page, e4b->bd_bitmap); + if (ret) { + unlock_page(page); + goto err; + } + } unlock_page(page); } } - if (page == NULL || !PageUptodate(page)) + if (page == NULL || !PageUptodate(page)) { + ret = -EIO; goto err; + } e4b->bd_buddy_page = page; e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); mark_page_accessed(page); @@ -962,7 +985,7 @@ err: page_cache_release(e4b->bd_buddy_page); e4b->bd_buddy = NULL; e4b->bd_bitmap = NULL; - return -EIO; + return ret; } static void ext4_mb_release_desc(struct ext4_buddy *e4b) @@ -1031,7 +1054,7 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len) } } -static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, +static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, int first, int count) { int block = 0; @@ -1071,11 +1094,12 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, blocknr += block; blocknr += le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); - + ext4_unlock_group(sb, e4b->bd_group); ext4_error(sb, __func__, "double-free of inode" " %lu's block %llu(bit %u in group %lu)\n", inode ? inode->i_ino : 0, blocknr, block, e4b->bd_group); + ext4_lock_group(sb, e4b->bd_group); } mb_clear_bit(block, EXT4_MB_BITMAP(e4b)); e4b->bd_info->bb_counters[order]++; @@ -1113,8 +1137,6 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, } while (1); } mb_check_buddy(e4b); - - return 0; } static int mb_find_extent(struct ext4_buddy *e4b, int order, int block, @@ -1730,10 +1752,6 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) ac->ac_g_ex.fe_start = sbi->s_mb_last_start; spin_unlock(&sbi->s_md_lock); } - - /* searching for the right group start from the goal value specified */ - group = ac->ac_g_ex.fe_group; - /* Let's just scan groups to find more-less suitable blocks */ cr = ac->ac_2order ? 0 : 1; /* @@ -1743,6 +1761,12 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) repeat: for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) { ac->ac_criteria = cr; + /* + * searching for the right group start + * from the goal value specified + */ + group = ac->ac_g_ex.fe_group; + for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) { struct ext4_group_info *grp; struct ext4_group_desc *desc; @@ -1963,6 +1987,8 @@ static int ext4_mb_seq_history_open(struct inode *inode, struct file *file) int rc; int size; + if (unlikely(sbi->s_mb_history == NULL)) + return -ENOMEM; s = kmalloc(sizeof(*s), GFP_KERNEL); if (s == NULL) return -ENOMEM; @@ -2143,9 +2169,10 @@ static void ext4_mb_history_release(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); - remove_proc_entry("mb_groups", sbi->s_mb_proc); - remove_proc_entry("mb_history", sbi->s_mb_proc); - + if (sbi->s_proc != NULL) { + remove_proc_entry("mb_groups", sbi->s_proc); + remove_proc_entry("mb_history", sbi->s_proc); + } kfree(sbi->s_mb_history); } @@ -2154,10 +2181,10 @@ static void ext4_mb_history_init(struct super_block *sb) struct ext4_sb_info *sbi = EXT4_SB(sb); int i; - if (sbi->s_mb_proc != NULL) { - proc_create_data("mb_history", S_IRUGO, sbi->s_mb_proc, + if (sbi->s_proc != NULL) { + proc_create_data("mb_history", S_IRUGO, sbi->s_proc, &ext4_mb_seq_history_fops, sb); - proc_create_data("mb_groups", S_IRUGO, sbi->s_mb_proc, + proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, &ext4_mb_seq_groups_fops, sb); } @@ -2165,9 +2192,7 @@ static void ext4_mb_history_init(struct super_block *sb) sbi->s_mb_history_cur = 0; spin_lock_init(&sbi->s_mb_history_lock); i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history); - sbi->s_mb_history = kmalloc(i, GFP_KERNEL); - if (likely(sbi->s_mb_history != NULL)) - memset(sbi->s_mb_history, 0, i); + sbi->s_mb_history = kzalloc(i, GFP_KERNEL); /* if we can't allocate history, then we simple won't use it */ } @@ -2215,21 +2240,192 @@ ext4_mb_store_history(struct ext4_allocation_context *ac) #define ext4_mb_history_init(sb) #endif + +/* Create and initialize ext4_group_info data for the given group. */ +int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *desc) +{ + int i, len; + int metalen = 0; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_info **meta_group_info; + + /* + * First check if this group is the first of a reserved block. + * If it's true, we have to allocate a new table of pointers + * to ext4_group_info structures + */ + if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { + metalen = sizeof(*meta_group_info) << + EXT4_DESC_PER_BLOCK_BITS(sb); + meta_group_info = kmalloc(metalen, GFP_KERNEL); + if (meta_group_info == NULL) { + printk(KERN_ERR "EXT4-fs: can't allocate mem for a " + "buddy group\n"); + goto exit_meta_group_info; + } + sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = + meta_group_info; + } + + /* + * calculate needed size. if change bb_counters size, + * don't forget about ext4_mb_generate_buddy() + */ + len = offsetof(typeof(**meta_group_info), + bb_counters[sb->s_blocksize_bits + 2]); + + meta_group_info = + sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]; + i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); + + meta_group_info[i] = kzalloc(len, GFP_KERNEL); + if (meta_group_info[i] == NULL) { + printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n"); + goto exit_group_info; + } + set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, + &(meta_group_info[i]->bb_state)); + + /* + * initialize bb_free to be able to skip + * empty groups without initialization + */ + if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { + meta_group_info[i]->bb_free = + ext4_free_blocks_after_init(sb, group, desc); + } else { + meta_group_info[i]->bb_free = + le16_to_cpu(desc->bg_free_blocks_count); + } + + INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); + +#ifdef DOUBLE_CHECK + { + struct buffer_head *bh; + meta_group_info[i]->bb_bitmap = + kmalloc(sb->s_blocksize, GFP_KERNEL); + BUG_ON(meta_group_info[i]->bb_bitmap == NULL); + bh = ext4_read_block_bitmap(sb, group); + BUG_ON(bh == NULL); + memcpy(meta_group_info[i]->bb_bitmap, bh->b_data, + sb->s_blocksize); + put_bh(bh); + } +#endif + + return 0; + +exit_group_info: + /* If a meta_group_info table has been allocated, release it now */ + if (group % EXT4_DESC_PER_BLOCK(sb) == 0) + kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]); +exit_meta_group_info: + return -ENOMEM; +} /* ext4_mb_add_groupinfo */ + +/* + * Add a group to the existing groups. + * This function is used for online resize + */ +int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *desc) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct inode *inode = sbi->s_buddy_cache; + int blocks_per_page; + int block; + int pnum; + struct page *page; + int err; + + /* Add group based on group descriptor*/ + err = ext4_mb_add_groupinfo(sb, group, desc); + if (err) + return err; + + /* + * Cache pages containing dynamic mb_alloc datas (buddy and bitmap + * datas) are set not up to date so that they will be re-initilaized + * during the next call to ext4_mb_load_buddy + */ + + /* Set buddy page as not up to date */ + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + block = group * 2; + pnum = block / blocks_per_page; + page = find_get_page(inode->i_mapping, pnum); + if (page != NULL) { + ClearPageUptodate(page); + page_cache_release(page); + } + + /* Set bitmap page as not up to date */ + block++; + pnum = block / blocks_per_page; + page = find_get_page(inode->i_mapping, pnum); + if (page != NULL) { + ClearPageUptodate(page); + page_cache_release(page); + } + + return 0; +} + +/* + * Update an existing group. + * This function is used for online resize + */ +void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add) +{ + grp->bb_free += add; +} + static int ext4_mb_init_backend(struct super_block *sb) { ext4_group_t i; - int j, len, metalen; + int metalen; struct ext4_sb_info *sbi = EXT4_SB(sb); - int num_meta_group_infos = - (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) >> - EXT4_DESC_PER_BLOCK_BITS(sb); + struct ext4_super_block *es = sbi->s_es; + int num_meta_group_infos; + int num_meta_group_infos_max; + int array_size; struct ext4_group_info **meta_group_info; + struct ext4_group_desc *desc; + /* This is the number of blocks used by GDT */ + num_meta_group_infos = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - + 1) >> EXT4_DESC_PER_BLOCK_BITS(sb); + + /* + * This is the total number of blocks used by GDT including + * the number of reserved blocks for GDT. + * The s_group_info array is allocated with this value + * to allow a clean online resize without a complex + * manipulation of pointer. + * The drawback is the unused memory when no resize + * occurs but it's very low in terms of pages + * (see comments below) + * Need to handle this properly when META_BG resizing is allowed + */ + num_meta_group_infos_max = num_meta_group_infos + + le16_to_cpu(es->s_reserved_gdt_blocks); + + /* + * array_size is the size of s_group_info array. We round it + * to the next power of two because this approximation is done + * internally by kmalloc so we can have some more memory + * for free here (e.g. may be used for META_BG resize). + */ + array_size = 1; + while (array_size < sizeof(*sbi->s_group_info) * + num_meta_group_infos_max) + array_size = array_size << 1; /* An 8TB filesystem with 64-bit pointers requires a 4096 byte * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. * So a two level scheme suffices for now. */ - sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) * - num_meta_group_infos, GFP_KERNEL); + sbi->s_group_info = kmalloc(array_size, GFP_KERNEL); if (sbi->s_group_info == NULL) { printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n"); return -ENOMEM; @@ -2256,63 +2452,15 @@ static int ext4_mb_init_backend(struct super_block *sb) sbi->s_group_info[i] = meta_group_info; } - /* - * calculate needed size. if change bb_counters size, - * don't forget about ext4_mb_generate_buddy() - */ - len = sizeof(struct ext4_group_info); - len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2); for (i = 0; i < sbi->s_groups_count; i++) { - struct ext4_group_desc *desc; - - meta_group_info = - sbi->s_group_info[i >> EXT4_DESC_PER_BLOCK_BITS(sb)]; - j = i & (EXT4_DESC_PER_BLOCK(sb) - 1); - - meta_group_info[j] = kzalloc(len, GFP_KERNEL); - if (meta_group_info[j] == NULL) { - printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n"); - goto err_freebuddy; - } desc = ext4_get_group_desc(sb, i, NULL); if (desc == NULL) { printk(KERN_ERR "EXT4-fs: can't read descriptor %lu\n", i); - i++; goto err_freebuddy; } - memset(meta_group_info[j], 0, len); - set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, - &(meta_group_info[j]->bb_state)); - - /* - * initialize bb_free to be able to skip - * empty groups without initialization - */ - if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { - meta_group_info[j]->bb_free = - ext4_free_blocks_after_init(sb, i, desc); - } else { - meta_group_info[j]->bb_free = - le16_to_cpu(desc->bg_free_blocks_count); - } - - INIT_LIST_HEAD(&meta_group_info[j]->bb_prealloc_list); - -#ifdef DOUBLE_CHECK - { - struct buffer_head *bh; - meta_group_info[j]->bb_bitmap = - kmalloc(sb->s_blocksize, GFP_KERNEL); - BUG_ON(meta_group_info[j]->bb_bitmap == NULL); - bh = read_block_bitmap(sb, i); - BUG_ON(bh == NULL); - memcpy(meta_group_info[j]->bb_bitmap, bh->b_data, - sb->s_blocksize); - put_bh(bh); - } -#endif - + if (ext4_mb_add_groupinfo(sb, i, desc) != 0) + goto err_freebuddy; } return 0; @@ -2333,23 +2481,19 @@ err_freesgi: int ext4_mb_init(struct super_block *sb, int needs_recovery) { struct ext4_sb_info *sbi = EXT4_SB(sb); - unsigned i; + unsigned i, j; unsigned offset; unsigned max; - - if (!test_opt(sb, MBALLOC)) - return 0; + int ret; i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short); sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); if (sbi->s_mb_offsets == NULL) { - clear_opt(sbi->s_mount_opt, MBALLOC); return -ENOMEM; } sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); if (sbi->s_mb_maxs == NULL) { - clear_opt(sbi->s_mount_opt, MBALLOC); kfree(sbi->s_mb_maxs); return -ENOMEM; } @@ -2370,12 +2514,11 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) } while (i <= sb->s_blocksize_bits + 1); /* init file for buddy data */ - i = ext4_mb_init_backend(sb); - if (i) { - clear_opt(sbi->s_mount_opt, MBALLOC); + ret = ext4_mb_init_backend(sb); + if (ret != 0) { kfree(sbi->s_mb_offsets); kfree(sbi->s_mb_maxs); - return i; + return ret; } spin_lock_init(&sbi->s_md_lock); @@ -2392,26 +2535,25 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT; sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; - i = sizeof(struct ext4_locality_group) * NR_CPUS; - sbi->s_locality_groups = kmalloc(i, GFP_KERNEL); + sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); if (sbi->s_locality_groups == NULL) { - clear_opt(sbi->s_mount_opt, MBALLOC); kfree(sbi->s_mb_offsets); kfree(sbi->s_mb_maxs); return -ENOMEM; } - for (i = 0; i < NR_CPUS; i++) { + for_each_possible_cpu(i) { struct ext4_locality_group *lg; - lg = &sbi->s_locality_groups[i]; + lg = per_cpu_ptr(sbi->s_locality_groups, i); mutex_init(&lg->lg_mutex); - INIT_LIST_HEAD(&lg->lg_prealloc_list); + for (j = 0; j < PREALLOC_TB_SIZE; j++) + INIT_LIST_HEAD(&lg->lg_prealloc_list[j]); spin_lock_init(&lg->lg_prealloc_lock); } ext4_mb_init_per_dev_proc(sb); ext4_mb_history_init(sb); - printk("EXT4-fs: mballoc enabled\n"); + printk(KERN_INFO "EXT4-fs: mballoc enabled\n"); return 0; } @@ -2440,9 +2582,6 @@ int ext4_mb_release(struct super_block *sb) struct ext4_group_info *grinfo; struct ext4_sb_info *sbi = EXT4_SB(sb); - if (!test_opt(sb, MBALLOC)) - return 0; - /* release freed, non-committed blocks */ spin_lock(&sbi->s_md_lock); list_splice_init(&sbi->s_closed_transaction, @@ -2498,8 +2637,7 @@ int ext4_mb_release(struct super_block *sb) atomic_read(&sbi->s_mb_discarded)); } - kfree(sbi->s_locality_groups); - + free_percpu(sbi->s_locality_groups); ext4_mb_history_release(sb); ext4_mb_destroy_per_dev_proc(sb); @@ -2548,8 +2686,7 @@ ext4_mb_free_committed_blocks(struct super_block *sb) ext4_lock_group(sb, md->group); for (i = 0; i < md->num; i++) { mb_debug(" %u", md->blocks[i]); - err = mb_free_blocks(NULL, &e4b, md->blocks[i], 1); - BUG_ON(err != 0); + mb_free_blocks(NULL, &e4b, md->blocks[i], 1); } mb_debug("\n"); ext4_unlock_group(sb, md->group); @@ -2573,114 +2710,46 @@ ext4_mb_free_committed_blocks(struct super_block *sb) #define EXT4_MB_STREAM_REQ "stream_req" #define EXT4_MB_GROUP_PREALLOC "group_prealloc" - - -#define MB_PROC_VALUE_READ(name) \ -static int ext4_mb_read_##name(char *page, char **start, \ - off_t off, int count, int *eof, void *data) \ -{ \ - struct ext4_sb_info *sbi = data; \ - int len; \ - *eof = 1; \ - if (off != 0) \ - return 0; \ - len = sprintf(page, "%ld\n", sbi->s_mb_##name); \ - *start = page; \ - return len; \ -} - -#define MB_PROC_VALUE_WRITE(name) \ -static int ext4_mb_write_##name(struct file *file, \ - const char __user *buf, unsigned long cnt, void *data) \ -{ \ - struct ext4_sb_info *sbi = data; \ - char str[32]; \ - long value; \ - if (cnt >= sizeof(str)) \ - return -EINVAL; \ - if (copy_from_user(str, buf, cnt)) \ - return -EFAULT; \ - value = simple_strtol(str, NULL, 0); \ - if (value <= 0) \ - return -ERANGE; \ - sbi->s_mb_##name = value; \ - return cnt; \ -} - -MB_PROC_VALUE_READ(stats); -MB_PROC_VALUE_WRITE(stats); -MB_PROC_VALUE_READ(max_to_scan); -MB_PROC_VALUE_WRITE(max_to_scan); -MB_PROC_VALUE_READ(min_to_scan); -MB_PROC_VALUE_WRITE(min_to_scan); -MB_PROC_VALUE_READ(order2_reqs); -MB_PROC_VALUE_WRITE(order2_reqs); -MB_PROC_VALUE_READ(stream_request); -MB_PROC_VALUE_WRITE(stream_request); -MB_PROC_VALUE_READ(group_prealloc); -MB_PROC_VALUE_WRITE(group_prealloc); - -#define MB_PROC_HANDLER(name, var) \ -do { \ - proc = create_proc_entry(name, mode, sbi->s_mb_proc); \ - if (proc == NULL) { \ - printk(KERN_ERR "EXT4-fs: can't to create %s\n", name); \ - goto err_out; \ - } \ - proc->data = sbi; \ - proc->read_proc = ext4_mb_read_##var ; \ - proc->write_proc = ext4_mb_write_##var; \ -} while (0) - static int ext4_mb_init_per_dev_proc(struct super_block *sb) { mode_t mode = S_IFREG | S_IRUGO | S_IWUSR; struct ext4_sb_info *sbi = EXT4_SB(sb); struct proc_dir_entry *proc; - char devname[64]; - bdevname(sb->s_bdev, devname); - sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4); - - MB_PROC_HANDLER(EXT4_MB_STATS_NAME, stats); - MB_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, max_to_scan); - MB_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, min_to_scan); - MB_PROC_HANDLER(EXT4_MB_ORDER2_REQ, order2_reqs); - MB_PROC_HANDLER(EXT4_MB_STREAM_REQ, stream_request); - MB_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, group_prealloc); + if (sbi->s_proc == NULL) + return -EINVAL; + EXT4_PROC_HANDLER(EXT4_MB_STATS_NAME, mb_stats); + EXT4_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, mb_max_to_scan); + EXT4_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, mb_min_to_scan); + EXT4_PROC_HANDLER(EXT4_MB_ORDER2_REQ, mb_order2_reqs); + EXT4_PROC_HANDLER(EXT4_MB_STREAM_REQ, mb_stream_request); + EXT4_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, mb_group_prealloc); return 0; err_out: - printk(KERN_ERR "EXT4-fs: Unable to create %s\n", devname); - remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc); - remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc); - remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc); - remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc); - remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc); - remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc); - remove_proc_entry(devname, proc_root_ext4); - sbi->s_mb_proc = NULL; - + remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc); + remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc); + remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc); + remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc); + remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc); + remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc); return -ENOMEM; } static int ext4_mb_destroy_per_dev_proc(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); - char devname[64]; - if (sbi->s_mb_proc == NULL) + if (sbi->s_proc == NULL) return -EINVAL; - bdevname(sb->s_bdev, devname); - remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc); - remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc); - remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc); - remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc); - remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc); - remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc); - remove_proc_entry(devname, proc_root_ext4); + remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc); + remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc); + remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc); + remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc); + remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc); + remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc); return 0; } @@ -2702,11 +2771,6 @@ int __init init_ext4_mballoc(void) kmem_cache_destroy(ext4_pspace_cachep); return -ENOMEM; } -#ifdef CONFIG_PROC_FS - proc_root_ext4 = proc_mkdir("fs/ext4", NULL); - if (proc_root_ext4 == NULL) - printk(KERN_ERR "EXT4-fs: Unable to create fs/ext4\n"); -#endif return 0; } @@ -2715,9 +2779,6 @@ void exit_ext4_mballoc(void) /* XXX: synchronize_rcu(); */ kmem_cache_destroy(ext4_pspace_cachep); kmem_cache_destroy(ext4_ac_cachep); -#ifdef CONFIG_PROC_FS - remove_proc_entry("fs/ext4", NULL); -#endif } @@ -2727,7 +2788,7 @@ void exit_ext4_mballoc(void) */ static noinline_for_stack int ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, - handle_t *handle) + handle_t *handle, unsigned long reserv_blks) { struct buffer_head *bitmap_bh = NULL; struct ext4_super_block *es; @@ -2747,7 +2808,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, err = -EIO; - bitmap_bh = read_block_bitmap(sb, ac->ac_b_ex.fe_group); + bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group); if (!bitmap_bh) goto out_err; @@ -2817,6 +2878,23 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len); + /* + * Now reduce the dirty block count also. Should not go negative + */ + if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) + /* release all the reserved blocks if non delalloc */ + percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks); + else + percpu_counter_sub(&sbi->s_dirtyblocks_counter, + ac->ac_b_ex.fe_len); + + if (sbi->s_log_groups_per_flex) { + ext4_group_t flex_group = ext4_flex_group(sbi, + ac->ac_b_ex.fe_group); + spin_lock(sb_bgl_lock(sbi, flex_group)); + sbi->s_flex_groups[flex_group].free_blocks -= ac->ac_b_ex.fe_len; + spin_unlock(sb_bgl_lock(sbi, flex_group)); + } err = ext4_journal_dirty_metadata(handle, bitmap_bh); if (err) @@ -3096,6 +3174,7 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, struct ext4_prealloc_space *pa) { unsigned int len = ac->ac_o_ex.fe_len; + ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart, &ac->ac_b_ex.fe_group, &ac->ac_b_ex.fe_start); @@ -3113,14 +3192,45 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, } /* + * Return the prealloc space that have minimal distance + * from the goal block. @cpa is the prealloc + * space that is having currently known minimal distance + * from the goal block. + */ +static struct ext4_prealloc_space * +ext4_mb_check_group_pa(ext4_fsblk_t goal_block, + struct ext4_prealloc_space *pa, + struct ext4_prealloc_space *cpa) +{ + ext4_fsblk_t cur_distance, new_distance; + + if (cpa == NULL) { + atomic_inc(&pa->pa_count); + return pa; + } + cur_distance = abs(goal_block - cpa->pa_pstart); + new_distance = abs(goal_block - pa->pa_pstart); + + if (cur_distance < new_distance) + return cpa; + + /* drop the previous reference */ + atomic_dec(&cpa->pa_count); + atomic_inc(&pa->pa_count); + return pa; +} + +/* * search goal blocks in preallocated space */ static noinline_for_stack int ext4_mb_use_preallocated(struct ext4_allocation_context *ac) { + int order, i; struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_locality_group *lg; - struct ext4_prealloc_space *pa; + struct ext4_prealloc_space *pa, *cpa = NULL; + ext4_fsblk_t goal_block; /* only data can be preallocated */ if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) @@ -3158,22 +3268,38 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) lg = ac->ac_lg; if (lg == NULL) return 0; + order = fls(ac->ac_o_ex.fe_len) - 1; + if (order > PREALLOC_TB_SIZE - 1) + /* The max size of hash table is PREALLOC_TB_SIZE */ + order = PREALLOC_TB_SIZE - 1; + + goal_block = ac->ac_g_ex.fe_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb) + + ac->ac_g_ex.fe_start + + le32_to_cpu(EXT4_SB(ac->ac_sb)->s_es->s_first_data_block); + /* + * search for the prealloc space that is having + * minimal distance from the goal block. + */ + for (i = order; i < PREALLOC_TB_SIZE; i++) { + rcu_read_lock(); + list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i], + pa_inode_list) { + spin_lock(&pa->pa_lock); + if (pa->pa_deleted == 0 && + pa->pa_free >= ac->ac_o_ex.fe_len) { - rcu_read_lock(); - list_for_each_entry_rcu(pa, &lg->lg_prealloc_list, pa_inode_list) { - spin_lock(&pa->pa_lock); - if (pa->pa_deleted == 0 && pa->pa_free >= ac->ac_o_ex.fe_len) { - atomic_inc(&pa->pa_count); - ext4_mb_use_group_pa(ac, pa); + cpa = ext4_mb_check_group_pa(goal_block, + pa, cpa); + } spin_unlock(&pa->pa_lock); - ac->ac_criteria = 20; - rcu_read_unlock(); - return 1; } - spin_unlock(&pa->pa_lock); + rcu_read_unlock(); + } + if (cpa) { + ext4_mb_use_group_pa(ac, cpa); + ac->ac_criteria = 20; + return 1; } - rcu_read_unlock(); - return 0; } @@ -3396,6 +3522,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) pa->pa_free = pa->pa_len; atomic_set(&pa->pa_count, 1); spin_lock_init(&pa->pa_lock); + INIT_LIST_HEAD(&pa->pa_inode_list); pa->pa_deleted = 0; pa->pa_linear = 1; @@ -3416,10 +3543,10 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) list_add(&pa->pa_group_list, &grp->bb_prealloc_list); ext4_unlock_group(sb, ac->ac_b_ex.fe_group); - spin_lock(pa->pa_obj_lock); - list_add_tail_rcu(&pa->pa_inode_list, &lg->lg_prealloc_list); - spin_unlock(pa->pa_obj_lock); - + /* + * We will later add the new pa to the right bucket + * after updating the pa_free in ext4_mb_release_context + */ return 0; } @@ -3473,8 +3600,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, if (bit >= end) break; next = mb_find_next_bit(bitmap_bh->b_data, end, bit); - if (next > end) - next = end; start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit + le32_to_cpu(sbi->s_es->s_first_data_block); mb_debug(" free preallocated %u/%u in group %u\n", @@ -3569,22 +3694,25 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, if (list_empty(&grp->bb_prealloc_list)) return 0; - bitmap_bh = read_block_bitmap(sb, group); + bitmap_bh = ext4_read_block_bitmap(sb, group); if (bitmap_bh == NULL) { - /* error handling here */ - ext4_mb_release_desc(&e4b); - BUG_ON(bitmap_bh == NULL); + ext4_error(sb, __func__, "Error in reading block " + "bitmap for %lu\n", group); + return 0; } err = ext4_mb_load_buddy(sb, group, &e4b); - BUG_ON(err != 0); /* error handling here */ + if (err) { + ext4_error(sb, __func__, "Error in loading buddy " + "information for %lu\n", group); + put_bh(bitmap_bh); + return 0; + } if (needed == 0) needed = EXT4_BLOCKS_PER_GROUP(sb) + 1; - grp = ext4_get_group_info(sb, group); INIT_LIST_HEAD(&list); - ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); repeat: ext4_lock_group(sb, group); @@ -3666,7 +3794,7 @@ out: * * FIXME!! Make sure it is valid at all the call sites */ -void ext4_mb_discard_inode_preallocations(struct inode *inode) +void ext4_discard_preallocations(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct super_block *sb = inode->i_sb; @@ -3678,7 +3806,7 @@ void ext4_mb_discard_inode_preallocations(struct inode *inode) struct ext4_buddy e4b; int err; - if (!test_opt(sb, MBALLOC) || !S_ISREG(inode->i_mode)) { + if (!S_ISREG(inode->i_mode)) { /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/ return; } @@ -3741,13 +3869,18 @@ repeat: ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); err = ext4_mb_load_buddy(sb, group, &e4b); - BUG_ON(err != 0); /* error handling here */ + if (err) { + ext4_error(sb, __func__, "Error in loading buddy " + "information for %lu\n", group); + continue; + } - bitmap_bh = read_block_bitmap(sb, group); + bitmap_bh = ext4_read_block_bitmap(sb, group); if (bitmap_bh == NULL) { - /* error handling here */ + ext4_error(sb, __func__, "Error in reading block " + "bitmap for %lu\n", group); ext4_mb_release_desc(&e4b); - BUG_ON(bitmap_bh == NULL); + continue; } ext4_lock_group(sb, group); @@ -3871,8 +4004,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) * per cpu locality group is to reduce the contention between block * request from multiple CPUs. */ - ac->ac_lg = &sbi->s_locality_groups[get_cpu()]; - put_cpu(); + ac->ac_lg = per_cpu_ptr(sbi->s_locality_groups, raw_smp_processor_id()); /* we're going to use group allocation */ ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC; @@ -3950,22 +4082,168 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, } +static noinline_for_stack void +ext4_mb_discard_lg_preallocations(struct super_block *sb, + struct ext4_locality_group *lg, + int order, int total_entries) +{ + ext4_group_t group = 0; + struct ext4_buddy e4b; + struct list_head discard_list; + struct ext4_prealloc_space *pa, *tmp; + struct ext4_allocation_context *ac; + + mb_debug("discard locality group preallocation\n"); + + INIT_LIST_HEAD(&discard_list); + ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); + + spin_lock(&lg->lg_prealloc_lock); + list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], + pa_inode_list) { + spin_lock(&pa->pa_lock); + if (atomic_read(&pa->pa_count)) { + /* + * This is the pa that we just used + * for block allocation. So don't + * free that + */ + spin_unlock(&pa->pa_lock); + continue; + } + if (pa->pa_deleted) { + spin_unlock(&pa->pa_lock); + continue; + } + /* only lg prealloc space */ + BUG_ON(!pa->pa_linear); + + /* seems this one can be freed ... */ + pa->pa_deleted = 1; + spin_unlock(&pa->pa_lock); + + list_del_rcu(&pa->pa_inode_list); + list_add(&pa->u.pa_tmp_list, &discard_list); + + total_entries--; + if (total_entries <= 5) { + /* + * we want to keep only 5 entries + * allowing it to grow to 8. This + * mak sure we don't call discard + * soon for this list. + */ + break; + } + } + spin_unlock(&lg->lg_prealloc_lock); + + list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) { + + ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); + if (ext4_mb_load_buddy(sb, group, &e4b)) { + ext4_error(sb, __func__, "Error in loading buddy " + "information for %lu\n", group); + continue; + } + ext4_lock_group(sb, group); + list_del(&pa->pa_group_list); + ext4_mb_release_group_pa(&e4b, pa, ac); + ext4_unlock_group(sb, group); + + ext4_mb_release_desc(&e4b); + list_del(&pa->u.pa_tmp_list); + call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); + } + if (ac) + kmem_cache_free(ext4_ac_cachep, ac); +} + +/* + * We have incremented pa_count. So it cannot be freed at this + * point. Also we hold lg_mutex. So no parallel allocation is + * possible from this lg. That means pa_free cannot be updated. + * + * A parallel ext4_mb_discard_group_preallocations is possible. + * which can cause the lg_prealloc_list to be updated. + */ + +static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) +{ + int order, added = 0, lg_prealloc_count = 1; + struct super_block *sb = ac->ac_sb; + struct ext4_locality_group *lg = ac->ac_lg; + struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa; + + order = fls(pa->pa_free) - 1; + if (order > PREALLOC_TB_SIZE - 1) + /* The max size of hash table is PREALLOC_TB_SIZE */ + order = PREALLOC_TB_SIZE - 1; + /* Add the prealloc space to lg */ + rcu_read_lock(); + list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order], + pa_inode_list) { + spin_lock(&tmp_pa->pa_lock); + if (tmp_pa->pa_deleted) { + spin_unlock(&pa->pa_lock); + continue; + } + if (!added && pa->pa_free < tmp_pa->pa_free) { + /* Add to the tail of the previous entry */ + list_add_tail_rcu(&pa->pa_inode_list, + &tmp_pa->pa_inode_list); + added = 1; + /* + * we want to count the total + * number of entries in the list + */ + } + spin_unlock(&tmp_pa->pa_lock); + lg_prealloc_count++; + } + if (!added) + list_add_tail_rcu(&pa->pa_inode_list, + &lg->lg_prealloc_list[order]); + rcu_read_unlock(); + + /* Now trim the list to be not more than 8 elements */ + if (lg_prealloc_count > 8) { + ext4_mb_discard_lg_preallocations(sb, lg, + order, lg_prealloc_count); + return; + } + return ; +} + /* * release all resource we used in allocation */ static int ext4_mb_release_context(struct ext4_allocation_context *ac) { - if (ac->ac_pa) { - if (ac->ac_pa->pa_linear) { + struct ext4_prealloc_space *pa = ac->ac_pa; + if (pa) { + if (pa->pa_linear) { /* see comment in ext4_mb_use_group_pa() */ - spin_lock(&ac->ac_pa->pa_lock); - ac->ac_pa->pa_pstart += ac->ac_b_ex.fe_len; - ac->ac_pa->pa_lstart += ac->ac_b_ex.fe_len; - ac->ac_pa->pa_free -= ac->ac_b_ex.fe_len; - ac->ac_pa->pa_len -= ac->ac_b_ex.fe_len; - spin_unlock(&ac->ac_pa->pa_lock); + spin_lock(&pa->pa_lock); + pa->pa_pstart += ac->ac_b_ex.fe_len; + pa->pa_lstart += ac->ac_b_ex.fe_len; + pa->pa_free -= ac->ac_b_ex.fe_len; + pa->pa_len -= ac->ac_b_ex.fe_len; + spin_unlock(&pa->pa_lock); + /* + * We want to add the pa to the right bucket. + * Remove it from the list and while adding + * make sure the list to which we are adding + * doesn't grow big. + */ + if (likely(pa->pa_free)) { + spin_lock(pa->pa_obj_lock); + list_del_rcu(&pa->pa_inode_list); + spin_unlock(pa->pa_obj_lock); + ext4_mb_add_n_trim(ac); + } } - ext4_mb_put_pa(ac, ac->ac_sb, ac->ac_pa); + ext4_mb_put_pa(ac, ac->ac_sb, pa); } if (ac->ac_bitmap_page) page_cache_release(ac->ac_bitmap_page); @@ -4000,22 +4278,32 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, struct ext4_allocation_request *ar, int *errp) { + int freed; struct ext4_allocation_context *ac = NULL; struct ext4_sb_info *sbi; struct super_block *sb; ext4_fsblk_t block = 0; - int freed; - int inquota; + unsigned long inquota; + unsigned long reserv_blks = 0; sb = ar->inode->i_sb; sbi = EXT4_SB(sb); - if (!test_opt(sb, MBALLOC)) { - block = ext4_new_blocks_old(handle, ar->inode, ar->goal, - &(ar->len), errp); - return block; + if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) { + /* + * With delalloc we already reserved the blocks + */ + while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) { + /* let others to free the space */ + yield(); + ar->len = ar->len >> 1; + } + if (!ar->len) { + *errp = -ENOSPC; + return 0; + } + reserv_blks = ar->len; } - while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) { ar->flags |= EXT4_MB_HINT_NOPREALLOC; ar->len--; @@ -4026,10 +4314,14 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, } inquota = ar->len; + if (EXT4_I(ar->inode)->i_delalloc_reserved_flag) + ar->flags |= EXT4_MB_DELALLOC_RESERVED; + ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); if (!ac) { + ar->len = 0; *errp = -ENOMEM; - return 0; + goto out1; } ext4_mb_poll_new_transaction(sb, handle); @@ -4037,12 +4329,11 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, *errp = ext4_mb_initialize_context(ac, ar); if (*errp) { ar->len = 0; - goto out; + goto out2; } ac->ac_op = EXT4_MB_HISTORY_PREALLOC; if (!ext4_mb_use_preallocated(ac)) { - ac->ac_op = EXT4_MB_HISTORY_ALLOC; ext4_mb_normalize_request(ac, ar); repeat: @@ -4058,7 +4349,7 @@ repeat: } if (likely(ac->ac_status == AC_STATUS_FOUND)) { - *errp = ext4_mb_mark_diskspace_used(ac, handle); + *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks); if (*errp == -EAGAIN) { ac->ac_b_ex.fe_group = 0; ac->ac_b_ex.fe_start = 0; @@ -4085,11 +4376,12 @@ repeat: ext4_mb_release_context(ac); -out: +out2: + kmem_cache_free(ext4_ac_cachep, ac); +out1: if (ar->len < inquota) DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len); - kmem_cache_free(ext4_ac_cachep, ac); return block; } static void ext4_mb_poll_new_transaction(struct super_block *sb, @@ -4242,12 +4534,16 @@ do_more: overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb); count -= overflow; } - bitmap_bh = read_block_bitmap(sb, block_group); - if (!bitmap_bh) + bitmap_bh = ext4_read_block_bitmap(sb, block_group); + if (!bitmap_bh) { + err = -EIO; goto error_return; + } gdp = ext4_get_group_desc(sb, block_group, &gd_bh); - if (!gdp) + if (!gdp) { + err = -EIO; goto error_return; + } if (in_range(ext4_block_bitmap(sb, gdp), block, count) || in_range(ext4_inode_bitmap(sb, gdp), block, count) || @@ -4309,10 +4605,9 @@ do_more: ext4_mb_free_metadata(handle, &e4b, block_group, bit, count); } else { ext4_lock_group(sb, block_group); - err = mb_free_blocks(inode, &e4b, bit, count); + mb_free_blocks(inode, &e4b, bit, count); ext4_mb_return_to_preallocation(inode, &e4b, block, count); ext4_unlock_group(sb, block_group); - BUG_ON(err != 0); } spin_lock(sb_bgl_lock(sbi, block_group)); @@ -4321,6 +4616,13 @@ do_more: spin_unlock(sb_bgl_lock(sbi, block_group)); percpu_counter_add(&sbi->s_freeblocks_counter, count); + if (sbi->s_log_groups_per_flex) { + ext4_group_t flex_group = ext4_flex_group(sbi, block_group); + spin_lock(sb_bgl_lock(sbi, flex_group)); + sbi->s_flex_groups[flex_group].free_blocks += count; + spin_unlock(sb_bgl_lock(sbi, flex_group)); + } + ext4_mb_release_desc(&e4b); *freed += count; diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index bfe6add46bcf..b3b4828f8b89 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -164,11 +164,17 @@ struct ext4_free_extent { * Locality group: * we try to group all related changes together * so that writeback can flush/allocate them together as well + * Size of lg_prealloc_list hash is determined by MB_DEFAULT_GROUP_PREALLOC + * (512). We store prealloc space into the hash based on the pa_free blocks + * order value.ie, fls(pa_free)-1; */ +#define PREALLOC_TB_SIZE 10 struct ext4_locality_group { /* for allocator */ - struct mutex lg_mutex; /* to serialize allocates */ - struct list_head lg_prealloc_list;/* list of preallocations */ + /* to serialize allocates */ + struct mutex lg_mutex; + /* list of preallocations */ + struct list_head lg_prealloc_list[PREALLOC_TB_SIZE]; spinlock_t lg_prealloc_lock; }; @@ -251,7 +257,6 @@ static void ext4_mb_store_history(struct ext4_allocation_context *ac); #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) -static struct proc_dir_entry *proc_root_ext4; struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t); static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index b9e077ba07e9..f2a9cf498ecd 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -53,7 +53,8 @@ static int finish_range(handle_t *handle, struct inode *inode, * credit. But below we try to not accumalate too much * of them by restarting the journal. */ - needed = ext4_ext_calc_credits_for_insert(inode, path); + needed = ext4_ext_calc_credits_for_single_extent(inode, + lb->last_block - lb->first_block + 1, path); /* * Make sure the credit we accumalated is not really high @@ -446,8 +447,7 @@ static int free_ext_block(handle_t *handle, struct inode *inode) } -int ext4_ext_migrate(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg) +int ext4_ext_migrate(struct inode *inode) { handle_t *handle; int retval = 0, i; @@ -515,12 +515,6 @@ int ext4_ext_migrate(struct inode *inode, struct file *filp, * when we add extents we extent the journal */ /* - * inode_mutex prevent write and truncate on the file. Read still goes - * through. We take i_data_sem in ext4_ext_swap_inode_data before we - * switch the inode format to prevent read. - */ - mutex_lock(&(inode->i_mutex)); - /* * Even though we take i_mutex we can still cause block allocation * via mmap write to holes. If we have allocated new blocks we fail * migrate. New block allocation will clear EXT4_EXT_MIGRATE flag. @@ -622,7 +616,6 @@ err_out: tmp_inode->i_nlink = 0; ext4_journal_stop(handle); - mutex_unlock(&(inode->i_mutex)); if (tmp_inode) iput(tmp_inode); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index ab16beaa830d..92db9e945147 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -151,38 +151,50 @@ struct dx_map_entry static inline ext4_lblk_t dx_get_block(struct dx_entry *entry); static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value); -static inline unsigned dx_get_hash (struct dx_entry *entry); -static void dx_set_hash (struct dx_entry *entry, unsigned value); -static unsigned dx_get_count (struct dx_entry *entries); -static unsigned dx_get_limit (struct dx_entry *entries); -static void dx_set_count (struct dx_entry *entries, unsigned value); -static void dx_set_limit (struct dx_entry *entries, unsigned value); -static unsigned dx_root_limit (struct inode *dir, unsigned infosize); -static unsigned dx_node_limit (struct inode *dir); -static struct dx_frame *dx_probe(struct dentry *dentry, +static inline unsigned dx_get_hash(struct dx_entry *entry); +static void dx_set_hash(struct dx_entry *entry, unsigned value); +static unsigned dx_get_count(struct dx_entry *entries); +static unsigned dx_get_limit(struct dx_entry *entries); +static void dx_set_count(struct dx_entry *entries, unsigned value); +static void dx_set_limit(struct dx_entry *entries, unsigned value); +static unsigned dx_root_limit(struct inode *dir, unsigned infosize); +static unsigned dx_node_limit(struct inode *dir); +static struct dx_frame *dx_probe(const struct qstr *d_name, struct inode *dir, struct dx_hash_info *hinfo, struct dx_frame *frame, int *err); -static void dx_release (struct dx_frame *frames); -static int dx_make_map (struct ext4_dir_entry_2 *de, int size, - struct dx_hash_info *hinfo, struct dx_map_entry map[]); +static void dx_release(struct dx_frame *frames); +static int dx_make_map(struct ext4_dir_entry_2 *de, int size, + struct dx_hash_info *hinfo, struct dx_map_entry map[]); static void dx_sort_map(struct dx_map_entry *map, unsigned count); -static struct ext4_dir_entry_2 *dx_move_dirents (char *from, char *to, +static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to, struct dx_map_entry *offsets, int count); -static struct ext4_dir_entry_2* dx_pack_dirents (char *base, int size); +static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size); static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block); static int ext4_htree_next_block(struct inode *dir, __u32 hash, struct dx_frame *frame, struct dx_frame *frames, __u32 *start_hash); -static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry, - struct ext4_dir_entry_2 **res_dir, int *err); +static struct buffer_head * ext4_dx_find_entry(struct inode *dir, + const struct qstr *d_name, + struct ext4_dir_entry_2 **res_dir, + int *err); static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, struct inode *inode); /* + * p is at least 6 bytes before the end of page + */ +static inline struct ext4_dir_entry_2 * +ext4_next_entry(struct ext4_dir_entry_2 *p) +{ + return (struct ext4_dir_entry_2 *)((char *)p + + ext4_rec_len_from_disk(p->rec_len)); +} + +/* * Future: use high four bits of block for coalesce-on-delete flags * Mask them off for now. */ @@ -197,59 +209,59 @@ static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value) entry->block = cpu_to_le32(value); } -static inline unsigned dx_get_hash (struct dx_entry *entry) +static inline unsigned dx_get_hash(struct dx_entry *entry) { return le32_to_cpu(entry->hash); } -static inline void dx_set_hash (struct dx_entry *entry, unsigned value) +static inline void dx_set_hash(struct dx_entry *entry, unsigned value) { entry->hash = cpu_to_le32(value); } -static inline unsigned dx_get_count (struct dx_entry *entries) +static inline unsigned dx_get_count(struct dx_entry *entries) { return le16_to_cpu(((struct dx_countlimit *) entries)->count); } -static inline unsigned dx_get_limit (struct dx_entry *entries) +static inline unsigned dx_get_limit(struct dx_entry *entries) { return le16_to_cpu(((struct dx_countlimit *) entries)->limit); } -static inline void dx_set_count (struct dx_entry *entries, unsigned value) +static inline void dx_set_count(struct dx_entry *entries, unsigned value) { ((struct dx_countlimit *) entries)->count = cpu_to_le16(value); } -static inline void dx_set_limit (struct dx_entry *entries, unsigned value) +static inline void dx_set_limit(struct dx_entry *entries, unsigned value) { ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value); } -static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize) +static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize) { unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) - EXT4_DIR_REC_LEN(2) - infosize; - return 0? 20: entry_space / sizeof(struct dx_entry); + return entry_space / sizeof(struct dx_entry); } -static inline unsigned dx_node_limit (struct inode *dir) +static inline unsigned dx_node_limit(struct inode *dir) { unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0); - return 0? 22: entry_space / sizeof(struct dx_entry); + return entry_space / sizeof(struct dx_entry); } /* * Debug */ #ifdef DX_DEBUG -static void dx_show_index (char * label, struct dx_entry *entries) +static void dx_show_index(char * label, struct dx_entry *entries) { int i, n = dx_get_count (entries); - printk("%s index ", label); + printk(KERN_DEBUG "%s index ", label); for (i = 0; i < n; i++) { - printk("%x->%lu ", i? dx_get_hash(entries + i) : + printk("%x->%lu ", i ? dx_get_hash(entries + i) : 0, (unsigned long)dx_get_block(entries + i)); } printk("\n"); @@ -296,7 +308,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, struct dx_entry *entries, int levels) { unsigned blocksize = dir->i_sb->s_blocksize; - unsigned count = dx_get_count (entries), names = 0, space = 0, i; + unsigned count = dx_get_count(entries), names = 0, space = 0, i; unsigned bcount = 0; struct buffer_head *bh; int err; @@ -315,11 +327,12 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, names += stats.names; space += stats.space; bcount += stats.bcount; - brelse (bh); + brelse(bh); } if (bcount) - printk("%snames %u, fullness %u (%u%%)\n", levels?"":" ", - names, space/bcount,(space/bcount)*100/blocksize); + printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n", + levels ? "" : " ", names, space/bcount, + (space/bcount)*100/blocksize); return (struct stats) { names, space, bcount}; } #endif /* DX_DEBUG */ @@ -334,7 +347,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, * back to userspace. */ static struct dx_frame * -dx_probe(struct dentry *dentry, struct inode *dir, +dx_probe(const struct qstr *d_name, struct inode *dir, struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err) { unsigned count, indirect; @@ -345,8 +358,6 @@ dx_probe(struct dentry *dentry, struct inode *dir, u32 hash; frame->bh = NULL; - if (dentry) - dir = dentry->d_parent->d_inode; if (!(bh = ext4_bread (NULL,dir, 0, 0, err))) goto fail; root = (struct dx_root *) bh->b_data; @@ -362,8 +373,8 @@ dx_probe(struct dentry *dentry, struct inode *dir, } hinfo->hash_version = root->info.hash_version; hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; - if (dentry) - ext4fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo); + if (d_name) + ext4fs_dirhash(d_name->name, d_name->len, hinfo); hash = hinfo->hash; if (root->info.unused_flags & 1) { @@ -396,7 +407,7 @@ dx_probe(struct dentry *dentry, struct inode *dir, goto fail; } - dxtrace (printk("Look up %x", hash)); + dxtrace(printk("Look up %x", hash)); while (1) { count = dx_get_count(entries); @@ -545,7 +556,7 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash, 0, &err))) return err; /* Failure */ p++; - brelse (p->bh); + brelse(p->bh); p->bh = bh; p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; } @@ -554,15 +565,6 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash, /* - * p is at least 6 bytes before the end of page - */ -static inline struct ext4_dir_entry_2 *ext4_next_entry(struct ext4_dir_entry_2 *p) -{ - return (struct ext4_dir_entry_2 *)((char *)p + - ext4_rec_len_from_disk(p->rec_len)); -} - -/* * This function fills a red-black tree with information from a * directory block. It returns the number directory entries loaded * into the tree. If there is an error it is returned in err. @@ -592,7 +594,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, /* On error, skip the f_pos to the next block. */ dir_file->f_pos = (dir_file->f_pos | (dir->i_sb->s_blocksize - 1)) + 1; - brelse (bh); + brelse(bh); return count; } ext4fs_dirhash(de->name, de->name_len, hinfo); @@ -634,8 +636,8 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, int ret, err; __u32 hashval; - dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash, - start_minor_hash)); + dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n", + start_hash, start_minor_hash)); dir = dir_file->f_path.dentry->d_inode; if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) { hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; @@ -647,7 +649,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, } hinfo.hash = start_hash; hinfo.minor_hash = 0; - frame = dx_probe(NULL, dir_file->f_path.dentry->d_inode, &hinfo, frames, &err); + frame = dx_probe(NULL, dir, &hinfo, frames, &err); if (!frame) return err; @@ -693,8 +695,8 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, break; } dx_release(frames); - dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n", - count, *next_hash)); + dxtrace(printk(KERN_DEBUG "Fill tree: returned %d entries, " + "next hash: %x\n", count, *next_hash)); return count; errout: dx_release(frames); @@ -801,17 +803,17 @@ static inline int ext4_match (int len, const char * const name, /* * Returns 0 if not found, -1 on failure, and 1 on success */ -static inline int search_dirblock(struct buffer_head * bh, +static inline int search_dirblock(struct buffer_head *bh, struct inode *dir, - struct dentry *dentry, + const struct qstr *d_name, unsigned long offset, struct ext4_dir_entry_2 ** res_dir) { struct ext4_dir_entry_2 * de; char * dlimit; int de_len; - const char *name = dentry->d_name.name; - int namelen = dentry->d_name.len; + const char *name = d_name->name; + int namelen = d_name->len; de = (struct ext4_dir_entry_2 *) bh->b_data; dlimit = bh->b_data + dir->i_sb->s_blocksize; @@ -850,12 +852,13 @@ static inline int search_dirblock(struct buffer_head * bh, * The returned buffer_head has ->b_count elevated. The caller is expected * to brelse() it when appropriate. */ -static struct buffer_head * ext4_find_entry (struct dentry *dentry, +static struct buffer_head * ext4_find_entry (struct inode *dir, + const struct qstr *d_name, struct ext4_dir_entry_2 ** res_dir) { - struct super_block * sb; - struct buffer_head * bh_use[NAMEI_RA_SIZE]; - struct buffer_head * bh, *ret = NULL; + struct super_block *sb; + struct buffer_head *bh_use[NAMEI_RA_SIZE]; + struct buffer_head *bh, *ret = NULL; ext4_lblk_t start, block, b; int ra_max = 0; /* Number of bh's in the readahead buffer, bh_use[] */ @@ -864,16 +867,15 @@ static struct buffer_head * ext4_find_entry (struct dentry *dentry, int num = 0; ext4_lblk_t nblocks; int i, err; - struct inode *dir = dentry->d_parent->d_inode; int namelen; *res_dir = NULL; sb = dir->i_sb; - namelen = dentry->d_name.len; + namelen = d_name->len; if (namelen > EXT4_NAME_LEN) return NULL; if (is_dx(dir)) { - bh = ext4_dx_find_entry(dentry, res_dir, &err); + bh = ext4_dx_find_entry(dir, d_name, res_dir, &err); /* * On success, or if the error was file not found, * return. Otherwise, fall back to doing a search the @@ -881,7 +883,8 @@ static struct buffer_head * ext4_find_entry (struct dentry *dentry, */ if (bh || (err != ERR_BAD_DX_DIR)) return bh; - dxtrace(printk("ext4_find_entry: dx failed, falling back\n")); + dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " + "falling back\n")); } nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); start = EXT4_I(dir)->i_dir_start_lookup; @@ -925,7 +928,7 @@ restart: brelse(bh); goto next; } - i = search_dirblock(bh, dir, dentry, + i = search_dirblock(bh, dir, d_name, block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); if (i == 1) { EXT4_I(dir)->i_dir_start_lookup = block; @@ -955,11 +958,11 @@ restart: cleanup_and_exit: /* Clean up the read-ahead blocks */ for (; ra_ptr < ra_max; ra_ptr++) - brelse (bh_use[ra_ptr]); + brelse(bh_use[ra_ptr]); return ret; } -static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry, +static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name, struct ext4_dir_entry_2 **res_dir, int *err) { struct super_block * sb; @@ -970,14 +973,13 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry, struct buffer_head *bh; ext4_lblk_t block; int retval; - int namelen = dentry->d_name.len; - const u8 *name = dentry->d_name.name; - struct inode *dir = dentry->d_parent->d_inode; + int namelen = d_name->len; + const u8 *name = d_name->name; sb = dir->i_sb; /* NFS may look up ".." - look at dx_root directory block */ if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){ - if (!(frame = dx_probe(dentry, NULL, &hinfo, frames, err))) + if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err))) return NULL; } else { frame = frames; @@ -993,21 +995,23 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry, de = (struct ext4_dir_entry_2 *) bh->b_data; top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize - EXT4_DIR_REC_LEN(0)); - for (; de < top; de = ext4_next_entry(de)) - if (ext4_match (namelen, name, de)) { - if (!ext4_check_dir_entry("ext4_find_entry", - dir, de, bh, - (block<<EXT4_BLOCK_SIZE_BITS(sb)) - +((char *)de - bh->b_data))) { - brelse (bh); + for (; de < top; de = ext4_next_entry(de)) { + int off = (block << EXT4_BLOCK_SIZE_BITS(sb)) + + ((char *) de - bh->b_data); + + if (!ext4_check_dir_entry(__func__, dir, de, bh, off)) { + brelse(bh); *err = ERR_BAD_DX_DIR; goto errout; } - *res_dir = de; - dx_release (frames); - return bh; + + if (ext4_match(namelen, name, de)) { + *res_dir = de; + dx_release(frames); + return bh; + } } - brelse (bh); + brelse(bh); /* Check to see if we should continue to search */ retval = ext4_htree_next_block(dir, hash, frame, frames, NULL); @@ -1022,25 +1026,25 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry, *err = -ENOENT; errout: - dxtrace(printk("%s not found\n", name)); + dxtrace(printk(KERN_DEBUG "%s not found\n", name)); dx_release (frames); return NULL; } -static struct dentry *ext4_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd) +static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { - struct inode * inode; - struct ext4_dir_entry_2 * de; - struct buffer_head * bh; + struct inode *inode; + struct ext4_dir_entry_2 *de; + struct buffer_head *bh; if (dentry->d_name.len > EXT4_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); - bh = ext4_find_entry(dentry, &de); + bh = ext4_find_entry(dir, &dentry->d_name, &de); inode = NULL; if (bh) { unsigned long ino = le32_to_cpu(de->inode); - brelse (bh); + brelse(bh); if (!ext4_valid_inum(dir->i_sb, ino)) { ext4_error(dir->i_sb, "ext4_lookup", "bad inode number: %lu", ino); @@ -1059,15 +1063,14 @@ struct dentry *ext4_get_parent(struct dentry *child) unsigned long ino; struct dentry *parent; struct inode *inode; - struct dentry dotdot; + static const struct qstr dotdot = { + .name = "..", + .len = 2, + }; struct ext4_dir_entry_2 * de; struct buffer_head *bh; - dotdot.d_name.name = ".."; - dotdot.d_name.len = 2; - dotdot.d_parent = child; /* confusing, isn't it! */ - - bh = ext4_find_entry(&dotdot, &de); + bh = ext4_find_entry(child->d_inode, &dotdot, &de); inode = NULL; if (!bh) return ERR_PTR(-ENOENT); @@ -1198,10 +1201,10 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, /* create map in the end of data2 block */ map = (struct dx_map_entry *) (data2 + blocksize); - count = dx_make_map ((struct ext4_dir_entry_2 *) data1, + count = dx_make_map((struct ext4_dir_entry_2 *) data1, blocksize, hinfo, map); map -= count; - dx_sort_map (map, count); + dx_sort_map(map, count); /* Split the existing block in the middle, size-wise */ size = 0; move = 0; @@ -1222,7 +1225,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, /* Fancy dance to stay within two buffers */ de2 = dx_move_dirents(data1, data2, map + split, count - split); - de = dx_pack_dirents(data1,blocksize); + de = dx_pack_dirents(data1, blocksize); de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de); de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2); dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1)); @@ -1234,15 +1237,15 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, swap(*bh, bh2); de = de2; } - dx_insert_block (frame, hash2 + continued, newblock); - err = ext4_journal_dirty_metadata (handle, bh2); + dx_insert_block(frame, hash2 + continued, newblock); + err = ext4_journal_dirty_metadata(handle, bh2); if (err) goto journal_error; - err = ext4_journal_dirty_metadata (handle, frame->bh); + err = ext4_journal_dirty_metadata(handle, frame->bh); if (err) goto journal_error; - brelse (bh2); - dxtrace(dx_show_index ("frame", frame->entries)); + brelse(bh2); + dxtrace(dx_show_index("frame", frame->entries)); return de; journal_error: @@ -1268,7 +1271,7 @@ errout: */ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, struct inode *inode, struct ext4_dir_entry_2 *de, - struct buffer_head * bh) + struct buffer_head *bh) { struct inode *dir = dentry->d_parent->d_inode; const char *name = dentry->d_name.name; @@ -1285,11 +1288,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, while ((char *) de <= top) { if (!ext4_check_dir_entry("ext4_add_entry", dir, de, bh, offset)) { - brelse (bh); + brelse(bh); return -EIO; } - if (ext4_match (namelen, name, de)) { - brelse (bh); + if (ext4_match(namelen, name, de)) { + brelse(bh); return -EEXIST; } nlen = EXT4_DIR_REC_LEN(de->name_len); @@ -1326,7 +1329,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, } else de->inode = 0; de->name_len = namelen; - memcpy (de->name, name, namelen); + memcpy(de->name, name, namelen); /* * XXX shouldn't update any times until successful * completion of syscall, but too many callers depend @@ -1374,7 +1377,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, struct fake_dirent *fde; blocksize = dir->i_sb->s_blocksize; - dxtrace(printk("Creating index\n")); + dxtrace(printk(KERN_DEBUG "Creating index\n")); retval = ext4_journal_get_write_access(handle, bh); if (retval) { ext4_std_error(dir->i_sb, retval); @@ -1383,7 +1386,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, } root = (struct dx_root *) bh->b_data; - bh2 = ext4_append (handle, dir, &block, &retval); + bh2 = ext4_append(handle, dir, &block, &retval); if (!(bh2)) { brelse(bh); return retval; @@ -1409,9 +1412,9 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, root->info.info_length = sizeof(root->info); root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; entries = root->entries; - dx_set_block (entries, 1); - dx_set_count (entries, 1); - dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info))); + dx_set_block(entries, 1); + dx_set_count(entries, 1); + dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info))); /* Initialize as for dx_probe */ hinfo.hash_version = root->info.hash_version; @@ -1440,14 +1443,14 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, * may not sleep between calling this and putting something into * the entry, as someone else might have used it while you slept. */ -static int ext4_add_entry (handle_t *handle, struct dentry *dentry, - struct inode *inode) +static int ext4_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode) { struct inode *dir = dentry->d_parent->d_inode; unsigned long offset; - struct buffer_head * bh; + struct buffer_head *bh; struct ext4_dir_entry_2 *de; - struct super_block * sb; + struct super_block *sb; int retval; int dx_fallback=0; unsigned blocksize; @@ -1497,13 +1500,13 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, struct dx_frame frames[2], *frame; struct dx_entry *entries, *at; struct dx_hash_info hinfo; - struct buffer_head * bh; + struct buffer_head *bh; struct inode *dir = dentry->d_parent->d_inode; - struct super_block * sb = dir->i_sb; + struct super_block *sb = dir->i_sb; struct ext4_dir_entry_2 *de; int err; - frame = dx_probe(dentry, NULL, &hinfo, frames, &err); + frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err); if (!frame) return err; entries = frame->entries; @@ -1524,7 +1527,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, } /* Block full, should compress but for now just split */ - dxtrace(printk("using %u of %u node entries\n", + dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", dx_get_count(entries), dx_get_limit(entries))); /* Need to split index? */ if (dx_get_count(entries) == dx_get_limit(entries)) { @@ -1556,7 +1559,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, if (levels) { unsigned icount1 = icount/2, icount2 = icount - icount1; unsigned hash2 = dx_get_hash(entries + icount1); - dxtrace(printk("Split index %i/%i\n", icount1, icount2)); + dxtrace(printk(KERN_DEBUG "Split index %i/%i\n", + icount1, icount2)); BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ err = ext4_journal_get_write_access(handle, @@ -1564,11 +1568,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, if (err) goto journal_error; - memcpy ((char *) entries2, (char *) (entries + icount1), - icount2 * sizeof(struct dx_entry)); - dx_set_count (entries, icount1); - dx_set_count (entries2, icount2); - dx_set_limit (entries2, dx_node_limit(dir)); + memcpy((char *) entries2, (char *) (entries + icount1), + icount2 * sizeof(struct dx_entry)); + dx_set_count(entries, icount1); + dx_set_count(entries2, icount2); + dx_set_limit(entries2, dx_node_limit(dir)); /* Which index block gets the new entry? */ if (at - entries >= icount1) { @@ -1576,16 +1580,17 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, frame->entries = entries = entries2; swap(frame->bh, bh2); } - dx_insert_block (frames + 0, hash2, newblock); - dxtrace(dx_show_index ("node", frames[1].entries)); - dxtrace(dx_show_index ("node", + dx_insert_block(frames + 0, hash2, newblock); + dxtrace(dx_show_index("node", frames[1].entries)); + dxtrace(dx_show_index("node", ((struct dx_node *) bh2->b_data)->entries)); err = ext4_journal_dirty_metadata(handle, bh2); if (err) goto journal_error; brelse (bh2); } else { - dxtrace(printk("Creating second level index...\n")); + dxtrace(printk(KERN_DEBUG + "Creating second level index...\n")); memcpy((char *) entries2, (char *) entries, icount * sizeof(struct dx_entry)); dx_set_limit(entries2, dx_node_limit(dir)); @@ -1627,12 +1632,12 @@ cleanup: * ext4_delete_entry deletes a directory entry by merging it with the * previous entry */ -static int ext4_delete_entry (handle_t *handle, - struct inode * dir, - struct ext4_dir_entry_2 * de_del, - struct buffer_head * bh) +static int ext4_delete_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh) { - struct ext4_dir_entry_2 * de, * pde; + struct ext4_dir_entry_2 *de, *pde; int i; i = 0; @@ -1713,11 +1718,11 @@ static int ext4_add_nondir(handle_t *handle, * If the create succeeds, we fill in the inode information * with d_instantiate(). */ -static int ext4_create (struct inode * dir, struct dentry * dentry, int mode, - struct nameidata *nd) +static int ext4_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) { handle_t *handle; - struct inode * inode; + struct inode *inode; int err, retries = 0; retry: @@ -1744,8 +1749,8 @@ retry: return err; } -static int ext4_mknod (struct inode * dir, struct dentry *dentry, - int mode, dev_t rdev) +static int ext4_mknod(struct inode *dir, struct dentry *dentry, + int mode, dev_t rdev) { handle_t *handle; struct inode *inode; @@ -1764,11 +1769,11 @@ retry: if (IS_DIRSYNC(dir)) handle->h_sync = 1; - inode = ext4_new_inode (handle, dir, mode); + inode = ext4_new_inode(handle, dir, mode); err = PTR_ERR(inode); if (!IS_ERR(inode)) { init_special_inode(inode, inode->i_mode, rdev); -#ifdef CONFIG_EXT4DEV_FS_XATTR +#ifdef CONFIG_EXT4_FS_XATTR inode->i_op = &ext4_special_inode_operations; #endif err = ext4_add_nondir(handle, dentry, inode); @@ -1779,12 +1784,12 @@ retry: return err; } -static int ext4_mkdir(struct inode * dir, struct dentry * dentry, int mode) +static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode) { handle_t *handle; - struct inode * inode; - struct buffer_head * dir_block; - struct ext4_dir_entry_2 * de; + struct inode *inode; + struct buffer_head *dir_block; + struct ext4_dir_entry_2 *de; int err, retries = 0; if (EXT4_DIR_LINK_MAX(dir)) @@ -1800,7 +1805,7 @@ retry: if (IS_DIRSYNC(dir)) handle->h_sync = 1; - inode = ext4_new_inode (handle, dir, S_IFDIR | mode); + inode = ext4_new_inode(handle, dir, S_IFDIR | mode); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_stop; @@ -1808,7 +1813,7 @@ retry: inode->i_op = &ext4_dir_inode_operations; inode->i_fop = &ext4_dir_operations; inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; - dir_block = ext4_bread (handle, inode, 0, 1, &err); + dir_block = ext4_bread(handle, inode, 0, 1, &err); if (!dir_block) goto out_clear_inode; BUFFER_TRACE(dir_block, "get_write_access"); @@ -1817,26 +1822,26 @@ retry: de->inode = cpu_to_le32(inode->i_ino); de->name_len = 1; de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len)); - strcpy (de->name, "."); + strcpy(de->name, "."); ext4_set_de_type(dir->i_sb, de, S_IFDIR); de = ext4_next_entry(de); de->inode = cpu_to_le32(dir->i_ino); de->rec_len = ext4_rec_len_to_disk(inode->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1)); de->name_len = 2; - strcpy (de->name, ".."); + strcpy(de->name, ".."); ext4_set_de_type(dir->i_sb, de, S_IFDIR); inode->i_nlink = 2; BUFFER_TRACE(dir_block, "call ext4_journal_dirty_metadata"); ext4_journal_dirty_metadata(handle, dir_block); - brelse (dir_block); + brelse(dir_block); ext4_mark_inode_dirty(handle, inode); - err = ext4_add_entry (handle, dentry, inode); + err = ext4_add_entry(handle, dentry, inode); if (err) { out_clear_inode: clear_nlink(inode); ext4_mark_inode_dirty(handle, inode); - iput (inode); + iput(inode); goto out_stop; } ext4_inc_count(handle, dir); @@ -1853,17 +1858,17 @@ out_stop: /* * routine to check that the specified directory is empty (for rmdir) */ -static int empty_dir (struct inode * inode) +static int empty_dir(struct inode *inode) { unsigned long offset; - struct buffer_head * bh; - struct ext4_dir_entry_2 * de, * de1; - struct super_block * sb; + struct buffer_head *bh; + struct ext4_dir_entry_2 *de, *de1; + struct super_block *sb; int err = 0; sb = inode->i_sb; if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) || - !(bh = ext4_bread (NULL, inode, 0, 0, &err))) { + !(bh = ext4_bread(NULL, inode, 0, 0, &err))) { if (err) ext4_error(inode->i_sb, __func__, "error %d reading directory #%lu offset 0", @@ -1878,23 +1883,23 @@ static int empty_dir (struct inode * inode) de1 = ext4_next_entry(de); if (le32_to_cpu(de->inode) != inode->i_ino || !le32_to_cpu(de1->inode) || - strcmp (".", de->name) || - strcmp ("..", de1->name)) { - ext4_warning (inode->i_sb, "empty_dir", - "bad directory (dir #%lu) - no `.' or `..'", - inode->i_ino); - brelse (bh); + strcmp(".", de->name) || + strcmp("..", de1->name)) { + ext4_warning(inode->i_sb, "empty_dir", + "bad directory (dir #%lu) - no `.' or `..'", + inode->i_ino); + brelse(bh); return 1; } offset = ext4_rec_len_from_disk(de->rec_len) + ext4_rec_len_from_disk(de1->rec_len); de = ext4_next_entry(de1); - while (offset < inode->i_size ) { + while (offset < inode->i_size) { if (!bh || (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { err = 0; - brelse (bh); - bh = ext4_bread (NULL, inode, + brelse(bh); + bh = ext4_bread(NULL, inode, offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err); if (!bh) { if (err) @@ -1914,13 +1919,13 @@ static int empty_dir (struct inode * inode) continue; } if (le32_to_cpu(de->inode)) { - brelse (bh); + brelse(bh); return 0; } offset += ext4_rec_len_from_disk(de->rec_len); de = ext4_next_entry(de); } - brelse (bh); + brelse(bh); return 1; } @@ -1951,8 +1956,8 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) * ->i_nlink. For, say it, character device. Not a regular file, * not a directory, not a symlink and ->i_nlink > 0. */ - J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); + J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); @@ -2066,12 +2071,12 @@ out_brelse: goto out_err; } -static int ext4_rmdir (struct inode * dir, struct dentry *dentry) +static int ext4_rmdir(struct inode *dir, struct dentry *dentry) { int retval; - struct inode * inode; - struct buffer_head * bh; - struct ext4_dir_entry_2 * de; + struct inode *inode; + struct buffer_head *bh; + struct ext4_dir_entry_2 *de; handle_t *handle; /* Initialize quotas before so that eventual writes go in @@ -2082,7 +2087,7 @@ static int ext4_rmdir (struct inode * dir, struct dentry *dentry) return PTR_ERR(handle); retval = -ENOENT; - bh = ext4_find_entry (dentry, &de); + bh = ext4_find_entry(dir, &dentry->d_name, &de); if (!bh) goto end_rmdir; @@ -2096,16 +2101,16 @@ static int ext4_rmdir (struct inode * dir, struct dentry *dentry) goto end_rmdir; retval = -ENOTEMPTY; - if (!empty_dir (inode)) + if (!empty_dir(inode)) goto end_rmdir; retval = ext4_delete_entry(handle, dir, de, bh); if (retval) goto end_rmdir; if (!EXT4_DIR_LINK_EMPTY(inode)) - ext4_warning (inode->i_sb, "ext4_rmdir", - "empty directory has too many links (%d)", - inode->i_nlink); + ext4_warning(inode->i_sb, "ext4_rmdir", + "empty directory has too many links (%d)", + inode->i_nlink); inode->i_version++; clear_nlink(inode); /* There's no need to set i_disksize: the fact that i_nlink is @@ -2121,16 +2126,16 @@ static int ext4_rmdir (struct inode * dir, struct dentry *dentry) end_rmdir: ext4_journal_stop(handle); - brelse (bh); + brelse(bh); return retval; } -static int ext4_unlink(struct inode * dir, struct dentry *dentry) +static int ext4_unlink(struct inode *dir, struct dentry *dentry) { int retval; - struct inode * inode; - struct buffer_head * bh; - struct ext4_dir_entry_2 * de; + struct inode *inode; + struct buffer_head *bh; + struct ext4_dir_entry_2 *de; handle_t *handle; /* Initialize quotas before so that eventual writes go @@ -2144,7 +2149,7 @@ static int ext4_unlink(struct inode * dir, struct dentry *dentry) handle->h_sync = 1; retval = -ENOENT; - bh = ext4_find_entry (dentry, &de); + bh = ext4_find_entry(dir, &dentry->d_name, &de); if (!bh) goto end_unlink; @@ -2155,9 +2160,9 @@ static int ext4_unlink(struct inode * dir, struct dentry *dentry) goto end_unlink; if (!inode->i_nlink) { - ext4_warning (inode->i_sb, "ext4_unlink", - "Deleting nonexistent file (%lu), %d", - inode->i_ino, inode->i_nlink); + ext4_warning(inode->i_sb, "ext4_unlink", + "Deleting nonexistent file (%lu), %d", + inode->i_ino, inode->i_nlink); inode->i_nlink = 1; } retval = ext4_delete_entry(handle, dir, de, bh); @@ -2175,15 +2180,15 @@ static int ext4_unlink(struct inode * dir, struct dentry *dentry) end_unlink: ext4_journal_stop(handle); - brelse (bh); + brelse(bh); return retval; } -static int ext4_symlink (struct inode * dir, - struct dentry *dentry, const char * symname) +static int ext4_symlink(struct inode *dir, + struct dentry *dentry, const char *symname) { handle_t *handle; - struct inode * inode; + struct inode *inode; int l, err, retries = 0; l = strlen(symname)+1; @@ -2200,12 +2205,12 @@ retry: if (IS_DIRSYNC(dir)) handle->h_sync = 1; - inode = ext4_new_inode (handle, dir, S_IFLNK|S_IRWXUGO); + inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_stop; - if (l > sizeof (EXT4_I(inode)->i_data)) { + if (l > sizeof(EXT4_I(inode)->i_data)) { inode->i_op = &ext4_symlink_inode_operations; ext4_set_aops(inode); /* @@ -2218,14 +2223,14 @@ retry: if (err) { clear_nlink(inode); ext4_mark_inode_dirty(handle, inode); - iput (inode); + iput(inode); goto out_stop; } } else { /* clear the extent format for fast symlink */ EXT4_I(inode)->i_flags &= ~EXT4_EXTENTS_FL; inode->i_op = &ext4_fast_symlink_inode_operations; - memcpy((char*)&EXT4_I(inode)->i_data,symname,l); + memcpy((char *)&EXT4_I(inode)->i_data, symname, l); inode->i_size = l-1; } EXT4_I(inode)->i_disksize = inode->i_size; @@ -2237,8 +2242,8 @@ out_stop: return err; } -static int ext4_link (struct dentry * old_dentry, - struct inode * dir, struct dentry *dentry) +static int ext4_link(struct dentry *old_dentry, + struct inode *dir, struct dentry *dentry) { handle_t *handle; struct inode *inode = old_dentry->d_inode; @@ -2281,13 +2286,13 @@ retry: * Anybody can rename anything with this: the permission checks are left to the * higher-level routines. */ -static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry, - struct inode * new_dir,struct dentry *new_dentry) +static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) { handle_t *handle; - struct inode * old_inode, * new_inode; - struct buffer_head * old_bh, * new_bh, * dir_bh; - struct ext4_dir_entry_2 * old_de, * new_de; + struct inode *old_inode, *new_inode; + struct buffer_head *old_bh, *new_bh, *dir_bh; + struct ext4_dir_entry_2 *old_de, *new_de; int retval; old_bh = new_bh = dir_bh = NULL; @@ -2305,7 +2310,7 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry, if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) handle->h_sync = 1; - old_bh = ext4_find_entry (old_dentry, &old_de); + old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de); /* * Check for inode number is _not_ due to possible IO errors. * We might rmdir the source, keep it as pwd of some process @@ -2318,32 +2323,32 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry, goto end_rename; new_inode = new_dentry->d_inode; - new_bh = ext4_find_entry (new_dentry, &new_de); + new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de); if (new_bh) { if (!new_inode) { - brelse (new_bh); + brelse(new_bh); new_bh = NULL; } } if (S_ISDIR(old_inode->i_mode)) { if (new_inode) { retval = -ENOTEMPTY; - if (!empty_dir (new_inode)) + if (!empty_dir(new_inode)) goto end_rename; } retval = -EIO; - dir_bh = ext4_bread (handle, old_inode, 0, 0, &retval); + dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval); if (!dir_bh) goto end_rename; if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino) goto end_rename; retval = -EMLINK; - if (!new_inode && new_dir!=old_dir && + if (!new_inode && new_dir != old_dir && new_dir->i_nlink >= EXT4_LINK_MAX) goto end_rename; } if (!new_bh) { - retval = ext4_add_entry (handle, new_dentry, old_inode); + retval = ext4_add_entry(handle, new_dentry, old_inode); if (retval) goto end_rename; } else { @@ -2385,7 +2390,7 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry, struct buffer_head *old_bh2; struct ext4_dir_entry_2 *old_de2; - old_bh2 = ext4_find_entry(old_dentry, &old_de2); + old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de2); if (old_bh2) { retval = ext4_delete_entry(handle, old_dir, old_de2, old_bh2); @@ -2430,9 +2435,9 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry, retval = 0; end_rename: - brelse (dir_bh); - brelse (old_bh); - brelse (new_bh); + brelse(dir_bh); + brelse(old_bh); + brelse(new_bh); ext4_journal_stop(handle); return retval; } @@ -2451,7 +2456,7 @@ const struct inode_operations ext4_dir_inode_operations = { .mknod = ext4_mknod, .rename = ext4_rename, .setattr = ext4_setattr, -#ifdef CONFIG_EXT4DEV_FS_XATTR +#ifdef CONFIG_EXT4_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = ext4_listxattr, @@ -2462,7 +2467,7 @@ const struct inode_operations ext4_dir_inode_operations = { const struct inode_operations ext4_special_inode_operations = { .setattr = ext4_setattr, -#ifdef CONFIG_EXT4DEV_FS_XATTR +#ifdef CONFIG_EXT4_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = ext4_listxattr, diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 9ff7b1c04239..b6ec1843a015 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -73,7 +73,7 @@ static int verify_group_input(struct super_block *sb, "Inode bitmap not in group (block %llu)", (unsigned long long)input->inode_bitmap); else if (outside(input->inode_table, start, end) || - outside(itend - 1, start, end)) + outside(itend - 1, start, end)) ext4_warning(sb, __func__, "Inode table not in group (blocks %llu-%llu)", (unsigned long long)input->inode_table, itend - 1); @@ -104,7 +104,7 @@ static int verify_group_input(struct super_block *sb, (unsigned long long)input->inode_bitmap, start, metaend - 1); else if (inside(input->inode_table, start, metaend) || - inside(itend - 1, start, metaend)) + inside(itend - 1, start, metaend)) ext4_warning(sb, __func__, "Inode table (%llu-%llu) overlaps" "GDT table (%llu-%llu)", @@ -158,9 +158,9 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh, if (err) { if ((err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA))) return err; - if ((err = ext4_journal_get_write_access(handle, bh))) + if ((err = ext4_journal_get_write_access(handle, bh))) return err; - } + } return 0; } @@ -418,9 +418,9 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, /* * If we are not using the primary superblock/GDT copy don't resize, - * because the user tools have no way of handling this. Probably a - * bad time to do it anyways. - */ + * because the user tools have no way of handling this. Probably a + * bad time to do it anyways. + */ if (EXT4_SB(sb)->s_sbh->b_blocknr != le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) { ext4_warning(sb, __func__, @@ -507,14 +507,14 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, return 0; exit_inode: - //ext4_journal_release_buffer(handle, iloc.bh); + /* ext4_journal_release_buffer(handle, iloc.bh); */ brelse(iloc.bh); exit_dindj: - //ext4_journal_release_buffer(handle, dind); + /* ext4_journal_release_buffer(handle, dind); */ exit_primary: - //ext4_journal_release_buffer(handle, *primary); + /* ext4_journal_release_buffer(handle, *primary); */ exit_sbh: - //ext4_journal_release_buffer(handle, *primary); + /* ext4_journal_release_buffer(handle, *primary); */ exit_dind: brelse(dind); exit_bh: @@ -773,7 +773,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) if (reserved_gdb || gdb_off == 0) { if (!EXT4_HAS_COMPAT_FEATURE(sb, - EXT4_FEATURE_COMPAT_RESIZE_INODE)){ + EXT4_FEATURE_COMPAT_RESIZE_INODE) + || !le16_to_cpu(es->s_reserved_gdt_blocks)) { ext4_warning(sb, __func__, "No reserved GDT blocks, can't resize"); return -EPERM; @@ -818,12 +819,12 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh))) goto exit_journal; - /* - * We will only either add reserved group blocks to a backup group - * or remove reserved blocks for the first group in a new group block. - * Doing both would be mean more complex code, and sane people don't - * use non-sparse filesystems anymore. This is already checked above. - */ + /* + * We will only either add reserved group blocks to a backup group + * or remove reserved blocks for the first group in a new group block. + * Doing both would be mean more complex code, and sane people don't + * use non-sparse filesystems anymore. This is already checked above. + */ if (gdb_off) { primary = sbi->s_group_desc[gdb_num]; if ((err = ext4_journal_get_write_access(handle, primary))) @@ -835,24 +836,24 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) } else if ((err = add_new_gdb(handle, inode, input, &primary))) goto exit_journal; - /* - * OK, now we've set up the new group. Time to make it active. - * - * Current kernels don't lock all allocations via lock_super(), - * so we have to be safe wrt. concurrent accesses the group - * data. So we need to be careful to set all of the relevant - * group descriptor data etc. *before* we enable the group. - * - * The key field here is sbi->s_groups_count: as long as - * that retains its old value, nobody is going to access the new - * group. - * - * So first we update all the descriptor metadata for the new - * group; then we update the total disk blocks count; then we - * update the groups count to enable the group; then finally we - * update the free space counts so that the system can start - * using the new disk blocks. - */ + /* + * OK, now we've set up the new group. Time to make it active. + * + * Current kernels don't lock all allocations via lock_super(), + * so we have to be safe wrt. concurrent accesses the group + * data. So we need to be careful to set all of the relevant + * group descriptor data etc. *before* we enable the group. + * + * The key field here is sbi->s_groups_count: as long as + * that retains its old value, nobody is going to access the new + * group. + * + * So first we update all the descriptor metadata for the new + * group; then we update the total disk blocks count; then we + * update the groups count to enable the group; then finally we + * update the free space counts so that the system can start + * using the new disk blocks. + */ /* Update group descriptor block for new group */ gdp = (struct ext4_group_desc *)((char *)primary->b_data + @@ -866,6 +867,14 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp); /* + * We can allocate memory for mb_alloc based on the new group + * descriptor + */ + err = ext4_mb_add_more_groupinfo(sb, input->group, gdp); + if (err) + goto exit_journal; + + /* * Make the new blocks and inodes valid next. We do this before * increasing the group count so that once the group is enabled, * all of its blocks and inodes are already valid. @@ -919,6 +928,15 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) percpu_counter_add(&sbi->s_freeinodes_counter, EXT4_INODES_PER_GROUP(sb)); + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) { + ext4_group_t flex_group; + flex_group = ext4_flex_group(sbi, input->group); + sbi->s_flex_groups[flex_group].free_blocks += + input->free_blocks_count; + sbi->s_flex_groups[flex_group].free_inodes += + EXT4_INODES_PER_GROUP(sb); + } + ext4_journal_dirty_metadata(handle, sbi->s_sbh); sb->s_dirt = 1; @@ -937,7 +955,8 @@ exit_put: return err; } /* ext4_group_add */ -/* Extend the filesystem to the new number of blocks specified. This entry +/* + * Extend the filesystem to the new number of blocks specified. This entry * point is only used to extend the current filesystem to the end of the last * existing group. It can be accessed via ioctl, or by "remount,resize=<size>" * for emergencies (because it has no dependencies on reserved blocks). @@ -953,10 +972,12 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, ext4_group_t o_groups_count; ext4_grpblk_t last; ext4_grpblk_t add; - struct buffer_head * bh; + struct buffer_head *bh; handle_t *handle; int err; unsigned long freed_blocks; + ext4_group_t group; + struct ext4_group_info *grp; /* We don't need to worry about locking wrt other resizers just * yet: we're going to revalidate es->s_blocks_count after @@ -988,7 +1009,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, } /* Handle the remaining blocks in the last group only. */ - ext4_get_group_no_and_offset(sb, o_blocks_count, NULL, &last); + ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last); if (last == 0) { ext4_warning(sb, __func__, @@ -1013,7 +1034,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, o_blocks_count + add, add); /* See if the device is actually as big as what was requested */ - bh = sb_bread(sb, o_blocks_count + add -1); + bh = sb_bread(sb, o_blocks_count + add - 1); if (!bh) { ext4_warning(sb, __func__, "can't read last block, resize aborted"); @@ -1060,6 +1081,52 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, o_blocks_count + add); if ((err = ext4_journal_stop(handle))) goto exit_put; + + /* + * Mark mballoc pages as not up to date so that they will be updated + * next time they are loaded by ext4_mb_load_buddy. + * + * XXX Bad, Bad, BAD!!! We should not be overloading the + * Uptodate flag, particularly on thte bitmap bh, as way of + * hinting to ext4_mb_load_buddy() that it needs to be + * overloaded. A user could take a LVM snapshot, then do an + * on-line fsck, and clear the uptodate flag, and this would + * not be a bug in userspace, but a bug in the kernel. FIXME!!! + */ + { + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct inode *inode = sbi->s_buddy_cache; + int blocks_per_page; + int block; + int pnum; + struct page *page; + + /* Set buddy page as not up to date */ + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + block = group * 2; + pnum = block / blocks_per_page; + page = find_get_page(inode->i_mapping, pnum); + if (page != NULL) { + ClearPageUptodate(page); + page_cache_release(page); + } + + /* Set bitmap page as not up to date */ + block++; + pnum = block / blocks_per_page; + page = find_get_page(inode->i_mapping, pnum); + if (page != NULL) { + ClearPageUptodate(page); + page_cache_release(page); + } + + /* Get the info on the last group */ + grp = ext4_get_group_info(sb, group); + + /* Update free blocks in group info */ + ext4_mb_update_group_info(grp, add); + } + if (test_opt(sb, DEBUG)) printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n", ext4_blocks_count(es)); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index cb96f127c366..dea8f13c2fd9 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -34,6 +34,8 @@ #include <linux/namei.h> #include <linux/quotaops.h> #include <linux/seq_file.h> +#include <linux/proc_fs.h> +#include <linux/marker.h> #include <linux/log2.h> #include <linux/crc16.h> #include <asm/uaccess.h> @@ -45,24 +47,25 @@ #include "namei.h" #include "group.h" +struct proc_dir_entry *ext4_proc_root; + static int ext4_load_journal(struct super_block *, struct ext4_super_block *, unsigned long journal_devnum); static int ext4_create_journal(struct super_block *, struct ext4_super_block *, unsigned int); -static void ext4_commit_super (struct super_block * sb, - struct ext4_super_block * es, - int sync); -static void ext4_mark_recovery_complete(struct super_block * sb, - struct ext4_super_block * es); -static void ext4_clear_journal_err(struct super_block * sb, - struct ext4_super_block * es); +static void ext4_commit_super(struct super_block *sb, + struct ext4_super_block *es, int sync); +static void ext4_mark_recovery_complete(struct super_block *sb, + struct ext4_super_block *es); +static void ext4_clear_journal_err(struct super_block *sb, + struct ext4_super_block *es); static int ext4_sync_fs(struct super_block *sb, int wait); -static const char *ext4_decode_error(struct super_block * sb, int errno, +static const char *ext4_decode_error(struct super_block *sb, int errno, char nbuf[16]); -static int ext4_remount (struct super_block * sb, int * flags, char * data); -static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf); +static int ext4_remount(struct super_block *sb, int *flags, char *data); +static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); static void ext4_unlockfs(struct super_block *sb); -static void ext4_write_super (struct super_block * sb); +static void ext4_write_super(struct super_block *sb); static void ext4_write_super_lockfs(struct super_block *sb); @@ -211,15 +214,15 @@ static void ext4_handle_error(struct super_block *sb) if (sb->s_flags & MS_RDONLY) return; - if (!test_opt (sb, ERRORS_CONT)) { + if (!test_opt(sb, ERRORS_CONT)) { journal_t *journal = EXT4_SB(sb)->s_journal; EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT; if (journal) jbd2_journal_abort(journal, -EIO); } - if (test_opt (sb, ERRORS_RO)) { - printk (KERN_CRIT "Remounting filesystem read-only\n"); + if (test_opt(sb, ERRORS_RO)) { + printk(KERN_CRIT "Remounting filesystem read-only\n"); sb->s_flags |= MS_RDONLY; } ext4_commit_super(sb, es, 1); @@ -228,13 +231,13 @@ static void ext4_handle_error(struct super_block *sb) sb->s_id); } -void ext4_error (struct super_block * sb, const char * function, - const char * fmt, ...) +void ext4_error(struct super_block *sb, const char *function, + const char *fmt, ...) { va_list args; va_start(args, fmt); - printk(KERN_CRIT "EXT4-fs error (device %s): %s: ",sb->s_id, function); + printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function); vprintk(fmt, args); printk("\n"); va_end(args); @@ -242,7 +245,7 @@ void ext4_error (struct super_block * sb, const char * function, ext4_handle_error(sb); } -static const char *ext4_decode_error(struct super_block * sb, int errno, +static const char *ext4_decode_error(struct super_block *sb, int errno, char nbuf[16]) { char *errstr = NULL; @@ -278,8 +281,7 @@ static const char *ext4_decode_error(struct super_block * sb, int errno, /* __ext4_std_error decodes expected errors from journaling functions * automatically and invokes the appropriate error response. */ -void __ext4_std_error (struct super_block * sb, const char * function, - int errno) +void __ext4_std_error(struct super_block *sb, const char *function, int errno) { char nbuf[16]; const char *errstr; @@ -292,8 +294,8 @@ void __ext4_std_error (struct super_block * sb, const char * function, return; errstr = ext4_decode_error(sb, errno, nbuf); - printk (KERN_CRIT "EXT4-fs error (device %s) in %s: %s\n", - sb->s_id, function, errstr); + printk(KERN_CRIT "EXT4-fs error (device %s) in %s: %s\n", + sb->s_id, function, errstr); ext4_handle_error(sb); } @@ -308,15 +310,15 @@ void __ext4_std_error (struct super_block * sb, const char * function, * case we take the easy way out and panic immediately. */ -void ext4_abort (struct super_block * sb, const char * function, - const char * fmt, ...) +void ext4_abort(struct super_block *sb, const char *function, + const char *fmt, ...) { va_list args; - printk (KERN_CRIT "ext4_abort called.\n"); + printk(KERN_CRIT "ext4_abort called.\n"); va_start(args, fmt); - printk(KERN_CRIT "EXT4-fs error (device %s): %s: ",sb->s_id, function); + printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function); vprintk(fmt, args); printk("\n"); va_end(args); @@ -334,8 +336,8 @@ void ext4_abort (struct super_block * sb, const char * function, jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); } -void ext4_warning (struct super_block * sb, const char * function, - const char * fmt, ...) +void ext4_warning(struct super_block *sb, const char *function, + const char *fmt, ...) { va_list args; @@ -496,7 +498,7 @@ static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi) } } -static void ext4_put_super (struct super_block * sb) +static void ext4_put_super(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; @@ -505,21 +507,27 @@ static void ext4_put_super (struct super_block * sb) ext4_mb_release(sb); ext4_ext_release(sb); ext4_xattr_put_super(sb); - jbd2_journal_destroy(sbi->s_journal); + if (jbd2_journal_destroy(sbi->s_journal) < 0) + ext4_abort(sb, __func__, "Couldn't clean up the journal"); + sbi->s_journal = NULL; if (!(sb->s_flags & MS_RDONLY)) { EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); es->s_state = cpu_to_le16(sbi->s_mount_state); - BUFFER_TRACE(sbi->s_sbh, "marking dirty"); - mark_buffer_dirty(sbi->s_sbh); ext4_commit_super(sb, es, 1); } + if (sbi->s_proc) { + remove_proc_entry("inode_readahead_blks", sbi->s_proc); + remove_proc_entry(sb->s_id, ext4_proc_root); + } for (i = 0; i < sbi->s_gdb_count; i++) brelse(sbi->s_group_desc[i]); kfree(sbi->s_group_desc); + kfree(sbi->s_flex_groups); percpu_counter_destroy(&sbi->s_freeblocks_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); + percpu_counter_destroy(&sbi->s_dirtyblocks_counter); brelse(sbi->s_sbh); #ifdef CONFIG_QUOTA for (i = 0; i < MAXQUOTAS; i++) @@ -562,15 +570,21 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS); if (!ei) return NULL; -#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL +#ifdef CONFIG_EXT4_FS_POSIX_ACL ei->i_acl = EXT4_ACL_NOT_CACHED; ei->i_default_acl = EXT4_ACL_NOT_CACHED; #endif - ei->i_block_alloc_info = NULL; ei->vfs_inode.i_version = 1; + ei->vfs_inode.i_data.writeback_index = 0; memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); INIT_LIST_HEAD(&ei->i_prealloc_list); spin_lock_init(&ei->i_prealloc_lock); + jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode); + ei->i_reserved_data_blocks = 0; + ei->i_reserved_meta_blocks = 0; + ei->i_allocated_meta_blocks = 0; + ei->i_delalloc_reserved_flag = 0; + spin_lock_init(&(ei->i_block_reservation_lock)); return &ei->vfs_inode; } @@ -587,12 +601,12 @@ static void ext4_destroy_inode(struct inode *inode) kmem_cache_free(ext4_inode_cachep, EXT4_I(inode)); } -static void init_once(struct kmem_cache *cachep, void *foo) +static void init_once(void *foo) { struct ext4_inode_info *ei = (struct ext4_inode_info *) foo; INIT_LIST_HEAD(&ei->i_orphan); -#ifdef CONFIG_EXT4DEV_FS_XATTR +#ifdef CONFIG_EXT4_FS_XATTR init_rwsem(&ei->xattr_sem); #endif init_rwsem(&ei->i_data_sem); @@ -618,8 +632,7 @@ static void destroy_inodecache(void) static void ext4_clear_inode(struct inode *inode) { - struct ext4_block_alloc_info *rsv = EXT4_I(inode)->i_block_alloc_info; -#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL +#ifdef CONFIG_EXT4_FS_POSIX_ACL if (EXT4_I(inode)->i_acl && EXT4_I(inode)->i_acl != EXT4_ACL_NOT_CACHED) { posix_acl_release(EXT4_I(inode)->i_acl); @@ -631,20 +644,20 @@ static void ext4_clear_inode(struct inode *inode) EXT4_I(inode)->i_default_acl = EXT4_ACL_NOT_CACHED; } #endif - ext4_discard_reservation(inode); - EXT4_I(inode)->i_block_alloc_info = NULL; - if (unlikely(rsv)) - kfree(rsv); + ext4_discard_preallocations(inode); + jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal, + &EXT4_I(inode)->jinode); } -static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb) +static inline void ext4_show_quota_options(struct seq_file *seq, + struct super_block *sb) { #if defined(CONFIG_QUOTA) struct ext4_sb_info *sbi = EXT4_SB(sb); if (sbi->s_jquota_fmt) seq_printf(seq, ",jqfmt=%s", - (sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold": "vfsv0"); + (sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold" : "vfsv0"); if (sbi->s_qf_names[USRQUOTA]) seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]); @@ -671,7 +684,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) unsigned long def_mount_opts; struct super_block *sb = vfs->mnt_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); - journal_t *journal = sbi->s_journal; struct ext4_super_block *es = sbi->s_es; def_mount_opts = le32_to_cpu(es->s_default_mount_opts); @@ -709,7 +721,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",debug"); if (test_opt(sb, OLDALLOC)) seq_puts(seq, ",oldalloc"); -#ifdef CONFIG_EXT4DEV_FS_XATTR +#ifdef CONFIG_EXT4_FS_XATTR if (test_opt(sb, XATTR_USER) && !(def_mount_opts & EXT4_DEFM_XATTR_USER)) seq_puts(seq, ",user_xattr"); @@ -718,7 +730,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",nouser_xattr"); } #endif -#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL +#ifdef CONFIG_EXT4_FS_POSIX_ACL if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL)) seq_puts(seq, ",acl"); if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL)) @@ -743,10 +755,11 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",nobh"); if (!test_opt(sb, EXTENTS)) seq_puts(seq, ",noextents"); - if (!test_opt(sb, MBALLOC)) - seq_puts(seq, ",nomballoc"); if (test_opt(sb, I_VERSION)) seq_puts(seq, ",i_version"); + if (!test_opt(sb, DELALLOC)) + seq_puts(seq, ",nodelalloc"); + if (sbi->s_stripe) seq_printf(seq, ",stripe=%lu", sbi->s_stripe); @@ -761,6 +774,13 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) seq_puts(seq, ",data=writeback"); + if (sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS) + seq_printf(seq, ",inode_readahead_blks=%u", + sbi->s_inode_readahead_blks); + + if (test_opt(sb, DATA_ERR_ABORT)) + seq_puts(seq, ",data_err=abort"); + ext4_show_quota_options(seq, sb); return 0; } @@ -810,8 +830,8 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid, } #ifdef CONFIG_QUOTA -#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group") -#define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) +#define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group") +#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) static int ext4_dquot_initialize(struct inode *inode, int type); static int ext4_dquot_drop(struct inode *inode); @@ -890,14 +910,16 @@ enum { Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, Opt_journal_checksum, Opt_journal_async_commit, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, + Opt_data_err_abort, Opt_data_err_ignore, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, - Opt_mballoc, Opt_nomballoc, Opt_stripe, + Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc, + Opt_inode_readahead_blks }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_bsd_df, "bsddf"}, {Opt_minix_df, "minixdf"}, {Opt_grpid, "grpid"}, @@ -935,6 +957,8 @@ static match_table_t tokens = { {Opt_data_journal, "data=journal"}, {Opt_data_ordered, "data=ordered"}, {Opt_data_writeback, "data=writeback"}, + {Opt_data_err_abort, "data_err=abort"}, + {Opt_data_err_ignore, "data_err=ignore"}, {Opt_offusrjquota, "usrjquota="}, {Opt_usrjquota, "usrjquota=%s"}, {Opt_offgrpjquota, "grpjquota="}, @@ -953,6 +977,9 @@ static match_table_t tokens = { {Opt_nomballoc, "nomballoc"}, {Opt_stripe, "stripe=%u"}, {Opt_resize, "resize"}, + {Opt_delalloc, "delalloc"}, + {Opt_nodelalloc, "nodelalloc"}, + {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, {Opt_err, NULL}, }; @@ -967,7 +994,7 @@ static ext4_fsblk_t get_sb_block(void **data) /*todo: use simple_strtoll with >32bit ext4 */ sb_block = simple_strtoul(options, &options, 0); if (*options && *options != ',') { - printk("EXT4-fs: Invalid sb specification: %s\n", + printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n", (char *) *data); return 1; } @@ -977,12 +1004,12 @@ static ext4_fsblk_t get_sb_block(void **data) return sb_block; } -static int parse_options (char *options, struct super_block *sb, - unsigned int *inum, unsigned long *journal_devnum, - ext4_fsblk_t *n_blocks_count, int is_remount) +static int parse_options(char *options, struct super_block *sb, + unsigned int *inum, unsigned long *journal_devnum, + ext4_fsblk_t *n_blocks_count, int is_remount) { struct ext4_sb_info *sbi = EXT4_SB(sb); - char * p; + char *p; substring_t args[MAX_OPT_ARGS]; int data_opt = 0; int option; @@ -990,11 +1017,12 @@ static int parse_options (char *options, struct super_block *sb, int qtype, qfmt; char *qname; #endif + ext4_fsblk_t last_block; if (!options) return 1; - while ((p = strsep (&options, ",")) != NULL) { + while ((p = strsep(&options, ",")) != NULL) { int token; if (!*p) continue; @@ -1002,16 +1030,16 @@ static int parse_options (char *options, struct super_block *sb, token = match_token(p, tokens, args); switch (token) { case Opt_bsd_df: - clear_opt (sbi->s_mount_opt, MINIX_DF); + clear_opt(sbi->s_mount_opt, MINIX_DF); break; case Opt_minix_df: - set_opt (sbi->s_mount_opt, MINIX_DF); + set_opt(sbi->s_mount_opt, MINIX_DF); break; case Opt_grpid: - set_opt (sbi->s_mount_opt, GRPID); + set_opt(sbi->s_mount_opt, GRPID); break; case Opt_nogrpid: - clear_opt (sbi->s_mount_opt, GRPID); + clear_opt(sbi->s_mount_opt, GRPID); break; case Opt_resuid: if (match_int(&args[0], &option)) @@ -1028,49 +1056,50 @@ static int parse_options (char *options, struct super_block *sb, /* *sb_block = match_int(&args[0]); */ break; case Opt_err_panic: - clear_opt (sbi->s_mount_opt, ERRORS_CONT); - clear_opt (sbi->s_mount_opt, ERRORS_RO); - set_opt (sbi->s_mount_opt, ERRORS_PANIC); + clear_opt(sbi->s_mount_opt, ERRORS_CONT); + clear_opt(sbi->s_mount_opt, ERRORS_RO); + set_opt(sbi->s_mount_opt, ERRORS_PANIC); break; case Opt_err_ro: - clear_opt (sbi->s_mount_opt, ERRORS_CONT); - clear_opt (sbi->s_mount_opt, ERRORS_PANIC); - set_opt (sbi->s_mount_opt, ERRORS_RO); + clear_opt(sbi->s_mount_opt, ERRORS_CONT); + clear_opt(sbi->s_mount_opt, ERRORS_PANIC); + set_opt(sbi->s_mount_opt, ERRORS_RO); break; case Opt_err_cont: - clear_opt (sbi->s_mount_opt, ERRORS_RO); - clear_opt (sbi->s_mount_opt, ERRORS_PANIC); - set_opt (sbi->s_mount_opt, ERRORS_CONT); + clear_opt(sbi->s_mount_opt, ERRORS_RO); + clear_opt(sbi->s_mount_opt, ERRORS_PANIC); + set_opt(sbi->s_mount_opt, ERRORS_CONT); break; case Opt_nouid32: - set_opt (sbi->s_mount_opt, NO_UID32); + set_opt(sbi->s_mount_opt, NO_UID32); break; case Opt_nocheck: - clear_opt (sbi->s_mount_opt, CHECK); + clear_opt(sbi->s_mount_opt, CHECK); break; case Opt_debug: - set_opt (sbi->s_mount_opt, DEBUG); + set_opt(sbi->s_mount_opt, DEBUG); break; case Opt_oldalloc: - set_opt (sbi->s_mount_opt, OLDALLOC); + set_opt(sbi->s_mount_opt, OLDALLOC); break; case Opt_orlov: - clear_opt (sbi->s_mount_opt, OLDALLOC); + clear_opt(sbi->s_mount_opt, OLDALLOC); break; -#ifdef CONFIG_EXT4DEV_FS_XATTR +#ifdef CONFIG_EXT4_FS_XATTR case Opt_user_xattr: - set_opt (sbi->s_mount_opt, XATTR_USER); + set_opt(sbi->s_mount_opt, XATTR_USER); break; case Opt_nouser_xattr: - clear_opt (sbi->s_mount_opt, XATTR_USER); + clear_opt(sbi->s_mount_opt, XATTR_USER); break; #else case Opt_user_xattr: case Opt_nouser_xattr: - printk("EXT4 (no)user_xattr options not supported\n"); + printk(KERN_ERR "EXT4 (no)user_xattr options " + "not supported\n"); break; #endif -#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL +#ifdef CONFIG_EXT4_FS_POSIX_ACL case Opt_acl: set_opt(sbi->s_mount_opt, POSIX_ACL); break; @@ -1080,7 +1109,8 @@ static int parse_options (char *options, struct super_block *sb, #else case Opt_acl: case Opt_noacl: - printk("EXT4 (no)acl options not supported\n"); + printk(KERN_ERR "EXT4 (no)acl options " + "not supported\n"); break; #endif case Opt_reservation: @@ -1100,7 +1130,7 @@ static int parse_options (char *options, struct super_block *sb, "journal on remount\n"); return 0; } - set_opt (sbi->s_mount_opt, UPDATE_JOURNAL); + set_opt(sbi->s_mount_opt, UPDATE_JOURNAL); break; case Opt_journal_inum: if (is_remount) { @@ -1130,7 +1160,7 @@ static int parse_options (char *options, struct super_block *sb, set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM); break; case Opt_noload: - set_opt (sbi->s_mount_opt, NOLOAD); + set_opt(sbi->s_mount_opt, NOLOAD); break; case Opt_commit: if (match_int(&args[0], &option)) @@ -1163,6 +1193,12 @@ static int parse_options (char *options, struct super_block *sb, sbi->s_mount_opt |= data_opt; } break; + case Opt_data_err_abort: + set_opt(sbi->s_mount_opt, DATA_ERR_ABORT); + break; + case Opt_data_err_ignore: + clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT); + break; #ifdef CONFIG_QUOTA case Opt_usrjquota: qtype = USRQUOTA; @@ -1174,8 +1210,8 @@ set_qf_name: sb_any_quota_suspended(sb)) && !sbi->s_qf_names[qtype]) { printk(KERN_ERR - "EXT4-fs: Cannot change journaled " - "quota options when quota turned on.\n"); + "EXT4-fs: Cannot change journaled " + "quota options when quota turned on.\n"); return 0; } qname = match_strdup(&args[0]); @@ -1309,20 +1345,38 @@ set_qf_format: clear_opt(sbi->s_mount_opt, NOBH); break; case Opt_extents: - set_opt (sbi->s_mount_opt, EXTENTS); + if (!EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_EXTENTS)) { + ext4_warning(sb, __func__, + "extents feature not enabled " + "on this filesystem, use tune2fs\n"); + return 0; + } + set_opt(sbi->s_mount_opt, EXTENTS); break; case Opt_noextents: - clear_opt (sbi->s_mount_opt, EXTENTS); + /* + * When e2fsprogs support resizing an already existing + * ext3 file system to greater than 2**32 we need to + * add support to block allocator to handle growing + * already existing block mapped inode so that blocks + * allocated for them fall within 2**32 + */ + last_block = ext4_blocks_count(sbi->s_es) - 1; + if (last_block > 0xffffffffULL) { + printk(KERN_ERR "EXT4-fs: Filesystem too " + "large to mount with " + "-o noextents options\n"); + return 0; + } + clear_opt(sbi->s_mount_opt, EXTENTS); break; case Opt_i_version: set_opt(sbi->s_mount_opt, I_VERSION); sb->s_flags |= MS_I_VERSION; break; - case Opt_mballoc: - set_opt(sbi->s_mount_opt, MBALLOC); - break; - case Opt_nomballoc: - clear_opt(sbi->s_mount_opt, MBALLOC); + case Opt_nodelalloc: + clear_opt(sbi->s_mount_opt, DELALLOC); break; case Opt_stripe: if (match_int(&args[0], &option)) @@ -1331,10 +1385,20 @@ set_qf_format: return 0; sbi->s_stripe = option; break; + case Opt_delalloc: + set_opt(sbi->s_mount_opt, DELALLOC); + break; + case Opt_inode_readahead_blks: + if (match_int(&args[0], &option)) + return 0; + if (option < 0 || option > (1 << 30)) + return 0; + sbi->s_inode_readahead_blks = option; + break; default: - printk (KERN_ERR - "EXT4-fs: Unrecognized mount option \"%s\" " - "or missing value\n", p); + printk(KERN_ERR + "EXT4-fs: Unrecognized mount option \"%s\" " + "or missing value\n", p); return 0; } } @@ -1381,31 +1445,31 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, int res = 0; if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) { - printk (KERN_ERR "EXT4-fs warning: revision level too high, " - "forcing read-only mode\n"); + printk(KERN_ERR "EXT4-fs warning: revision level too high, " + "forcing read-only mode\n"); res = MS_RDONLY; } if (read_only) return res; if (!(sbi->s_mount_state & EXT4_VALID_FS)) - printk (KERN_WARNING "EXT4-fs warning: mounting unchecked fs, " - "running e2fsck is recommended\n"); + printk(KERN_WARNING "EXT4-fs warning: mounting unchecked fs, " + "running e2fsck is recommended\n"); else if ((sbi->s_mount_state & EXT4_ERROR_FS)) - printk (KERN_WARNING - "EXT4-fs warning: mounting fs with errors, " - "running e2fsck is recommended\n"); + printk(KERN_WARNING + "EXT4-fs warning: mounting fs with errors, " + "running e2fsck is recommended\n"); else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 && le16_to_cpu(es->s_mnt_count) >= (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) - printk (KERN_WARNING - "EXT4-fs warning: maximal mount count reached, " - "running e2fsck is recommended\n"); + printk(KERN_WARNING + "EXT4-fs warning: maximal mount count reached, " + "running e2fsck is recommended\n"); else if (le32_to_cpu(es->s_checkinterval) && (le32_to_cpu(es->s_lastcheck) + le32_to_cpu(es->s_checkinterval) <= get_seconds())) - printk (KERN_WARNING - "EXT4-fs warning: checktime reached, " - "running e2fsck is recommended\n"); + printk(KERN_WARNING + "EXT4-fs warning: checktime reached, " + "running e2fsck is recommended\n"); #if 0 /* @@@ We _will_ want to clear the valid bit if we find * inconsistencies, to force a fsck at reboot. But for @@ -1431,16 +1495,60 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, EXT4_INODES_PER_GROUP(sb), sbi->s_mount_opt); - printk(KERN_INFO "EXT4 FS on %s, ", sb->s_id); - if (EXT4_SB(sb)->s_journal->j_inode == NULL) { - char b[BDEVNAME_SIZE]; + printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n", + sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" : + "external", EXT4_SB(sb)->s_journal->j_devname); + return res; +} - printk("external journal on %s\n", - bdevname(EXT4_SB(sb)->s_journal->j_dev, b)); - } else { - printk("internal journal\n"); +static int ext4_fill_flex_info(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_desc *gdp = NULL; + struct buffer_head *bh; + ext4_group_t flex_group_count; + ext4_group_t flex_group; + int groups_per_flex = 0; + __u64 block_bitmap = 0; + int i; + + if (!sbi->s_es->s_log_groups_per_flex) { + sbi->s_log_groups_per_flex = 0; + return 1; } - return res; + + sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; + groups_per_flex = 1 << sbi->s_log_groups_per_flex; + + /* We allocate both existing and potentially added groups */ + flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) + + ((sbi->s_es->s_reserved_gdt_blocks +1 ) << + EXT4_DESC_PER_BLOCK_BITS(sb))) / + groups_per_flex; + sbi->s_flex_groups = kzalloc(flex_group_count * + sizeof(struct flex_groups), GFP_KERNEL); + if (sbi->s_flex_groups == NULL) { + printk(KERN_ERR "EXT4-fs: not enough memory for " + "%lu flex groups\n", flex_group_count); + goto failed; + } + + gdp = ext4_get_group_desc(sb, 1, &bh); + block_bitmap = ext4_block_bitmap(sb, gdp) - 1; + + for (i = 0; i < sbi->s_groups_count; i++) { + gdp = ext4_get_group_desc(sb, i, &bh); + + flex_group = ext4_flex_group(sbi, i); + sbi->s_flex_groups[flex_group].free_inodes += + le16_to_cpu(gdp->bg_free_inodes_count); + sbi->s_flex_groups[flex_group].free_blocks += + le16_to_cpu(gdp->bg_free_blocks_count); + } + + return 1; +failed: + return 0; } __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group, @@ -1495,7 +1603,7 @@ static int ext4_check_descriptors(struct super_block *sb) if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) flexbg_flag = 1; - ext4_debug ("Checking group descriptors"); + ext4_debug("Checking group descriptors"); for (i = 0; i < sbi->s_groups_count; i++) { struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); @@ -1507,16 +1615,14 @@ static int ext4_check_descriptors(struct super_block *sb) (EXT4_BLOCKS_PER_GROUP(sb) - 1); block_bitmap = ext4_block_bitmap(sb, gdp); - if (block_bitmap < first_block || block_bitmap > last_block) - { + if (block_bitmap < first_block || block_bitmap > last_block) { printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " "Block bitmap for group %lu not in group " "(block %llu)!", i, block_bitmap); return 0; } inode_bitmap = ext4_inode_bitmap(sb, gdp); - if (inode_bitmap < first_block || inode_bitmap > last_block) - { + if (inode_bitmap < first_block || inode_bitmap > last_block) { printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " "Inode bitmap for group %lu not in group " "(block %llu)!", i, inode_bitmap); @@ -1524,26 +1630,30 @@ static int ext4_check_descriptors(struct super_block *sb) } inode_table = ext4_inode_table(sb, gdp); if (inode_table < first_block || - inode_table + sbi->s_itb_per_group - 1 > last_block) - { + inode_table + sbi->s_itb_per_group - 1 > last_block) { printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " "Inode table for group %lu not in group " "(block %llu)!", i, inode_table); return 0; } + spin_lock(sb_bgl_lock(sbi, i)); if (!ext4_group_desc_csum_verify(sbi, i, gdp)) { printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " "Checksum for group %lu failed (%u!=%u)\n", i, le16_to_cpu(ext4_group_desc_csum(sbi, i, gdp)), le16_to_cpu(gdp->bg_checksum)); - return 0; + if (!(sb->s_flags & MS_RDONLY)) { + spin_unlock(sb_bgl_lock(sbi, i)); + return 0; + } } + spin_unlock(sb_bgl_lock(sbi, i)); if (!flexbg_flag) first_block += EXT4_BLOCKS_PER_GROUP(sb); } ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb)); - sbi->s_es->s_free_inodes_count=cpu_to_le32(ext4_count_free_inodes(sb)); + sbi->s_es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb)); return 1; } @@ -1564,8 +1674,8 @@ static int ext4_check_descriptors(struct super_block *sb) * e2fsck was run on this filesystem, and it must have already done the orphan * inode cleanup for us, so we can safely abort without any further action. */ -static void ext4_orphan_cleanup (struct super_block * sb, - struct ext4_super_block * es) +static void ext4_orphan_cleanup(struct super_block *sb, + struct ext4_super_block *es) { unsigned int s_flags = sb->s_flags; int nr_orphans = 0, nr_truncates = 0; @@ -1625,9 +1735,9 @@ static void ext4_orphan_cleanup (struct super_block * sb, DQUOT_INIT(inode); if (inode->i_nlink) { printk(KERN_DEBUG - "%s: truncating inode %lu to %Ld bytes\n", + "%s: truncating inode %lu to %lld bytes\n", __func__, inode->i_ino, inode->i_size); - jbd_debug(2, "truncating inode %lu to %Ld bytes\n", + jbd_debug(2, "truncating inode %lu to %lld bytes\n", inode->i_ino, inode->i_size); ext4_truncate(inode); nr_truncates++; @@ -1642,7 +1752,7 @@ static void ext4_orphan_cleanup (struct super_block * sb, iput(inode); /* The delete magic happens here! */ } -#define PLURAL(x) (x), ((x)==1) ? "" : "s" +#define PLURAL(x) (x), ((x) == 1) ? "" : "s" if (nr_orphans) printk(KERN_INFO "EXT4-fs: %s: %d orphan inode%s deleted\n", @@ -1809,12 +1919,12 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi) return 0; } -static int ext4_fill_super (struct super_block *sb, void *data, int silent) - __releases(kernel_sem) - __acquires(kernel_sem) +static int ext4_fill_super(struct super_block *sb, void *data, int silent) + __releases(kernel_lock) + __acquires(kernel_lock) { - struct buffer_head * bh; + struct buffer_head *bh; struct ext4_super_block *es = NULL; struct ext4_sb_info *sbi; ext4_fsblk_t block; @@ -1825,6 +1935,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) unsigned long journal_devnum = 0; unsigned long def_mount_opts; struct inode *root; + char *cp; int ret = -EINVAL; int blocksize; int db_count; @@ -1841,21 +1952,21 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) sbi->s_mount_opt = 0; sbi->s_resuid = EXT4_DEF_RESUID; sbi->s_resgid = EXT4_DEF_RESGID; + sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; sbi->s_sb_block = sb_block; unlock_kernel(); + /* Cleanup superblock name */ + for (cp = sb->s_id; (cp = strchr(cp, '/'));) + *cp = '!'; + blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE); if (!blocksize) { printk(KERN_ERR "EXT4-fs: unable to set blocksize\n"); goto out_fail; } - if (!sb_set_blocksize(sb, blocksize)) { - printk(KERN_ERR "EXT4-fs: bad blocksize %d.\n", blocksize); - goto out_fail; - } - /* * The ext4 superblock will not be buffer aligned for other than 1kB * block sizes. We need to calculate the offset from buffer start. @@ -1868,7 +1979,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) } if (!(bh = sb_bread(sb, logical_sb_block))) { - printk (KERN_ERR "EXT4-fs: unable to read superblock\n"); + printk(KERN_ERR "EXT4-fs: unable to read superblock\n"); goto out_fail; } /* @@ -1889,11 +2000,11 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) set_opt(sbi->s_mount_opt, GRPID); if (def_mount_opts & EXT4_DEFM_UID16) set_opt(sbi->s_mount_opt, NO_UID32); -#ifdef CONFIG_EXT4DEV_FS_XATTR +#ifdef CONFIG_EXT4_FS_XATTR if (def_mount_opts & EXT4_DEFM_XATTR_USER) set_opt(sbi->s_mount_opt, XATTR_USER); #endif -#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL +#ifdef CONFIG_EXT4_FS_POSIX_ACL if (def_mount_opts & EXT4_DEFM_ACL) set_opt(sbi->s_mount_opt, POSIX_ACL); #endif @@ -1919,17 +2030,25 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) /* * turn on extents feature by default in ext4 filesystem - * User -o noextents to turn it off + * only if feature flag already set by mkfs or tune2fs. + * Use -o noextents to turn it off */ - set_opt(sbi->s_mount_opt, EXTENTS); + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) + set_opt(sbi->s_mount_opt, EXTENTS); + else + ext4_warning(sb, __func__, + "extents feature not enabled on this filesystem, " + "use tune2fs.\n"); + /* - * turn on mballoc feature by default in ext4 filesystem - * User -o nomballoc to turn it off + * enable delayed allocation by default + * Use -o nodelalloc to turn it off */ - set_opt(sbi->s_mount_opt, MBALLOC); + set_opt(sbi->s_mount_opt, DELALLOC); - if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum, - NULL, 0)) + + if (!parse_options((char *) data, sb, &journal_inum, &journal_devnum, + NULL, 0)) goto failed_mount; sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | @@ -1944,16 +2063,6 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) "running e2fsck is recommended\n"); /* - * Since ext4 is still considered development code, we require - * that the TEST_FILESYS flag in s->flags be set. - */ - if (!(le32_to_cpu(es->s_flags) & EXT2_FLAGS_TEST_FILESYS)) { - printk(KERN_WARNING "EXT4-fs: %s: not marked " - "OK to use with test code.\n", sb->s_id); - goto failed_mount; - } - - /* * Check feature flags regardless of the revision level, since we * previously didn't change the revision level when setting the flags, * so there is a chance incompat flags are set on a rev 0 filesystem. @@ -2004,7 +2113,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) goto failed_mount; } - brelse (bh); + brelse(bh); logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE; offset = do_div(logical_sb_block, blocksize); bh = sb_bread(sb, logical_sb_block); @@ -2016,8 +2125,8 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) es = (struct ext4_super_block *)(((char *)bh->b_data) + offset); sbi->s_es = es; if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) { - printk (KERN_ERR - "EXT4-fs: Magic mismatch, very weird !\n"); + printk(KERN_ERR + "EXT4-fs: Magic mismatch, very weird !\n"); goto failed_mount; } } @@ -2034,9 +2143,9 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) || (!is_power_of_2(sbi->s_inode_size)) || (sbi->s_inode_size > blocksize)) { - printk (KERN_ERR - "EXT4-fs: unsupported inode size: %d\n", - sbi->s_inode_size); + printk(KERN_ERR + "EXT4-fs: unsupported inode size: %d\n", + sbi->s_inode_size); goto failed_mount; } if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) @@ -2068,20 +2177,20 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) sbi->s_mount_state = le16_to_cpu(es->s_state); sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); - for (i=0; i < 4; i++) + for (i = 0; i < 4; i++) sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); sbi->s_def_hash_version = es->s_def_hash_version; if (sbi->s_blocks_per_group > blocksize * 8) { - printk (KERN_ERR - "EXT4-fs: #blocks per group too big: %lu\n", - sbi->s_blocks_per_group); + printk(KERN_ERR + "EXT4-fs: #blocks per group too big: %lu\n", + sbi->s_blocks_per_group); goto failed_mount; } if (sbi->s_inodes_per_group > blocksize * 8) { - printk (KERN_ERR - "EXT4-fs: #inodes per group too big: %lu\n", - sbi->s_inodes_per_group); + printk(KERN_ERR + "EXT4-fs: #inodes per group too big: %lu\n", + sbi->s_inodes_per_group); goto failed_mount; } @@ -2115,29 +2224,47 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) sbi->s_groups_count = blocks_count; db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb); - sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *), + sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *), GFP_KERNEL); if (sbi->s_group_desc == NULL) { - printk (KERN_ERR "EXT4-fs: not enough memory\n"); + printk(KERN_ERR "EXT4-fs: not enough memory\n"); goto failed_mount; } +#ifdef CONFIG_PROC_FS + if (ext4_proc_root) + sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root); + + if (sbi->s_proc) + proc_create_data("inode_readahead_blks", 0644, sbi->s_proc, + &ext4_ui_proc_fops, + &sbi->s_inode_readahead_blks); +#endif + bgl_lock_init(&sbi->s_blockgroup_lock); for (i = 0; i < db_count; i++) { block = descriptor_loc(sb, logical_sb_block, i); sbi->s_group_desc[i] = sb_bread(sb, block); if (!sbi->s_group_desc[i]) { - printk (KERN_ERR "EXT4-fs: " - "can't read group descriptor %d\n", i); + printk(KERN_ERR "EXT4-fs: " + "can't read group descriptor %d\n", i); db_count = i; goto failed_mount2; } } - if (!ext4_check_descriptors (sb)) { + if (!ext4_check_descriptors(sb)) { printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n"); goto failed_mount2; } + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) + if (!ext4_fill_flex_info(sb)) { + printk(KERN_ERR + "EXT4-fs: unable to initialize " + "flex_bg meta info!\n"); + goto failed_mount2; + } + sbi->s_gdb_count = db_count; get_random_bytes(&sbi->s_next_generation, sizeof(u32)); spin_lock_init(&sbi->s_next_gen_lock); @@ -2152,24 +2279,14 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) err = percpu_counter_init(&sbi->s_dirs_counter, ext4_count_dirs(sb)); } + if (!err) { + err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0); + } if (err) { printk(KERN_ERR "EXT4-fs: insufficient memory\n"); goto failed_mount3; } - /* per fileystem reservation list head & lock */ - spin_lock_init(&sbi->s_rsv_window_lock); - sbi->s_rsv_window_root = RB_ROOT; - /* Add a single, static dummy reservation to the start of the - * reservation window list --- it gives us a placeholder for - * append-at-start-of-list which makes the allocation logic - * _much_ simpler. */ - sbi->s_rsv_window_head.rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED; - sbi->s_rsv_window_head.rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED; - sbi->s_rsv_window_head.rsv_alloc_hit = 0; - sbi->s_rsv_window_head.rsv_goal_size = 0; - ext4_rsv_window_add(sb, &sbi->s_rsv_window_head); - sbi->s_stripe = ext4_get_stripe_size(sbi); /* @@ -2202,11 +2319,11 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) EXT4_SB(sb)->s_journal->j_failed_commit) { printk(KERN_CRIT "EXT4-fs error (device %s): " "ext4_fill_super: Journal transaction " - "%u is corrupt\n", sb->s_id, + "%u is corrupt\n", sb->s_id, EXT4_SB(sb)->s_journal->j_failed_commit); - if (test_opt (sb, ERRORS_RO)) { - printk (KERN_CRIT - "Mounting filesystem read-only\n"); + if (test_opt(sb, ERRORS_RO)) { + printk(KERN_CRIT + "Mounting filesystem read-only\n"); sb->s_flags |= MS_RDONLY; EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; es->s_state |= cpu_to_le16(EXT4_ERROR_FS); @@ -2226,9 +2343,9 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) goto failed_mount3; } else { if (!silent) - printk (KERN_ERR - "ext4: No journal on filesystem on %s\n", - sb->s_id); + printk(KERN_ERR + "ext4: No journal on filesystem on %s\n", + sb->s_id); goto failed_mount3; } @@ -2312,7 +2429,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) goto failed_mount4; } - ext4_setup_super (sb, es, sb->s_flags & MS_RDONLY); + ext4_setup_super(sb, es, sb->s_flags & MS_RDONLY); /* determine the minimum size of new large inodes, if present */ if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) { @@ -2351,15 +2468,27 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) ext4_orphan_cleanup(sb, es); EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; if (needs_recovery) - printk (KERN_INFO "EXT4-fs: recovery complete.\n"); + printk(KERN_INFO "EXT4-fs: recovery complete.\n"); ext4_mark_recovery_complete(sb, es); - printk (KERN_INFO "EXT4-fs: mounted filesystem with %s data mode.\n", - test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ? "journal": - test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered": - "writeback"); + printk(KERN_INFO "EXT4-fs: mounted filesystem with %s data mode.\n", + test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ? "journal": + test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered": + "writeback"); + + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { + printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - " + "requested data journaling mode\n"); + clear_opt(sbi->s_mount_opt, DELALLOC); + } else if (test_opt(sb, DELALLOC)) + printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n"); ext4_ext_init(sb); - ext4_mb_init(sb, needs_recovery); + err = ext4_mb_init(sb, needs_recovery); + if (err) { + printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n", + err); + goto failed_mount4; + } lock_kernel(); return 0; @@ -2372,15 +2501,21 @@ cantfind_ext4: failed_mount4: jbd2_journal_destroy(sbi->s_journal); + sbi->s_journal = NULL; failed_mount3: percpu_counter_destroy(&sbi->s_freeblocks_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); + percpu_counter_destroy(&sbi->s_dirtyblocks_counter); failed_mount2: for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); kfree(sbi->s_group_desc); failed_mount: + if (sbi->s_proc) { + remove_proc_entry("inode_readahead_blks", sbi->s_proc); + remove_proc_entry(sb->s_id, ext4_proc_root); + } #ifdef CONFIG_QUOTA for (i = 0; i < MAXQUOTAS; i++) kfree(sbi->s_qf_names[i]); @@ -2414,6 +2549,10 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal) journal->j_flags |= JBD2_BARRIER; else journal->j_flags &= ~JBD2_BARRIER; + if (test_opt(sb, DATA_ERR_ABORT)) + journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR; + else + journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR; spin_unlock(&journal->j_state_lock); } @@ -2439,7 +2578,7 @@ static journal_t *ext4_get_journal(struct super_block *sb, return NULL; } - jbd_debug(2, "Journal inode found at %p: %Ld bytes\n", + jbd_debug(2, "Journal inode found at %p: %lld bytes\n", journal_inode, journal_inode->i_size); if (!S_ISREG(journal_inode->i_mode)) { printk(KERN_ERR "EXT4-fs: invalid journal inode.\n"); @@ -2461,14 +2600,14 @@ static journal_t *ext4_get_journal(struct super_block *sb, static journal_t *ext4_get_dev_journal(struct super_block *sb, dev_t j_dev) { - struct buffer_head * bh; + struct buffer_head *bh; journal_t *journal; ext4_fsblk_t start; ext4_fsblk_t len; int hblock, blocksize; ext4_fsblk_t sb_block; unsigned long offset; - struct ext4_super_block * es; + struct ext4_super_block *es; struct block_device *bdev; bdev = ext4_blkdev_get(j_dev); @@ -2583,8 +2722,8 @@ static int ext4_load_journal(struct super_block *sb, "unavailable, cannot proceed.\n"); return -EROFS; } - printk (KERN_INFO "EXT4-fs: write access will " - "be enabled during recovery.\n"); + printk(KERN_INFO "EXT4-fs: write access will " + "be enabled during recovery.\n"); } } @@ -2602,6 +2741,11 @@ static int ext4_load_journal(struct super_block *sb, return -EINVAL; } + if (journal->j_flags & JBD2_BARRIER) + printk(KERN_INFO "EXT4-fs: barriers enabled\n"); + else + printk(KERN_INFO "EXT4-fs: barriers disabled\n"); + if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) { err = jbd2_journal_update_format(journal); if (err) { @@ -2637,8 +2781,8 @@ static int ext4_load_journal(struct super_block *sb, return 0; } -static int ext4_create_journal(struct super_block * sb, - struct ext4_super_block * es, +static int ext4_create_journal(struct super_block *sb, + struct ext4_super_block *es, unsigned int journal_inum) { journal_t *journal; @@ -2679,21 +2823,41 @@ static int ext4_create_journal(struct super_block * sb, return 0; } -static void ext4_commit_super (struct super_block * sb, - struct ext4_super_block * es, - int sync) +static void ext4_commit_super(struct super_block *sb, + struct ext4_super_block *es, int sync) { struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; if (!sbh) return; + if (buffer_write_io_error(sbh)) { + /* + * Oh, dear. A previous attempt to write the + * superblock failed. This could happen because the + * USB device was yanked out. Or it could happen to + * be a transient write error and maybe the block will + * be remapped. Nothing we can do but to retry the + * write and hope for the best. + */ + printk(KERN_ERR "ext4: previous I/O error to " + "superblock detected for %s.\n", sb->s_id); + clear_buffer_write_io_error(sbh); + set_buffer_uptodate(sbh); + } es->s_wtime = cpu_to_le32(get_seconds()); ext4_free_blocks_count_set(es, ext4_count_free_blocks(sb)); es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb)); BUFFER_TRACE(sbh, "marking dirty"); mark_buffer_dirty(sbh); - if (sync) + if (sync) { sync_dirty_buffer(sbh); + if (buffer_write_io_error(sbh)) { + printk(KERN_ERR "ext4: I/O error while writing " + "superblock for %s.\n", sb->s_id); + clear_buffer_write_io_error(sbh); + set_buffer_uptodate(sbh); + } + } } @@ -2702,13 +2866,15 @@ static void ext4_commit_super (struct super_block * sb, * remounting) the filesystem readonly, then we will end up with a * consistent fs on disk. Record that fact. */ -static void ext4_mark_recovery_complete(struct super_block * sb, - struct ext4_super_block * es) +static void ext4_mark_recovery_complete(struct super_block *sb, + struct ext4_super_block *es) { journal_t *journal = EXT4_SB(sb)->s_journal; jbd2_journal_lock_updates(journal); - jbd2_journal_flush(journal); + if (jbd2_journal_flush(journal) < 0) + goto out; + lock_super(sb); if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) && sb->s_flags & MS_RDONLY) { @@ -2717,6 +2883,8 @@ static void ext4_mark_recovery_complete(struct super_block * sb, ext4_commit_super(sb, es, 1); } unlock_super(sb); + +out: jbd2_journal_unlock_updates(journal); } @@ -2725,8 +2893,8 @@ static void ext4_mark_recovery_complete(struct super_block * sb, * has recorded an error from a previous lifetime, move that error to the * main filesystem now. */ -static void ext4_clear_journal_err(struct super_block * sb, - struct ext4_super_block * es) +static void ext4_clear_journal_err(struct super_block *sb, + struct ext4_super_block *es) { journal_t *journal; int j_errno; @@ -2751,7 +2919,7 @@ static void ext4_clear_journal_err(struct super_block * sb, EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; es->s_state |= cpu_to_le16(EXT4_ERROR_FS); - ext4_commit_super (sb, es, 1); + ext4_commit_super(sb, es, 1); jbd2_journal_clear_err(journal); } @@ -2784,7 +2952,7 @@ int ext4_force_commit(struct super_block *sb) * This implicitly triggers the writebehind on sync(). */ -static void ext4_write_super (struct super_block * sb) +static void ext4_write_super(struct super_block *sb) { if (mutex_trylock(&sb->s_lock) != 0) BUG(); @@ -2795,6 +2963,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait) { tid_t target; + trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait); sb->s_dirt = 0; if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) { if (wait) @@ -2816,7 +2985,13 @@ static void ext4_write_super_lockfs(struct super_block *sb) /* Now we set up the journal barrier. */ jbd2_journal_lock_updates(journal); - jbd2_journal_flush(journal); + + /* + * We don't want to clear needs_recovery flag when we failed + * to flush the journal. + */ + if (jbd2_journal_flush(journal) < 0) + return; /* Journal blocked and flushed, clear needs_recovery flag. */ EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); @@ -2840,13 +3015,14 @@ static void ext4_unlockfs(struct super_block *sb) } } -static int ext4_remount (struct super_block * sb, int * flags, char * data) +static int ext4_remount(struct super_block *sb, int *flags, char *data) { - struct ext4_super_block * es; + struct ext4_super_block *es; struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t n_blocks_count = 0; unsigned long old_sb_flags; struct ext4_mount_options old_opts; + ext4_group_t g; int err; #ifdef CONFIG_QUOTA int i; @@ -2925,6 +3101,26 @@ static int ext4_remount (struct super_block * sb, int * flags, char * data) } /* + * Make sure the group descriptor checksums + * are sane. If they aren't, refuse to + * remount r/w. + */ + for (g = 0; g < sbi->s_groups_count; g++) { + struct ext4_group_desc *gdp = + ext4_get_group_desc(sb, g, NULL); + + if (!ext4_group_desc_csum_verify(sbi, g, gdp)) { + printk(KERN_ERR + "EXT4-fs: ext4_remount: " + "Checksum for group %lu failed (%u!=%u)\n", + g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)), + le16_to_cpu(gdp->bg_checksum)); + err = -EINVAL; + goto restore_opts; + } + } + + /* * If we have an unprocessed orphan list hanging * around from a previously readonly bdev mount, * require a full umount/remount for now. @@ -2949,7 +3145,7 @@ static int ext4_remount (struct super_block * sb, int * flags, char * data) sbi->s_mount_state = le16_to_cpu(es->s_state); if ((err = ext4_group_extend(sb, es, n_blocks_count))) goto restore_opts; - if (!ext4_setup_super (sb, es, 0)) + if (!ext4_setup_super(sb, es, 0)) sb->s_flags &= ~MS_RDONLY; } } @@ -2979,7 +3175,7 @@ restore_opts: return err; } -static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf) +static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -3029,7 +3225,8 @@ static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf) buf->f_type = EXT4_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last; - buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter); + buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) - + percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter); ext4_free_blocks_count_set(es, buf->f_bfree); buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); if (buf->f_bfree < ext4_r_blocks_count(es)) @@ -3217,12 +3414,12 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, } /* Journaling quota? */ if (EXT4_SB(sb)->s_qf_names[type]) { - /* Quotafile not of fs root? */ + /* Quotafile not in fs root? */ if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode) printk(KERN_WARNING "EXT4-fs: Quota file not on filesystem root. " "Journaled quota will not work.\n"); - } + } /* * When we journal data on quota file, we have to flush journal to see @@ -3234,12 +3431,17 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, * otherwise be livelocked... */ jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); - jbd2_journal_flush(EXT4_SB(sb)->s_journal); + err = jbd2_journal_flush(EXT4_SB(sb)->s_journal); jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + if (err) { + path_put(&nd.path); + return err; + } } + err = vfs_quota_on_path(sb, type, format_id, &nd.path); path_put(&nd.path); - return vfs_quota_on(sb, type, format_id, path, remount); + return err; } /* Read data from quotafile - avoid pagecache and such because we cannot afford @@ -3298,7 +3500,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, handle_t *handle = journal_current_handle(); if (!handle) { - printk(KERN_WARNING "EXT4-fs: Quota write (off=%Lu, len=%Lu)" + printk(KERN_WARNING "EXT4-fs: Quota write (off=%llu, len=%llu)" " cancelled because transaction is not started.\n", (unsigned long long)off, (unsigned long long)len); return -EIO; @@ -3325,7 +3527,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, err = ext4_journal_dirty_metadata(handle, bh); else { /* Always do at least ordered writes for quotas */ - err = ext4_journal_dirty_data(handle, bh); + err = ext4_jbd2_file_inode(handle, inode); mark_buffer_dirty(bh); } brelse(bh); @@ -3337,8 +3539,10 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, blk++; } out: - if (len == towrite) + if (len == towrite) { + mutex_unlock(&inode->i_mutex); return err; + } if (inode->i_size < off+len-towrite) { i_size_write(inode, off+len-towrite); EXT4_I(inode)->i_disksize = inode->i_size; @@ -3357,18 +3561,82 @@ static int ext4_get_sb(struct file_system_type *fs_type, return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt); } +#ifdef CONFIG_PROC_FS +static int ext4_ui_proc_show(struct seq_file *m, void *v) +{ + unsigned int *p = m->private; + + seq_printf(m, "%u\n", *p); + return 0; +} + +static int ext4_ui_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, ext4_ui_proc_show, PDE(inode)->data); +} + +static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf, + size_t cnt, loff_t *ppos) +{ + unsigned int *p = PDE(file->f_path.dentry->d_inode)->data; + char str[32]; + unsigned long value; + + if (cnt >= sizeof(str)) + return -EINVAL; + if (copy_from_user(str, buf, cnt)) + return -EFAULT; + value = simple_strtol(str, NULL, 0); + if (value < 0) + return -ERANGE; + *p = value; + return cnt; +} + +const struct file_operations ext4_ui_proc_fops = { + .owner = THIS_MODULE, + .open = ext4_ui_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = ext4_ui_proc_write, +}; +#endif + +static struct file_system_type ext4_fs_type = { + .owner = THIS_MODULE, + .name = "ext4", + .get_sb = ext4_get_sb, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +#ifdef CONFIG_EXT4DEV_COMPAT +static int ext4dev_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) +{ + printk(KERN_WARNING "EXT4-fs: Update your userspace programs " + "to mount using ext4\n"); + printk(KERN_WARNING "EXT4-fs: ext4dev backwards compatibility " + "will go away by 2.6.31\n"); + return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt); +} + static struct file_system_type ext4dev_fs_type = { .owner = THIS_MODULE, .name = "ext4dev", - .get_sb = ext4_get_sb, + .get_sb = ext4dev_get_sb, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS("ext4dev"); +#endif static int __init init_ext4_fs(void) { int err; + ext4_proc_root = proc_mkdir("fs/ext4", NULL); err = init_ext4_mballoc(); if (err) return err; @@ -3379,9 +3647,16 @@ static int __init init_ext4_fs(void) err = init_inodecache(); if (err) goto out1; - err = register_filesystem(&ext4dev_fs_type); + err = register_filesystem(&ext4_fs_type); if (err) goto out; +#ifdef CONFIG_EXT4DEV_COMPAT + err = register_filesystem(&ext4dev_fs_type); + if (err) { + unregister_filesystem(&ext4_fs_type); + goto out; + } +#endif return 0; out: destroy_inodecache(); @@ -3394,10 +3669,14 @@ out2: static void __exit exit_ext4_fs(void) { + unregister_filesystem(&ext4_fs_type); +#ifdef CONFIG_EXT4DEV_COMPAT unregister_filesystem(&ext4dev_fs_type); +#endif destroy_inodecache(); exit_ext4_xattr(); exit_ext4_mballoc(); + remove_proc_entry("fs/ext4", NULL); } MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index e9178643dc01..00740cb32be3 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -23,10 +23,10 @@ #include "ext4.h" #include "xattr.h" -static void * ext4_follow_link(struct dentry *dentry, struct nameidata *nd) +static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd) { struct ext4_inode_info *ei = EXT4_I(dentry->d_inode); - nd_set_link(nd, (char*)ei->i_data); + nd_set_link(nd, (char *) ei->i_data); return NULL; } @@ -34,7 +34,7 @@ const struct inode_operations ext4_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = page_follow_link_light, .put_link = page_put_link, -#ifdef CONFIG_EXT4DEV_FS_XATTR +#ifdef CONFIG_EXT4_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = ext4_listxattr, @@ -45,7 +45,7 @@ const struct inode_operations ext4_symlink_inode_operations = { const struct inode_operations ext4_fast_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = ext4_follow_link, -#ifdef CONFIG_EXT4DEV_FS_XATTR +#ifdef CONFIG_EXT4_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = ext4_listxattr, diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index ff08633f398e..80626d516fee 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -99,12 +99,12 @@ static struct mb_cache *ext4_xattr_cache; static struct xattr_handler *ext4_xattr_handler_map[] = { [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler, -#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL +#ifdef CONFIG_EXT4_FS_POSIX_ACL [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext4_xattr_acl_access_handler, [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext4_xattr_acl_default_handler, #endif [EXT4_XATTR_INDEX_TRUSTED] = &ext4_xattr_trusted_handler, -#ifdef CONFIG_EXT4DEV_FS_SECURITY +#ifdef CONFIG_EXT4_FS_SECURITY [EXT4_XATTR_INDEX_SECURITY] = &ext4_xattr_security_handler, #endif }; @@ -112,11 +112,11 @@ static struct xattr_handler *ext4_xattr_handler_map[] = { struct xattr_handler *ext4_xattr_handlers[] = { &ext4_xattr_user_handler, &ext4_xattr_trusted_handler, -#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL +#ifdef CONFIG_EXT4_FS_POSIX_ACL &ext4_xattr_acl_access_handler, &ext4_xattr_acl_default_handler, #endif -#ifdef CONFIG_EXT4DEV_FS_SECURITY +#ifdef CONFIG_EXT4_FS_SECURITY &ext4_xattr_security_handler, #endif NULL @@ -810,7 +810,7 @@ inserted: /* We need to allocate a new block */ ext4_fsblk_t goal = ext4_group_first_block_no(sb, EXT4_I(inode)->i_block_group); - ext4_fsblk_t block = ext4_new_block(handle, inode, + ext4_fsblk_t block = ext4_new_meta_block(handle, inode, goal, &error); if (error) goto cleanup; @@ -959,6 +959,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, struct ext4_xattr_block_find bs = { .s = { .not_found = -ENODATA, }, }; + unsigned long no_expand; int error; if (!name) @@ -966,6 +967,9 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, if (strlen(name) > 255) return -ERANGE; down_write(&EXT4_I(inode)->xattr_sem); + no_expand = EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND; + EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND; + error = ext4_get_inode_loc(inode, &is.iloc); if (error) goto cleanup; @@ -1042,6 +1046,8 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, cleanup: brelse(is.iloc.bh); brelse(bs.bh); + if (no_expand == 0) + EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND; up_write(&EXT4_I(inode)->xattr_sem); return error; } @@ -1512,7 +1518,7 @@ static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header, char *name = entry->e_name; int n; - for (n=0; n < entry->e_name_len; n++) { + for (n = 0; n < entry->e_name_len; n++) { hash = (hash << NAME_HASH_SHIFT) ^ (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ *name++; diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 5992fe979bb9..8ede88b18c29 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -51,8 +51,8 @@ struct ext4_xattr_entry { (((name_len) + EXT4_XATTR_ROUND + \ sizeof(struct ext4_xattr_entry)) & ~EXT4_XATTR_ROUND) #define EXT4_XATTR_NEXT(entry) \ - ( (struct ext4_xattr_entry *)( \ - (char *)(entry) + EXT4_XATTR_LEN((entry)->e_name_len)) ) + ((struct ext4_xattr_entry *)( \ + (char *)(entry) + EXT4_XATTR_LEN((entry)->e_name_len))) #define EXT4_XATTR_SIZE(size) \ (((size) + EXT4_XATTR_ROUND) & ~EXT4_XATTR_ROUND) @@ -63,7 +63,7 @@ struct ext4_xattr_entry { EXT4_I(inode)->i_extra_isize)) #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1)) -# ifdef CONFIG_EXT4DEV_FS_XATTR +# ifdef CONFIG_EXT4_FS_XATTR extern struct xattr_handler ext4_xattr_user_handler; extern struct xattr_handler ext4_xattr_trusted_handler; @@ -88,7 +88,7 @@ extern void exit_ext4_xattr(void); extern struct xattr_handler *ext4_xattr_handlers[]; -# else /* CONFIG_EXT4DEV_FS_XATTR */ +# else /* CONFIG_EXT4_FS_XATTR */ static inline int ext4_xattr_get(struct inode *inode, int name_index, const char *name, @@ -141,9 +141,9 @@ ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, #define ext4_xattr_handlers NULL -# endif /* CONFIG_EXT4DEV_FS_XATTR */ +# endif /* CONFIG_EXT4_FS_XATTR */ -#ifdef CONFIG_EXT4DEV_FS_SECURITY +#ifdef CONFIG_EXT4_FS_SECURITY extern int ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir); #else diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c index fff33382cadc..ac1a52cf2a37 100644 --- a/fs/ext4/xattr_trusted.c +++ b/fs/ext4/xattr_trusted.c @@ -13,13 +13,11 @@ #include "ext4.h" #include "xattr.h" -#define XATTR_TRUSTED_PREFIX "trusted." - static size_t ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size, const char *name, size_t name_len) { - const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1; + const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; if (!capable(CAP_SYS_ADMIN)) diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c index 67be723fcc4e..d91aa61b42aa 100644 --- a/fs/ext4/xattr_user.c +++ b/fs/ext4/xattr_user.c @@ -12,13 +12,11 @@ #include "ext4.h" #include "xattr.h" -#define XATTR_USER_PREFIX "user." - static size_t ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size, const char *name, size_t name_len) { - const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1; + const size_t prefix_len = XATTR_USER_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; if (!test_opt(inode->i_sb, XATTR_USER)) diff --git a/fs/fat/cache.c b/fs/fat/cache.c index fda25479af26..3222f51c41cf 100644 --- a/fs/fat/cache.c +++ b/fs/fat/cache.c @@ -36,7 +36,7 @@ static inline int fat_max_cache(struct inode *inode) static struct kmem_cache *fat_cache_cachep; -static void init_once(struct kmem_cache *cachep, void *foo) +static void init_once(void *foo) { struct fat_cache *cache = (struct fat_cache *)foo; @@ -61,7 +61,7 @@ void fat_cache_destroy(void) static inline struct fat_cache *fat_cache_alloc(struct inode *inode) { - return kmem_cache_alloc(fat_cache_cachep, GFP_KERNEL); + return kmem_cache_alloc(fat_cache_cachep, GFP_NOFS); } static inline void fat_cache_free(struct fat_cache *cache) diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 486725ee99ae..cd4a0162e10d 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -17,7 +17,6 @@ #include <linux/slab.h> #include <linux/time.h> #include <linux/msdos_fs.h> -#include <linux/dirent.h> #include <linux/smp_lock.h> #include <linux/buffer_head.h> #include <linux/compat.h> @@ -124,10 +123,11 @@ static inline int fat_get_entry(struct inode *dir, loff_t *pos, * but ignore that right now. * Ahem... Stack smashing in ring 0 isn't fun. Fixed. */ -static int uni16_to_x8(unsigned char *ascii, wchar_t *uni, int len, +static int uni16_to_x8(unsigned char *ascii, const wchar_t *uni, int len, int uni_xlate, struct nls_table *nls) { - wchar_t *ip, ec; + const wchar_t *ip; + wchar_t ec; unsigned char *op, nc; int charlen; int k; @@ -167,6 +167,16 @@ static int uni16_to_x8(unsigned char *ascii, wchar_t *uni, int len, return (op - ascii); } +static inline int fat_uni_to_x8(struct msdos_sb_info *sbi, const wchar_t *uni, + unsigned char *buf, int size) +{ + if (sbi->options.utf8) + return utf8_wcstombs(buf, uni, size); + else + return uni16_to_x8(buf, uni, size, sbi->options.unicode_xlate, + sbi->nls_io); +} + static inline int fat_short2uni(struct nls_table *t, unsigned char *c, int clen, wchar_t *uni) { @@ -227,6 +237,19 @@ fat_shortname2uni(struct nls_table *nls, unsigned char *buf, int buf_size, return len; } +static inline int fat_name_match(struct msdos_sb_info *sbi, + const unsigned char *a, int a_len, + const unsigned char *b, int b_len) +{ + if (a_len != b_len) + return 0; + + if (sbi->options.name_check != 's') + return !nls_strnicmp(sbi->nls_io, a, b, a_len); + else + return !memcmp(a, b, a_len); +} + enum { PARSE_INVALID = 1, PARSE_NOT_LONGNAME, PARSE_EOF, }; /** @@ -302,6 +325,19 @@ parse_long: } /* + * Maximum buffer size of short name. + * [(MSDOS_NAME + '.') * max one char + nul] + * For msdos style, ['.' (hidden) + MSDOS_NAME + '.' + nul] + */ +#define FAT_MAX_SHORT_SIZE ((MSDOS_NAME + 1) * NLS_MAX_CHARSET_SIZE + 1) +/* + * Maximum buffer size of unicode chars from slots. + * [(max longname slots * 13 (size in a slot) + nul) * sizeof(wchar_t)] + */ +#define FAT_MAX_UNI_CHARS ((MSDOS_SLOTS - 1) * 13 + 1) +#define FAT_MAX_UNI_SIZE (FAT_MAX_UNI_CHARS * sizeof(wchar_t)) + +/* * Return values: negative -> error, 0 -> not found, positive -> found, * value is the total amount of slots, including the shortname entry. */ @@ -312,29 +348,20 @@ int fat_search_long(struct inode *inode, const unsigned char *name, struct msdos_sb_info *sbi = MSDOS_SB(sb); struct buffer_head *bh = NULL; struct msdos_dir_entry *de; - struct nls_table *nls_io = sbi->nls_io; struct nls_table *nls_disk = sbi->nls_disk; - wchar_t bufuname[14]; unsigned char nr_slots; - int xlate_len; + wchar_t bufuname[14]; wchar_t *unicode = NULL; unsigned char work[MSDOS_NAME]; - unsigned char *bufname = NULL; - int uni_xlate = sbi->options.unicode_xlate; - int utf8 = sbi->options.utf8; - int anycase = (sbi->options.name_check != 's'); + unsigned char bufname[FAT_MAX_SHORT_SIZE]; unsigned short opt_shortname = sbi->options.shortname; loff_t cpos = 0; - int chl, i, j, last_u, err; - - bufname = __getname(); - if (!bufname) - return -ENOMEM; + int chl, i, j, last_u, err, len; err = -ENOENT; - while(1) { + while (1) { if (fat_get_entry(inode, &cpos, &bh, &de) == -1) - goto EODir; + goto end_of_dir; parse_record: nr_slots = 0; if (de->name[0] == DELETED_FLAG) @@ -353,7 +380,7 @@ parse_record: else if (status == PARSE_NOT_LONGNAME) goto parse_record; else if (status == PARSE_EOF) - goto EODir; + goto end_of_dir; } memcpy(work, de->name, sizeof(de->name)); @@ -394,30 +421,24 @@ parse_record: if (!last_u) continue; + /* Compare shortname */ bufuname[last_u] = 0x0000; - xlate_len = utf8 - ?utf8_wcstombs(bufname, bufuname, PATH_MAX) - :uni16_to_x8(bufname, bufuname, PATH_MAX, uni_xlate, nls_io); - if (xlate_len == name_len) - if ((!anycase && !memcmp(name, bufname, xlate_len)) || - (anycase && !nls_strnicmp(nls_io, name, bufname, - xlate_len))) - goto Found; + len = fat_uni_to_x8(sbi, bufuname, bufname, sizeof(bufname)); + if (fat_name_match(sbi, name, name_len, bufname, len)) + goto found; if (nr_slots) { - xlate_len = utf8 - ?utf8_wcstombs(bufname, unicode, PATH_MAX) - :uni16_to_x8(bufname, unicode, PATH_MAX, uni_xlate, nls_io); - if (xlate_len != name_len) - continue; - if ((!anycase && !memcmp(name, bufname, xlate_len)) || - (anycase && !nls_strnicmp(nls_io, name, bufname, - xlate_len))) - goto Found; + void *longname = unicode + FAT_MAX_UNI_CHARS; + int size = PATH_MAX - FAT_MAX_UNI_SIZE; + + /* Compare longname */ + len = fat_uni_to_x8(sbi, unicode, longname, size); + if (fat_name_match(sbi, name, name_len, longname, len)) + goto found; } } -Found: +found: nr_slots++; /* include the de */ sinfo->slot_off = cpos - nr_slots * sizeof(*de); sinfo->nr_slots = nr_slots; @@ -425,9 +446,7 @@ Found: sinfo->bh = bh; sinfo->i_pos = fat_make_i_pos(sb, sinfo->bh, sinfo->de); err = 0; -EODir: - if (bufname) - __putname(bufname); +end_of_dir: if (unicode) __putname(unicode); @@ -453,26 +472,23 @@ static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent, struct msdos_sb_info *sbi = MSDOS_SB(sb); struct buffer_head *bh; struct msdos_dir_entry *de; - struct nls_table *nls_io = sbi->nls_io; struct nls_table *nls_disk = sbi->nls_disk; - unsigned char long_slots; - const char *fill_name; - int fill_len; + unsigned char nr_slots; wchar_t bufuname[14]; wchar_t *unicode = NULL; - unsigned char c, work[MSDOS_NAME], bufname[56], *ptname = bufname; - unsigned long lpos, dummy, *furrfu = &lpos; - int uni_xlate = sbi->options.unicode_xlate; + unsigned char c, work[MSDOS_NAME]; + unsigned char bufname[FAT_MAX_SHORT_SIZE], *ptname = bufname; + unsigned short opt_shortname = sbi->options.shortname; int isvfat = sbi->options.isvfat; - int utf8 = sbi->options.utf8; int nocase = sbi->options.nocase; - unsigned short opt_shortname = sbi->options.shortname; + const char *fill_name = NULL; unsigned long inum; - int chi, chl, i, i2, j, last, last_u, dotoffset = 0; + unsigned long lpos, dummy, *furrfu = &lpos; loff_t cpos; + int chi, chl, i, i2, j, last, last_u, dotoffset = 0, fill_len = 0; int ret = 0; - lock_kernel(); + lock_super(sb); cpos = filp->f_pos; /* Fake . and .. for the root directory. */ @@ -489,43 +505,58 @@ static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent, cpos = 0; } } - if (cpos & (sizeof(struct msdos_dir_entry)-1)) { + if (cpos & (sizeof(struct msdos_dir_entry) - 1)) { ret = -ENOENT; goto out; } bh = NULL; -GetNew: +get_new: if (fat_get_entry(inode, &cpos, &bh, &de) == -1) - goto EODir; + goto end_of_dir; parse_record: - long_slots = 0; - /* Check for long filename entry */ - if (isvfat) { + nr_slots = 0; + /* + * Check for long filename entry, but if short_only, we don't + * need to parse long filename. + */ + if (isvfat && !short_only) { if (de->name[0] == DELETED_FLAG) - goto RecEnd; + goto record_end; if (de->attr != ATTR_EXT && (de->attr & ATTR_VOLUME)) - goto RecEnd; + goto record_end; if (de->attr != ATTR_EXT && IS_FREE(de->name)) - goto RecEnd; + goto record_end; } else { if ((de->attr & ATTR_VOLUME) || IS_FREE(de->name)) - goto RecEnd; + goto record_end; } if (isvfat && de->attr == ATTR_EXT) { int status = fat_parse_long(inode, &cpos, &bh, &de, - &unicode, &long_slots); + &unicode, &nr_slots); if (status < 0) { filp->f_pos = cpos; ret = status; goto out; } else if (status == PARSE_INVALID) - goto RecEnd; + goto record_end; else if (status == PARSE_NOT_LONGNAME) goto parse_record; else if (status == PARSE_EOF) - goto EODir; + goto end_of_dir; + + if (nr_slots) { + void *longname = unicode + FAT_MAX_UNI_CHARS; + int size = PATH_MAX - FAT_MAX_UNI_SIZE; + int len = fat_uni_to_x8(sbi, unicode, longname, size); + + fill_name = longname; + fill_len = len; + /* !both && !short_only, so we don't need shortname. */ + if (!both) + goto start_filldir; + } } if (sbi->options.dotsOK) { @@ -587,12 +618,32 @@ parse_record: } } if (!last) - goto RecEnd; + goto record_end; i = last + dotoffset; j = last_u; - lpos = cpos - (long_slots+1)*sizeof(struct msdos_dir_entry); + if (isvfat) { + bufuname[j] = 0x0000; + i = fat_uni_to_x8(sbi, bufuname, bufname, sizeof(bufname)); + } + if (nr_slots) { + /* hack for fat_ioctl_filldir() */ + struct fat_ioctl_filldir_callback *p = dirent; + + p->longname = fill_name; + p->long_len = fill_len; + p->shortname = bufname; + p->short_len = i; + fill_name = NULL; + fill_len = 0; + } else { + fill_name = bufname; + fill_len = i; + } + +start_filldir: + lpos = cpos - (nr_slots + 1) * sizeof(struct msdos_dir_entry); if (!memcmp(de->name, MSDOS_DOT, MSDOS_NAME)) inum = inode->i_ino; else if (!memcmp(de->name, MSDOS_DOTDOT, MSDOS_NAME)) { @@ -607,54 +658,22 @@ parse_record: inum = iunique(sb, MSDOS_ROOT_INO); } - if (isvfat) { - bufuname[j] = 0x0000; - i = utf8 ? utf8_wcstombs(bufname, bufuname, sizeof(bufname)) - : uni16_to_x8(bufname, bufuname, sizeof(bufname), uni_xlate, nls_io); - } - - fill_name = bufname; - fill_len = i; - if (!short_only && long_slots) { - /* convert the unicode long name. 261 is maximum size - * of unicode buffer. (13 * slots + nul) */ - void *longname = unicode + 261; - int buf_size = PATH_MAX - (261 * sizeof(unicode[0])); - int long_len = utf8 - ? utf8_wcstombs(longname, unicode, buf_size) - : uni16_to_x8(longname, unicode, buf_size, uni_xlate, nls_io); - - if (!both) { - fill_name = longname; - fill_len = long_len; - } else { - /* hack for fat_ioctl_filldir() */ - struct fat_ioctl_filldir_callback *p = dirent; - - p->longname = longname; - p->long_len = long_len; - p->shortname = bufname; - p->short_len = i; - fill_name = NULL; - fill_len = 0; - } - } if (filldir(dirent, fill_name, fill_len, *furrfu, inum, (de->attr & ATTR_DIR) ? DT_DIR : DT_REG) < 0) - goto FillFailed; + goto fill_failed; -RecEnd: +record_end: furrfu = &lpos; filp->f_pos = cpos; - goto GetNew; -EODir: + goto get_new; +end_of_dir: filp->f_pos = cpos; -FillFailed: +fill_failed: brelse(bh); if (unicode) __putname(unicode); out: - unlock_kernel(); + unlock_super(sb); return ret; } @@ -715,7 +734,7 @@ efault: \ return -EFAULT; \ } -FAT_IOCTL_FILLDIR_FUNC(fat_ioctl_filldir, dirent) +FAT_IOCTL_FILLDIR_FUNC(fat_ioctl_filldir, __fat_dirent) static int fat_ioctl_readdir(struct inode *inode, struct file *filp, void __user *dirent, filldir_t filldir, @@ -741,7 +760,7 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *filp, static int fat_dir_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg) { - struct dirent __user *d1 = (struct dirent __user *)arg; + struct __fat_dirent __user *d1 = (struct __fat_dirent __user *)arg; int short_only, both; switch (cmd) { @@ -757,7 +776,7 @@ static int fat_dir_ioctl(struct inode *inode, struct file *filp, return fat_generic_ioctl(inode, filp, cmd, arg); } - if (!access_ok(VERIFY_WRITE, d1, sizeof(struct dirent[2]))) + if (!access_ok(VERIFY_WRITE, d1, sizeof(struct __fat_dirent[2]))) return -EFAULT; /* * Yes, we don't need this put_user() absolutely. However old @@ -1082,7 +1101,7 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec *ts) goto error_free; } - fat_date_unix2dos(ts->tv_sec, &time, &date); + fat_date_unix2dos(ts->tv_sec, &time, &date, sbi->options.tz_utc); de = (struct msdos_dir_entry *)bhs[0]->b_data; /* filling the new directory slots ("." and ".." entries) */ diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index 302e95c4af7e..fb98b3d847ed 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -6,6 +6,7 @@ #include <linux/module.h> #include <linux/fs.h> #include <linux/msdos_fs.h> +#include <linux/blkdev.h> struct fatent_operations { void (*ent_blocknr)(struct super_block *, int, int *, sector_t *); @@ -535,6 +536,7 @@ int fat_free_clusters(struct inode *inode, int cluster) struct fat_entry fatent; struct buffer_head *bhs[MAX_BUF_PER_PAGE]; int i, err, nr_bhs; + int first_cl = cluster; nr_bhs = 0; fatent_init(&fatent); @@ -551,6 +553,18 @@ int fat_free_clusters(struct inode *inode, int cluster) goto error; } + /* + * Issue discard for the sectors we no longer care about, + * batching contiguous clusters into one request + */ + if (cluster != fatent.entry + 1) { + int nr_clus = fatent.entry - first_cl + 1; + + sb_issue_discard(sb, fat_clus_to_blknr(sbi, first_cl), + nr_clus * sbi->sec_per_clus); + first_cl = cluster; + } + ops->ent_put(&fatent, FAT_ENT_FREE); if (sbi->free_clusters != -1) { sbi->free_clusters++; diff --git a/fs/fat/file.c b/fs/fat/file.c index 771326b8047e..ddde37025ca6 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -11,11 +11,12 @@ #include <linux/mount.h> #include <linux/time.h> #include <linux/msdos_fs.h> -#include <linux/smp_lock.h> #include <linux/buffer_head.h> #include <linux/writeback.h> #include <linux/backing-dev.h> #include <linux/blkdev.h> +#include <linux/fsnotify.h> +#include <linux/security.h> int fat_generic_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg) @@ -65,6 +66,7 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp, /* Equivalent to a chmod() */ ia.ia_valid = ATTR_MODE | ATTR_CTIME; + ia.ia_ctime = current_fs_time(inode->i_sb); if (is_dir) { ia.ia_mode = MSDOS_MKMODE(attr, S_IRWXUGO & ~sbi->options.fs_dmask) @@ -91,11 +93,21 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp, } } + /* + * The security check is questionable... We single + * out the RO attribute for checking by the security + * module, just because it maps to a file mode. + */ + err = security_inode_setattr(filp->f_path.dentry, &ia); + if (err) + goto up; + /* This MUST be done before doing anything irreversible... */ - err = notify_change(filp->f_path.dentry, &ia); + err = fat_setattr(filp->f_path.dentry, &ia); if (err) goto up; + fsnotify_change(filp->f_path.dentry, ia.ia_valid); if (sbi->options.sys_immutable) { if (attr & ATTR_SYS) inode->i_flags |= S_IMMUTABLE; @@ -242,9 +254,7 @@ void fat_truncate(struct inode *inode) nr_clusters = (inode->i_size + (cluster_size - 1)) >> sbi->cluster_bits; - lock_kernel(); fat_free(inode, nr_clusters); - unlock_kernel(); fat_flush_inodes(inode->i_sb, inode, NULL); } @@ -303,6 +313,8 @@ static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode) return 0; } +#define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET) + int fat_setattr(struct dentry *dentry, struct iattr *attr) { struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb); @@ -310,8 +322,6 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr) int error = 0; unsigned int ia_valid; - lock_kernel(); - /* * Expand the file. Since inode_setattr() updates ->i_size * before calling the ->truncate(), but FAT needs to fill the @@ -328,9 +338,9 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr) /* Check for setting the inode time. */ ia_valid = attr->ia_valid; - if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET)) { + if (ia_valid & TIMES_SET_FLAGS) { if (fat_allow_set_time(sbi, inode)) - attr->ia_valid &= ~(ATTR_MTIME_SET | ATTR_ATIME_SET); + attr->ia_valid &= ~TIMES_SET_FLAGS; } error = inode_change_ok(inode, attr); @@ -366,7 +376,6 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr) error = inode_setattr(inode, attr); out: - unlock_kernel(); return error; } EXPORT_SYMBOL_GPL(fat_setattr); diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 4e0a3dd9d677..d12cdf2a0406 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -382,17 +382,20 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1)) & ~((loff_t)sbi->cluster_size - 1)) >> 9; inode->i_mtime.tv_sec = - date_dos2unix(le16_to_cpu(de->time), le16_to_cpu(de->date)); + date_dos2unix(le16_to_cpu(de->time), le16_to_cpu(de->date), + sbi->options.tz_utc); inode->i_mtime.tv_nsec = 0; if (sbi->options.isvfat) { int secs = de->ctime_cs / 100; int csecs = de->ctime_cs % 100; inode->i_ctime.tv_sec = date_dos2unix(le16_to_cpu(de->ctime), - le16_to_cpu(de->cdate)) + secs; + le16_to_cpu(de->cdate), + sbi->options.tz_utc) + secs; inode->i_ctime.tv_nsec = csecs * 10000000; inode->i_atime.tv_sec = - date_dos2unix(0, le16_to_cpu(de->adate)); + date_dos2unix(0, le16_to_cpu(de->adate), + sbi->options.tz_utc); inode->i_atime.tv_nsec = 0; } else inode->i_ctime = inode->i_atime = inode->i_mtime; @@ -440,14 +443,13 @@ static void fat_delete_inode(struct inode *inode) static void fat_clear_inode(struct inode *inode) { - struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); + struct super_block *sb = inode->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); - lock_kernel(); spin_lock(&sbi->inode_hash_lock); fat_cache_inval_inode(inode); hlist_del_init(&MSDOS_I(inode)->i_fat_hash); spin_unlock(&sbi->inode_hash_lock); - unlock_kernel(); } static void fat_write_super(struct super_block *sb) @@ -485,7 +487,7 @@ static struct kmem_cache *fat_inode_cachep; static struct inode *fat_alloc_inode(struct super_block *sb) { struct msdos_inode_info *ei; - ei = kmem_cache_alloc(fat_inode_cachep, GFP_KERNEL); + ei = kmem_cache_alloc(fat_inode_cachep, GFP_NOFS); if (!ei) return NULL; return &ei->vfs_inode; @@ -496,7 +498,7 @@ static void fat_destroy_inode(struct inode *inode) kmem_cache_free(fat_inode_cachep, MSDOS_I(inode)); } -static void init_once(struct kmem_cache *cachep, void *foo) +static void init_once(void *foo) { struct msdos_inode_info *ei = (struct msdos_inode_info *)foo; @@ -560,26 +562,23 @@ static int fat_write_inode(struct inode *inode, int wait) struct buffer_head *bh; struct msdos_dir_entry *raw_entry; loff_t i_pos; - int err = 0; + int err; retry: i_pos = MSDOS_I(inode)->i_pos; if (inode->i_ino == MSDOS_ROOT_INO || !i_pos) return 0; - lock_kernel(); bh = sb_bread(sb, i_pos >> sbi->dir_per_block_bits); if (!bh) { printk(KERN_ERR "FAT: unable to read inode block " "for updating (i_pos %lld)\n", i_pos); - err = -EIO; - goto out; + return -EIO; } spin_lock(&sbi->inode_hash_lock); if (i_pos != MSDOS_I(inode)->i_pos) { spin_unlock(&sbi->inode_hash_lock); brelse(bh); - unlock_kernel(); goto retry; } @@ -592,21 +591,23 @@ retry: raw_entry->attr = fat_attr(inode); raw_entry->start = cpu_to_le16(MSDOS_I(inode)->i_logstart); raw_entry->starthi = cpu_to_le16(MSDOS_I(inode)->i_logstart >> 16); - fat_date_unix2dos(inode->i_mtime.tv_sec, &raw_entry->time, &raw_entry->date); + fat_date_unix2dos(inode->i_mtime.tv_sec, &raw_entry->time, + &raw_entry->date, sbi->options.tz_utc); if (sbi->options.isvfat) { __le16 atime; - fat_date_unix2dos(inode->i_ctime.tv_sec,&raw_entry->ctime,&raw_entry->cdate); - fat_date_unix2dos(inode->i_atime.tv_sec,&atime,&raw_entry->adate); + fat_date_unix2dos(inode->i_ctime.tv_sec, &raw_entry->ctime, + &raw_entry->cdate, sbi->options.tz_utc); + fat_date_unix2dos(inode->i_atime.tv_sec, &atime, + &raw_entry->adate, sbi->options.tz_utc); raw_entry->ctime_cs = (inode->i_ctime.tv_sec & 1) * 100 + inode->i_ctime.tv_nsec / 10000000; } spin_unlock(&sbi->inode_hash_lock); mark_buffer_dirty(bh); + err = 0; if (wait) err = sync_dirty_buffer(bh); brelse(bh); -out: - unlock_kernel(); return err; } @@ -736,6 +737,7 @@ fat_encode_fh(struct dentry *de, __u32 *fh, int *lenp, int connectable) static struct dentry *fat_get_parent(struct dentry *child) { + struct super_block *sb = child->d_sb; struct buffer_head *bh; struct msdos_dir_entry *de; loff_t i_pos; @@ -743,14 +745,14 @@ static struct dentry *fat_get_parent(struct dentry *child) struct inode *inode; int err; - lock_kernel(); + lock_super(sb); err = fat_get_dotdot_entry(child->d_inode, &bh, &de, &i_pos); if (err) { parent = ERR_PTR(err); goto out; } - inode = fat_build_inode(child->d_sb, de, i_pos); + inode = fat_build_inode(sb, de, i_pos); brelse(bh); if (IS_ERR(inode)) { parent = ERR_CAST(inode); @@ -762,7 +764,7 @@ static struct dentry *fat_get_parent(struct dentry *child) parent = ERR_PTR(-ENOMEM); } out: - unlock_kernel(); + unlock_super(sb); return parent; } @@ -836,6 +838,8 @@ static int fat_show_options(struct seq_file *m, struct vfsmount *mnt) } if (sbi->options.flush) seq_puts(m, ",flush"); + if (opts->tz_utc) + seq_puts(m, ",tz=UTC"); return 0; } @@ -848,10 +852,10 @@ enum { Opt_charset, Opt_shortname_lower, Opt_shortname_win95, Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes, Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes, - Opt_obsolate, Opt_flush, Opt_err, + Opt_obsolate, Opt_flush, Opt_tz_utc, Opt_err, }; -static match_table_t fat_tokens = { +static const match_table_t fat_tokens = { {Opt_check_r, "check=relaxed"}, {Opt_check_s, "check=strict"}, {Opt_check_n, "check=normal"}, @@ -883,16 +887,17 @@ static match_table_t fat_tokens = { {Opt_obsolate, "cvf_options=%100s"}, {Opt_obsolate, "posix"}, {Opt_flush, "flush"}, + {Opt_tz_utc, "tz=UTC"}, {Opt_err, NULL}, }; -static match_table_t msdos_tokens = { +static const match_table_t msdos_tokens = { {Opt_nodots, "nodots"}, {Opt_nodots, "dotsOK=no"}, {Opt_dots, "dots"}, {Opt_dots, "dotsOK=yes"}, {Opt_err, NULL} }; -static match_table_t vfat_tokens = { +static const match_table_t vfat_tokens = { {Opt_charset, "iocharset=%s"}, {Opt_shortname_lower, "shortname=lower"}, {Opt_shortname_win95, "shortname=win95"}, @@ -947,10 +952,11 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug, opts->utf8 = opts->unicode_xlate = 0; opts->numtail = 1; opts->usefree = opts->nocase = 0; + opts->tz_utc = 0; *debug = 0; if (!options) - return 0; + goto out; while ((p = strsep(&options, ",")) != NULL) { int token; @@ -1036,6 +1042,9 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug, case Opt_flush: opts->flush = 1; break; + case Opt_tz_utc: + opts->tz_utc = 1; + break; /* msdos specific */ case Opt_dots: @@ -1104,10 +1113,13 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug, return -EINVAL; } } + +out: /* UTF-8 doesn't provide FAT semantics */ if (!strcmp(opts->iocharset, "utf8")) { printk(KERN_ERR "FAT: utf8 is not a recommended IO charset" - " for FAT filesystems, filesystem will be case sensitive!\n"); + " for FAT filesystems, filesystem will be " + "case sensitive!\n"); } /* If user doesn't specify allow_utime, it's initialized from dmask. */ @@ -1172,6 +1184,12 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, long error; char buf[50]; + /* + * GFP_KERNEL is ok here, because while we do hold the + * supeblock lock, memory pressure can't call back into + * the filesystem, since we're only just about to mount + * it and have no inodes etc active! + */ sbi = kzalloc(sizeof(struct msdos_sb_info), GFP_KERNEL); if (!sbi) return -ENOMEM; diff --git a/fs/fat/misc.c b/fs/fat/misc.c index 61f23511eacf..79fb98ad36d4 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -142,7 +142,7 @@ static int day_n[] = { }; /* Convert a MS-DOS time/date pair to a UNIX date (seconds since 1 1 70). */ -int date_dos2unix(unsigned short time, unsigned short date) +int date_dos2unix(unsigned short time, unsigned short date, int tz_utc) { int month, year, secs; @@ -156,16 +156,18 @@ int date_dos2unix(unsigned short time, unsigned short date) ((date & 31)-1+day_n[month]+(year/4)+year*365-((year & 3) == 0 && month < 2 ? 1 : 0)+3653); /* days since 1.1.70 plus 80's leap day */ - secs += sys_tz.tz_minuteswest*60; + if (!tz_utc) + secs += sys_tz.tz_minuteswest*60; return secs; } /* Convert linear UNIX date to a MS-DOS time/date pair. */ -void fat_date_unix2dos(int unix_date, __le16 *time, __le16 *date) +void fat_date_unix2dos(int unix_date, __le16 *time, __le16 *date, int tz_utc) { int day, year, nl_day, month; - unix_date -= sys_tz.tz_minuteswest*60; + if (!tz_utc) + unix_date -= sys_tz.tz_minuteswest*60; /* Jan 1 GMT 00:00:00 1980. But what about another time zone? */ if (unix_date < 315532800) diff --git a/fs/fcntl.c b/fs/fcntl.c index bfd776509a72..ac4f7db9f134 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -12,7 +12,6 @@ #include <linux/fdtable.h> #include <linux/capability.h> #include <linux/dnotify.h> -#include <linux/smp_lock.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/security.h> @@ -50,145 +49,94 @@ static int get_close_on_exec(unsigned int fd) return res; } -/* - * locate_fd finds a free file descriptor in the open_fds fdset, - * expanding the fd arrays if necessary. Must be called with the - * file_lock held for write. - */ - -static int locate_fd(unsigned int orig_start, int cloexec) -{ - struct files_struct *files = current->files; - unsigned int newfd; - unsigned int start; - int error; - struct fdtable *fdt; - - spin_lock(&files->file_lock); - - error = -EINVAL; - if (orig_start >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur) - goto out; - -repeat: - fdt = files_fdtable(files); - /* - * Someone might have closed fd's in the range - * orig_start..fdt->next_fd - */ - start = orig_start; - if (start < files->next_fd) - start = files->next_fd; - - newfd = start; - if (start < fdt->max_fds) - newfd = find_next_zero_bit(fdt->open_fds->fds_bits, - fdt->max_fds, start); - - error = -EMFILE; - if (newfd >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur) - goto out; - - error = expand_files(files, newfd); - if (error < 0) - goto out; - - /* - * If we needed to expand the fs array we - * might have blocked - try again. - */ - if (error) - goto repeat; - - if (start <= files->next_fd) - files->next_fd = newfd + 1; - - FD_SET(newfd, fdt->open_fds); - if (cloexec) - FD_SET(newfd, fdt->close_on_exec); - else - FD_CLR(newfd, fdt->close_on_exec); - error = newfd; - -out: - spin_unlock(&files->file_lock); - return error; -} - -static int dupfd(struct file *file, unsigned int start, int cloexec) -{ - int fd = locate_fd(start, cloexec); - if (fd >= 0) - fd_install(fd, file); - else - fput(file); - - return fd; -} - -asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd) +asmlinkage long sys_dup3(unsigned int oldfd, unsigned int newfd, int flags) { int err = -EBADF; struct file * file, *tofree; struct files_struct * files = current->files; struct fdtable *fdt; - spin_lock(&files->file_lock); - if (!(file = fcheck(oldfd))) - goto out_unlock; - err = newfd; - if (newfd == oldfd) - goto out_unlock; - err = -EBADF; - if (newfd >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur) - goto out_unlock; - get_file(file); /* We are now finished with oldfd */ - - err = expand_files(files, newfd); - if (err < 0) - goto out_fput; + if ((flags & ~O_CLOEXEC) != 0) + return -EINVAL; - /* To avoid races with open() and dup(), we will mark the fd as - * in-use in the open-file bitmap throughout the entire dup2() - * process. This is quite safe: do_close() uses the fd array - * entry, not the bitmap, to decide what work needs to be - * done. --sct */ - /* Doesn't work. open() might be there first. --AV */ + if (unlikely(oldfd == newfd)) + return -EINVAL; - /* Yes. It's a race. In user space. Nothing sane to do */ + spin_lock(&files->file_lock); + err = expand_files(files, newfd); + file = fcheck(oldfd); + if (unlikely(!file)) + goto Ebadf; + if (unlikely(err < 0)) { + if (err == -EMFILE) + goto Ebadf; + goto out_unlock; + } + /* + * We need to detect attempts to do dup2() over allocated but still + * not finished descriptor. NB: OpenBSD avoids that at the price of + * extra work in their equivalent of fget() - they insert struct + * file immediately after grabbing descriptor, mark it larval if + * more work (e.g. actual opening) is needed and make sure that + * fget() treats larval files as absent. Potentially interesting, + * but while extra work in fget() is trivial, locking implications + * and amount of surgery on open()-related paths in VFS are not. + * FreeBSD fails with -EBADF in the same situation, NetBSD "solution" + * deadlocks in rather amusing ways, AFAICS. All of that is out of + * scope of POSIX or SUS, since neither considers shared descriptor + * tables and this condition does not arise without those. + */ err = -EBUSY; fdt = files_fdtable(files); tofree = fdt->fd[newfd]; if (!tofree && FD_ISSET(newfd, fdt->open_fds)) - goto out_fput; - + goto out_unlock; + get_file(file); rcu_assign_pointer(fdt->fd[newfd], file); FD_SET(newfd, fdt->open_fds); - FD_CLR(newfd, fdt->close_on_exec); + if (flags & O_CLOEXEC) + FD_SET(newfd, fdt->close_on_exec); + else + FD_CLR(newfd, fdt->close_on_exec); spin_unlock(&files->file_lock); if (tofree) filp_close(tofree, files); - err = newfd; -out: - return err; + + return newfd; + +Ebadf: + err = -EBADF; out_unlock: spin_unlock(&files->file_lock); - goto out; + return err; +} -out_fput: - spin_unlock(&files->file_lock); - fput(file); - goto out; +asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd) +{ + if (unlikely(newfd == oldfd)) { /* corner case */ + struct files_struct *files = current->files; + rcu_read_lock(); + if (!fcheck_files(files, oldfd)) + oldfd = -EBADF; + rcu_read_unlock(); + return oldfd; + } + return sys_dup3(oldfd, newfd, 0); } asmlinkage long sys_dup(unsigned int fildes) { int ret = -EBADF; - struct file * file = fget(fildes); - - if (file) - ret = dupfd(file, 0, 0); + struct file *file = fget(fildes); + + if (file) { + ret = get_unused_fd(); + if (ret >= 0) + fd_install(ret, file); + else + fput(file); + } return ret; } @@ -227,7 +175,6 @@ static int setfl(int fd, struct file * filp, unsigned long arg) if (error) return error; - lock_kernel(); if ((arg ^ filp->f_flags) & FASYNC) { if (filp->f_op && filp->f_op->fasync) { error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0); @@ -238,7 +185,6 @@ static int setfl(int fd, struct file * filp, unsigned long arg) filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK); out: - unlock_kernel(); return error; } @@ -313,8 +259,13 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, switch (cmd) { case F_DUPFD: case F_DUPFD_CLOEXEC: - get_file(filp); - err = dupfd(filp, arg, cmd == F_DUPFD_CLOEXEC); + if (arg >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur) + break; + err = alloc_fd(arg, cmd == F_DUPFD_CLOEXEC ? O_CLOEXEC : 0); + if (err >= 0) { + get_file(filp); + fd_install(err, filp); + } break; case F_GETFD: err = get_close_on_exec(fd) ? FD_CLOEXEC : 0; diff --git a/fs/fifo.c b/fs/fifo.c index 9785e36f81e7..987bf9411495 100644 --- a/fs/fifo.c +++ b/fs/fifo.c @@ -57,7 +57,7 @@ static int fifo_open(struct inode *inode, struct file *filp) * POSIX.1 says that O_NONBLOCK means return with the FIFO * opened, even when there is no process writing the FIFO. */ - filp->f_op = &read_fifo_fops; + filp->f_op = &read_pipefifo_fops; pipe->r_counter++; if (pipe->readers++ == 0) wake_up_partner(inode); @@ -86,7 +86,7 @@ static int fifo_open(struct inode *inode, struct file *filp) if ((filp->f_flags & O_NONBLOCK) && !pipe->readers) goto err; - filp->f_op = &write_fifo_fops; + filp->f_op = &write_pipefifo_fops; pipe->w_counter++; if (!pipe->writers++) wake_up_partner(inode); @@ -105,7 +105,7 @@ static int fifo_open(struct inode *inode, struct file *filp) * This implementation will NEVER block on a O_RDWR open, since * the process can at least talk to itself. */ - filp->f_op = &rdwr_fifo_fops; + filp->f_op = &rdwr_pipefifo_fops; pipe->readers++; pipe->writers++; @@ -151,5 +151,5 @@ err_nocleanup: * depending on the access mode of the file... */ const struct file_operations def_fifo_fops = { - .open = fifo_open, /* will set read or write pipe_fops */ + .open = fifo_open, /* will set read_ or write_pipefifo_fops */ }; diff --git a/fs/file.c b/fs/file.c index 7b3887e054d0..f313314f996f 100644 --- a/fs/file.c +++ b/fs/file.c @@ -6,6 +6,7 @@ * Manage the dynamic fd arrays in the process files_struct. */ +#include <linux/module.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/time.h> @@ -250,9 +251,18 @@ int expand_files(struct files_struct *files, int nr) struct fdtable *fdt; fdt = files_fdtable(files); + + /* + * N.B. For clone tasks sharing a files structure, this test + * will limit the total number of files that can be opened. + */ + if (nr >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur) + return -EMFILE; + /* Do we need to expand? */ if (nr < fdt->max_fds) return 0; + /* Can we expand? */ if (nr >= sysctl_nr_open) return -EMFILE; @@ -423,3 +433,63 @@ struct files_struct init_files = { }, .file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock), }; + +/* + * allocate a file descriptor, mark it busy. + */ +int alloc_fd(unsigned start, unsigned flags) +{ + struct files_struct *files = current->files; + unsigned int fd; + int error; + struct fdtable *fdt; + + spin_lock(&files->file_lock); +repeat: + fdt = files_fdtable(files); + fd = start; + if (fd < files->next_fd) + fd = files->next_fd; + + if (fd < fdt->max_fds) + fd = find_next_zero_bit(fdt->open_fds->fds_bits, + fdt->max_fds, fd); + + error = expand_files(files, fd); + if (error < 0) + goto out; + + /* + * If we needed to expand the fs array we + * might have blocked - try again. + */ + if (error) + goto repeat; + + if (start <= files->next_fd) + files->next_fd = fd + 1; + + FD_SET(fd, fdt->open_fds); + if (flags & O_CLOEXEC) + FD_SET(fd, fdt->close_on_exec); + else + FD_CLR(fd, fdt->close_on_exec); + error = fd; +#if 1 + /* Sanity check */ + if (rcu_dereference(fdt->fd[fd]) != NULL) { + printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd); + rcu_assign_pointer(fdt->fd[fd], NULL); + } +#endif + +out: + spin_unlock(&files->file_lock); + return error; +} + +int get_unused_fd(void) +{ + return alloc_fd(0, 0); +} +EXPORT_SYMBOL(get_unused_fd); diff --git a/fs/file_table.c b/fs/file_table.c index 83084225b4c3..f45a4493f9e7 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -120,7 +120,7 @@ struct file *get_empty_filp(void) tsk = current; INIT_LIST_HEAD(&f->f_u.fu_list); - atomic_set(&f->f_count, 1); + atomic_long_set(&f->f_count, 1); rwlock_init(&f->f_owner.lock); f->f_uid = tsk->fsuid; f->f_gid = tsk->fsgid; @@ -219,7 +219,7 @@ EXPORT_SYMBOL(init_file); void fput(struct file *file) { - if (atomic_dec_and_test(&file->f_count)) + if (atomic_long_dec_and_test(&file->f_count)) __fput(file); } @@ -294,7 +294,7 @@ struct file *fget(unsigned int fd) rcu_read_lock(); file = fcheck_files(files, fd); if (file) { - if (!atomic_inc_not_zero(&file->f_count)) { + if (!atomic_long_inc_not_zero(&file->f_count)) { /* File object ref couldn't be taken */ rcu_read_unlock(); return NULL; @@ -326,7 +326,7 @@ struct file *fget_light(unsigned int fd, int *fput_needed) rcu_read_lock(); file = fcheck_files(files, fd); if (file) { - if (atomic_inc_not_zero(&file->f_count)) + if (atomic_long_inc_not_zero(&file->f_count)) *fput_needed = 1; else /* Didn't get the reference, someone's freed */ @@ -341,7 +341,7 @@ struct file *fget_light(unsigned int fd, int *fput_needed) void put_filp(struct file *file) { - if (atomic_dec_and_test(&file->f_count)) { + if (atomic_long_dec_and_test(&file->f_count)) { security_file_free(file); file_kill(file); file_free(file); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index ae45f77765c0..25adfc3c693a 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -424,8 +424,6 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so * that it can be located for waiting on in __writeback_single_inode(). * - * Called under inode_lock. - * * If `bdi' is non-zero then we're being asked to writeback a specific queue. * This function assumes that the blockdev superblock's inodes are backed by * a variety of queues, so all inodes are searched. For other superblocks, @@ -441,11 +439,12 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * on the writer throttling path, and we get decent balancing between many * throttled threads: we don't want them all piling up on inode_sync_wait. */ -static void -sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc) +void generic_sync_sb_inodes(struct super_block *sb, + struct writeback_control *wbc) { const unsigned long start = jiffies; /* livelock avoidance */ + spin_lock(&inode_lock); if (!wbc->for_kupdate || list_empty(&sb->s_io)) queue_io(sb, wbc->older_than_this); @@ -524,8 +523,16 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc) if (!list_empty(&sb->s_more_io)) wbc->more_io = 1; } + spin_unlock(&inode_lock); return; /* Leave any unwritten inodes on s_io */ } +EXPORT_SYMBOL_GPL(generic_sync_sb_inodes); + +static void sync_sb_inodes(struct super_block *sb, + struct writeback_control *wbc) +{ + generic_sync_sb_inodes(sb, wbc); +} /* * Start writeback of dirty pagecache data against all unlocked inodes. @@ -565,11 +572,8 @@ restart: * be unmounted by the time it is released. */ if (down_read_trylock(&sb->s_umount)) { - if (sb->s_root) { - spin_lock(&inode_lock); + if (sb->s_root) sync_sb_inodes(sb, wbc); - spin_unlock(&inode_lock); - } up_read(&sb->s_umount); } spin_lock(&sb_lock); @@ -607,9 +611,7 @@ void sync_inodes_sb(struct super_block *sb, int wait) (inodes_stat.nr_inodes - inodes_stat.nr_unused) + nr_dirty + nr_unstable; wbc.nr_to_write += wbc.nr_to_write / 2; /* Bit more for luck */ - spin_lock(&inode_lock); sync_sb_inodes(sb, &wbc); - spin_unlock(&inode_lock); } /* diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 2060bf06b906..fd03330cadeb 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -97,7 +97,7 @@ void fuse_invalidate_attr(struct inode *inode) * timeout is unknown (unlink, rmdir, rename and in some cases * lookup) */ -static void fuse_invalidate_entry_cache(struct dentry *entry) +void fuse_invalidate_entry_cache(struct dentry *entry) { fuse_dentry_settime(entry, 0); } @@ -112,18 +112,16 @@ static void fuse_invalidate_entry(struct dentry *entry) fuse_invalidate_entry_cache(entry); } -static void fuse_lookup_init(struct fuse_req *req, struct inode *dir, - struct dentry *entry, +static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_req *req, + u64 nodeid, struct qstr *name, struct fuse_entry_out *outarg) { - struct fuse_conn *fc = get_fuse_conn(dir); - memset(outarg, 0, sizeof(struct fuse_entry_out)); req->in.h.opcode = FUSE_LOOKUP; - req->in.h.nodeid = get_node_id(dir); + req->in.h.nodeid = nodeid; req->in.numargs = 1; - req->in.args[0].size = entry->d_name.len + 1; - req->in.args[0].value = entry->d_name.name; + req->in.args[0].size = name->len + 1; + req->in.args[0].value = name->name; req->out.numargs = 1; if (fc->minor < 9) req->out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE; @@ -189,7 +187,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd) attr_version = fuse_get_attr_version(fc); parent = dget_parent(entry); - fuse_lookup_init(req, parent->d_inode, entry, &outarg); + fuse_lookup_init(fc, req, get_node_id(parent->d_inode), + &entry->d_name, &outarg); request_send(fc, req); dput(parent); err = req->out.h.error; @@ -225,7 +224,7 @@ static int invalid_nodeid(u64 nodeid) return !nodeid || nodeid == FUSE_ROOT_ID; } -static struct dentry_operations fuse_dentry_operations = { +struct dentry_operations fuse_dentry_operations = { .d_revalidate = fuse_dentry_revalidate, }; @@ -239,85 +238,127 @@ int fuse_valid_type(int m) * Add a directory inode to a dentry, ensuring that no other dentry * refers to this inode. Called with fc->inst_mutex. */ -static int fuse_d_add_directory(struct dentry *entry, struct inode *inode) +static struct dentry *fuse_d_add_directory(struct dentry *entry, + struct inode *inode) { struct dentry *alias = d_find_alias(inode); - if (alias) { + if (alias && !(alias->d_flags & DCACHE_DISCONNECTED)) { /* This tries to shrink the subtree below alias */ fuse_invalidate_entry(alias); dput(alias); if (!list_empty(&inode->i_dentry)) - return -EBUSY; + return ERR_PTR(-EBUSY); + } else { + dput(alias); } - d_add(entry, inode); - return 0; + return d_splice_alias(inode, entry); } -static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, - struct nameidata *nd) +int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name, + struct fuse_entry_out *outarg, struct inode **inode) { - int err; - struct fuse_entry_out outarg; - struct inode *inode = NULL; - struct fuse_conn *fc = get_fuse_conn(dir); + struct fuse_conn *fc = get_fuse_conn_super(sb); struct fuse_req *req; struct fuse_req *forget_req; u64 attr_version; + int err; - if (entry->d_name.len > FUSE_NAME_MAX) - return ERR_PTR(-ENAMETOOLONG); + *inode = NULL; + err = -ENAMETOOLONG; + if (name->len > FUSE_NAME_MAX) + goto out; req = fuse_get_req(fc); + err = PTR_ERR(req); if (IS_ERR(req)) - return ERR_CAST(req); + goto out; forget_req = fuse_get_req(fc); + err = PTR_ERR(forget_req); if (IS_ERR(forget_req)) { fuse_put_request(fc, req); - return ERR_CAST(forget_req); + goto out; } attr_version = fuse_get_attr_version(fc); - fuse_lookup_init(req, dir, entry, &outarg); + fuse_lookup_init(fc, req, nodeid, name, outarg); request_send(fc, req); err = req->out.h.error; fuse_put_request(fc, req); /* Zero nodeid is same as -ENOENT, but with valid timeout */ - if (!err && outarg.nodeid && - (invalid_nodeid(outarg.nodeid) || - !fuse_valid_type(outarg.attr.mode))) - err = -EIO; - if (!err && outarg.nodeid) { - inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation, - &outarg.attr, entry_attr_timeout(&outarg), - attr_version); - if (!inode) { - fuse_send_forget(fc, forget_req, outarg.nodeid, 1); - return ERR_PTR(-ENOMEM); - } + if (err || !outarg->nodeid) + goto out_put_forget; + + err = -EIO; + if (!outarg->nodeid) + goto out_put_forget; + if (!fuse_valid_type(outarg->attr.mode)) + goto out_put_forget; + + *inode = fuse_iget(sb, outarg->nodeid, outarg->generation, + &outarg->attr, entry_attr_timeout(outarg), + attr_version); + err = -ENOMEM; + if (!*inode) { + fuse_send_forget(fc, forget_req, outarg->nodeid, 1); + goto out; } + err = 0; + + out_put_forget: fuse_put_request(fc, forget_req); - if (err && err != -ENOENT) - return ERR_PTR(err); + out: + return err; +} + +static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, + struct nameidata *nd) +{ + int err; + struct fuse_entry_out outarg; + struct inode *inode; + struct dentry *newent; + struct fuse_conn *fc = get_fuse_conn(dir); + bool outarg_valid = true; + + err = fuse_lookup_name(dir->i_sb, get_node_id(dir), &entry->d_name, + &outarg, &inode); + if (err == -ENOENT) { + outarg_valid = false; + err = 0; + } + if (err) + goto out_err; + + err = -EIO; + if (inode && get_node_id(inode) == FUSE_ROOT_ID) + goto out_iput; if (inode && S_ISDIR(inode->i_mode)) { mutex_lock(&fc->inst_mutex); - err = fuse_d_add_directory(entry, inode); + newent = fuse_d_add_directory(entry, inode); mutex_unlock(&fc->inst_mutex); - if (err) { - iput(inode); - return ERR_PTR(err); - } - } else - d_add(entry, inode); + err = PTR_ERR(newent); + if (IS_ERR(newent)) + goto out_iput; + } else { + newent = d_splice_alias(inode, entry); + } + entry = newent ? newent : entry; entry->d_op = &fuse_dentry_operations; - if (!err) + if (outarg_valid) fuse_change_entry_timeout(entry, &outarg); else fuse_invalidate_entry_cache(entry); - return NULL; + + return newent; + + out_iput: + iput(inode); + out_err: + return ERR_PTR(err); } /* @@ -857,7 +898,7 @@ static int fuse_access(struct inode *inode, int mask) return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); - inarg.mask = mask; + inarg.mask = mask & (MAY_READ | MAY_WRITE | MAY_EXEC); req->in.h.opcode = FUSE_ACCESS; req->in.h.nodeid = get_node_id(inode); req->in.numargs = 1; @@ -886,7 +927,7 @@ static int fuse_access(struct inode *inode, int mask) * access request is sent. Execute permission is still checked * locally based on file mode. */ -static int fuse_permission(struct inode *inode, int mask, struct nameidata *nd) +static int fuse_permission(struct inode *inode, int mask) { struct fuse_conn *fc = get_fuse_conn(inode); bool refreshed = false; @@ -921,7 +962,7 @@ static int fuse_permission(struct inode *inode, int mask, struct nameidata *nd) exist. So if permissions are revoked this won't be noticed immediately, only after the attribute timeout has expired */ - } else if (nd && (nd->flags & (LOOKUP_ACCESS | LOOKUP_CHDIR))) { + } else if (mask & MAY_ACCESS) { err = fuse_access(inode, mask); } else if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) { if (!(inode->i_mode & S_IXUGO)) { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 8092f0d9fd1f..2bada6bbc317 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -893,7 +893,7 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov, if (count == 0) goto out; - err = remove_suid(file->f_path.dentry); + err = file_remove_suid(file); if (err) goto out; @@ -1341,6 +1341,11 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) pid_t pid = fl->fl_type != F_UNLCK ? current->tgid : 0; int err; + if (fl->fl_lmops && fl->fl_lmops->fl_grant) { + /* NLM needs asynchronous locks, which we don't support yet */ + return -ENOLCK; + } + /* Unlock on close is handled by the flush method */ if (fl->fl_flags & FL_CLOSE) return 0; @@ -1365,7 +1370,9 @@ static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl) struct fuse_conn *fc = get_fuse_conn(inode); int err; - if (cmd == F_GETLK) { + if (cmd == F_CANCELLK) { + err = 0; + } else if (cmd == F_GETLK) { if (fc->no_lock) { posix_test_lock(file, fl); err = 0; @@ -1373,7 +1380,7 @@ static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl) err = fuse_getlk(file, fl); } else { if (fc->no_lock) - err = posix_lock_file_wait(file, fl); + err = posix_lock_file(file, fl, NULL); else err = fuse_setlk(file, fl, 0); } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index bae948657c4f..3a876076bdd1 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -363,6 +363,9 @@ struct fuse_conn { /** Do not send separate SETATTR request before open(O_TRUNC) */ unsigned atomic_o_trunc : 1; + /** Filesystem supports NFS exporting. Only set in INIT */ + unsigned export_support : 1; + /* * The following bitfields are only for optimization purposes * and hence races in setting them will not cause malfunction @@ -464,6 +467,8 @@ static inline u64 get_node_id(struct inode *inode) /** Device operations */ extern const struct file_operations fuse_dev_operations; +extern struct dentry_operations fuse_dentry_operations; + /** * Get a filled in inode */ @@ -471,6 +476,9 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, int generation, struct fuse_attr *attr, u64 attr_valid, u64 attr_version); +int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name, + struct fuse_entry_out *outarg, struct inode **inode); + /** * Send FORGET command */ @@ -604,6 +612,8 @@ void fuse_abort_conn(struct fuse_conn *fc); */ void fuse_invalidate_attr(struct inode *inode); +void fuse_invalidate_entry_cache(struct dentry *entry); + /** * Acquire reference to fuse_conn */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 3141690558c8..6a84388cacff 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -18,6 +18,7 @@ #include <linux/statfs.h> #include <linux/random.h> #include <linux/sched.h> +#include <linux/exportfs.h> MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>"); MODULE_DESCRIPTION("Filesystem in Userspace"); @@ -353,7 +354,7 @@ enum { OPT_ERR }; -static match_table_t tokens = { +static const match_table_t tokens = { {OPT_FD, "fd=%u"}, {OPT_ROOTMODE, "rootmode=%o"}, {OPT_USER_ID, "user_id=%u"}, @@ -552,6 +553,174 @@ static struct inode *get_root_inode(struct super_block *sb, unsigned mode) return fuse_iget(sb, 1, 0, &attr, 0, 0); } +struct fuse_inode_handle +{ + u64 nodeid; + u32 generation; +}; + +static struct dentry *fuse_get_dentry(struct super_block *sb, + struct fuse_inode_handle *handle) +{ + struct fuse_conn *fc = get_fuse_conn_super(sb); + struct inode *inode; + struct dentry *entry; + int err = -ESTALE; + + if (handle->nodeid == 0) + goto out_err; + + inode = ilookup5(sb, handle->nodeid, fuse_inode_eq, &handle->nodeid); + if (!inode) { + struct fuse_entry_out outarg; + struct qstr name; + + if (!fc->export_support) + goto out_err; + + name.len = 1; + name.name = "."; + err = fuse_lookup_name(sb, handle->nodeid, &name, &outarg, + &inode); + if (err && err != -ENOENT) + goto out_err; + if (err || !inode) { + err = -ESTALE; + goto out_err; + } + err = -EIO; + if (get_node_id(inode) != handle->nodeid) + goto out_iput; + } + err = -ESTALE; + if (inode->i_generation != handle->generation) + goto out_iput; + + entry = d_alloc_anon(inode); + err = -ENOMEM; + if (!entry) + goto out_iput; + + if (get_node_id(inode) != FUSE_ROOT_ID) { + entry->d_op = &fuse_dentry_operations; + fuse_invalidate_entry_cache(entry); + } + + return entry; + + out_iput: + iput(inode); + out_err: + return ERR_PTR(err); +} + +static int fuse_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, + int connectable) +{ + struct inode *inode = dentry->d_inode; + bool encode_parent = connectable && !S_ISDIR(inode->i_mode); + int len = encode_parent ? 6 : 3; + u64 nodeid; + u32 generation; + + if (*max_len < len) + return 255; + + nodeid = get_fuse_inode(inode)->nodeid; + generation = inode->i_generation; + + fh[0] = (u32)(nodeid >> 32); + fh[1] = (u32)(nodeid & 0xffffffff); + fh[2] = generation; + + if (encode_parent) { + struct inode *parent; + + spin_lock(&dentry->d_lock); + parent = dentry->d_parent->d_inode; + nodeid = get_fuse_inode(parent)->nodeid; + generation = parent->i_generation; + spin_unlock(&dentry->d_lock); + + fh[3] = (u32)(nodeid >> 32); + fh[4] = (u32)(nodeid & 0xffffffff); + fh[5] = generation; + } + + *max_len = len; + return encode_parent ? 0x82 : 0x81; +} + +static struct dentry *fuse_fh_to_dentry(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + struct fuse_inode_handle handle; + + if ((fh_type != 0x81 && fh_type != 0x82) || fh_len < 3) + return NULL; + + handle.nodeid = (u64) fid->raw[0] << 32; + handle.nodeid |= (u64) fid->raw[1]; + handle.generation = fid->raw[2]; + return fuse_get_dentry(sb, &handle); +} + +static struct dentry *fuse_fh_to_parent(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + struct fuse_inode_handle parent; + + if (fh_type != 0x82 || fh_len < 6) + return NULL; + + parent.nodeid = (u64) fid->raw[3] << 32; + parent.nodeid |= (u64) fid->raw[4]; + parent.generation = fid->raw[5]; + return fuse_get_dentry(sb, &parent); +} + +static struct dentry *fuse_get_parent(struct dentry *child) +{ + struct inode *child_inode = child->d_inode; + struct fuse_conn *fc = get_fuse_conn(child_inode); + struct inode *inode; + struct dentry *parent; + struct fuse_entry_out outarg; + struct qstr name; + int err; + + if (!fc->export_support) + return ERR_PTR(-ESTALE); + + name.len = 2; + name.name = ".."; + err = fuse_lookup_name(child_inode->i_sb, get_node_id(child_inode), + &name, &outarg, &inode); + if (err && err != -ENOENT) + return ERR_PTR(err); + if (err || !inode) + return ERR_PTR(-ESTALE); + + parent = d_alloc_anon(inode); + if (!parent) { + iput(inode); + return ERR_PTR(-ENOMEM); + } + if (get_node_id(inode) != FUSE_ROOT_ID) { + parent->d_op = &fuse_dentry_operations; + fuse_invalidate_entry_cache(parent); + } + + return parent; +} + +static const struct export_operations fuse_export_operations = { + .fh_to_dentry = fuse_fh_to_dentry, + .fh_to_parent = fuse_fh_to_parent, + .encode_fh = fuse_encode_fh, + .get_parent = fuse_get_parent, +}; + static const struct super_operations fuse_super_operations = { .alloc_inode = fuse_alloc_inode, .destroy_inode = fuse_destroy_inode, @@ -581,6 +750,11 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->no_lock = 1; if (arg->flags & FUSE_ATOMIC_O_TRUNC) fc->atomic_o_trunc = 1; + if (arg->minor >= 9) { + /* LOOKUP has dependency on proto version */ + if (arg->flags & FUSE_EXPORT_SUPPORT) + fc->export_support = 1; + } if (arg->flags & FUSE_BIG_WRITES) fc->big_writes = 1; } else { @@ -607,7 +781,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) arg->minor = FUSE_KERNEL_MINOR_VERSION; arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE; arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | - FUSE_BIG_WRITES; + FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES; req->in.h.opcode = FUSE_INIT; req->in.numargs = 1; req->in.args[0].size = sizeof(*arg); @@ -652,6 +826,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) sb->s_magic = FUSE_SUPER_MAGIC; sb->s_op = &fuse_super_operations; sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_export_op = &fuse_export_operations; file = fget(d.fd); if (!file) @@ -781,7 +956,7 @@ static inline void unregister_fuseblk(void) } #endif -static void fuse_inode_init_once(struct kmem_cache *cachep, void *foo) +static void fuse_inode_init_once(void *foo) { struct inode * inode = foo; diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig index 7f7947e3dfbb..ab2f57e3fb87 100644 --- a/fs/gfs2/Kconfig +++ b/fs/gfs2/Kconfig @@ -14,23 +14,11 @@ config GFS2_FS GFS is perfect consistency -- changes made to the filesystem on one machine show up immediately on all other machines in the cluster. - To use the GFS2 filesystem, you will need to enable one or more of - the below locking modules. Documentation and utilities for GFS2 can + To use the GFS2 filesystem in a cluster, you will need to enable + the locking module below. Documentation and utilities for GFS2 can be found here: http://sources.redhat.com/cluster -config GFS2_FS_LOCKING_NOLOCK - tristate "GFS2 \"nolock\" locking module" - depends on GFS2_FS - help - Single node locking module for GFS2. - - Use this module if you want to use GFS2 on a single node without - its clustering features. You can still take advantage of the - large file support, and upgrade to running a full cluster later on - if required. - - If you will only be using GFS2 in cluster mode, you do not need this - module. + The "nolock" lock module is now built in to GFS2 by default. config GFS2_FS_LOCKING_DLM tristate "GFS2 DLM locking module" diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile index e2350df02a07..ec65851ec80a 100644 --- a/fs/gfs2/Makefile +++ b/fs/gfs2/Makefile @@ -5,6 +5,5 @@ gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \ ops_fstype.o ops_inode.o ops_super.o quota.o \ recovery.o rgrp.o super.o sys.o trans.o util.o -obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += locking/nolock/ obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += locking/dlm/ diff --git a/fs/gfs2/gfs2.h b/fs/gfs2/gfs2.h index 3bb11c0f8b56..ef606e3a5cf4 100644 --- a/fs/gfs2/gfs2.h +++ b/fs/gfs2/gfs2.h @@ -16,11 +16,6 @@ enum { }; enum { - NO_WAIT = 0, - WAIT = 1, -}; - -enum { NO_FORCE = 0, FORCE = 1, }; diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index d636b3e80f5d..c962283d4e7f 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -45,21 +45,19 @@ struct gfs2_gl_hash_bucket { struct hlist_head hb_list; }; -struct glock_iter { - int hash; /* hash bucket index */ - struct gfs2_sbd *sdp; /* incore superblock */ - struct gfs2_glock *gl; /* current glock struct */ - struct seq_file *seq; /* sequence file for debugfs */ - char string[512]; /* scratch space */ +struct gfs2_glock_iter { + int hash; /* hash bucket index */ + struct gfs2_sbd *sdp; /* incore superblock */ + struct gfs2_glock *gl; /* current glock struct */ + char string[512]; /* scratch space */ }; typedef void (*glock_examiner) (struct gfs2_glock * gl); static int gfs2_dump_lockstate(struct gfs2_sbd *sdp); -static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl); -static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh); -static void gfs2_glock_drop_th(struct gfs2_glock *gl); -static void run_queue(struct gfs2_glock *gl); +static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl); +#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { __dump_glock(NULL, gl); BUG(); } } while(0) +static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target); static DECLARE_RWSEM(gfs2_umount_flush_sem); static struct dentry *gfs2_root; @@ -123,33 +121,6 @@ static inline rwlock_t *gl_lock_addr(unsigned int x) #endif /** - * relaxed_state_ok - is a requested lock compatible with the current lock mode? - * @actual: the current state of the lock - * @requested: the lock state that was requested by the caller - * @flags: the modifier flags passed in by the caller - * - * Returns: 1 if the locks are compatible, 0 otherwise - */ - -static inline int relaxed_state_ok(unsigned int actual, unsigned requested, - int flags) -{ - if (actual == requested) - return 1; - - if (flags & GL_EXACT) - return 0; - - if (actual == LM_ST_EXCLUSIVE && requested == LM_ST_SHARED) - return 1; - - if (actual != LM_ST_UNLOCKED && (flags & LM_FLAG_ANY)) - return 1; - - return 0; -} - -/** * gl_hash() - Turn glock number into hash bucket number * @lock: The glock number * @@ -182,7 +153,7 @@ static void glock_free(struct gfs2_glock *gl) struct gfs2_sbd *sdp = gl->gl_sbd; struct inode *aspace = gl->gl_aspace; - if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + if (sdp->sd_lockstruct.ls_ops->lm_put_lock) sdp->sd_lockstruct.ls_ops->lm_put_lock(gl->gl_lock); if (aspace) @@ -211,17 +182,14 @@ static void gfs2_glock_hold(struct gfs2_glock *gl) int gfs2_glock_put(struct gfs2_glock *gl) { int rv = 0; - struct gfs2_sbd *sdp = gl->gl_sbd; write_lock(gl_lock_addr(gl->gl_hash)); if (atomic_dec_and_test(&gl->gl_ref)) { hlist_del(&gl->gl_list); write_unlock(gl_lock_addr(gl->gl_hash)); - gfs2_assert(sdp, gl->gl_state == LM_ST_UNLOCKED); - gfs2_assert(sdp, list_empty(&gl->gl_reclaim)); - gfs2_assert(sdp, list_empty(&gl->gl_holders)); - gfs2_assert(sdp, list_empty(&gl->gl_waiters1)); - gfs2_assert(sdp, list_empty(&gl->gl_waiters3)); + GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_UNLOCKED); + GLOCK_BUG_ON(gl, !list_empty(&gl->gl_reclaim)); + GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders)); glock_free(gl); rv = 1; goto out; @@ -281,22 +249,401 @@ static struct gfs2_glock *gfs2_glock_find(const struct gfs2_sbd *sdp, return gl; } +/** + * may_grant - check if its ok to grant a new lock + * @gl: The glock + * @gh: The lock request which we wish to grant + * + * Returns: true if its ok to grant the lock + */ + +static inline int may_grant(const struct gfs2_glock *gl, const struct gfs2_holder *gh) +{ + const struct gfs2_holder *gh_head = list_entry(gl->gl_holders.next, const struct gfs2_holder, gh_list); + if ((gh->gh_state == LM_ST_EXCLUSIVE || + gh_head->gh_state == LM_ST_EXCLUSIVE) && gh != gh_head) + return 0; + if (gl->gl_state == gh->gh_state) + return 1; + if (gh->gh_flags & GL_EXACT) + return 0; + if (gl->gl_state == LM_ST_EXCLUSIVE) { + if (gh->gh_state == LM_ST_SHARED && gh_head->gh_state == LM_ST_SHARED) + return 1; + if (gh->gh_state == LM_ST_DEFERRED && gh_head->gh_state == LM_ST_DEFERRED) + return 1; + } + if (gl->gl_state != LM_ST_UNLOCKED && (gh->gh_flags & LM_FLAG_ANY)) + return 1; + return 0; +} + +static void gfs2_holder_wake(struct gfs2_holder *gh) +{ + clear_bit(HIF_WAIT, &gh->gh_iflags); + smp_mb__after_clear_bit(); + wake_up_bit(&gh->gh_iflags, HIF_WAIT); +} + +/** + * do_promote - promote as many requests as possible on the current queue + * @gl: The glock + * + * Returns: true if there is a blocked holder at the head of the list + */ + +static int do_promote(struct gfs2_glock *gl) +{ + const struct gfs2_glock_operations *glops = gl->gl_ops; + struct gfs2_holder *gh, *tmp; + int ret; + +restart: + list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) { + if (test_bit(HIF_HOLDER, &gh->gh_iflags)) + continue; + if (may_grant(gl, gh)) { + if (gh->gh_list.prev == &gl->gl_holders && + glops->go_lock) { + spin_unlock(&gl->gl_spin); + /* FIXME: eliminate this eventually */ + ret = glops->go_lock(gh); + spin_lock(&gl->gl_spin); + if (ret) { + gh->gh_error = ret; + list_del_init(&gh->gh_list); + gfs2_holder_wake(gh); + goto restart; + } + set_bit(HIF_HOLDER, &gh->gh_iflags); + gfs2_holder_wake(gh); + goto restart; + } + set_bit(HIF_HOLDER, &gh->gh_iflags); + gfs2_holder_wake(gh); + continue; + } + if (gh->gh_list.prev == &gl->gl_holders) + return 1; + break; + } + return 0; +} + +/** + * do_error - Something unexpected has happened during a lock request + * + */ + +static inline void do_error(struct gfs2_glock *gl, const int ret) +{ + struct gfs2_holder *gh, *tmp; + + list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) { + if (test_bit(HIF_HOLDER, &gh->gh_iflags)) + continue; + if (ret & LM_OUT_ERROR) + gh->gh_error = -EIO; + else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) + gh->gh_error = GLR_TRYFAILED; + else + continue; + list_del_init(&gh->gh_list); + gfs2_holder_wake(gh); + } +} + +/** + * find_first_waiter - find the first gh that's waiting for the glock + * @gl: the glock + */ + +static inline struct gfs2_holder *find_first_waiter(const struct gfs2_glock *gl) +{ + struct gfs2_holder *gh; + + list_for_each_entry(gh, &gl->gl_holders, gh_list) { + if (!test_bit(HIF_HOLDER, &gh->gh_iflags)) + return gh; + } + return NULL; +} + +/** + * state_change - record that the glock is now in a different state + * @gl: the glock + * @new_state the new state + * + */ + +static void state_change(struct gfs2_glock *gl, unsigned int new_state) +{ + int held1, held2; + + held1 = (gl->gl_state != LM_ST_UNLOCKED); + held2 = (new_state != LM_ST_UNLOCKED); + + if (held1 != held2) { + if (held2) + gfs2_glock_hold(gl); + else + gfs2_glock_put(gl); + } + + gl->gl_state = new_state; + gl->gl_tchange = jiffies; +} + +static void gfs2_demote_wake(struct gfs2_glock *gl) +{ + gl->gl_demote_state = LM_ST_EXCLUSIVE; + clear_bit(GLF_DEMOTE, &gl->gl_flags); + smp_mb__after_clear_bit(); + wake_up_bit(&gl->gl_flags, GLF_DEMOTE); +} + +/** + * finish_xmote - The DLM has replied to one of our lock requests + * @gl: The glock + * @ret: The status from the DLM + * + */ + +static void finish_xmote(struct gfs2_glock *gl, unsigned int ret) +{ + const struct gfs2_glock_operations *glops = gl->gl_ops; + struct gfs2_holder *gh; + unsigned state = ret & LM_OUT_ST_MASK; + + spin_lock(&gl->gl_spin); + state_change(gl, state); + gh = find_first_waiter(gl); + + /* Demote to UN request arrived during demote to SH or DF */ + if (test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) && + state != LM_ST_UNLOCKED && gl->gl_demote_state == LM_ST_UNLOCKED) + gl->gl_target = LM_ST_UNLOCKED; + + /* Check for state != intended state */ + if (unlikely(state != gl->gl_target)) { + if (gh && !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) { + /* move to back of queue and try next entry */ + if (ret & LM_OUT_CANCELED) { + if ((gh->gh_flags & LM_FLAG_PRIORITY) == 0) + list_move_tail(&gh->gh_list, &gl->gl_holders); + gh = find_first_waiter(gl); + gl->gl_target = gh->gh_state; + goto retry; + } + /* Some error or failed "try lock" - report it */ + if ((ret & LM_OUT_ERROR) || + (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) { + gl->gl_target = gl->gl_state; + do_error(gl, ret); + goto out; + } + } + switch(state) { + /* Unlocked due to conversion deadlock, try again */ + case LM_ST_UNLOCKED: +retry: + do_xmote(gl, gh, gl->gl_target); + break; + /* Conversion fails, unlock and try again */ + case LM_ST_SHARED: + case LM_ST_DEFERRED: + do_xmote(gl, gh, LM_ST_UNLOCKED); + break; + default: /* Everything else */ + printk(KERN_ERR "GFS2: wanted %u got %u\n", gl->gl_target, state); + GLOCK_BUG_ON(gl, 1); + } + spin_unlock(&gl->gl_spin); + gfs2_glock_put(gl); + return; + } + + /* Fast path - we got what we asked for */ + if (test_and_clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) + gfs2_demote_wake(gl); + if (state != LM_ST_UNLOCKED) { + if (glops->go_xmote_bh) { + int rv; + spin_unlock(&gl->gl_spin); + rv = glops->go_xmote_bh(gl, gh); + if (rv == -EAGAIN) + return; + spin_lock(&gl->gl_spin); + if (rv) { + do_error(gl, rv); + goto out; + } + } + do_promote(gl); + } +out: + clear_bit(GLF_LOCK, &gl->gl_flags); + spin_unlock(&gl->gl_spin); + gfs2_glock_put(gl); +} + +static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock, + unsigned int cur_state, unsigned int req_state, + unsigned int flags) +{ + int ret = LM_OUT_ERROR; + + if (!sdp->sd_lockstruct.ls_ops->lm_lock) + return req_state == LM_ST_UNLOCKED ? 0 : req_state; + + if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state, + req_state, flags); + return ret; +} + +/** + * do_xmote - Calls the DLM to change the state of a lock + * @gl: The lock state + * @gh: The holder (only for promotes) + * @target: The target lock state + * + */ + +static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target) +{ + const struct gfs2_glock_operations *glops = gl->gl_ops; + struct gfs2_sbd *sdp = gl->gl_sbd; + unsigned int lck_flags = gh ? gh->gh_flags : 0; + int ret; + + lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP | + LM_FLAG_PRIORITY); + BUG_ON(gl->gl_state == target); + BUG_ON(gl->gl_state == gl->gl_target); + if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) && + glops->go_inval) { + set_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags); + do_error(gl, 0); /* Fail queued try locks */ + } + spin_unlock(&gl->gl_spin); + if (glops->go_xmote_th) + glops->go_xmote_th(gl); + if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) + glops->go_inval(gl, target == LM_ST_DEFERRED ? 0 : DIO_METADATA); + clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags); + + gfs2_glock_hold(gl); + if (target != LM_ST_UNLOCKED && (gl->gl_state == LM_ST_SHARED || + gl->gl_state == LM_ST_DEFERRED) && + !(lck_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) + lck_flags |= LM_FLAG_TRY_1CB; + ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, target, lck_flags); + + if (!(ret & LM_OUT_ASYNC)) { + finish_xmote(gl, ret); + gfs2_glock_hold(gl); + if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) + gfs2_glock_put(gl); + } else { + GLOCK_BUG_ON(gl, ret != LM_OUT_ASYNC); + } + spin_lock(&gl->gl_spin); +} + +/** + * find_first_holder - find the first "holder" gh + * @gl: the glock + */ + +static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl) +{ + struct gfs2_holder *gh; + + if (!list_empty(&gl->gl_holders)) { + gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list); + if (test_bit(HIF_HOLDER, &gh->gh_iflags)) + return gh; + } + return NULL; +} + +/** + * run_queue - do all outstanding tasks related to a glock + * @gl: The glock in question + * @nonblock: True if we must not block in run_queue + * + */ + +static void run_queue(struct gfs2_glock *gl, const int nonblock) +{ + struct gfs2_holder *gh = NULL; + + if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) + return; + + GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)); + + if (test_bit(GLF_DEMOTE, &gl->gl_flags) && + gl->gl_demote_state != gl->gl_state) { + if (find_first_holder(gl)) + goto out; + if (nonblock) + goto out_sched; + set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags); + GLOCK_BUG_ON(gl, gl->gl_demote_state == LM_ST_EXCLUSIVE); + gl->gl_target = gl->gl_demote_state; + } else { + if (test_bit(GLF_DEMOTE, &gl->gl_flags)) + gfs2_demote_wake(gl); + if (do_promote(gl) == 0) + goto out; + gh = find_first_waiter(gl); + gl->gl_target = gh->gh_state; + if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) + do_error(gl, 0); /* Fail queued try locks */ + } + do_xmote(gl, gh, gl->gl_target); + return; + +out_sched: + gfs2_glock_hold(gl); + if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) + gfs2_glock_put(gl); +out: + clear_bit(GLF_LOCK, &gl->gl_flags); +} + static void glock_work_func(struct work_struct *work) { + unsigned long delay = 0; struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work); + if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags)) + finish_xmote(gl, gl->gl_reply); spin_lock(&gl->gl_spin); - if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags)) - set_bit(GLF_DEMOTE, &gl->gl_flags); - run_queue(gl); + if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && + gl->gl_state != LM_ST_UNLOCKED && + gl->gl_demote_state != LM_ST_EXCLUSIVE) { + unsigned long holdtime, now = jiffies; + holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time; + if (time_before(now, holdtime)) + delay = holdtime - now; + set_bit(delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE, &gl->gl_flags); + } + run_queue(gl, 0); spin_unlock(&gl->gl_spin); - gfs2_glock_put(gl); + if (!delay || + queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) + gfs2_glock_put(gl); } static int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name, void **lockp) { int error = -EIO; + if (!sdp->sd_lockstruct.ls_ops->lm_get_lock) + return 0; if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) error = sdp->sd_lockstruct.ls_ops->lm_get_lock( sdp->sd_lockstruct.ls_lockspace, name, lockp); @@ -342,12 +689,10 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, gl->gl_name = name; atomic_set(&gl->gl_ref, 1); gl->gl_state = LM_ST_UNLOCKED; + gl->gl_target = LM_ST_UNLOCKED; gl->gl_demote_state = LM_ST_EXCLUSIVE; gl->gl_hash = hash; - gl->gl_owner_pid = NULL; - gl->gl_ip = 0; gl->gl_ops = glops; - gl->gl_req_gh = NULL; gl->gl_stamp = jiffies; gl->gl_tchange = jiffies; gl->gl_object = NULL; @@ -447,13 +792,6 @@ void gfs2_holder_uninit(struct gfs2_holder *gh) gh->gh_ip = 0; } -static void gfs2_holder_wake(struct gfs2_holder *gh) -{ - clear_bit(HIF_WAIT, &gh->gh_iflags); - smp_mb__after_clear_bit(); - wake_up_bit(&gh->gh_iflags, HIF_WAIT); -} - static int just_schedule(void *word) { schedule(); @@ -466,14 +804,6 @@ static void wait_on_holder(struct gfs2_holder *gh) wait_on_bit(&gh->gh_iflags, HIF_WAIT, just_schedule, TASK_UNINTERRUPTIBLE); } -static void gfs2_demote_wake(struct gfs2_glock *gl) -{ - gl->gl_demote_state = LM_ST_EXCLUSIVE; - clear_bit(GLF_DEMOTE, &gl->gl_flags); - smp_mb__after_clear_bit(); - wake_up_bit(&gl->gl_flags, GLF_DEMOTE); -} - static void wait_on_demote(struct gfs2_glock *gl) { might_sleep(); @@ -481,217 +811,6 @@ static void wait_on_demote(struct gfs2_glock *gl) } /** - * rq_mutex - process a mutex request in the queue - * @gh: the glock holder - * - * Returns: 1 if the queue is blocked - */ - -static int rq_mutex(struct gfs2_holder *gh) -{ - struct gfs2_glock *gl = gh->gh_gl; - - list_del_init(&gh->gh_list); - /* gh->gh_error never examined. */ - set_bit(GLF_LOCK, &gl->gl_flags); - clear_bit(HIF_WAIT, &gh->gh_iflags); - smp_mb(); - wake_up_bit(&gh->gh_iflags, HIF_WAIT); - - return 1; -} - -/** - * rq_promote - process a promote request in the queue - * @gh: the glock holder - * - * Acquire a new inter-node lock, or change a lock state to more restrictive. - * - * Returns: 1 if the queue is blocked - */ - -static int rq_promote(struct gfs2_holder *gh) -{ - struct gfs2_glock *gl = gh->gh_gl; - - if (!relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) { - if (list_empty(&gl->gl_holders)) { - gl->gl_req_gh = gh; - set_bit(GLF_LOCK, &gl->gl_flags); - spin_unlock(&gl->gl_spin); - gfs2_glock_xmote_th(gh->gh_gl, gh); - spin_lock(&gl->gl_spin); - } - return 1; - } - - if (list_empty(&gl->gl_holders)) { - set_bit(HIF_FIRST, &gh->gh_iflags); - set_bit(GLF_LOCK, &gl->gl_flags); - } else { - struct gfs2_holder *next_gh; - if (gh->gh_state == LM_ST_EXCLUSIVE) - return 1; - next_gh = list_entry(gl->gl_holders.next, struct gfs2_holder, - gh_list); - if (next_gh->gh_state == LM_ST_EXCLUSIVE) - return 1; - } - - list_move_tail(&gh->gh_list, &gl->gl_holders); - gh->gh_error = 0; - set_bit(HIF_HOLDER, &gh->gh_iflags); - - gfs2_holder_wake(gh); - - return 0; -} - -/** - * rq_demote - process a demote request in the queue - * @gh: the glock holder - * - * Returns: 1 if the queue is blocked - */ - -static int rq_demote(struct gfs2_glock *gl) -{ - if (!list_empty(&gl->gl_holders)) - return 1; - - if (gl->gl_state == gl->gl_demote_state || - gl->gl_state == LM_ST_UNLOCKED) { - gfs2_demote_wake(gl); - return 0; - } - - set_bit(GLF_LOCK, &gl->gl_flags); - set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags); - - if (gl->gl_demote_state == LM_ST_UNLOCKED || - gl->gl_state != LM_ST_EXCLUSIVE) { - spin_unlock(&gl->gl_spin); - gfs2_glock_drop_th(gl); - } else { - spin_unlock(&gl->gl_spin); - gfs2_glock_xmote_th(gl, NULL); - } - - spin_lock(&gl->gl_spin); - clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags); - - return 0; -} - -/** - * run_queue - process holder structures on a glock - * @gl: the glock - * - */ -static void run_queue(struct gfs2_glock *gl) -{ - struct gfs2_holder *gh; - int blocked = 1; - - for (;;) { - if (test_bit(GLF_LOCK, &gl->gl_flags)) - break; - - if (!list_empty(&gl->gl_waiters1)) { - gh = list_entry(gl->gl_waiters1.next, - struct gfs2_holder, gh_list); - blocked = rq_mutex(gh); - } else if (test_bit(GLF_DEMOTE, &gl->gl_flags)) { - blocked = rq_demote(gl); - if (test_bit(GLF_WAITERS2, &gl->gl_flags) && - !blocked) { - set_bit(GLF_DEMOTE, &gl->gl_flags); - gl->gl_demote_state = LM_ST_UNLOCKED; - } - clear_bit(GLF_WAITERS2, &gl->gl_flags); - } else if (!list_empty(&gl->gl_waiters3)) { - gh = list_entry(gl->gl_waiters3.next, - struct gfs2_holder, gh_list); - blocked = rq_promote(gh); - } else - break; - - if (blocked) - break; - } -} - -/** - * gfs2_glmutex_lock - acquire a local lock on a glock - * @gl: the glock - * - * Gives caller exclusive access to manipulate a glock structure. - */ - -static void gfs2_glmutex_lock(struct gfs2_glock *gl) -{ - spin_lock(&gl->gl_spin); - if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) { - struct gfs2_holder gh; - - gfs2_holder_init(gl, 0, 0, &gh); - set_bit(HIF_WAIT, &gh.gh_iflags); - list_add_tail(&gh.gh_list, &gl->gl_waiters1); - spin_unlock(&gl->gl_spin); - wait_on_holder(&gh); - gfs2_holder_uninit(&gh); - } else { - gl->gl_owner_pid = get_pid(task_pid(current)); - gl->gl_ip = (unsigned long)__builtin_return_address(0); - spin_unlock(&gl->gl_spin); - } -} - -/** - * gfs2_glmutex_trylock - try to acquire a local lock on a glock - * @gl: the glock - * - * Returns: 1 if the glock is acquired - */ - -static int gfs2_glmutex_trylock(struct gfs2_glock *gl) -{ - int acquired = 1; - - spin_lock(&gl->gl_spin); - if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) { - acquired = 0; - } else { - gl->gl_owner_pid = get_pid(task_pid(current)); - gl->gl_ip = (unsigned long)__builtin_return_address(0); - } - spin_unlock(&gl->gl_spin); - - return acquired; -} - -/** - * gfs2_glmutex_unlock - release a local lock on a glock - * @gl: the glock - * - */ - -static void gfs2_glmutex_unlock(struct gfs2_glock *gl) -{ - struct pid *pid; - - spin_lock(&gl->gl_spin); - clear_bit(GLF_LOCK, &gl->gl_flags); - pid = gl->gl_owner_pid; - gl->gl_owner_pid = NULL; - gl->gl_ip = 0; - run_queue(gl); - spin_unlock(&gl->gl_spin); - - put_pid(pid); -} - -/** * handle_callback - process a demote request * @gl: the glock * @state: the state the caller wants us to change to @@ -705,398 +824,45 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state, { int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE; - spin_lock(&gl->gl_spin); set_bit(bit, &gl->gl_flags); if (gl->gl_demote_state == LM_ST_EXCLUSIVE) { gl->gl_demote_state = state; gl->gl_demote_time = jiffies; if (remote && gl->gl_ops->go_type == LM_TYPE_IOPEN && - gl->gl_object) { + gl->gl_object) gfs2_glock_schedule_for_reclaim(gl); - spin_unlock(&gl->gl_spin); - return; - } } else if (gl->gl_demote_state != LM_ST_UNLOCKED && gl->gl_demote_state != state) { - if (test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) - set_bit(GLF_WAITERS2, &gl->gl_flags); - else - gl->gl_demote_state = LM_ST_UNLOCKED; - } - spin_unlock(&gl->gl_spin); -} - -/** - * state_change - record that the glock is now in a different state - * @gl: the glock - * @new_state the new state - * - */ - -static void state_change(struct gfs2_glock *gl, unsigned int new_state) -{ - int held1, held2; - - held1 = (gl->gl_state != LM_ST_UNLOCKED); - held2 = (new_state != LM_ST_UNLOCKED); - - if (held1 != held2) { - if (held2) - gfs2_glock_hold(gl); - else - gfs2_glock_put(gl); - } - - gl->gl_state = new_state; - gl->gl_tchange = jiffies; -} - -/** - * drop_bh - Called after a lock module unlock completes - * @gl: the glock - * @ret: the return status - * - * Doesn't wake up the process waiting on the struct gfs2_holder (if any) - * Doesn't drop the reference on the glock the top half took out - * - */ - -static void drop_bh(struct gfs2_glock *gl, unsigned int ret) -{ - struct gfs2_sbd *sdp = gl->gl_sbd; - struct gfs2_holder *gh = gl->gl_req_gh; - - gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags)); - gfs2_assert_warn(sdp, list_empty(&gl->gl_holders)); - gfs2_assert_warn(sdp, !ret); - - state_change(gl, LM_ST_UNLOCKED); - - if (test_and_clear_bit(GLF_CONV_DEADLK, &gl->gl_flags)) { - spin_lock(&gl->gl_spin); - gh->gh_error = 0; - spin_unlock(&gl->gl_spin); - gfs2_glock_xmote_th(gl, gl->gl_req_gh); - gfs2_glock_put(gl); - return; + gl->gl_demote_state = LM_ST_UNLOCKED; } - - spin_lock(&gl->gl_spin); - gfs2_demote_wake(gl); - clear_bit(GLF_LOCK, &gl->gl_flags); - spin_unlock(&gl->gl_spin); - gfs2_glock_put(gl); } /** - * xmote_bh - Called after the lock module is done acquiring a lock - * @gl: The glock in question - * @ret: the int returned from the lock module - * - */ - -static void xmote_bh(struct gfs2_glock *gl, unsigned int ret) -{ - struct gfs2_sbd *sdp = gl->gl_sbd; - const struct gfs2_glock_operations *glops = gl->gl_ops; - struct gfs2_holder *gh = gl->gl_req_gh; - int op_done = 1; - - if (!gh && (ret & LM_OUT_ST_MASK) == LM_ST_UNLOCKED) { - drop_bh(gl, ret); - return; - } - - gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags)); - gfs2_assert_warn(sdp, list_empty(&gl->gl_holders)); - gfs2_assert_warn(sdp, !(ret & LM_OUT_ASYNC)); - - state_change(gl, ret & LM_OUT_ST_MASK); - - /* Deal with each possible exit condition */ - - if (!gh) { - gl->gl_stamp = jiffies; - if (ret & LM_OUT_CANCELED) { - op_done = 0; - } else { - spin_lock(&gl->gl_spin); - if (gl->gl_state != gl->gl_demote_state) { - spin_unlock(&gl->gl_spin); - gfs2_glock_drop_th(gl); - gfs2_glock_put(gl); - return; - } - gfs2_demote_wake(gl); - spin_unlock(&gl->gl_spin); - } - } else { - spin_lock(&gl->gl_spin); - if (ret & LM_OUT_CONV_DEADLK) { - gh->gh_error = 0; - set_bit(GLF_CONV_DEADLK, &gl->gl_flags); - spin_unlock(&gl->gl_spin); - gfs2_glock_drop_th(gl); - gfs2_glock_put(gl); - return; - } - list_del_init(&gh->gh_list); - gh->gh_error = -EIO; - if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) - goto out; - gh->gh_error = GLR_CANCELED; - if (ret & LM_OUT_CANCELED) - goto out; - if (relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) { - list_add_tail(&gh->gh_list, &gl->gl_holders); - gh->gh_error = 0; - set_bit(HIF_HOLDER, &gh->gh_iflags); - set_bit(HIF_FIRST, &gh->gh_iflags); - op_done = 0; - goto out; - } - gh->gh_error = GLR_TRYFAILED; - if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) - goto out; - gh->gh_error = -EINVAL; - if (gfs2_assert_withdraw(sdp, 0) == -1) - fs_err(sdp, "ret = 0x%.8X\n", ret); -out: - spin_unlock(&gl->gl_spin); - } - - if (glops->go_xmote_bh) - glops->go_xmote_bh(gl); - - if (op_done) { - spin_lock(&gl->gl_spin); - gl->gl_req_gh = NULL; - clear_bit(GLF_LOCK, &gl->gl_flags); - spin_unlock(&gl->gl_spin); - } - - gfs2_glock_put(gl); - - if (gh) - gfs2_holder_wake(gh); -} - -static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock, - unsigned int cur_state, unsigned int req_state, - unsigned int flags) -{ - int ret = 0; - if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) - ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state, - req_state, flags); - return ret; -} - -/** - * gfs2_glock_xmote_th - Call into the lock module to acquire or change a glock - * @gl: The glock in question - * @state: the requested state - * @flags: modifier flags to the lock call - * - */ - -static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh) -{ - struct gfs2_sbd *sdp = gl->gl_sbd; - int flags = gh ? gh->gh_flags : 0; - unsigned state = gh ? gh->gh_state : gl->gl_demote_state; - const struct gfs2_glock_operations *glops = gl->gl_ops; - int lck_flags = flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB | - LM_FLAG_NOEXP | LM_FLAG_ANY | - LM_FLAG_PRIORITY); - unsigned int lck_ret; - - if (glops->go_xmote_th) - glops->go_xmote_th(gl); - if (state == LM_ST_DEFERRED && glops->go_inval) - glops->go_inval(gl, DIO_METADATA); - - gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags)); - gfs2_assert_warn(sdp, list_empty(&gl->gl_holders)); - gfs2_assert_warn(sdp, state != LM_ST_UNLOCKED); - gfs2_assert_warn(sdp, state != gl->gl_state); - - gfs2_glock_hold(gl); - - lck_ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, state, lck_flags); - - if (gfs2_assert_withdraw(sdp, !(lck_ret & LM_OUT_ERROR))) - return; - - if (lck_ret & LM_OUT_ASYNC) - gfs2_assert_warn(sdp, lck_ret == LM_OUT_ASYNC); - else - xmote_bh(gl, lck_ret); -} - -static unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, void *lock, - unsigned int cur_state) -{ - int ret = 0; - if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) - ret = sdp->sd_lockstruct.ls_ops->lm_unlock(lock, cur_state); - return ret; -} - -/** - * gfs2_glock_drop_th - call into the lock module to unlock a lock - * @gl: the glock - * - */ - -static void gfs2_glock_drop_th(struct gfs2_glock *gl) -{ - struct gfs2_sbd *sdp = gl->gl_sbd; - const struct gfs2_glock_operations *glops = gl->gl_ops; - unsigned int ret; - - if (glops->go_xmote_th) - glops->go_xmote_th(gl); - if (glops->go_inval) - glops->go_inval(gl, DIO_METADATA); - - gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags)); - gfs2_assert_warn(sdp, list_empty(&gl->gl_holders)); - gfs2_assert_warn(sdp, gl->gl_state != LM_ST_UNLOCKED); - - gfs2_glock_hold(gl); - - ret = gfs2_lm_unlock(sdp, gl->gl_lock, gl->gl_state); - - if (gfs2_assert_withdraw(sdp, !(ret & LM_OUT_ERROR))) - return; - - if (!ret) - drop_bh(gl, ret); - else - gfs2_assert_warn(sdp, ret == LM_OUT_ASYNC); -} - -/** - * do_cancels - cancel requests for locks stuck waiting on an expire flag - * @gh: the LM_FLAG_PRIORITY holder waiting to acquire the lock - * - * Don't cancel GL_NOCANCEL requests. - */ - -static void do_cancels(struct gfs2_holder *gh) -{ - struct gfs2_glock *gl = gh->gh_gl; - struct gfs2_sbd *sdp = gl->gl_sbd; - - spin_lock(&gl->gl_spin); - - while (gl->gl_req_gh != gh && - !test_bit(HIF_HOLDER, &gh->gh_iflags) && - !list_empty(&gh->gh_list)) { - if (!(gl->gl_req_gh && (gl->gl_req_gh->gh_flags & GL_NOCANCEL))) { - spin_unlock(&gl->gl_spin); - if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) - sdp->sd_lockstruct.ls_ops->lm_cancel(gl->gl_lock); - msleep(100); - spin_lock(&gl->gl_spin); - } else { - spin_unlock(&gl->gl_spin); - msleep(100); - spin_lock(&gl->gl_spin); - } - } - - spin_unlock(&gl->gl_spin); -} - -/** - * glock_wait_internal - wait on a glock acquisition + * gfs2_glock_wait - wait on a glock acquisition * @gh: the glock holder * * Returns: 0 on success */ -static int glock_wait_internal(struct gfs2_holder *gh) +int gfs2_glock_wait(struct gfs2_holder *gh) { - struct gfs2_glock *gl = gh->gh_gl; - struct gfs2_sbd *sdp = gl->gl_sbd; - const struct gfs2_glock_operations *glops = gl->gl_ops; - - if (test_bit(HIF_ABORTED, &gh->gh_iflags)) - return -EIO; - - if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) { - spin_lock(&gl->gl_spin); - if (gl->gl_req_gh != gh && - !test_bit(HIF_HOLDER, &gh->gh_iflags) && - !list_empty(&gh->gh_list)) { - list_del_init(&gh->gh_list); - gh->gh_error = GLR_TRYFAILED; - run_queue(gl); - spin_unlock(&gl->gl_spin); - return gh->gh_error; - } - spin_unlock(&gl->gl_spin); - } - - if (gh->gh_flags & LM_FLAG_PRIORITY) - do_cancels(gh); - wait_on_holder(gh); - if (gh->gh_error) - return gh->gh_error; - - gfs2_assert_withdraw(sdp, test_bit(HIF_HOLDER, &gh->gh_iflags)); - gfs2_assert_withdraw(sdp, relaxed_state_ok(gl->gl_state, gh->gh_state, - gh->gh_flags)); - - if (test_bit(HIF_FIRST, &gh->gh_iflags)) { - gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags)); - - if (glops->go_lock) { - gh->gh_error = glops->go_lock(gh); - if (gh->gh_error) { - spin_lock(&gl->gl_spin); - list_del_init(&gh->gh_list); - spin_unlock(&gl->gl_spin); - } - } - - spin_lock(&gl->gl_spin); - gl->gl_req_gh = NULL; - clear_bit(GLF_LOCK, &gl->gl_flags); - run_queue(gl); - spin_unlock(&gl->gl_spin); - } - return gh->gh_error; } -static inline struct gfs2_holder * -find_holder_by_owner(struct list_head *head, struct pid *pid) -{ - struct gfs2_holder *gh; - - list_for_each_entry(gh, head, gh_list) { - if (gh->gh_owner_pid == pid) - return gh; - } - - return NULL; -} - -static void print_dbg(struct glock_iter *gi, const char *fmt, ...) +void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...) { va_list args; va_start(args, fmt); - if (gi) { + if (seq) { + struct gfs2_glock_iter *gi = seq->private; vsprintf(gi->string, fmt, args); - seq_printf(gi->seq, gi->string); - } - else + seq_printf(seq, gi->string); + } else { + printk(KERN_ERR " "); vprintk(fmt, args); + } va_end(args); } @@ -1104,50 +870,76 @@ static void print_dbg(struct glock_iter *gi, const char *fmt, ...) * add_to_queue - Add a holder to the wait queue (but look for recursion) * @gh: the holder structure to add * + * Eventually we should move the recursive locking trap to a + * debugging option or something like that. This is the fast + * path and needs to have the minimum number of distractions. + * */ -static void add_to_queue(struct gfs2_holder *gh) +static inline void add_to_queue(struct gfs2_holder *gh) { struct gfs2_glock *gl = gh->gh_gl; - struct gfs2_holder *existing; + struct gfs2_sbd *sdp = gl->gl_sbd; + struct list_head *insert_pt = NULL; + struct gfs2_holder *gh2; + int try_lock = 0; BUG_ON(gh->gh_owner_pid == NULL); if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags)) BUG(); - if (!(gh->gh_flags & GL_FLOCK)) { - existing = find_holder_by_owner(&gl->gl_holders, - gh->gh_owner_pid); - if (existing) { - print_symbol(KERN_WARNING "original: %s\n", - existing->gh_ip); - printk(KERN_INFO "pid : %d\n", - pid_nr(existing->gh_owner_pid)); - printk(KERN_INFO "lock type : %d lock state : %d\n", - existing->gh_gl->gl_name.ln_type, - existing->gh_gl->gl_state); - print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip); - printk(KERN_INFO "pid : %d\n", - pid_nr(gh->gh_owner_pid)); - printk(KERN_INFO "lock type : %d lock state : %d\n", - gl->gl_name.ln_type, gl->gl_state); - BUG(); - } - - existing = find_holder_by_owner(&gl->gl_waiters3, - gh->gh_owner_pid); - if (existing) { - print_symbol(KERN_WARNING "original: %s\n", - existing->gh_ip); - print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip); - BUG(); + if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) { + if (test_bit(GLF_LOCK, &gl->gl_flags)) + try_lock = 1; + if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) + goto fail; + } + + list_for_each_entry(gh2, &gl->gl_holders, gh_list) { + if (unlikely(gh2->gh_owner_pid == gh->gh_owner_pid && + (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK))) + goto trap_recursive; + if (try_lock && + !(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) && + !may_grant(gl, gh)) { +fail: + gh->gh_error = GLR_TRYFAILED; + gfs2_holder_wake(gh); + return; } + if (test_bit(HIF_HOLDER, &gh2->gh_iflags)) + continue; + if (unlikely((gh->gh_flags & LM_FLAG_PRIORITY) && !insert_pt)) + insert_pt = &gh2->gh_list; + } + if (likely(insert_pt == NULL)) { + list_add_tail(&gh->gh_list, &gl->gl_holders); + if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY)) + goto do_cancel; + return; + } + list_add_tail(&gh->gh_list, insert_pt); +do_cancel: + gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list); + if (!(gh->gh_flags & LM_FLAG_PRIORITY)) { + spin_unlock(&gl->gl_spin); + if (sdp->sd_lockstruct.ls_ops->lm_cancel) + sdp->sd_lockstruct.ls_ops->lm_cancel(gl->gl_lock); + spin_lock(&gl->gl_spin); } + return; - if (gh->gh_flags & LM_FLAG_PRIORITY) - list_add(&gh->gh_list, &gl->gl_waiters3); - else - list_add_tail(&gh->gh_list, &gl->gl_waiters3); +trap_recursive: + print_symbol(KERN_ERR "original: %s\n", gh2->gh_ip); + printk(KERN_ERR "pid: %d\n", pid_nr(gh2->gh_owner_pid)); + printk(KERN_ERR "lock type: %d req lock state : %d\n", + gh2->gh_gl->gl_name.ln_type, gh2->gh_state); + print_symbol(KERN_ERR "new: %s\n", gh->gh_ip); + printk(KERN_ERR "pid: %d\n", pid_nr(gh->gh_owner_pid)); + printk(KERN_ERR "lock type: %d req lock state : %d\n", + gh->gh_gl->gl_name.ln_type, gh->gh_state); + __dump_glock(NULL, gl); + BUG(); } /** @@ -1165,24 +957,16 @@ int gfs2_glock_nq(struct gfs2_holder *gh) struct gfs2_sbd *sdp = gl->gl_sbd; int error = 0; -restart: - if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) { - set_bit(HIF_ABORTED, &gh->gh_iflags); + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) return -EIO; - } spin_lock(&gl->gl_spin); add_to_queue(gh); - run_queue(gl); + run_queue(gl, 1); spin_unlock(&gl->gl_spin); - if (!(gh->gh_flags & GL_ASYNC)) { - error = glock_wait_internal(gh); - if (error == GLR_CANCELED) { - msleep(100); - goto restart; - } - } + if (!(gh->gh_flags & GL_ASYNC)) + error = gfs2_glock_wait(gh); return error; } @@ -1196,48 +980,7 @@ restart: int gfs2_glock_poll(struct gfs2_holder *gh) { - struct gfs2_glock *gl = gh->gh_gl; - int ready = 0; - - spin_lock(&gl->gl_spin); - - if (test_bit(HIF_HOLDER, &gh->gh_iflags)) - ready = 1; - else if (list_empty(&gh->gh_list)) { - if (gh->gh_error == GLR_CANCELED) { - spin_unlock(&gl->gl_spin); - msleep(100); - if (gfs2_glock_nq(gh)) - return 1; - return 0; - } else - ready = 1; - } - - spin_unlock(&gl->gl_spin); - - return ready; -} - -/** - * gfs2_glock_wait - wait for a lock acquisition that ended in a GLR_ASYNC - * @gh: the holder structure - * - * Returns: 0, GLR_TRYFAILED, or errno on failure - */ - -int gfs2_glock_wait(struct gfs2_holder *gh) -{ - int error; - - error = glock_wait_internal(gh); - if (error == GLR_CANCELED) { - msleep(100); - gh->gh_flags &= ~GL_ASYNC; - error = gfs2_glock_nq(gh); - } - - return error; + return test_bit(HIF_WAIT, &gh->gh_iflags) ? 0 : 1; } /** @@ -1251,26 +994,30 @@ void gfs2_glock_dq(struct gfs2_holder *gh) struct gfs2_glock *gl = gh->gh_gl; const struct gfs2_glock_operations *glops = gl->gl_ops; unsigned delay = 0; + int fast_path = 0; + spin_lock(&gl->gl_spin); if (gh->gh_flags & GL_NOCACHE) handle_callback(gl, LM_ST_UNLOCKED, 0, 0); - gfs2_glmutex_lock(gl); - - spin_lock(&gl->gl_spin); list_del_init(&gh->gh_list); - - if (list_empty(&gl->gl_holders)) { + if (find_first_holder(gl) == NULL) { if (glops->go_unlock) { + GLOCK_BUG_ON(gl, test_and_set_bit(GLF_LOCK, &gl->gl_flags)); spin_unlock(&gl->gl_spin); glops->go_unlock(gh); spin_lock(&gl->gl_spin); + clear_bit(GLF_LOCK, &gl->gl_flags); } gl->gl_stamp = jiffies; + if (list_empty(&gl->gl_holders) && + !test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && + !test_bit(GLF_DEMOTE, &gl->gl_flags)) + fast_path = 1; } - - clear_bit(GLF_LOCK, &gl->gl_flags); spin_unlock(&gl->gl_spin); + if (likely(fast_path)) + return; gfs2_glock_hold(gl); if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && @@ -1454,6 +1201,8 @@ void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs) static int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp) { int error = -EIO; + if (!sdp->sd_lockstruct.ls_ops->lm_hold_lvb) + return 0; if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) error = sdp->sd_lockstruct.ls_ops->lm_hold_lvb(lock, lvbp); return error; @@ -1469,20 +1218,14 @@ int gfs2_lvb_hold(struct gfs2_glock *gl) { int error; - gfs2_glmutex_lock(gl); - if (!atomic_read(&gl->gl_lvb_count)) { error = gfs2_lm_hold_lvb(gl->gl_sbd, gl->gl_lock, &gl->gl_lvb); - if (error) { - gfs2_glmutex_unlock(gl); + if (error) return error; - } gfs2_glock_hold(gl); } atomic_inc(&gl->gl_lvb_count); - gfs2_glmutex_unlock(gl); - return 0; } @@ -1497,17 +1240,13 @@ void gfs2_lvb_unhold(struct gfs2_glock *gl) struct gfs2_sbd *sdp = gl->gl_sbd; gfs2_glock_hold(gl); - gfs2_glmutex_lock(gl); - gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count) > 0); if (atomic_dec_and_test(&gl->gl_lvb_count)) { - if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + if (sdp->sd_lockstruct.ls_ops->lm_unhold_lvb) sdp->sd_lockstruct.ls_ops->lm_unhold_lvb(gl->gl_lock, gl->gl_lvb); gl->gl_lvb = NULL; gfs2_glock_put(gl); } - - gfs2_glmutex_unlock(gl); gfs2_glock_put(gl); } @@ -1526,8 +1265,12 @@ static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name, holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time; if (time_before(now, holdtime)) delay = holdtime - now; + if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags)) + delay = gl->gl_ops->go_min_hold_time; + spin_lock(&gl->gl_spin); handle_callback(gl, state, 1, delay); + spin_unlock(&gl->gl_spin); if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) gfs2_glock_put(gl); } @@ -1568,7 +1311,8 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data) gl = gfs2_glock_find(sdp, &async->lc_name); if (gfs2_assert_warn(sdp, gl)) return; - xmote_bh(gl, async->lc_ret); + gl->gl_reply = async->lc_ret; + set_bit(GLF_REPLY_PENDING, &gl->gl_flags); if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) gfs2_glock_put(gl); up_read(&gfs2_umount_flush_sem); @@ -1581,11 +1325,6 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data) wake_up_process(sdp->sd_recoverd_process); return; - case LM_CB_DROPLOCKS: - gfs2_gl_hash_clear(sdp, NO_WAIT); - gfs2_quota_scan(sdp); - return; - default: gfs2_assert_warn(sdp, 0); return; @@ -1646,6 +1385,7 @@ void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl) void gfs2_reclaim_glock(struct gfs2_sbd *sdp) { struct gfs2_glock *gl; + int done_callback = 0; spin_lock(&sdp->sd_reclaim_lock); if (list_empty(&sdp->sd_reclaim_list)) { @@ -1660,14 +1400,16 @@ void gfs2_reclaim_glock(struct gfs2_sbd *sdp) atomic_dec(&sdp->sd_reclaim_count); atomic_inc(&sdp->sd_reclaimed); - if (gfs2_glmutex_trylock(gl)) { - if (list_empty(&gl->gl_holders) && - gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl)) - handle_callback(gl, LM_ST_UNLOCKED, 0, 0); - gfs2_glmutex_unlock(gl); + spin_lock(&gl->gl_spin); + if (find_first_holder(gl) == NULL && + gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl)) { + handle_callback(gl, LM_ST_UNLOCKED, 0, 0); + done_callback = 1; } - - gfs2_glock_put(gl); + spin_unlock(&gl->gl_spin); + if (!done_callback || + queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) + gfs2_glock_put(gl); } /** @@ -1724,18 +1466,14 @@ static void scan_glock(struct gfs2_glock *gl) { if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object) return; + if (test_bit(GLF_LOCK, &gl->gl_flags)) + return; - if (gfs2_glmutex_trylock(gl)) { - if (list_empty(&gl->gl_holders) && - gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl)) - goto out_schedule; - gfs2_glmutex_unlock(gl); - } - return; - -out_schedule: - gfs2_glmutex_unlock(gl); - gfs2_glock_schedule_for_reclaim(gl); + spin_lock(&gl->gl_spin); + if (find_first_holder(gl) == NULL && + gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl)) + gfs2_glock_schedule_for_reclaim(gl); + spin_unlock(&gl->gl_spin); } /** @@ -1760,12 +1498,13 @@ static void clear_glock(struct gfs2_glock *gl) spin_unlock(&sdp->sd_reclaim_lock); } - if (gfs2_glmutex_trylock(gl)) { - if (list_empty(&gl->gl_holders) && - gl->gl_state != LM_ST_UNLOCKED) - handle_callback(gl, LM_ST_UNLOCKED, 0, 0); - gfs2_glmutex_unlock(gl); - } + spin_lock(&gl->gl_spin); + if (find_first_holder(gl) == NULL && gl->gl_state != LM_ST_UNLOCKED) + handle_callback(gl, LM_ST_UNLOCKED, 0, 0); + spin_unlock(&gl->gl_spin); + gfs2_glock_hold(gl); + if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) + gfs2_glock_put(gl); } /** @@ -1773,11 +1512,10 @@ static void clear_glock(struct gfs2_glock *gl) * @sdp: the filesystem * @wait: wait until it's all gone * - * Called when unmounting the filesystem, or when inter-node lock manager - * requests DROPLOCKS because it is running out of capacity. + * Called when unmounting the filesystem. */ -void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait) +void gfs2_gl_hash_clear(struct gfs2_sbd *sdp) { unsigned long t; unsigned int x; @@ -1792,7 +1530,7 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait) cont = 1; } - if (!wait || !cont) + if (!cont) break; if (time_after_eq(jiffies, @@ -1810,180 +1548,162 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait) } } -/* - * Diagnostic routines to help debug distributed deadlock - */ - -static void gfs2_print_symbol(struct glock_iter *gi, const char *fmt, - unsigned long address) +static const char *state2str(unsigned state) { - char buffer[KSYM_SYMBOL_LEN]; - - sprint_symbol(buffer, address); - print_dbg(gi, fmt, buffer); + switch(state) { + case LM_ST_UNLOCKED: + return "UN"; + case LM_ST_SHARED: + return "SH"; + case LM_ST_DEFERRED: + return "DF"; + case LM_ST_EXCLUSIVE: + return "EX"; + } + return "??"; +} + +static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags) +{ + char *p = buf; + if (flags & LM_FLAG_TRY) + *p++ = 't'; + if (flags & LM_FLAG_TRY_1CB) + *p++ = 'T'; + if (flags & LM_FLAG_NOEXP) + *p++ = 'e'; + if (flags & LM_FLAG_ANY) + *p++ = 'a'; + if (flags & LM_FLAG_PRIORITY) + *p++ = 'p'; + if (flags & GL_ASYNC) + *p++ = 'a'; + if (flags & GL_EXACT) + *p++ = 'E'; + if (flags & GL_NOCACHE) + *p++ = 'c'; + if (test_bit(HIF_HOLDER, &iflags)) + *p++ = 'H'; + if (test_bit(HIF_WAIT, &iflags)) + *p++ = 'W'; + if (test_bit(HIF_FIRST, &iflags)) + *p++ = 'F'; + *p = 0; + return buf; } /** * dump_holder - print information about a glock holder - * @str: a string naming the type of holder + * @seq: the seq_file struct * @gh: the glock holder * * Returns: 0 on success, -ENOBUFS when we run out of space */ -static int dump_holder(struct glock_iter *gi, char *str, - struct gfs2_holder *gh) +static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh) { - unsigned int x; - struct task_struct *gh_owner; + struct task_struct *gh_owner = NULL; + char buffer[KSYM_SYMBOL_LEN]; + char flags_buf[32]; - print_dbg(gi, " %s\n", str); - if (gh->gh_owner_pid) { - print_dbg(gi, " owner = %ld ", - (long)pid_nr(gh->gh_owner_pid)); + sprint_symbol(buffer, gh->gh_ip); + if (gh->gh_owner_pid) gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID); - if (gh_owner) - print_dbg(gi, "(%s)\n", gh_owner->comm); - else - print_dbg(gi, "(ended)\n"); - } else - print_dbg(gi, " owner = -1\n"); - print_dbg(gi, " gh_state = %u\n", gh->gh_state); - print_dbg(gi, " gh_flags ="); - for (x = 0; x < 32; x++) - if (gh->gh_flags & (1 << x)) - print_dbg(gi, " %u", x); - print_dbg(gi, " \n"); - print_dbg(gi, " error = %d\n", gh->gh_error); - print_dbg(gi, " gh_iflags ="); - for (x = 0; x < 32; x++) - if (test_bit(x, &gh->gh_iflags)) - print_dbg(gi, " %u", x); - print_dbg(gi, " \n"); - gfs2_print_symbol(gi, " initialized at: %s\n", gh->gh_ip); - + gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %s\n", + state2str(gh->gh_state), + hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags), + gh->gh_error, + gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1, + gh_owner ? gh_owner->comm : "(ended)", buffer); return 0; } -/** - * dump_inode - print information about an inode - * @ip: the inode - * - * Returns: 0 on success, -ENOBUFS when we run out of space - */ - -static int dump_inode(struct glock_iter *gi, struct gfs2_inode *ip) -{ - unsigned int x; - - print_dbg(gi, " Inode:\n"); - print_dbg(gi, " num = %llu/%llu\n", - (unsigned long long)ip->i_no_formal_ino, - (unsigned long long)ip->i_no_addr); - print_dbg(gi, " type = %u\n", IF2DT(ip->i_inode.i_mode)); - print_dbg(gi, " i_flags ="); - for (x = 0; x < 32; x++) - if (test_bit(x, &ip->i_flags)) - print_dbg(gi, " %u", x); - print_dbg(gi, " \n"); - return 0; +static const char *gflags2str(char *buf, const unsigned long *gflags) +{ + char *p = buf; + if (test_bit(GLF_LOCK, gflags)) + *p++ = 'l'; + if (test_bit(GLF_STICKY, gflags)) + *p++ = 's'; + if (test_bit(GLF_DEMOTE, gflags)) + *p++ = 'D'; + if (test_bit(GLF_PENDING_DEMOTE, gflags)) + *p++ = 'd'; + if (test_bit(GLF_DEMOTE_IN_PROGRESS, gflags)) + *p++ = 'p'; + if (test_bit(GLF_DIRTY, gflags)) + *p++ = 'y'; + if (test_bit(GLF_LFLUSH, gflags)) + *p++ = 'f'; + if (test_bit(GLF_INVALIDATE_IN_PROGRESS, gflags)) + *p++ = 'i'; + if (test_bit(GLF_REPLY_PENDING, gflags)) + *p++ = 'r'; + *p = 0; + return buf; } /** - * dump_glock - print information about a glock + * __dump_glock - print information about a glock + * @seq: The seq_file struct * @gl: the glock - * @count: where we are in the buffer + * + * The file format is as follows: + * One line per object, capital letters are used to indicate objects + * G = glock, I = Inode, R = rgrp, H = holder. Glocks are not indented, + * other objects are indented by a single space and follow the glock to + * which they are related. Fields are indicated by lower case letters + * followed by a colon and the field value, except for strings which are in + * [] so that its possible to see if they are composed of spaces for + * example. The field's are n = number (id of the object), f = flags, + * t = type, s = state, r = refcount, e = error, p = pid. * * Returns: 0 on success, -ENOBUFS when we run out of space */ -static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl) +static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl) { - struct gfs2_holder *gh; - unsigned int x; - int error = -ENOBUFS; - struct task_struct *gl_owner; + const struct gfs2_glock_operations *glops = gl->gl_ops; + unsigned long long dtime; + const struct gfs2_holder *gh; + char gflags_buf[32]; + int error = 0; - spin_lock(&gl->gl_spin); + dtime = jiffies - gl->gl_demote_time; + dtime *= 1000000/HZ; /* demote time in uSec */ + if (!test_bit(GLF_DEMOTE, &gl->gl_flags)) + dtime = 0; + gfs2_print_dbg(seq, "G: s:%s n:%u/%llu f:%s t:%s d:%s/%llu l:%d a:%d r:%d\n", + state2str(gl->gl_state), + gl->gl_name.ln_type, + (unsigned long long)gl->gl_name.ln_number, + gflags2str(gflags_buf, &gl->gl_flags), + state2str(gl->gl_target), + state2str(gl->gl_demote_state), dtime, + atomic_read(&gl->gl_lvb_count), + atomic_read(&gl->gl_ail_count), + atomic_read(&gl->gl_ref)); - print_dbg(gi, "Glock 0x%p (%u, 0x%llx)\n", gl, gl->gl_name.ln_type, - (unsigned long long)gl->gl_name.ln_number); - print_dbg(gi, " gl_flags ="); - for (x = 0; x < 32; x++) { - if (test_bit(x, &gl->gl_flags)) - print_dbg(gi, " %u", x); - } - if (!test_bit(GLF_LOCK, &gl->gl_flags)) - print_dbg(gi, " (unlocked)"); - print_dbg(gi, " \n"); - print_dbg(gi, " gl_ref = %d\n", atomic_read(&gl->gl_ref)); - print_dbg(gi, " gl_state = %u\n", gl->gl_state); - if (gl->gl_owner_pid) { - gl_owner = pid_task(gl->gl_owner_pid, PIDTYPE_PID); - if (gl_owner) - print_dbg(gi, " gl_owner = pid %d (%s)\n", - pid_nr(gl->gl_owner_pid), gl_owner->comm); - else - print_dbg(gi, " gl_owner = %d (ended)\n", - pid_nr(gl->gl_owner_pid)); - } else - print_dbg(gi, " gl_owner = -1\n"); - print_dbg(gi, " gl_ip = %lu\n", gl->gl_ip); - print_dbg(gi, " req_gh = %s\n", (gl->gl_req_gh) ? "yes" : "no"); - print_dbg(gi, " lvb_count = %d\n", atomic_read(&gl->gl_lvb_count)); - print_dbg(gi, " object = %s\n", (gl->gl_object) ? "yes" : "no"); - print_dbg(gi, " reclaim = %s\n", - (list_empty(&gl->gl_reclaim)) ? "no" : "yes"); - if (gl->gl_aspace) - print_dbg(gi, " aspace = 0x%p nrpages = %lu\n", gl->gl_aspace, - gl->gl_aspace->i_mapping->nrpages); - else - print_dbg(gi, " aspace = no\n"); - print_dbg(gi, " ail = %d\n", atomic_read(&gl->gl_ail_count)); - if (gl->gl_req_gh) { - error = dump_holder(gi, "Request", gl->gl_req_gh); - if (error) - goto out; - } list_for_each_entry(gh, &gl->gl_holders, gh_list) { - error = dump_holder(gi, "Holder", gh); - if (error) - goto out; - } - list_for_each_entry(gh, &gl->gl_waiters1, gh_list) { - error = dump_holder(gi, "Waiter1", gh); + error = dump_holder(seq, gh); if (error) goto out; } - list_for_each_entry(gh, &gl->gl_waiters3, gh_list) { - error = dump_holder(gi, "Waiter3", gh); - if (error) - goto out; - } - if (test_bit(GLF_DEMOTE, &gl->gl_flags)) { - print_dbg(gi, " Demotion req to state %u (%llu uS ago)\n", - gl->gl_demote_state, (unsigned long long) - (jiffies - gl->gl_demote_time)*(1000000/HZ)); - } - if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object) { - if (!test_bit(GLF_LOCK, &gl->gl_flags) && - list_empty(&gl->gl_holders)) { - error = dump_inode(gi, gl->gl_object); - if (error) - goto out; - } else { - error = -ENOBUFS; - print_dbg(gi, " Inode: busy\n"); - } - } - - error = 0; - + if (gl->gl_state != LM_ST_UNLOCKED && glops->go_dump) + error = glops->go_dump(seq, gl); out: - spin_unlock(&gl->gl_spin); return error; } +static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl) +{ + int ret; + spin_lock(&gl->gl_spin); + ret = __dump_glock(seq, gl); + spin_unlock(&gl->gl_spin); + return ret; +} + /** * gfs2_dump_lockstate - print out the current lockstate * @sdp: the filesystem @@ -2086,7 +1806,7 @@ void gfs2_glock_exit(void) module_param(scand_secs, uint, S_IRUGO|S_IWUSR); MODULE_PARM_DESC(scand_secs, "The number of seconds between scand runs"); -static int gfs2_glock_iter_next(struct glock_iter *gi) +static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi) { struct gfs2_glock *gl; @@ -2096,15 +1816,17 @@ restart: if (gl) { gi->gl = hlist_entry(gl->gl_list.next, struct gfs2_glock, gl_list); - if (gi->gl) - gfs2_glock_hold(gi->gl); + } else { + gi->gl = hlist_entry(gl_hash_table[gi->hash].hb_list.first, + struct gfs2_glock, gl_list); } + if (gi->gl) + gfs2_glock_hold(gi->gl); read_unlock(gl_lock_addr(gi->hash)); if (gl) gfs2_glock_put(gl); - if (gl && gi->gl == NULL) + while (gi->gl == NULL) { gi->hash++; - while(gi->gl == NULL) { if (gi->hash >= GFS2_GL_HASH_SIZE) return 1; read_lock(gl_lock_addr(gi->hash)); @@ -2113,7 +1835,6 @@ restart: if (gi->gl) gfs2_glock_hold(gi->gl); read_unlock(gl_lock_addr(gi->hash)); - gi->hash++; } if (gi->sdp != gi->gl->gl_sbd) @@ -2122,58 +1843,34 @@ restart: return 0; } -static void gfs2_glock_iter_free(struct glock_iter *gi) +static void gfs2_glock_iter_free(struct gfs2_glock_iter *gi) { if (gi->gl) gfs2_glock_put(gi->gl); - kfree(gi); -} - -static struct glock_iter *gfs2_glock_iter_init(struct gfs2_sbd *sdp) -{ - struct glock_iter *gi; - - gi = kmalloc(sizeof (*gi), GFP_KERNEL); - if (!gi) - return NULL; - - gi->sdp = sdp; - gi->hash = 0; - gi->seq = NULL; gi->gl = NULL; - memset(gi->string, 0, sizeof(gi->string)); - - if (gfs2_glock_iter_next(gi)) { - gfs2_glock_iter_free(gi); - return NULL; - } - - return gi; } -static void *gfs2_glock_seq_start(struct seq_file *file, loff_t *pos) +static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos) { - struct glock_iter *gi; + struct gfs2_glock_iter *gi = seq->private; loff_t n = *pos; - gi = gfs2_glock_iter_init(file->private); - if (!gi) - return NULL; + gi->hash = 0; - while(n--) { + do { if (gfs2_glock_iter_next(gi)) { gfs2_glock_iter_free(gi); return NULL; } - } + } while (n--); - return gi; + return gi->gl; } -static void *gfs2_glock_seq_next(struct seq_file *file, void *iter_ptr, +static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos) { - struct glock_iter *gi = iter_ptr; + struct gfs2_glock_iter *gi = seq->private; (*pos)++; @@ -2182,24 +1879,18 @@ static void *gfs2_glock_seq_next(struct seq_file *file, void *iter_ptr, return NULL; } - return gi; + return gi->gl; } -static void gfs2_glock_seq_stop(struct seq_file *file, void *iter_ptr) +static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr) { - struct glock_iter *gi = iter_ptr; - if (gi) - gfs2_glock_iter_free(gi); + struct gfs2_glock_iter *gi = seq->private; + gfs2_glock_iter_free(gi); } -static int gfs2_glock_seq_show(struct seq_file *file, void *iter_ptr) +static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr) { - struct glock_iter *gi = iter_ptr; - - gi->seq = file; - dump_glock(gi, gi->gl); - - return 0; + return dump_glock(seq, iter_ptr); } static const struct seq_operations gfs2_glock_seq_ops = { @@ -2211,17 +1902,14 @@ static const struct seq_operations gfs2_glock_seq_ops = { static int gfs2_debugfs_open(struct inode *inode, struct file *file) { - struct seq_file *seq; - int ret; - - ret = seq_open(file, &gfs2_glock_seq_ops); - if (ret) - return ret; - - seq = file->private_data; - seq->private = inode->i_private; - - return 0; + int ret = seq_open_private(file, &gfs2_glock_seq_ops, + sizeof(struct gfs2_glock_iter)); + if (ret == 0) { + struct seq_file *seq = file->private_data; + struct gfs2_glock_iter *gi = seq->private; + gi->sdp = inode->i_private; + } + return ret; } static const struct file_operations gfs2_debug_fops = { @@ -2229,7 +1917,7 @@ static const struct file_operations gfs2_debug_fops = { .open = gfs2_debugfs_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release + .release = seq_release_private, }; int gfs2_create_debugfs_file(struct gfs2_sbd *sdp) diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index cdad3e6f8150..695c6b193611 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -24,13 +24,9 @@ #define GL_ASYNC 0x00000040 #define GL_EXACT 0x00000080 #define GL_SKIP 0x00000100 -#define GL_ATIME 0x00000200 #define GL_NOCACHE 0x00000400 -#define GL_FLOCK 0x00000800 -#define GL_NOCANCEL 0x00001000 #define GLR_TRYFAILED 13 -#define GLR_CANCELED 14 static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl) { @@ -41,6 +37,8 @@ static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock * spin_lock(&gl->gl_spin); pid = task_pid(current); list_for_each_entry(gh, &gl->gl_holders, gh_list) { + if (!test_bit(HIF_HOLDER, &gh->gh_iflags)) + break; if (gh->gh_owner_pid == pid) goto out; } @@ -70,7 +68,7 @@ static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl) { int ret; spin_lock(&gl->gl_spin); - ret = test_bit(GLF_DEMOTE, &gl->gl_flags) || !list_empty(&gl->gl_waiters3); + ret = test_bit(GLF_DEMOTE, &gl->gl_flags); spin_unlock(&gl->gl_spin); return ret; } @@ -98,6 +96,7 @@ int gfs2_glock_nq_num(struct gfs2_sbd *sdp, int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs); void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs); void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs); +void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...); /** * gfs2_glock_nq_init - intialize a holder and enqueue it on a glock @@ -130,10 +129,9 @@ int gfs2_lvb_hold(struct gfs2_glock *gl); void gfs2_lvb_unhold(struct gfs2_glock *gl); void gfs2_glock_cb(void *cb_data, unsigned int type, void *data); - void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl); void gfs2_reclaim_glock(struct gfs2_sbd *sdp); -void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait); +void gfs2_gl_hash_clear(struct gfs2_sbd *sdp); int __init gfs2_glock_init(void); void gfs2_glock_exit(void); diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 07d84d16cda4..c6c318c2a0f6 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -13,6 +13,7 @@ #include <linux/buffer_head.h> #include <linux/gfs2_ondisk.h> #include <linux/lm_interface.h> +#include <linux/bio.h> #include "gfs2.h" #include "incore.h" @@ -172,26 +173,6 @@ static void inode_go_sync(struct gfs2_glock *gl) } /** - * inode_go_xmote_bh - After promoting/demoting a glock - * @gl: the glock - * - */ - -static void inode_go_xmote_bh(struct gfs2_glock *gl) -{ - struct gfs2_holder *gh = gl->gl_req_gh; - struct buffer_head *bh; - int error; - - if (gl->gl_state != LM_ST_UNLOCKED && - (!gh || !(gh->gh_flags & GL_SKIP))) { - error = gfs2_meta_read(gl, gl->gl_name.ln_number, 0, &bh); - if (!error) - brelse(bh); - } -} - -/** * inode_go_inval - prepare a inode glock to be released * @gl: the glock * @flags: @@ -267,6 +248,26 @@ static int inode_go_lock(struct gfs2_holder *gh) } /** + * inode_go_dump - print information about an inode + * @seq: The iterator + * @ip: the inode + * + * Returns: 0 on success, -ENOBUFS when we run out of space + */ + +static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl) +{ + const struct gfs2_inode *ip = gl->gl_object; + if (ip == NULL) + return 0; + gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%08lx\n", + (unsigned long long)ip->i_no_formal_ino, + (unsigned long long)ip->i_no_addr, + IF2DT(ip->i_inode.i_mode), ip->i_flags); + return 0; +} + +/** * rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock * @gl: the glock * @@ -306,6 +307,22 @@ static void rgrp_go_unlock(struct gfs2_holder *gh) } /** + * rgrp_go_dump - print out an rgrp + * @seq: The iterator + * @gl: The glock in question + * + */ + +static int rgrp_go_dump(struct seq_file *seq, const struct gfs2_glock *gl) +{ + const struct gfs2_rgrpd *rgd = gl->gl_object; + if (rgd == NULL) + return 0; + gfs2_print_dbg(seq, " R: n:%llu\n", (unsigned long long)rgd->rd_addr); + return 0; +} + +/** * trans_go_sync - promote/demote the transaction glock * @gl: the glock * @state: the requested state @@ -330,7 +347,7 @@ static void trans_go_sync(struct gfs2_glock *gl) * */ -static void trans_go_xmote_bh(struct gfs2_glock *gl) +static int trans_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh) { struct gfs2_sbd *sdp = gl->gl_sbd; struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode); @@ -338,8 +355,7 @@ static void trans_go_xmote_bh(struct gfs2_glock *gl) struct gfs2_log_header_host head; int error; - if (gl->gl_state != LM_ST_UNLOCKED && - test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { + if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { j_gl->gl_ops->go_inval(j_gl, DIO_METADATA); error = gfs2_find_jhead(sdp->sd_jdesc, &head); @@ -354,6 +370,7 @@ static void trans_go_xmote_bh(struct gfs2_glock *gl) gfs2_log_pointers_init(sdp, head.lh_blkno); } } + return 0; } /** @@ -375,12 +392,12 @@ const struct gfs2_glock_operations gfs2_meta_glops = { const struct gfs2_glock_operations gfs2_inode_glops = { .go_xmote_th = inode_go_sync, - .go_xmote_bh = inode_go_xmote_bh, .go_inval = inode_go_inval, .go_demote_ok = inode_go_demote_ok, .go_lock = inode_go_lock, + .go_dump = inode_go_dump, .go_type = LM_TYPE_INODE, - .go_min_hold_time = HZ / 10, + .go_min_hold_time = HZ / 5, }; const struct gfs2_glock_operations gfs2_rgrp_glops = { @@ -389,8 +406,9 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = { .go_demote_ok = rgrp_go_demote_ok, .go_lock = rgrp_go_lock, .go_unlock = rgrp_go_unlock, + .go_dump = rgrp_go_dump, .go_type = LM_TYPE_RGRP, - .go_min_hold_time = HZ / 10, + .go_min_hold_time = HZ / 5, }; const struct gfs2_glock_operations gfs2_trans_glops = { diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index eabe5eac41da..f566ec1b4e8e 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -77,7 +77,6 @@ struct gfs2_rgrp_host { struct gfs2_rgrpd { struct list_head rd_list; /* Link with superblock */ struct list_head rd_list_mru; - struct list_head rd_recent; /* Recently used rgrps */ struct gfs2_glock *rd_gl; /* Glock for this rgrp */ u64 rd_addr; /* grp block disk address */ u64 rd_data0; /* first data location */ @@ -128,20 +127,20 @@ struct gfs2_bufdata { struct gfs2_glock_operations { void (*go_xmote_th) (struct gfs2_glock *gl); - void (*go_xmote_bh) (struct gfs2_glock *gl); + int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh); void (*go_inval) (struct gfs2_glock *gl, int flags); int (*go_demote_ok) (struct gfs2_glock *gl); int (*go_lock) (struct gfs2_holder *gh); void (*go_unlock) (struct gfs2_holder *gh); + int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl); const int go_type; const unsigned long go_min_hold_time; }; enum { /* States */ - HIF_HOLDER = 6, + HIF_HOLDER = 6, /* Set for gh that "holds" the glock */ HIF_FIRST = 7, - HIF_ABORTED = 9, HIF_WAIT = 10, }; @@ -154,20 +153,20 @@ struct gfs2_holder { unsigned gh_flags; int gh_error; - unsigned long gh_iflags; + unsigned long gh_iflags; /* HIF_... */ unsigned long gh_ip; }; enum { - GLF_LOCK = 1, - GLF_STICKY = 2, - GLF_DEMOTE = 3, - GLF_PENDING_DEMOTE = 4, - GLF_DIRTY = 5, - GLF_DEMOTE_IN_PROGRESS = 6, - GLF_LFLUSH = 7, - GLF_WAITERS2 = 8, - GLF_CONV_DEADLK = 9, + GLF_LOCK = 1, + GLF_STICKY = 2, + GLF_DEMOTE = 3, + GLF_PENDING_DEMOTE = 4, + GLF_DEMOTE_IN_PROGRESS = 5, + GLF_DIRTY = 6, + GLF_LFLUSH = 7, + GLF_INVALIDATE_IN_PROGRESS = 8, + GLF_REPLY_PENDING = 9, }; struct gfs2_glock { @@ -179,19 +178,14 @@ struct gfs2_glock { spinlock_t gl_spin; unsigned int gl_state; + unsigned int gl_target; + unsigned int gl_reply; unsigned int gl_hash; unsigned int gl_demote_state; /* state requested by remote node */ unsigned long gl_demote_time; /* time of first demote request */ - struct pid *gl_owner_pid; - unsigned long gl_ip; struct list_head gl_holders; - struct list_head gl_waiters1; /* HIF_MUTEX */ - struct list_head gl_waiters3; /* HIF_PROMOTE */ const struct gfs2_glock_operations *gl_ops; - - struct gfs2_holder *gl_req_gh; - void *gl_lock; char *gl_lvb; atomic_t gl_lvb_count; @@ -392,20 +386,21 @@ struct gfs2_statfs_change_host { #define GFS2_DATA_ORDERED 2 struct gfs2_args { - char ar_lockproto[GFS2_LOCKNAME_LEN]; /* Name of the Lock Protocol */ - char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */ - char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */ - int ar_spectator; /* Don't get a journal because we're always RO */ - int ar_ignore_local_fs; /* Don't optimize even if local_fs is 1 */ - int ar_localflocks; /* Let the VFS do flock|fcntl locks for us */ - int ar_localcaching; /* Local-style caching (dangerous on multihost) */ - int ar_debug; /* Oops on errors instead of trying to be graceful */ - int ar_upgrade; /* Upgrade ondisk/multihost format */ - unsigned int ar_num_glockd; /* Number of glockd threads */ - int ar_posix_acl; /* Enable posix acls */ - int ar_quota; /* off/account/on */ - int ar_suiddir; /* suiddir support */ - int ar_data; /* ordered/writeback */ + char ar_lockproto[GFS2_LOCKNAME_LEN]; /* Name of the Lock Protocol */ + char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */ + char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */ + unsigned int ar_spectator:1; /* Don't get a journal */ + unsigned int ar_ignore_local_fs:1; /* Ignore optimisations */ + unsigned int ar_localflocks:1; /* Let the VFS do flock|fcntl */ + unsigned int ar_localcaching:1; /* Local caching */ + unsigned int ar_debug:1; /* Oops on errors */ + unsigned int ar_upgrade:1; /* Upgrade ondisk format */ + unsigned int ar_posix_acl:1; /* Enable posix acls */ + unsigned int ar_quota:2; /* off/account/on */ + unsigned int ar_suiddir:1; /* suiddir support */ + unsigned int ar_data:2; /* ordered/writeback */ + unsigned int ar_meta:1; /* mount metafs */ + unsigned int ar_num_glockd; /* Number of glockd threads */ }; struct gfs2_tune { @@ -425,9 +420,7 @@ struct gfs2_tune { unsigned int gt_quota_scale_den; /* Denominator */ unsigned int gt_quota_cache_secs; unsigned int gt_quota_quantum; /* Secs between syncs to quota file */ - unsigned int gt_atime_quantum; /* Min secs between atime updates */ unsigned int gt_new_files_jdata; - unsigned int gt_new_files_directio; unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */ unsigned int gt_stall_secs; /* Detects trouble! */ unsigned int gt_complain_secs; @@ -439,7 +432,7 @@ enum { SDF_JOURNAL_CHECKED = 0, SDF_JOURNAL_LIVE = 1, SDF_SHUTDOWN = 2, - SDF_NOATIME = 3, + SDF_NOBARRIERS = 3, }; #define GFS2_FSNAME_LEN 256 @@ -468,7 +461,6 @@ struct gfs2_sb_host { struct gfs2_sbd { struct super_block *sd_vfs; - struct super_block *sd_vfs_meta; struct kobject sd_kobj; unsigned long sd_flags; /* SDF_... */ struct gfs2_sb_host sd_sb; @@ -506,7 +498,9 @@ struct gfs2_sbd { /* Inode Stuff */ - struct inode *sd_master_dir; + struct dentry *sd_master_dir; + struct dentry *sd_root_dir; + struct inode *sd_jindex; struct inode *sd_inum_inode; struct inode *sd_statfs_inode; @@ -534,7 +528,6 @@ struct gfs2_sbd { struct mutex sd_rindex_mutex; struct list_head sd_rindex_list; struct list_head sd_rindex_mru_list; - struct list_head sd_rindex_recent_list; struct gfs2_rgrpd *sd_rindex_forward; unsigned int sd_rgrps; @@ -642,7 +635,6 @@ struct gfs2_sbd { /* Debugging crud */ unsigned long sd_last_warning; - struct vfsmount *sd_gfs2mnt; struct dentry *debugfs_dir; /* debugfs directory */ struct dentry *debugfs_dentry_glocks; /* for debugfs */ }; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 09453d057e41..7cee695fa441 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -18,6 +18,7 @@ #include <linux/crc32.h> #include <linux/lm_interface.h> #include <linux/security.h> +#include <linux/time.h> #include "gfs2.h" #include "incore.h" @@ -249,6 +250,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) { struct gfs2_dinode_host *di = &ip->i_di; const struct gfs2_dinode *str = buf; + struct timespec atime; u16 height, depth; if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr))) @@ -275,8 +277,10 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) di->di_size = be64_to_cpu(str->di_size); i_size_write(&ip->i_inode, di->di_size); gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks)); - ip->i_inode.i_atime.tv_sec = be64_to_cpu(str->di_atime); - ip->i_inode.i_atime.tv_nsec = be32_to_cpu(str->di_atime_nsec); + atime.tv_sec = be64_to_cpu(str->di_atime); + atime.tv_nsec = be32_to_cpu(str->di_atime_nsec); + if (timespec_compare(&ip->i_inode.i_atime, &atime) < 0) + ip->i_inode.i_atime = atime; ip->i_inode.i_mtime.tv_sec = be64_to_cpu(str->di_mtime); ip->i_inode.i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec); ip->i_inode.i_ctime.tv_sec = be64_to_cpu(str->di_ctime); @@ -448,7 +452,7 @@ struct inode *gfs2_lookup_simple(struct inode *dip, const char *name) struct qstr qstr; struct inode *inode; gfs2_str2qstr(&qstr, name); - inode = gfs2_lookupi(dip, &qstr, 1, NULL); + inode = gfs2_lookupi(dip, &qstr, 1); /* gfs2_lookupi has inconsistent callers: vfs * related routines expect NULL for no entry found, * gfs2_lookup_simple callers expect ENOENT @@ -477,7 +481,7 @@ struct inode *gfs2_lookup_simple(struct inode *dip, const char *name) */ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, - int is_root, struct nameidata *nd) + int is_root) { struct super_block *sb = dir->i_sb; struct gfs2_inode *dip = GFS2_I(dir); @@ -504,7 +508,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, } if (!is_root) { - error = permission(dir, MAY_EXEC, NULL); + error = gfs2_permission(dir, MAY_EXEC); if (error) goto out; } @@ -667,7 +671,7 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name, { int error; - error = permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, NULL); + error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC); if (error) return error; @@ -789,13 +793,8 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_JDATA) || gfs2_tune_get(sdp, gt_new_files_jdata)) di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA); - if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_DIRECTIO) || - gfs2_tune_get(sdp, gt_new_files_directio)) - di->di_flags |= cpu_to_be32(GFS2_DIF_DIRECTIO); } else if (S_ISDIR(mode)) { di->di_flags |= cpu_to_be32(dip->i_di.di_flags & - GFS2_DIF_INHERIT_DIRECTIO); - di->di_flags |= cpu_to_be32(dip->i_di.di_flags & GFS2_DIF_INHERIT_JDATA); } @@ -1038,13 +1037,11 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name, if (bh) brelse(bh); - if (!inode) - return ERR_PTR(-ENOMEM); return inode; fail_gunlock2: gfs2_glock_dq_uninit(ghs + 1); - if (inode) + if (inode && !IS_ERR(inode)) iput(inode); fail_gunlock: gfs2_glock_dq(ghs); @@ -1134,7 +1131,7 @@ int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, if (IS_APPEND(&dip->i_inode)) return -EPERM; - error = permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, NULL); + error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC); if (error) return error; @@ -1145,54 +1142,6 @@ int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, return 0; } -/* - * gfs2_ok_to_move - check if it's ok to move a directory to another directory - * @this: move this - * @to: to here - * - * Follow @to back to the root and make sure we don't encounter @this - * Assumes we already hold the rename lock. - * - * Returns: errno - */ - -int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to) -{ - struct inode *dir = &to->i_inode; - struct super_block *sb = dir->i_sb; - struct inode *tmp; - struct qstr dotdot; - int error = 0; - - gfs2_str2qstr(&dotdot, ".."); - - igrab(dir); - - for (;;) { - if (dir == &this->i_inode) { - error = -EINVAL; - break; - } - if (dir == sb->s_root->d_inode) { - error = 0; - break; - } - - tmp = gfs2_lookupi(dir, &dotdot, 1, NULL); - if (IS_ERR(tmp)) { - error = PTR_ERR(tmp); - break; - } - - iput(dir); - dir = tmp; - } - - iput(dir); - - return error; -} - /** * gfs2_readlinki - return the contents of a symlink * @ip: the symlink's inode @@ -1212,8 +1161,8 @@ int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len) unsigned int x; int error; - gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh); - error = gfs2_glock_nq_atime(&i_gh); + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh); + error = gfs2_glock_nq(&i_gh); if (error) { gfs2_holder_uninit(&i_gh); return error; @@ -1248,101 +1197,6 @@ out: return error; } -/** - * gfs2_glock_nq_atime - Acquire a hold on an inode's glock, and - * conditionally update the inode's atime - * @gh: the holder to acquire - * - * Tests atime (access time) for gfs2_read, gfs2_readdir and gfs2_mmap - * Update if the difference between the current time and the inode's current - * atime is greater than an interval specified at mount. - * - * Returns: errno - */ - -int gfs2_glock_nq_atime(struct gfs2_holder *gh) -{ - struct gfs2_glock *gl = gh->gh_gl; - struct gfs2_sbd *sdp = gl->gl_sbd; - struct gfs2_inode *ip = gl->gl_object; - s64 quantum = gfs2_tune_get(sdp, gt_atime_quantum); - unsigned int state; - int flags; - int error; - struct timespec tv = CURRENT_TIME; - - if (gfs2_assert_warn(sdp, gh->gh_flags & GL_ATIME) || - gfs2_assert_warn(sdp, !(gh->gh_flags & GL_ASYNC)) || - gfs2_assert_warn(sdp, gl->gl_ops == &gfs2_inode_glops)) - return -EINVAL; - - state = gh->gh_state; - flags = gh->gh_flags; - - error = gfs2_glock_nq(gh); - if (error) - return error; - - if (test_bit(SDF_NOATIME, &sdp->sd_flags) || - (sdp->sd_vfs->s_flags & MS_RDONLY)) - return 0; - - if (tv.tv_sec - ip->i_inode.i_atime.tv_sec >= quantum) { - gfs2_glock_dq(gh); - gfs2_holder_reinit(LM_ST_EXCLUSIVE, gh->gh_flags & ~LM_FLAG_ANY, - gh); - error = gfs2_glock_nq(gh); - if (error) - return error; - - /* Verify that atime hasn't been updated while we were - trying to get exclusive lock. */ - - tv = CURRENT_TIME; - if (tv.tv_sec - ip->i_inode.i_atime.tv_sec >= quantum) { - struct buffer_head *dibh; - struct gfs2_dinode *di; - - error = gfs2_trans_begin(sdp, RES_DINODE, 0); - if (error == -EROFS) - return 0; - if (error) - goto fail; - - error = gfs2_meta_inode_buffer(ip, &dibh); - if (error) - goto fail_end_trans; - - ip->i_inode.i_atime = tv; - - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - di = (struct gfs2_dinode *)dibh->b_data; - di->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec); - di->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec); - brelse(dibh); - - gfs2_trans_end(sdp); - } - - /* If someone else has asked for the glock, - unlock and let them have it. Then reacquire - in the original state. */ - if (gfs2_glock_is_blocking(gl)) { - gfs2_glock_dq(gh); - gfs2_holder_reinit(state, flags, gh); - return gfs2_glock_nq(gh); - } - } - - return 0; - -fail_end_trans: - gfs2_trans_end(sdp); -fail: - gfs2_glock_dq(gh); - return error; -} - static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr) { diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index 580da454b38f..2d43f69610a0 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -72,7 +72,6 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip, } -void gfs2_inode_attr_in(struct gfs2_inode *ip); void gfs2_set_iop(struct inode *inode); struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, u64 no_addr, u64 no_formal_ino, @@ -84,16 +83,15 @@ int gfs2_inode_refresh(struct gfs2_inode *ip); int gfs2_dinode_dealloc(struct gfs2_inode *inode); int gfs2_change_nlink(struct gfs2_inode *ip, int diff); struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, - int is_root, struct nameidata *nd); + int is_root); struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name, unsigned int mode, dev_t dev); int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name, struct gfs2_inode *ip); int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, const struct gfs2_inode *ip); -int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to); +int gfs2_permission(struct inode *inode, int mask); int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len); -int gfs2_glock_nq_atime(struct gfs2_holder *gh); int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr); struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); diff --git a/fs/gfs2/locking.c b/fs/gfs2/locking.c index 663fee728783..523243a13a21 100644 --- a/fs/gfs2/locking.c +++ b/fs/gfs2/locking.c @@ -23,12 +23,54 @@ struct lmh_wrapper { const struct lm_lockops *lw_ops; }; +static int nolock_mount(char *table_name, char *host_data, + lm_callback_t cb, void *cb_data, + unsigned int min_lvb_size, int flags, + struct lm_lockstruct *lockstruct, + struct kobject *fskobj); + /* List of registered low-level locking protocols. A file system selects one of them by name at mount time, e.g. lock_nolock, lock_dlm. */ +static const struct lm_lockops nolock_ops = { + .lm_proto_name = "lock_nolock", + .lm_mount = nolock_mount, +}; + +static struct lmh_wrapper nolock_proto = { + .lw_list = LIST_HEAD_INIT(nolock_proto.lw_list), + .lw_ops = &nolock_ops, +}; + static LIST_HEAD(lmh_list); static DEFINE_MUTEX(lmh_lock); +static int nolock_mount(char *table_name, char *host_data, + lm_callback_t cb, void *cb_data, + unsigned int min_lvb_size, int flags, + struct lm_lockstruct *lockstruct, + struct kobject *fskobj) +{ + char *c; + unsigned int jid; + + c = strstr(host_data, "jid="); + if (!c) + jid = 0; + else { + c += 4; + sscanf(c, "%u", &jid); + } + + lockstruct->ls_jid = jid; + lockstruct->ls_first = 1; + lockstruct->ls_lvb_size = min_lvb_size; + lockstruct->ls_ops = &nolock_ops; + lockstruct->ls_flags = LM_LSFLAG_LOCAL; + + return 0; +} + /** * gfs2_register_lockproto - Register a low-level locking protocol * @proto: the protocol definition @@ -116,9 +158,13 @@ int gfs2_mount_lockproto(char *proto_name, char *table_name, char *host_data, int try = 0; int error, found; + retry: mutex_lock(&lmh_lock); + if (list_empty(&nolock_proto.lw_list)) + list_add(&nolock_proto.lw_list, &lmh_list); + found = 0; list_for_each_entry(lw, &lmh_list, lw_list) { if (!strcmp(lw->lw_ops->lm_proto_name, proto_name)) { @@ -139,7 +185,8 @@ retry: goto out; } - if (!try_module_get(lw->lw_ops->lm_owner)) { + if (lw->lw_ops->lm_owner && + !try_module_get(lw->lw_ops->lm_owner)) { try = 0; mutex_unlock(&lmh_lock); msleep(1000); @@ -158,7 +205,8 @@ out: void gfs2_unmount_lockproto(struct lm_lockstruct *lockstruct) { mutex_lock(&lmh_lock); - lockstruct->ls_ops->lm_unmount(lockstruct->ls_lockspace); + if (lockstruct->ls_ops->lm_unmount) + lockstruct->ls_ops->lm_unmount(lockstruct->ls_lockspace); if (lockstruct->ls_ops->lm_owner) module_put(lockstruct->ls_ops->lm_owner); mutex_unlock(&lmh_lock); diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c index cf7ea8abec87..2482c9047505 100644 --- a/fs/gfs2/locking/dlm/lock.c +++ b/fs/gfs2/locking/dlm/lock.c @@ -11,46 +11,60 @@ static char junk_lvb[GDLM_LVB_SIZE]; -static void queue_complete(struct gdlm_lock *lp) + +/* convert dlm lock-mode to gfs lock-state */ + +static s16 gdlm_make_lmstate(s16 dlmmode) { - struct gdlm_ls *ls = lp->ls; + switch (dlmmode) { + case DLM_LOCK_IV: + case DLM_LOCK_NL: + return LM_ST_UNLOCKED; + case DLM_LOCK_EX: + return LM_ST_EXCLUSIVE; + case DLM_LOCK_CW: + return LM_ST_DEFERRED; + case DLM_LOCK_PR: + return LM_ST_SHARED; + } + gdlm_assert(0, "unknown DLM mode %d", dlmmode); + return -1; +} - clear_bit(LFL_ACTIVE, &lp->flags); +/* A lock placed on this queue is re-submitted to DLM as soon as the lock_dlm + thread gets to it. */ + +static void queue_submit(struct gdlm_lock *lp) +{ + struct gdlm_ls *ls = lp->ls; spin_lock(&ls->async_lock); - list_add_tail(&lp->clist, &ls->complete); + list_add_tail(&lp->delay_list, &ls->submit); spin_unlock(&ls->async_lock); wake_up(&ls->thread_wait); } -static inline void gdlm_ast(void *astarg) +static void wake_up_ast(struct gdlm_lock *lp) { - queue_complete(astarg); + clear_bit(LFL_AST_WAIT, &lp->flags); + smp_mb__after_clear_bit(); + wake_up_bit(&lp->flags, LFL_AST_WAIT); } -static inline void gdlm_bast(void *astarg, int mode) +static void gdlm_delete_lp(struct gdlm_lock *lp) { - struct gdlm_lock *lp = astarg; struct gdlm_ls *ls = lp->ls; - if (!mode) { - printk(KERN_INFO "lock_dlm: bast mode zero %x,%llx\n", - lp->lockname.ln_type, - (unsigned long long)lp->lockname.ln_number); - return; - } - spin_lock(&ls->async_lock); - if (!lp->bast_mode) { - list_add_tail(&lp->blist, &ls->blocking); - lp->bast_mode = mode; - } else if (lp->bast_mode < mode) - lp->bast_mode = mode; + if (!list_empty(&lp->delay_list)) + list_del_init(&lp->delay_list); + ls->all_locks_count--; spin_unlock(&ls->async_lock); - wake_up(&ls->thread_wait); + + kfree(lp); } -void gdlm_queue_delayed(struct gdlm_lock *lp) +static void gdlm_queue_delayed(struct gdlm_lock *lp) { struct gdlm_ls *ls = lp->ls; @@ -59,6 +73,236 @@ void gdlm_queue_delayed(struct gdlm_lock *lp) spin_unlock(&ls->async_lock); } +static void process_complete(struct gdlm_lock *lp) +{ + struct gdlm_ls *ls = lp->ls; + struct lm_async_cb acb; + + memset(&acb, 0, sizeof(acb)); + + if (lp->lksb.sb_status == -DLM_ECANCEL) { + log_info("complete dlm cancel %x,%llx flags %lx", + lp->lockname.ln_type, + (unsigned long long)lp->lockname.ln_number, + lp->flags); + + lp->req = lp->cur; + acb.lc_ret |= LM_OUT_CANCELED; + if (lp->cur == DLM_LOCK_IV) + lp->lksb.sb_lkid = 0; + goto out; + } + + if (test_and_clear_bit(LFL_DLM_UNLOCK, &lp->flags)) { + if (lp->lksb.sb_status != -DLM_EUNLOCK) { + log_info("unlock sb_status %d %x,%llx flags %lx", + lp->lksb.sb_status, lp->lockname.ln_type, + (unsigned long long)lp->lockname.ln_number, + lp->flags); + return; + } + + lp->cur = DLM_LOCK_IV; + lp->req = DLM_LOCK_IV; + lp->lksb.sb_lkid = 0; + + if (test_and_clear_bit(LFL_UNLOCK_DELETE, &lp->flags)) { + gdlm_delete_lp(lp); + return; + } + goto out; + } + + if (lp->lksb.sb_flags & DLM_SBF_VALNOTVALID) + memset(lp->lksb.sb_lvbptr, 0, GDLM_LVB_SIZE); + + if (lp->lksb.sb_flags & DLM_SBF_ALTMODE) { + if (lp->req == DLM_LOCK_PR) + lp->req = DLM_LOCK_CW; + else if (lp->req == DLM_LOCK_CW) + lp->req = DLM_LOCK_PR; + } + + /* + * A canceled lock request. The lock was just taken off the delayed + * list and was never even submitted to dlm. + */ + + if (test_and_clear_bit(LFL_CANCEL, &lp->flags)) { + log_info("complete internal cancel %x,%llx", + lp->lockname.ln_type, + (unsigned long long)lp->lockname.ln_number); + lp->req = lp->cur; + acb.lc_ret |= LM_OUT_CANCELED; + goto out; + } + + /* + * An error occured. + */ + + if (lp->lksb.sb_status) { + /* a "normal" error */ + if ((lp->lksb.sb_status == -EAGAIN) && + (lp->lkf & DLM_LKF_NOQUEUE)) { + lp->req = lp->cur; + if (lp->cur == DLM_LOCK_IV) + lp->lksb.sb_lkid = 0; + goto out; + } + + /* this could only happen with cancels I think */ + log_info("ast sb_status %d %x,%llx flags %lx", + lp->lksb.sb_status, lp->lockname.ln_type, + (unsigned long long)lp->lockname.ln_number, + lp->flags); + return; + } + + /* + * This is an AST for an EX->EX conversion for sync_lvb from GFS. + */ + + if (test_and_clear_bit(LFL_SYNC_LVB, &lp->flags)) { + wake_up_ast(lp); + return; + } + + /* + * A lock has been demoted to NL because it initially completed during + * BLOCK_LOCKS. Now it must be requested in the originally requested + * mode. + */ + + if (test_and_clear_bit(LFL_REREQUEST, &lp->flags)) { + gdlm_assert(lp->req == DLM_LOCK_NL, "%x,%llx", + lp->lockname.ln_type, + (unsigned long long)lp->lockname.ln_number); + gdlm_assert(lp->prev_req > DLM_LOCK_NL, "%x,%llx", + lp->lockname.ln_type, + (unsigned long long)lp->lockname.ln_number); + + lp->cur = DLM_LOCK_NL; + lp->req = lp->prev_req; + lp->prev_req = DLM_LOCK_IV; + lp->lkf &= ~DLM_LKF_CONVDEADLK; + + set_bit(LFL_NOCACHE, &lp->flags); + + if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) && + !test_bit(LFL_NOBLOCK, &lp->flags)) + gdlm_queue_delayed(lp); + else + queue_submit(lp); + return; + } + + /* + * A request is granted during dlm recovery. It may be granted + * because the locks of a failed node were cleared. In that case, + * there may be inconsistent data beneath this lock and we must wait + * for recovery to complete to use it. When gfs recovery is done this + * granted lock will be converted to NL and then reacquired in this + * granted state. + */ + + if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) && + !test_bit(LFL_NOBLOCK, &lp->flags) && + lp->req != DLM_LOCK_NL) { + + lp->cur = lp->req; + lp->prev_req = lp->req; + lp->req = DLM_LOCK_NL; + lp->lkf |= DLM_LKF_CONVERT; + lp->lkf &= ~DLM_LKF_CONVDEADLK; + + log_debug("rereq %x,%llx id %x %d,%d", + lp->lockname.ln_type, + (unsigned long long)lp->lockname.ln_number, + lp->lksb.sb_lkid, lp->cur, lp->req); + + set_bit(LFL_REREQUEST, &lp->flags); + queue_submit(lp); + return; + } + + /* + * DLM demoted the lock to NL before it was granted so GFS must be + * told it cannot cache data for this lock. + */ + + if (lp->lksb.sb_flags & DLM_SBF_DEMOTED) + set_bit(LFL_NOCACHE, &lp->flags); + +out: + /* + * This is an internal lock_dlm lock + */ + + if (test_bit(LFL_INLOCK, &lp->flags)) { + clear_bit(LFL_NOBLOCK, &lp->flags); + lp->cur = lp->req; + wake_up_ast(lp); + return; + } + + /* + * Normal completion of a lock request. Tell GFS it now has the lock. + */ + + clear_bit(LFL_NOBLOCK, &lp->flags); + lp->cur = lp->req; + + acb.lc_name = lp->lockname; + acb.lc_ret |= gdlm_make_lmstate(lp->cur); + + ls->fscb(ls->sdp, LM_CB_ASYNC, &acb); +} + +static void gdlm_ast(void *astarg) +{ + struct gdlm_lock *lp = astarg; + clear_bit(LFL_ACTIVE, &lp->flags); + process_complete(lp); +} + +static void process_blocking(struct gdlm_lock *lp, int bast_mode) +{ + struct gdlm_ls *ls = lp->ls; + unsigned int cb = 0; + + switch (gdlm_make_lmstate(bast_mode)) { + case LM_ST_EXCLUSIVE: + cb = LM_CB_NEED_E; + break; + case LM_ST_DEFERRED: + cb = LM_CB_NEED_D; + break; + case LM_ST_SHARED: + cb = LM_CB_NEED_S; + break; + default: + gdlm_assert(0, "unknown bast mode %u", bast_mode); + } + + ls->fscb(ls->sdp, cb, &lp->lockname); +} + + +static void gdlm_bast(void *astarg, int mode) +{ + struct gdlm_lock *lp = astarg; + + if (!mode) { + printk(KERN_INFO "lock_dlm: bast mode zero %x,%llx\n", + lp->lockname.ln_type, + (unsigned long long)lp->lockname.ln_number); + return; + } + + process_blocking(lp, mode); +} + /* convert gfs lock-state to dlm lock-mode */ static s16 make_mode(s16 lmstate) @@ -77,24 +321,6 @@ static s16 make_mode(s16 lmstate) return -1; } -/* convert dlm lock-mode to gfs lock-state */ - -s16 gdlm_make_lmstate(s16 dlmmode) -{ - switch (dlmmode) { - case DLM_LOCK_IV: - case DLM_LOCK_NL: - return LM_ST_UNLOCKED; - case DLM_LOCK_EX: - return LM_ST_EXCLUSIVE; - case DLM_LOCK_CW: - return LM_ST_DEFERRED; - case DLM_LOCK_PR: - return LM_ST_SHARED; - } - gdlm_assert(0, "unknown DLM mode %d", dlmmode); - return -1; -} /* verify agreement with GFS on the current lock state, NB: DLM_LOCK_NL and DLM_LOCK_IV are both considered LM_ST_UNLOCKED by GFS. */ @@ -134,14 +360,6 @@ static inline unsigned int make_flags(struct gdlm_lock *lp, if (lp->lksb.sb_lkid != 0) { lkf |= DLM_LKF_CONVERT; - - /* Conversion deadlock avoidance by DLM */ - - if (!(lp->ls->fsflags & LM_MFLAG_CONV_NODROP) && - !test_bit(LFL_FORCE_PROMOTE, &lp->flags) && - !(lkf & DLM_LKF_NOQUEUE) && - cur > DLM_LOCK_NL && req > DLM_LOCK_NL && cur != req) - lkf |= DLM_LKF_CONVDEADLK; } if (lp->lvb) @@ -173,14 +391,9 @@ static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name, make_strname(name, &lp->strname); lp->ls = ls; lp->cur = DLM_LOCK_IV; - lp->lvb = NULL; - lp->hold_null = NULL; - INIT_LIST_HEAD(&lp->clist); - INIT_LIST_HEAD(&lp->blist); INIT_LIST_HEAD(&lp->delay_list); spin_lock(&ls->async_lock); - list_add(&lp->all_list, &ls->all_locks); ls->all_locks_count++; spin_unlock(&ls->async_lock); @@ -188,26 +401,6 @@ static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name, return 0; } -void gdlm_delete_lp(struct gdlm_lock *lp) -{ - struct gdlm_ls *ls = lp->ls; - - spin_lock(&ls->async_lock); - if (!list_empty(&lp->clist)) - list_del_init(&lp->clist); - if (!list_empty(&lp->blist)) - list_del_init(&lp->blist); - if (!list_empty(&lp->delay_list)) - list_del_init(&lp->delay_list); - gdlm_assert(!list_empty(&lp->all_list), "%x,%llx", lp->lockname.ln_type, - (unsigned long long)lp->lockname.ln_number); - list_del_init(&lp->all_list); - ls->all_locks_count--; - spin_unlock(&ls->async_lock); - - kfree(lp); -} - int gdlm_get_lock(void *lockspace, struct lm_lockname *name, void **lockp) { @@ -261,7 +454,7 @@ unsigned int gdlm_do_lock(struct gdlm_lock *lp) if ((error == -EAGAIN) && (lp->lkf & DLM_LKF_NOQUEUE)) { lp->lksb.sb_status = -EAGAIN; - queue_complete(lp); + gdlm_ast(lp); error = 0; } @@ -308,6 +501,12 @@ unsigned int gdlm_lock(void *lock, unsigned int cur_state, { struct gdlm_lock *lp = lock; + if (req_state == LM_ST_UNLOCKED) + return gdlm_unlock(lock, cur_state); + + if (req_state == LM_ST_UNLOCKED) + return gdlm_unlock(lock, cur_state); + clear_bit(LFL_DLM_CANCEL, &lp->flags); if (flags & LM_FLAG_NOEXP) set_bit(LFL_NOBLOCK, &lp->flags); @@ -351,7 +550,7 @@ void gdlm_cancel(void *lock) if (delay_list) { set_bit(LFL_CANCEL, &lp->flags); set_bit(LFL_ACTIVE, &lp->flags); - queue_complete(lp); + gdlm_ast(lp); return; } @@ -507,22 +706,3 @@ void gdlm_submit_delayed(struct gdlm_ls *ls) wake_up(&ls->thread_wait); } -int gdlm_release_all_locks(struct gdlm_ls *ls) -{ - struct gdlm_lock *lp, *safe; - int count = 0; - - spin_lock(&ls->async_lock); - list_for_each_entry_safe(lp, safe, &ls->all_locks, all_list) { - list_del_init(&lp->all_list); - - if (lp->lvb && lp->lvb != junk_lvb) - kfree(lp->lvb); - kfree(lp); - count++; - } - spin_unlock(&ls->async_lock); - - return count; -} - diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h index a243cf69c54e..3c98e7c6f93b 100644 --- a/fs/gfs2/locking/dlm/lock_dlm.h +++ b/fs/gfs2/locking/dlm/lock_dlm.h @@ -72,19 +72,12 @@ struct gdlm_ls { int recover_jid_done; int recover_jid_status; spinlock_t async_lock; - struct list_head complete; - struct list_head blocking; struct list_head delayed; struct list_head submit; - struct list_head all_locks; u32 all_locks_count; wait_queue_head_t wait_control; - struct task_struct *thread1; - struct task_struct *thread2; + struct task_struct *thread; wait_queue_head_t thread_wait; - unsigned long drop_time; - int drop_locks_count; - int drop_locks_period; }; enum { @@ -117,12 +110,7 @@ struct gdlm_lock { u32 lkf; /* dlm flags DLM_LKF_ */ unsigned long flags; /* lock_dlm flags LFL_ */ - int bast_mode; /* protected by async_lock */ - - struct list_head clist; /* complete */ - struct list_head blist; /* blocking */ struct list_head delay_list; /* delayed */ - struct list_head all_list; /* all locks for the fs */ struct gdlm_lock *hold_null; /* NL lock for hold_lvb */ }; @@ -159,11 +147,7 @@ void gdlm_release_threads(struct gdlm_ls *); /* lock.c */ -s16 gdlm_make_lmstate(s16); -void gdlm_queue_delayed(struct gdlm_lock *); void gdlm_submit_delayed(struct gdlm_ls *); -int gdlm_release_all_locks(struct gdlm_ls *); -void gdlm_delete_lp(struct gdlm_lock *); unsigned int gdlm_do_lock(struct gdlm_lock *); int gdlm_get_lock(void *, struct lm_lockname *, void **); diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c index 470bdf650b50..0c4cbe6c8285 100644 --- a/fs/gfs2/locking/dlm/mount.c +++ b/fs/gfs2/locking/dlm/mount.c @@ -22,22 +22,14 @@ static struct gdlm_ls *init_gdlm(lm_callback_t cb, struct gfs2_sbd *sdp, if (!ls) return NULL; - ls->drop_locks_count = GDLM_DROP_COUNT; - ls->drop_locks_period = GDLM_DROP_PERIOD; ls->fscb = cb; ls->sdp = sdp; ls->fsflags = flags; spin_lock_init(&ls->async_lock); - INIT_LIST_HEAD(&ls->complete); - INIT_LIST_HEAD(&ls->blocking); INIT_LIST_HEAD(&ls->delayed); INIT_LIST_HEAD(&ls->submit); - INIT_LIST_HEAD(&ls->all_locks); init_waitqueue_head(&ls->thread_wait); init_waitqueue_head(&ls->wait_control); - ls->thread1 = NULL; - ls->thread2 = NULL; - ls->drop_time = jiffies; ls->jid = -1; strncpy(buf, table_name, 256); @@ -152,7 +144,8 @@ static int gdlm_mount(char *table_name, char *host_data, error = dlm_new_lockspace(ls->fsname, strlen(ls->fsname), &ls->dlm_lockspace, - DLM_LSFL_FS | (nodir ? DLM_LSFL_NODIR : 0), + DLM_LSFL_FS | DLM_LSFL_NEWEXCL | + (nodir ? DLM_LSFL_NODIR : 0), GDLM_LVB_SIZE); if (error) { log_error("dlm_new_lockspace error %d", error); @@ -180,7 +173,6 @@ out: static void gdlm_unmount(void *lockspace) { struct gdlm_ls *ls = lockspace; - int rv; log_debug("unmount flags %lx", ls->flags); @@ -194,9 +186,7 @@ static void gdlm_unmount(void *lockspace) gdlm_kobject_release(ls); dlm_release_lockspace(ls->dlm_lockspace, 2); gdlm_release_threads(ls); - rv = gdlm_release_all_locks(ls); - if (rv) - log_info("gdlm_unmount: %d stray locks freed", rv); + BUG_ON(ls->all_locks_count); out: kfree(ls); } @@ -232,7 +222,6 @@ static void gdlm_withdraw(void *lockspace) dlm_release_lockspace(ls->dlm_lockspace, 2); gdlm_release_threads(ls); - gdlm_release_all_locks(ls); gdlm_kobject_release(ls); } diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c index a4ff271df9ee..4ec571c3d8a9 100644 --- a/fs/gfs2/locking/dlm/sysfs.c +++ b/fs/gfs2/locking/dlm/sysfs.c @@ -114,17 +114,6 @@ static ssize_t recover_status_show(struct gdlm_ls *ls, char *buf) return sprintf(buf, "%d\n", ls->recover_jid_status); } -static ssize_t drop_count_show(struct gdlm_ls *ls, char *buf) -{ - return sprintf(buf, "%d\n", ls->drop_locks_count); -} - -static ssize_t drop_count_store(struct gdlm_ls *ls, const char *buf, size_t len) -{ - ls->drop_locks_count = simple_strtol(buf, NULL, 0); - return len; -} - struct gdlm_attr { struct attribute attr; ssize_t (*show)(struct gdlm_ls *, char *); @@ -144,7 +133,6 @@ GDLM_ATTR(first_done, 0444, first_done_show, NULL); GDLM_ATTR(recover, 0644, recover_show, recover_store); GDLM_ATTR(recover_done, 0444, recover_done_show, NULL); GDLM_ATTR(recover_status, 0444, recover_status_show, NULL); -GDLM_ATTR(drop_count, 0644, drop_count_show, drop_count_store); static struct attribute *gdlm_attrs[] = { &gdlm_attr_proto_name.attr, @@ -157,7 +145,6 @@ static struct attribute *gdlm_attrs[] = { &gdlm_attr_recover.attr, &gdlm_attr_recover_done.attr, &gdlm_attr_recover_status.attr, - &gdlm_attr_drop_count.attr, NULL, }; diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c index e53db6fd28ab..38823efd698c 100644 --- a/fs/gfs2/locking/dlm/thread.c +++ b/fs/gfs2/locking/dlm/thread.c @@ -9,367 +9,60 @@ #include "lock_dlm.h" -/* A lock placed on this queue is re-submitted to DLM as soon as the lock_dlm - thread gets to it. */ - -static void queue_submit(struct gdlm_lock *lp) -{ - struct gdlm_ls *ls = lp->ls; - - spin_lock(&ls->async_lock); - list_add_tail(&lp->delay_list, &ls->submit); - spin_unlock(&ls->async_lock); - wake_up(&ls->thread_wait); -} - -static void process_blocking(struct gdlm_lock *lp, int bast_mode) -{ - struct gdlm_ls *ls = lp->ls; - unsigned int cb = 0; - - switch (gdlm_make_lmstate(bast_mode)) { - case LM_ST_EXCLUSIVE: - cb = LM_CB_NEED_E; - break; - case LM_ST_DEFERRED: - cb = LM_CB_NEED_D; - break; - case LM_ST_SHARED: - cb = LM_CB_NEED_S; - break; - default: - gdlm_assert(0, "unknown bast mode %u", lp->bast_mode); - } - - ls->fscb(ls->sdp, cb, &lp->lockname); -} - -static void wake_up_ast(struct gdlm_lock *lp) -{ - clear_bit(LFL_AST_WAIT, &lp->flags); - smp_mb__after_clear_bit(); - wake_up_bit(&lp->flags, LFL_AST_WAIT); -} - -static void process_complete(struct gdlm_lock *lp) -{ - struct gdlm_ls *ls = lp->ls; - struct lm_async_cb acb; - s16 prev_mode = lp->cur; - - memset(&acb, 0, sizeof(acb)); - - if (lp->lksb.sb_status == -DLM_ECANCEL) { - log_info("complete dlm cancel %x,%llx flags %lx", - lp->lockname.ln_type, - (unsigned long long)lp->lockname.ln_number, - lp->flags); - - lp->req = lp->cur; - acb.lc_ret |= LM_OUT_CANCELED; - if (lp->cur == DLM_LOCK_IV) - lp->lksb.sb_lkid = 0; - goto out; - } - - if (test_and_clear_bit(LFL_DLM_UNLOCK, &lp->flags)) { - if (lp->lksb.sb_status != -DLM_EUNLOCK) { - log_info("unlock sb_status %d %x,%llx flags %lx", - lp->lksb.sb_status, lp->lockname.ln_type, - (unsigned long long)lp->lockname.ln_number, - lp->flags); - return; - } - - lp->cur = DLM_LOCK_IV; - lp->req = DLM_LOCK_IV; - lp->lksb.sb_lkid = 0; - - if (test_and_clear_bit(LFL_UNLOCK_DELETE, &lp->flags)) { - gdlm_delete_lp(lp); - return; - } - goto out; - } - - if (lp->lksb.sb_flags & DLM_SBF_VALNOTVALID) - memset(lp->lksb.sb_lvbptr, 0, GDLM_LVB_SIZE); - - if (lp->lksb.sb_flags & DLM_SBF_ALTMODE) { - if (lp->req == DLM_LOCK_PR) - lp->req = DLM_LOCK_CW; - else if (lp->req == DLM_LOCK_CW) - lp->req = DLM_LOCK_PR; - } - - /* - * A canceled lock request. The lock was just taken off the delayed - * list and was never even submitted to dlm. - */ - - if (test_and_clear_bit(LFL_CANCEL, &lp->flags)) { - log_info("complete internal cancel %x,%llx", - lp->lockname.ln_type, - (unsigned long long)lp->lockname.ln_number); - lp->req = lp->cur; - acb.lc_ret |= LM_OUT_CANCELED; - goto out; - } - - /* - * An error occured. - */ - - if (lp->lksb.sb_status) { - /* a "normal" error */ - if ((lp->lksb.sb_status == -EAGAIN) && - (lp->lkf & DLM_LKF_NOQUEUE)) { - lp->req = lp->cur; - if (lp->cur == DLM_LOCK_IV) - lp->lksb.sb_lkid = 0; - goto out; - } - - /* this could only happen with cancels I think */ - log_info("ast sb_status %d %x,%llx flags %lx", - lp->lksb.sb_status, lp->lockname.ln_type, - (unsigned long long)lp->lockname.ln_number, - lp->flags); - if (lp->lksb.sb_status == -EDEADLOCK && - lp->ls->fsflags & LM_MFLAG_CONV_NODROP) { - lp->req = lp->cur; - acb.lc_ret |= LM_OUT_CONV_DEADLK; - if (lp->cur == DLM_LOCK_IV) - lp->lksb.sb_lkid = 0; - goto out; - } else - return; - } - - /* - * This is an AST for an EX->EX conversion for sync_lvb from GFS. - */ - - if (test_and_clear_bit(LFL_SYNC_LVB, &lp->flags)) { - wake_up_ast(lp); - return; - } - - /* - * A lock has been demoted to NL because it initially completed during - * BLOCK_LOCKS. Now it must be requested in the originally requested - * mode. - */ - - if (test_and_clear_bit(LFL_REREQUEST, &lp->flags)) { - gdlm_assert(lp->req == DLM_LOCK_NL, "%x,%llx", - lp->lockname.ln_type, - (unsigned long long)lp->lockname.ln_number); - gdlm_assert(lp->prev_req > DLM_LOCK_NL, "%x,%llx", - lp->lockname.ln_type, - (unsigned long long)lp->lockname.ln_number); - - lp->cur = DLM_LOCK_NL; - lp->req = lp->prev_req; - lp->prev_req = DLM_LOCK_IV; - lp->lkf &= ~DLM_LKF_CONVDEADLK; - - set_bit(LFL_NOCACHE, &lp->flags); - - if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) && - !test_bit(LFL_NOBLOCK, &lp->flags)) - gdlm_queue_delayed(lp); - else - queue_submit(lp); - return; - } - - /* - * A request is granted during dlm recovery. It may be granted - * because the locks of a failed node were cleared. In that case, - * there may be inconsistent data beneath this lock and we must wait - * for recovery to complete to use it. When gfs recovery is done this - * granted lock will be converted to NL and then reacquired in this - * granted state. - */ - - if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) && - !test_bit(LFL_NOBLOCK, &lp->flags) && - lp->req != DLM_LOCK_NL) { - - lp->cur = lp->req; - lp->prev_req = lp->req; - lp->req = DLM_LOCK_NL; - lp->lkf |= DLM_LKF_CONVERT; - lp->lkf &= ~DLM_LKF_CONVDEADLK; - - log_debug("rereq %x,%llx id %x %d,%d", - lp->lockname.ln_type, - (unsigned long long)lp->lockname.ln_number, - lp->lksb.sb_lkid, lp->cur, lp->req); - - set_bit(LFL_REREQUEST, &lp->flags); - queue_submit(lp); - return; - } - - /* - * DLM demoted the lock to NL before it was granted so GFS must be - * told it cannot cache data for this lock. - */ - - if (lp->lksb.sb_flags & DLM_SBF_DEMOTED) - set_bit(LFL_NOCACHE, &lp->flags); - -out: - /* - * This is an internal lock_dlm lock - */ - - if (test_bit(LFL_INLOCK, &lp->flags)) { - clear_bit(LFL_NOBLOCK, &lp->flags); - lp->cur = lp->req; - wake_up_ast(lp); - return; - } - - /* - * Normal completion of a lock request. Tell GFS it now has the lock. - */ - - clear_bit(LFL_NOBLOCK, &lp->flags); - lp->cur = lp->req; - - acb.lc_name = lp->lockname; - acb.lc_ret |= gdlm_make_lmstate(lp->cur); - - if (!test_and_clear_bit(LFL_NOCACHE, &lp->flags) && - (lp->cur > DLM_LOCK_NL) && (prev_mode > DLM_LOCK_NL)) - acb.lc_ret |= LM_OUT_CACHEABLE; - - ls->fscb(ls->sdp, LM_CB_ASYNC, &acb); -} - -static inline int no_work(struct gdlm_ls *ls, int blocking) +static inline int no_work(struct gdlm_ls *ls) { int ret; spin_lock(&ls->async_lock); - ret = list_empty(&ls->complete) && list_empty(&ls->submit); - if (ret && blocking) - ret = list_empty(&ls->blocking); + ret = list_empty(&ls->submit); spin_unlock(&ls->async_lock); return ret; } -static inline int check_drop(struct gdlm_ls *ls) -{ - if (!ls->drop_locks_count) - return 0; - - if (time_after(jiffies, ls->drop_time + ls->drop_locks_period * HZ)) { - ls->drop_time = jiffies; - if (ls->all_locks_count >= ls->drop_locks_count) - return 1; - } - return 0; -} - -static int gdlm_thread(void *data, int blist) +static int gdlm_thread(void *data) { struct gdlm_ls *ls = (struct gdlm_ls *) data; struct gdlm_lock *lp = NULL; - uint8_t complete, blocking, submit, drop; - - /* Only thread1 is allowed to do blocking callbacks since gfs - may wait for a completion callback within a blocking cb. */ while (!kthread_should_stop()) { wait_event_interruptible(ls->thread_wait, - !no_work(ls, blist) || kthread_should_stop()); - - complete = blocking = submit = drop = 0; + !no_work(ls) || kthread_should_stop()); spin_lock(&ls->async_lock); - if (blist && !list_empty(&ls->blocking)) { - lp = list_entry(ls->blocking.next, struct gdlm_lock, - blist); - list_del_init(&lp->blist); - blocking = lp->bast_mode; - lp->bast_mode = 0; - } else if (!list_empty(&ls->complete)) { - lp = list_entry(ls->complete.next, struct gdlm_lock, - clist); - list_del_init(&lp->clist); - complete = 1; - } else if (!list_empty(&ls->submit)) { + if (!list_empty(&ls->submit)) { lp = list_entry(ls->submit.next, struct gdlm_lock, delay_list); list_del_init(&lp->delay_list); - submit = 1; + spin_unlock(&ls->async_lock); + gdlm_do_lock(lp); + spin_lock(&ls->async_lock); } - - drop = check_drop(ls); spin_unlock(&ls->async_lock); - - if (complete) - process_complete(lp); - - else if (blocking) - process_blocking(lp, blocking); - - else if (submit) - gdlm_do_lock(lp); - - if (drop) - ls->fscb(ls->sdp, LM_CB_DROPLOCKS, NULL); - - schedule(); } return 0; } -static int gdlm_thread1(void *data) -{ - return gdlm_thread(data, 1); -} - -static int gdlm_thread2(void *data) -{ - return gdlm_thread(data, 0); -} - int gdlm_init_threads(struct gdlm_ls *ls) { struct task_struct *p; int error; - p = kthread_run(gdlm_thread1, ls, "lock_dlm1"); - error = IS_ERR(p); - if (error) { - log_error("can't start lock_dlm1 thread %d", error); - return error; - } - ls->thread1 = p; - - p = kthread_run(gdlm_thread2, ls, "lock_dlm2"); + p = kthread_run(gdlm_thread, ls, "lock_dlm"); error = IS_ERR(p); if (error) { - log_error("can't start lock_dlm2 thread %d", error); - kthread_stop(ls->thread1); + log_error("can't start lock_dlm thread %d", error); return error; } - ls->thread2 = p; + ls->thread = p; return 0; } void gdlm_release_threads(struct gdlm_ls *ls) { - kthread_stop(ls->thread1); - kthread_stop(ls->thread2); + kthread_stop(ls->thread); } diff --git a/fs/gfs2/locking/nolock/Makefile b/fs/gfs2/locking/nolock/Makefile deleted file mode 100644 index 35e9730bc3a8..000000000000 --- a/fs/gfs2/locking/nolock/Makefile +++ /dev/null @@ -1,3 +0,0 @@ -obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += lock_nolock.o -lock_nolock-y := main.o - diff --git a/fs/gfs2/locking/nolock/main.c b/fs/gfs2/locking/nolock/main.c deleted file mode 100644 index 284a5ece8d94..000000000000 --- a/fs/gfs2/locking/nolock/main.c +++ /dev/null @@ -1,238 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/init.h> -#include <linux/types.h> -#include <linux/fs.h> -#include <linux/lm_interface.h> - -struct nolock_lockspace { - unsigned int nl_lvb_size; -}; - -static const struct lm_lockops nolock_ops; - -static int nolock_mount(char *table_name, char *host_data, - lm_callback_t cb, void *cb_data, - unsigned int min_lvb_size, int flags, - struct lm_lockstruct *lockstruct, - struct kobject *fskobj) -{ - char *c; - unsigned int jid; - struct nolock_lockspace *nl; - - c = strstr(host_data, "jid="); - if (!c) - jid = 0; - else { - c += 4; - sscanf(c, "%u", &jid); - } - - nl = kzalloc(sizeof(struct nolock_lockspace), GFP_KERNEL); - if (!nl) - return -ENOMEM; - - nl->nl_lvb_size = min_lvb_size; - - lockstruct->ls_jid = jid; - lockstruct->ls_first = 1; - lockstruct->ls_lvb_size = min_lvb_size; - lockstruct->ls_lockspace = nl; - lockstruct->ls_ops = &nolock_ops; - lockstruct->ls_flags = LM_LSFLAG_LOCAL; - - return 0; -} - -static void nolock_others_may_mount(void *lockspace) -{ -} - -static void nolock_unmount(void *lockspace) -{ - struct nolock_lockspace *nl = lockspace; - kfree(nl); -} - -static void nolock_withdraw(void *lockspace) -{ -} - -/** - * nolock_get_lock - get a lm_lock_t given a descripton of the lock - * @lockspace: the lockspace the lock lives in - * @name: the name of the lock - * @lockp: return the lm_lock_t here - * - * Returns: 0 on success, -EXXX on failure - */ - -static int nolock_get_lock(void *lockspace, struct lm_lockname *name, - void **lockp) -{ - *lockp = lockspace; - return 0; -} - -/** - * nolock_put_lock - get rid of a lock structure - * @lock: the lock to throw away - * - */ - -static void nolock_put_lock(void *lock) -{ -} - -/** - * nolock_lock - acquire a lock - * @lock: the lock to manipulate - * @cur_state: the current state - * @req_state: the requested state - * @flags: modifier flags - * - * Returns: A bitmap of LM_OUT_* - */ - -static unsigned int nolock_lock(void *lock, unsigned int cur_state, - unsigned int req_state, unsigned int flags) -{ - return req_state | LM_OUT_CACHEABLE; -} - -/** - * nolock_unlock - unlock a lock - * @lock: the lock to manipulate - * @cur_state: the current state - * - * Returns: 0 - */ - -static unsigned int nolock_unlock(void *lock, unsigned int cur_state) -{ - return 0; -} - -static void nolock_cancel(void *lock) -{ -} - -/** - * nolock_hold_lvb - hold on to a lock value block - * @lock: the lock the LVB is associated with - * @lvbp: return the lm_lvb_t here - * - * Returns: 0 on success, -EXXX on failure - */ - -static int nolock_hold_lvb(void *lock, char **lvbp) -{ - struct nolock_lockspace *nl = lock; - int error = 0; - - *lvbp = kzalloc(nl->nl_lvb_size, GFP_NOFS); - if (!*lvbp) - error = -ENOMEM; - - return error; -} - -/** - * nolock_unhold_lvb - release a LVB - * @lock: the lock the LVB is associated with - * @lvb: the lock value block - * - */ - -static void nolock_unhold_lvb(void *lock, char *lvb) -{ - kfree(lvb); -} - -static int nolock_plock_get(void *lockspace, struct lm_lockname *name, - struct file *file, struct file_lock *fl) -{ - posix_test_lock(file, fl); - - return 0; -} - -static int nolock_plock(void *lockspace, struct lm_lockname *name, - struct file *file, int cmd, struct file_lock *fl) -{ - int error; - error = posix_lock_file_wait(file, fl); - return error; -} - -static int nolock_punlock(void *lockspace, struct lm_lockname *name, - struct file *file, struct file_lock *fl) -{ - int error; - error = posix_lock_file_wait(file, fl); - return error; -} - -static void nolock_recovery_done(void *lockspace, unsigned int jid, - unsigned int message) -{ -} - -static const struct lm_lockops nolock_ops = { - .lm_proto_name = "lock_nolock", - .lm_mount = nolock_mount, - .lm_others_may_mount = nolock_others_may_mount, - .lm_unmount = nolock_unmount, - .lm_withdraw = nolock_withdraw, - .lm_get_lock = nolock_get_lock, - .lm_put_lock = nolock_put_lock, - .lm_lock = nolock_lock, - .lm_unlock = nolock_unlock, - .lm_cancel = nolock_cancel, - .lm_hold_lvb = nolock_hold_lvb, - .lm_unhold_lvb = nolock_unhold_lvb, - .lm_plock_get = nolock_plock_get, - .lm_plock = nolock_plock, - .lm_punlock = nolock_punlock, - .lm_recovery_done = nolock_recovery_done, - .lm_owner = THIS_MODULE, -}; - -static int __init init_nolock(void) -{ - int error; - - error = gfs2_register_lockproto(&nolock_ops); - if (error) { - printk(KERN_WARNING - "lock_nolock: can't register protocol: %d\n", error); - return error; - } - - printk(KERN_INFO - "Lock_Nolock (built %s %s) installed\n", __DATE__, __TIME__); - return 0; -} - -static void __exit exit_nolock(void) -{ - gfs2_unregister_lockproto(&nolock_ops); -} - -module_init(init_nolock); -module_exit(exit_nolock); - -MODULE_DESCRIPTION("GFS Nolock Locking Module"); -MODULE_AUTHOR("Red Hat, Inc."); -MODULE_LICENSE("GPL"); - diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 548264b1836d..ad305854bdc6 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -18,6 +18,7 @@ #include <linux/delay.h> #include <linux/kthread.h> #include <linux/freezer.h> +#include <linux/bio.h> #include "gfs2.h" #include "incore.h" @@ -87,6 +88,8 @@ void gfs2_remove_from_ail(struct gfs2_bufdata *bd) */ static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai) +__releases(&sdp->sd_log_lock) +__acquires(&sdp->sd_log_lock) { struct gfs2_bufdata *bd, *s; struct buffer_head *bh; @@ -582,7 +585,6 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull) memset(bh->b_data, 0, bh->b_size); set_buffer_uptodate(bh); clear_buffer_dirty(bh); - unlock_buffer(bh); gfs2_ail1_empty(sdp, 0); tail = current_tail(sdp); @@ -599,8 +601,23 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull) hash = gfs2_disk_hash(bh->b_data, sizeof(struct gfs2_log_header)); lh->lh_hash = cpu_to_be32(hash); - set_buffer_dirty(bh); - if (sync_dirty_buffer(bh)) + bh->b_end_io = end_buffer_write_sync; + if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) + goto skip_barrier; + get_bh(bh); + submit_bh(WRITE_BARRIER | (1 << BIO_RW_META), bh); + wait_on_buffer(bh); + if (buffer_eopnotsupp(bh)) { + clear_buffer_eopnotsupp(bh); + set_buffer_uptodate(bh); + set_bit(SDF_NOBARRIERS, &sdp->sd_flags); + lock_buffer(bh); +skip_barrier: + get_bh(bh); + submit_bh(WRITE_SYNC | (1 << BIO_RW_META), bh); + wait_on_buffer(bh); + } + if (!buffer_uptodate(bh)) gfs2_io_error_bh(sdp, bh); brelse(bh); diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index 771152816508..7c64510ccfd2 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -21,6 +21,7 @@ */ static inline void gfs2_log_lock(struct gfs2_sbd *sdp) +__acquires(&sdp->sd_log_lock) { spin_lock(&sdp->sd_log_lock); } @@ -32,6 +33,7 @@ static inline void gfs2_log_lock(struct gfs2_sbd *sdp) */ static inline void gfs2_log_unlock(struct gfs2_sbd *sdp) +__releases(&sdp->sd_log_lock) { spin_unlock(&sdp->sd_log_lock); } diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 053e2ebbbd50..bb2cc303ac29 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -24,7 +24,7 @@ #include "util.h" #include "glock.h" -static void gfs2_init_inode_once(struct kmem_cache *cachep, void *foo) +static void gfs2_init_inode_once(void *foo) { struct gfs2_inode *ip = foo; @@ -33,15 +33,13 @@ static void gfs2_init_inode_once(struct kmem_cache *cachep, void *foo) ip->i_alloc = NULL; } -static void gfs2_init_glock_once(struct kmem_cache *cachep, void *foo) +static void gfs2_init_glock_once(void *foo) { struct gfs2_glock *gl = foo; INIT_HLIST_NODE(&gl->gl_list); spin_lock_init(&gl->gl_spin); INIT_LIST_HEAD(&gl->gl_holders); - INIT_LIST_HEAD(&gl->gl_waiters1); - INIT_LIST_HEAD(&gl->gl_waiters3); gl->gl_lvb = NULL; atomic_set(&gl->gl_lvb_count, 0); INIT_LIST_HEAD(&gl->gl_reclaim); diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 78d75f892f82..09853620c951 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -129,7 +129,7 @@ void gfs2_meta_sync(struct gfs2_glock *gl) } /** - * getbuf - Get a buffer with a given address space + * gfs2_getbuf - Get a buffer with a given address space * @gl: the glock * @blkno: the block number (filesystem scope) * @create: 1 if the buffer should be created @@ -137,7 +137,7 @@ void gfs2_meta_sync(struct gfs2_glock *gl) * Returns: the buffer */ -static struct buffer_head *getbuf(struct gfs2_glock *gl, u64 blkno, int create) +struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create) { struct address_space *mapping = gl->gl_aspace->i_mapping; struct gfs2_sbd *sdp = gl->gl_sbd; @@ -205,7 +205,7 @@ static void meta_prep_new(struct buffer_head *bh) struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno) { struct buffer_head *bh; - bh = getbuf(gl, blkno, CREATE); + bh = gfs2_getbuf(gl, blkno, CREATE); meta_prep_new(bh); return bh; } @@ -223,7 +223,7 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno) int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, struct buffer_head **bhp) { - *bhp = getbuf(gl, blkno, CREATE); + *bhp = gfs2_getbuf(gl, blkno, CREATE); if (!buffer_uptodate(*bhp)) { ll_rw_block(READ_META, 1, bhp); if (flags & DIO_WAIT) { @@ -346,7 +346,7 @@ void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen) struct buffer_head *bh; while (blen) { - bh = getbuf(ip->i_gl, bstart, NO_CREATE); + bh = gfs2_getbuf(ip->i_gl, bstart, NO_CREATE); if (bh) { lock_buffer(bh); gfs2_log_lock(sdp); @@ -421,7 +421,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) if (extlen > max_ra) extlen = max_ra; - first_bh = getbuf(gl, dblock, CREATE); + first_bh = gfs2_getbuf(gl, dblock, CREATE); if (buffer_uptodate(first_bh)) goto out; @@ -432,7 +432,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) extlen--; while (extlen) { - bh = getbuf(gl, dblock, CREATE); + bh = gfs2_getbuf(gl, dblock, CREATE); if (!buffer_uptodate(bh) && !buffer_locked(bh)) ll_rw_block(READA, 1, &bh); diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h index 73e3b1c76fe1..b1a5f3674d43 100644 --- a/fs/gfs2/meta_io.h +++ b/fs/gfs2/meta_io.h @@ -47,6 +47,7 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno); int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, struct buffer_head **bhp); int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh); +struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create); void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh, int meta); diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c index b941f9f9f958..f96eb90a2cfa 100644 --- a/fs/gfs2/mount.c +++ b/fs/gfs2/mount.c @@ -42,10 +42,11 @@ enum { Opt_nosuiddir, Opt_data_writeback, Opt_data_ordered, + Opt_meta, Opt_err, }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_lockproto, "lockproto=%s"}, {Opt_locktable, "locktable=%s"}, {Opt_hostdata, "hostdata=%s"}, @@ -66,6 +67,7 @@ static match_table_t tokens = { {Opt_nosuiddir, "nosuiddir"}, {Opt_data_writeback, "data=writeback"}, {Opt_data_ordered, "data=ordered"}, + {Opt_meta, "meta"}, {Opt_err, NULL} }; @@ -239,6 +241,11 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount) case Opt_data_ordered: args->ar_data = GFS2_DATA_ORDERED; break; + case Opt_meta: + if (remount && args->ar_meta != 1) + goto cant_remount; + args->ar_meta = 1; + break; case Opt_err: default: fs_info(sdp, "unknown option: %s\n", o); diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c index f55394e57cb2..27563816e1c5 100644 --- a/fs/gfs2/ops_address.c +++ b/fs/gfs2/ops_address.c @@ -499,34 +499,34 @@ static int __gfs2_readpage(void *file, struct page *page) * @file: The file to read * @page: The page of the file * - * This deals with the locking required. We use a trylock in order to - * avoid the page lock / glock ordering problems returning AOP_TRUNCATED_PAGE - * in the event that we are unable to get the lock. + * This deals with the locking required. We have to unlock and + * relock the page in order to get the locking in the right + * order. */ static int gfs2_readpage(struct file *file, struct page *page) { - struct gfs2_inode *ip = GFS2_I(page->mapping->host); - struct gfs2_holder *gh; + struct address_space *mapping = page->mapping; + struct gfs2_inode *ip = GFS2_I(mapping->host); + struct gfs2_holder gh; int error; - gh = gfs2_glock_is_locked_by_me(ip->i_gl); - if (!gh) { - gh = kmalloc(sizeof(struct gfs2_holder), GFP_NOFS); - if (!gh) - return -ENOBUFS; - gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, gh); + unlock_page(page); + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); + error = gfs2_glock_nq(&gh); + if (unlikely(error)) + goto out; + error = AOP_TRUNCATED_PAGE; + lock_page(page); + if (page->mapping == mapping && !PageUptodate(page)) + error = __gfs2_readpage(file, page); + else unlock_page(page); - error = gfs2_glock_nq_atime(gh); - if (likely(error != 0)) - goto out; - return AOP_TRUNCATED_PAGE; - } - error = __gfs2_readpage(file, page); - gfs2_glock_dq(gh); + gfs2_glock_dq(&gh); out: - gfs2_holder_uninit(gh); - kfree(gh); + gfs2_holder_uninit(&gh); + if (error && error != AOP_TRUNCATED_PAGE) + lock_page(page); return error; } @@ -594,8 +594,8 @@ static int gfs2_readpages(struct file *file, struct address_space *mapping, struct gfs2_holder gh; int ret; - gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh); - ret = gfs2_glock_nq_atime(&gh); + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); + ret = gfs2_glock_nq(&gh); if (unlikely(ret)) goto out_uninit; if (!gfs2_is_stuffed(ip)) @@ -636,8 +636,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, unsigned to = from + len; struct page *page; - gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME, &ip->i_gh); - error = gfs2_glock_nq_atime(&ip->i_gh); + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh); + error = gfs2_glock_nq(&ip->i_gh); if (unlikely(error)) goto out_uninit; @@ -975,7 +975,7 @@ static int gfs2_ok_for_dio(struct gfs2_inode *ip, int rw, loff_t offset) if (gfs2_is_stuffed(ip)) return 0; - if (offset > i_size_read(&ip->i_inode)) + if (offset >= i_size_read(&ip->i_inode)) return 0; return 1; } @@ -1000,8 +1000,8 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb, * unfortunately have the option of only flushing a range like * the VFS does. */ - gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, GL_ATIME, &gh); - rv = gfs2_glock_nq_atime(&gh); + gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, &gh); + rv = gfs2_glock_nq(&gh); if (rv) return rv; rv = gfs2_ok_for_dio(ip, rw, offset); diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c index 990d9f4bc463..9cda8536530c 100644 --- a/fs/gfs2/ops_export.c +++ b/fs/gfs2/ops_export.c @@ -134,7 +134,7 @@ static struct dentry *gfs2_get_parent(struct dentry *child) struct dentry *dentry; gfs2_str2qstr(&dotdot, ".."); - inode = gfs2_lookupi(child->d_inode, &dotdot, 1, NULL); + inode = gfs2_lookupi(child->d_inode, &dotdot, 1); if (!inode) return ERR_PTR(-ENOENT); diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c index e1b7d525a066..3a747f8e2188 100644 --- a/fs/gfs2/ops_file.c +++ b/fs/gfs2/ops_file.c @@ -15,6 +15,7 @@ #include <linux/uio.h> #include <linux/blkdev.h> #include <linux/mm.h> +#include <linux/mount.h> #include <linux/fs.h> #include <linux/gfs2_ondisk.h> #include <linux/ext2_fs.h> @@ -62,11 +63,11 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin) error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); if (!error) { - error = remote_llseek(file, offset, origin); + error = generic_file_llseek_unlocked(file, offset, origin); gfs2_glock_dq_uninit(&i_gh); } } else - error = remote_llseek(file, offset, origin); + error = generic_file_llseek_unlocked(file, offset, origin); return error; } @@ -88,8 +89,8 @@ static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir) u64 offset = file->f_pos; int error; - gfs2_holder_init(dip->i_gl, LM_ST_SHARED, GL_ATIME, &d_gh); - error = gfs2_glock_nq_atime(&d_gh); + gfs2_holder_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); + error = gfs2_glock_nq(&d_gh); if (error) { gfs2_holder_uninit(&d_gh); return error; @@ -133,7 +134,6 @@ static const u32 fsflags_to_gfs2[32] = { [7] = GFS2_DIF_NOATIME, [12] = GFS2_DIF_EXHASH, [14] = GFS2_DIF_INHERIT_JDATA, - [20] = GFS2_DIF_INHERIT_DIRECTIO, }; static const u32 gfs2_to_fsflags[32] = { @@ -142,7 +142,6 @@ static const u32 gfs2_to_fsflags[32] = { [gfs2fl_AppendOnly] = FS_APPEND_FL, [gfs2fl_NoAtime] = FS_NOATIME_FL, [gfs2fl_ExHash] = FS_INDEX_FL, - [gfs2fl_InheritDirectio] = FS_DIRECTIO_FL, [gfs2fl_InheritJdata] = FS_JOURNAL_DATA_FL, }; @@ -154,18 +153,14 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr) int error; u32 fsflags; - gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh); - error = gfs2_glock_nq_atime(&gh); + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); + error = gfs2_glock_nq(&gh); if (error) return error; fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_di.di_flags); - if (!S_ISDIR(inode->i_mode)) { - if (ip->i_di.di_flags & GFS2_DIF_JDATA) - fsflags |= FS_JOURNAL_DATA_FL; - if (ip->i_di.di_flags & GFS2_DIF_DIRECTIO) - fsflags |= FS_DIRECTIO_FL; - } + if (!S_ISDIR(inode->i_mode) && ip->i_di.di_flags & GFS2_DIF_JDATA) + fsflags |= FS_JOURNAL_DATA_FL; if (put_user(fsflags, ptr)) error = -EFAULT; @@ -194,13 +189,11 @@ void gfs2_set_inode_flags(struct inode *inode) /* Flags that can be set by user space */ #define GFS2_FLAGS_USER_SET (GFS2_DIF_JDATA| \ - GFS2_DIF_DIRECTIO| \ GFS2_DIF_IMMUTABLE| \ GFS2_DIF_APPENDONLY| \ GFS2_DIF_NOATIME| \ GFS2_DIF_SYNC| \ GFS2_DIF_SYSTEM| \ - GFS2_DIF_INHERIT_DIRECTIO| \ GFS2_DIF_INHERIT_JDATA) /** @@ -220,10 +213,14 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask) int error; u32 new_flags, flags; - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + error = mnt_want_write(filp->f_path.mnt); if (error) return error; + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + if (error) + goto out_drop_write; + flags = ip->i_di.di_flags; new_flags = (flags & ~mask) | (reqflags & mask); if ((new_flags ^ flags) == 0) @@ -242,7 +239,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask) !capable(CAP_LINUX_IMMUTABLE)) goto out; if (!IS_IMMUTABLE(inode)) { - error = permission(inode, MAY_WRITE, NULL); + error = gfs2_permission(inode, MAY_WRITE); if (error) goto out; } @@ -272,6 +269,8 @@ out_trans_end: gfs2_trans_end(sdp); out: gfs2_glock_dq_uninit(&gh); +out_drop_write: + mnt_drop_write(filp->f_path.mnt); return error; } @@ -285,8 +284,6 @@ static int gfs2_set_flags(struct file *filp, u32 __user *ptr) if (!S_ISDIR(inode->i_mode)) { if (gfsflags & GFS2_DIF_INHERIT_JDATA) gfsflags ^= (GFS2_DIF_JDATA | GFS2_DIF_INHERIT_JDATA); - if (gfsflags & GFS2_DIF_INHERIT_DIRECTIO) - gfsflags ^= (GFS2_DIF_DIRECTIO | GFS2_DIF_INHERIT_DIRECTIO); return do_gfs2_set_flags(filp, gfsflags, ~0); } return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_JDATA); @@ -354,8 +351,8 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page) struct gfs2_alloc *al; int ret; - gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME, &gh); - ret = gfs2_glock_nq_atime(&gh); + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + ret = gfs2_glock_nq(&gh); if (ret) goto out; @@ -437,8 +434,8 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma) struct gfs2_holder i_gh; int error; - gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh); - error = gfs2_glock_nq_atime(&i_gh); + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh); + error = gfs2_glock_nq(&i_gh); if (error) { gfs2_holder_uninit(&i_gh); return error; @@ -487,11 +484,6 @@ static int gfs2_open(struct inode *inode, struct file *file) goto fail_gunlock; } - /* Listen to the Direct I/O flag */ - - if (ip->i_di.di_flags & GFS2_DIF_DIRECTIO) - file->f_flags |= O_DIRECT; - gfs2_glock_dq_uninit(&i_gh); } @@ -669,8 +661,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) int error = 0; state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED; - flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE - | GL_FLOCK; + flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE; mutex_lock(&fp->f_fl_mutex); @@ -683,9 +674,8 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) gfs2_glock_dq_wait(fl_gh); gfs2_holder_reinit(state, flags, fl_gh); } else { - error = gfs2_glock_get(GFS2_SB(&ip->i_inode), - ip->i_no_addr, &gfs2_flock_glops, - CREATE, &gl); + error = gfs2_glock_get(GFS2_SB(&ip->i_inode), ip->i_no_addr, + &gfs2_flock_glops, CREATE, &gl); if (error) goto out; gfs2_holder_init(gl, state, flags, fl_gh); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index b2028c82e8d1..b117fcf2c4f5 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -40,6 +40,44 @@ #define DO 0 #define UNDO 1 +static const u32 gfs2_old_fs_formats[] = { + 0 +}; + +static const u32 gfs2_old_multihost_formats[] = { + 0 +}; + +/** + * gfs2_tune_init - Fill a gfs2_tune structure with default values + * @gt: tune + * + */ + +static void gfs2_tune_init(struct gfs2_tune *gt) +{ + spin_lock_init(>->gt_spin); + + gt->gt_demote_secs = 300; + gt->gt_incore_log_blocks = 1024; + gt->gt_log_flush_secs = 60; + gt->gt_recoverd_secs = 60; + gt->gt_logd_secs = 1; + gt->gt_quotad_secs = 5; + gt->gt_quota_simul_sync = 64; + gt->gt_quota_warn_period = 10; + gt->gt_quota_scale_num = 1; + gt->gt_quota_scale_den = 1; + gt->gt_quota_cache_secs = 300; + gt->gt_quota_quantum = 60; + gt->gt_new_files_jdata = 0; + gt->gt_max_readahead = 1 << 18; + gt->gt_stall_secs = 600; + gt->gt_complain_secs = 10; + gt->gt_statfs_quantum = 30; + gt->gt_statfs_slow = 0; +} + static struct gfs2_sbd *init_sbd(struct super_block *sb) { struct gfs2_sbd *sdp; @@ -64,7 +102,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) mutex_init(&sdp->sd_rindex_mutex); INIT_LIST_HEAD(&sdp->sd_rindex_list); INIT_LIST_HEAD(&sdp->sd_rindex_mru_list); - INIT_LIST_HEAD(&sdp->sd_rindex_recent_list); INIT_LIST_HEAD(&sdp->sd_jindex_list); spin_lock_init(&sdp->sd_jindex_spin); @@ -97,21 +134,271 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) return sdp; } -static void init_vfs(struct super_block *sb, unsigned noatime) + +/** + * gfs2_check_sb - Check superblock + * @sdp: the filesystem + * @sb: The superblock + * @silent: Don't print a message if the check fails + * + * Checks the version code of the FS is one that we understand how to + * read and that the sizes of the various on-disk structures have not + * changed. + */ + +static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent) { - struct gfs2_sbd *sdp = sb->s_fs_info; + unsigned int x; - sb->s_magic = GFS2_MAGIC; - sb->s_op = &gfs2_super_ops; - sb->s_export_op = &gfs2_export_ops; - sb->s_time_gran = 1; - sb->s_maxbytes = MAX_LFS_FILESIZE; + if (sb->sb_magic != GFS2_MAGIC || + sb->sb_type != GFS2_METATYPE_SB) { + if (!silent) + printk(KERN_WARNING "GFS2: not a GFS2 filesystem\n"); + return -EINVAL; + } + + /* If format numbers match exactly, we're done. */ + + if (sb->sb_fs_format == GFS2_FORMAT_FS && + sb->sb_multihost_format == GFS2_FORMAT_MULTI) + return 0; + + if (sb->sb_fs_format != GFS2_FORMAT_FS) { + for (x = 0; gfs2_old_fs_formats[x]; x++) + if (gfs2_old_fs_formats[x] == sb->sb_fs_format) + break; + + if (!gfs2_old_fs_formats[x]) { + printk(KERN_WARNING + "GFS2: code version (%u, %u) is incompatible " + "with ondisk format (%u, %u)\n", + GFS2_FORMAT_FS, GFS2_FORMAT_MULTI, + sb->sb_fs_format, sb->sb_multihost_format); + printk(KERN_WARNING + "GFS2: I don't know how to upgrade this FS\n"); + return -EINVAL; + } + } + + if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) { + for (x = 0; gfs2_old_multihost_formats[x]; x++) + if (gfs2_old_multihost_formats[x] == + sb->sb_multihost_format) + break; + + if (!gfs2_old_multihost_formats[x]) { + printk(KERN_WARNING + "GFS2: code version (%u, %u) is incompatible " + "with ondisk format (%u, %u)\n", + GFS2_FORMAT_FS, GFS2_FORMAT_MULTI, + sb->sb_fs_format, sb->sb_multihost_format); + printk(KERN_WARNING + "GFS2: I don't know how to upgrade this FS\n"); + return -EINVAL; + } + } + + if (!sdp->sd_args.ar_upgrade) { + printk(KERN_WARNING + "GFS2: code version (%u, %u) is incompatible " + "with ondisk format (%u, %u)\n", + GFS2_FORMAT_FS, GFS2_FORMAT_MULTI, + sb->sb_fs_format, sb->sb_multihost_format); + printk(KERN_INFO + "GFS2: Use the \"upgrade\" mount option to upgrade " + "the FS\n"); + printk(KERN_INFO "GFS2: See the manual for more details\n"); + return -EINVAL; + } + + return 0; +} + +static void end_bio_io_page(struct bio *bio, int error) +{ + struct page *page = bio->bi_private; - if (sb->s_flags & (MS_NOATIME | MS_NODIRATIME)) - set_bit(noatime, &sdp->sd_flags); + if (!error) + SetPageUptodate(page); + else + printk(KERN_WARNING "gfs2: error %d reading superblock\n", error); + unlock_page(page); +} + +static void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf) +{ + const struct gfs2_sb *str = buf; + + sb->sb_magic = be32_to_cpu(str->sb_header.mh_magic); + sb->sb_type = be32_to_cpu(str->sb_header.mh_type); + sb->sb_format = be32_to_cpu(str->sb_header.mh_format); + sb->sb_fs_format = be32_to_cpu(str->sb_fs_format); + sb->sb_multihost_format = be32_to_cpu(str->sb_multihost_format); + sb->sb_bsize = be32_to_cpu(str->sb_bsize); + sb->sb_bsize_shift = be32_to_cpu(str->sb_bsize_shift); + sb->sb_master_dir.no_addr = be64_to_cpu(str->sb_master_dir.no_addr); + sb->sb_master_dir.no_formal_ino = be64_to_cpu(str->sb_master_dir.no_formal_ino); + sb->sb_root_dir.no_addr = be64_to_cpu(str->sb_root_dir.no_addr); + sb->sb_root_dir.no_formal_ino = be64_to_cpu(str->sb_root_dir.no_formal_ino); + + memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN); + memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN); +} + +/** + * gfs2_read_super - Read the gfs2 super block from disk + * @sdp: The GFS2 super block + * @sector: The location of the super block + * @error: The error code to return + * + * This uses the bio functions to read the super block from disk + * because we want to be 100% sure that we never read cached data. + * A super block is read twice only during each GFS2 mount and is + * never written to by the filesystem. The first time its read no + * locks are held, and the only details which are looked at are those + * relating to the locking protocol. Once locking is up and working, + * the sb is read again under the lock to establish the location of + * the master directory (contains pointers to journals etc) and the + * root directory. + * + * Returns: 0 on success or error + */ + +static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector) +{ + struct super_block *sb = sdp->sd_vfs; + struct gfs2_sb *p; + struct page *page; + struct bio *bio; + + page = alloc_page(GFP_NOFS); + if (unlikely(!page)) + return -ENOBUFS; + + ClearPageUptodate(page); + ClearPageDirty(page); + lock_page(page); + + bio = bio_alloc(GFP_NOFS, 1); + if (unlikely(!bio)) { + __free_page(page); + return -ENOBUFS; + } - /* Don't let the VFS update atimes. GFS2 handles this itself. */ - sb->s_flags |= MS_NOATIME | MS_NODIRATIME; + bio->bi_sector = sector * (sb->s_blocksize >> 9); + bio->bi_bdev = sb->s_bdev; + bio_add_page(bio, page, PAGE_SIZE, 0); + + bio->bi_end_io = end_bio_io_page; + bio->bi_private = page; + submit_bio(READ_SYNC | (1 << BIO_RW_META), bio); + wait_on_page_locked(page); + bio_put(bio); + if (!PageUptodate(page)) { + __free_page(page); + return -EIO; + } + p = kmap(page); + gfs2_sb_in(&sdp->sd_sb, p); + kunmap(page); + __free_page(page); + return 0; +} +/** + * gfs2_read_sb - Read super block + * @sdp: The GFS2 superblock + * @gl: the glock for the superblock (assumed to be held) + * @silent: Don't print message if mount fails + * + */ + +static int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent) +{ + u32 hash_blocks, ind_blocks, leaf_blocks; + u32 tmp_blocks; + unsigned int x; + int error; + + error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift); + if (error) { + if (!silent) + fs_err(sdp, "can't read superblock\n"); + return error; + } + + error = gfs2_check_sb(sdp, &sdp->sd_sb, silent); + if (error) + return error; + + sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - + GFS2_BASIC_BLOCK_SHIFT; + sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift; + sdp->sd_diptrs = (sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_dinode)) / sizeof(u64); + sdp->sd_inptrs = (sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_meta_header)) / sizeof(u64); + sdp->sd_jbsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header); + sdp->sd_hash_bsize = sdp->sd_sb.sb_bsize / 2; + sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1; + sdp->sd_hash_ptrs = sdp->sd_hash_bsize / sizeof(u64); + sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_meta_header)) / + sizeof(struct gfs2_quota_change); + + /* Compute maximum reservation required to add a entry to a directory */ + + hash_blocks = DIV_ROUND_UP(sizeof(u64) * (1 << GFS2_DIR_MAX_DEPTH), + sdp->sd_jbsize); + + ind_blocks = 0; + for (tmp_blocks = hash_blocks; tmp_blocks > sdp->sd_diptrs;) { + tmp_blocks = DIV_ROUND_UP(tmp_blocks, sdp->sd_inptrs); + ind_blocks += tmp_blocks; + } + + leaf_blocks = 2 + GFS2_DIR_MAX_DEPTH; + + sdp->sd_max_dirres = hash_blocks + ind_blocks + leaf_blocks; + + sdp->sd_heightsize[0] = sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_dinode); + sdp->sd_heightsize[1] = sdp->sd_sb.sb_bsize * sdp->sd_diptrs; + for (x = 2;; x++) { + u64 space, d; + u32 m; + + space = sdp->sd_heightsize[x - 1] * sdp->sd_inptrs; + d = space; + m = do_div(d, sdp->sd_inptrs); + + if (d != sdp->sd_heightsize[x - 1] || m) + break; + sdp->sd_heightsize[x] = space; + } + sdp->sd_max_height = x; + sdp->sd_heightsize[x] = ~0; + gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT); + + sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_dinode); + sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs; + for (x = 2;; x++) { + u64 space, d; + u32 m; + + space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs; + d = space; + m = do_div(d, sdp->sd_inptrs); + + if (d != sdp->sd_jheightsize[x - 1] || m) + break; + sdp->sd_jheightsize[x] = space; + } + sdp->sd_max_jheight = x; + sdp->sd_jheightsize[x] = ~0; + gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT); + + return 0; } static int init_names(struct gfs2_sbd *sdp, int silent) @@ -225,51 +512,59 @@ fail: return error; } -static inline struct inode *gfs2_lookup_root(struct super_block *sb, - u64 no_addr) +static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr, + u64 no_addr, const char *name) { - return gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0); + struct gfs2_sbd *sdp = sb->s_fs_info; + struct dentry *dentry; + struct inode *inode; + + inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0); + if (IS_ERR(inode)) { + fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode)); + return PTR_ERR(inode); + } + dentry = d_alloc_root(inode); + if (!dentry) { + fs_err(sdp, "can't alloc %s dentry\n", name); + iput(inode); + return -ENOMEM; + } + dentry->d_op = &gfs2_dops; + *dptr = dentry; + return 0; } -static int init_sb(struct gfs2_sbd *sdp, int silent, int undo) +static int init_sb(struct gfs2_sbd *sdp, int silent) { struct super_block *sb = sdp->sd_vfs; struct gfs2_holder sb_gh; u64 no_addr; - struct inode *inode; - int error = 0; + int ret; - if (undo) { - if (sb->s_root) { - dput(sb->s_root); - sb->s_root = NULL; - } - return 0; + ret = gfs2_glock_nq_num(sdp, GFS2_SB_LOCK, &gfs2_meta_glops, + LM_ST_SHARED, 0, &sb_gh); + if (ret) { + fs_err(sdp, "can't acquire superblock glock: %d\n", ret); + return ret; } - error = gfs2_glock_nq_num(sdp, GFS2_SB_LOCK, &gfs2_meta_glops, - LM_ST_SHARED, 0, &sb_gh); - if (error) { - fs_err(sdp, "can't acquire superblock glock: %d\n", error); - return error; - } - - error = gfs2_read_sb(sdp, sb_gh.gh_gl, silent); - if (error) { - fs_err(sdp, "can't read superblock: %d\n", error); + ret = gfs2_read_sb(sdp, sb_gh.gh_gl, silent); + if (ret) { + fs_err(sdp, "can't read superblock: %d\n", ret); goto out; } /* Set up the buffer cache and SB for real */ if (sdp->sd_sb.sb_bsize < bdev_hardsect_size(sb->s_bdev)) { - error = -EINVAL; + ret = -EINVAL; fs_err(sdp, "FS block size (%u) is too small for device " "block size (%u)\n", sdp->sd_sb.sb_bsize, bdev_hardsect_size(sb->s_bdev)); goto out; } if (sdp->sd_sb.sb_bsize > PAGE_SIZE) { - error = -EINVAL; + ret = -EINVAL; fs_err(sdp, "FS block size (%u) is too big for machine " "page size (%u)\n", sdp->sd_sb.sb_bsize, (unsigned int)PAGE_SIZE); @@ -279,26 +574,21 @@ static int init_sb(struct gfs2_sbd *sdp, int silent, int undo) /* Get the root inode */ no_addr = sdp->sd_sb.sb_root_dir.no_addr; - if (sb->s_type == &gfs2meta_fs_type) - no_addr = sdp->sd_sb.sb_master_dir.no_addr; - inode = gfs2_lookup_root(sb, no_addr); - if (IS_ERR(inode)) { - error = PTR_ERR(inode); - fs_err(sdp, "can't read in root inode: %d\n", error); + ret = gfs2_lookup_root(sb, &sdp->sd_root_dir, no_addr, "root"); + if (ret) goto out; - } - sb->s_root = d_alloc_root(inode); - if (!sb->s_root) { - fs_err(sdp, "can't get root dentry\n"); - error = -ENOMEM; - iput(inode); - } else - sb->s_root->d_op = &gfs2_dops; - + /* Get the master inode */ + no_addr = sdp->sd_sb.sb_master_dir.no_addr; + ret = gfs2_lookup_root(sb, &sdp->sd_master_dir, no_addr, "master"); + if (ret) { + dput(sdp->sd_root_dir); + goto out; + } + sb->s_root = dget(sdp->sd_args.ar_meta ? sdp->sd_master_dir : sdp->sd_root_dir); out: gfs2_glock_dq_uninit(&sb_gh); - return error; + return ret; } /** @@ -364,6 +654,8 @@ static int map_journal_extents(struct gfs2_sbd *sdp) static void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp) { + if (!sdp->sd_lockstruct.ls_ops->lm_others_may_mount) + return; if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) sdp->sd_lockstruct.ls_ops->lm_others_may_mount( sdp->sd_lockstruct.ls_lockspace); @@ -371,6 +663,7 @@ static void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp) static int init_journal(struct gfs2_sbd *sdp, int undo) { + struct inode *master = sdp->sd_master_dir->d_inode; struct gfs2_holder ji_gh; struct task_struct *p; struct gfs2_inode *ip; @@ -382,7 +675,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) goto fail_recoverd; } - sdp->sd_jindex = gfs2_lookup_simple(sdp->sd_master_dir, "jindex"); + sdp->sd_jindex = gfs2_lookup_simple(master, "jindex"); if (IS_ERR(sdp->sd_jindex)) { fs_err(sdp, "can't lookup journal index: %d\n", error); return PTR_ERR(sdp->sd_jindex); @@ -505,25 +798,17 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo) { int error = 0; struct gfs2_inode *ip; - struct inode *inode; + struct inode *master = sdp->sd_master_dir->d_inode; if (undo) goto fail_qinode; - inode = gfs2_lookup_root(sdp->sd_vfs, sdp->sd_sb.sb_master_dir.no_addr); - if (IS_ERR(inode)) { - error = PTR_ERR(inode); - fs_err(sdp, "can't read in master directory: %d\n", error); - goto fail; - } - sdp->sd_master_dir = inode; - error = init_journal(sdp, undo); if (error) - goto fail_master; + goto fail; /* Read in the master inode number inode */ - sdp->sd_inum_inode = gfs2_lookup_simple(sdp->sd_master_dir, "inum"); + sdp->sd_inum_inode = gfs2_lookup_simple(master, "inum"); if (IS_ERR(sdp->sd_inum_inode)) { error = PTR_ERR(sdp->sd_inum_inode); fs_err(sdp, "can't read in inum inode: %d\n", error); @@ -532,7 +817,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo) /* Read in the master statfs inode */ - sdp->sd_statfs_inode = gfs2_lookup_simple(sdp->sd_master_dir, "statfs"); + sdp->sd_statfs_inode = gfs2_lookup_simple(master, "statfs"); if (IS_ERR(sdp->sd_statfs_inode)) { error = PTR_ERR(sdp->sd_statfs_inode); fs_err(sdp, "can't read in statfs inode: %d\n", error); @@ -540,7 +825,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo) } /* Read in the resource index inode */ - sdp->sd_rindex = gfs2_lookup_simple(sdp->sd_master_dir, "rindex"); + sdp->sd_rindex = gfs2_lookup_simple(master, "rindex"); if (IS_ERR(sdp->sd_rindex)) { error = PTR_ERR(sdp->sd_rindex); fs_err(sdp, "can't get resource index inode: %d\n", error); @@ -551,7 +836,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo) sdp->sd_rindex_uptodate = 0; /* Read in the quota inode */ - sdp->sd_quota_inode = gfs2_lookup_simple(sdp->sd_master_dir, "quota"); + sdp->sd_quota_inode = gfs2_lookup_simple(master, "quota"); if (IS_ERR(sdp->sd_quota_inode)) { error = PTR_ERR(sdp->sd_quota_inode); fs_err(sdp, "can't get quota file inode: %d\n", error); @@ -570,8 +855,6 @@ fail_inum: iput(sdp->sd_inum_inode); fail_journal: init_journal(sdp, UNDO); -fail_master: - iput(sdp->sd_master_dir); fail: return error; } @@ -582,6 +865,7 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo) char buf[30]; int error = 0; struct gfs2_inode *ip; + struct inode *master = sdp->sd_master_dir->d_inode; if (sdp->sd_args.ar_spectator) return 0; @@ -589,7 +873,7 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo) if (undo) goto fail_qc_gh; - pn = gfs2_lookup_simple(sdp->sd_master_dir, "per_node"); + pn = gfs2_lookup_simple(master, "per_node"); if (IS_ERR(pn)) { error = PTR_ERR(pn); fs_err(sdp, "can't find per_node directory: %d\n", error); @@ -741,8 +1025,7 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent) goto out; } - if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lockspace) || - gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) || + if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) || gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lvb_size >= GFS2_MIN_LVB_SIZE)) { gfs2_unmount_lockproto(&sdp->sd_lockstruct); @@ -800,7 +1083,11 @@ static int fill_super(struct super_block *sb, void *data, int silent) goto fail; } - init_vfs(sb, SDF_NOATIME); + sb->s_magic = GFS2_MAGIC; + sb->s_op = &gfs2_super_ops; + sb->s_export_op = &gfs2_export_ops; + sb->s_time_gran = 1; + sb->s_maxbytes = MAX_LFS_FILESIZE; /* Set up the buffer cache and fill in some fake block size values to allow us to read-in the on-disk superblock. */ @@ -828,7 +1115,7 @@ static int fill_super(struct super_block *sb, void *data, int silent) if (error) goto fail_lm; - error = init_sb(sdp, silent, DO); + error = init_sb(sdp, silent); if (error) goto fail_locking; @@ -869,11 +1156,15 @@ fail_per_node: fail_inodes: init_inodes(sdp, UNDO); fail_sb: - init_sb(sdp, 0, UNDO); + if (sdp->sd_root_dir) + dput(sdp->sd_root_dir); + if (sdp->sd_master_dir) + dput(sdp->sd_master_dir); + sb->s_root = NULL; fail_locking: init_locking(sdp, &mount_gh, UNDO); fail_lm: - gfs2_gl_hash_clear(sdp, WAIT); + gfs2_gl_hash_clear(sdp); gfs2_lm_unmount(sdp); while (invalidate_inodes(sb)) yield(); @@ -887,151 +1178,63 @@ fail: } static int gfs2_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data, struct vfsmount *mnt) + const char *dev_name, void *data, struct vfsmount *mnt) { - struct super_block *sb; - struct gfs2_sbd *sdp; - int error = get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt); - if (error) - goto out; - sb = mnt->mnt_sb; - sdp = sb->s_fs_info; - sdp->sd_gfs2mnt = mnt; -out: - return error; + return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt); } -static int fill_super_meta(struct super_block *sb, struct super_block *new, - void *data, int silent) +static struct super_block *get_gfs2_sb(const char *dev_name) { - struct gfs2_sbd *sdp = sb->s_fs_info; - struct inode *inode; - int error = 0; - - new->s_fs_info = sdp; - sdp->sd_vfs_meta = sb; - - init_vfs(new, SDF_NOATIME); - - /* Get the master inode */ - inode = igrab(sdp->sd_master_dir); - - new->s_root = d_alloc_root(inode); - if (!new->s_root) { - fs_err(sdp, "can't get root dentry\n"); - error = -ENOMEM; - iput(inode); - } else - new->s_root->d_op = &gfs2_dops; - - return error; -} - -static int set_bdev_super(struct super_block *s, void *data) -{ - s->s_bdev = data; - s->s_dev = s->s_bdev->bd_dev; - return 0; -} - -static int test_bdev_super(struct super_block *s, void *data) -{ - return s->s_bdev == data; -} - -static struct super_block* get_gfs2_sb(const char *dev_name) -{ - struct kstat stat; + struct super_block *sb; struct nameidata nd; - struct super_block *sb = NULL, *s; int error; error = path_lookup(dev_name, LOOKUP_FOLLOW, &nd); if (error) { - printk(KERN_WARNING "GFS2: path_lookup on %s returned error\n", - dev_name); - goto out; - } - error = vfs_getattr(nd.path.mnt, nd.path.dentry, &stat); - - list_for_each_entry(s, &gfs2_fs_type.fs_supers, s_instances) { - if ((S_ISBLK(stat.mode) && s->s_dev == stat.rdev) || - (S_ISDIR(stat.mode) && - s == nd.path.dentry->d_inode->i_sb)) { - sb = s; - goto free_nd; - } + printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n", + dev_name, error); + return NULL; } - - printk(KERN_WARNING "GFS2: Unrecognized block device or " - "mount point %s\n", dev_name); - -free_nd: + sb = nd.path.dentry->d_inode->i_sb; + if (sb && (sb->s_type == &gfs2_fs_type)) + atomic_inc(&sb->s_active); + else + sb = NULL; path_put(&nd.path); -out: return sb; } static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - int error = 0; - struct super_block *sb = NULL, *new; + struct super_block *sb = NULL; struct gfs2_sbd *sdp; sb = get_gfs2_sb(dev_name); if (!sb) { printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n"); - error = -ENOENT; - goto error; + return -ENOENT; } sdp = sb->s_fs_info; - if (sdp->sd_vfs_meta) { - printk(KERN_WARNING "GFS2: gfs2meta mount already exists\n"); - error = -EBUSY; - goto error; - } - down(&sb->s_bdev->bd_mount_sem); - new = sget(fs_type, test_bdev_super, set_bdev_super, sb->s_bdev); - up(&sb->s_bdev->bd_mount_sem); - if (IS_ERR(new)) { - error = PTR_ERR(new); - goto error; - } - new->s_flags = flags; - strlcpy(new->s_id, sb->s_id, sizeof(new->s_id)); - sb_set_blocksize(new, sb->s_blocksize); - error = fill_super_meta(sb, new, data, flags & MS_SILENT ? 1 : 0); - if (error) { - up_write(&new->s_umount); - deactivate_super(new); - goto error; - } - - new->s_flags |= MS_ACTIVE; - - /* Grab a reference to the gfs2 mount point */ - atomic_inc(&sdp->sd_gfs2mnt->mnt_count); - return simple_set_mnt(mnt, new); -error: - return error; + mnt->mnt_sb = sb; + mnt->mnt_root = dget(sdp->sd_master_dir); + return 0; } static void gfs2_kill_sb(struct super_block *sb) { - if (sb->s_fs_info) { - gfs2_delete_debugfs_file(sb->s_fs_info); - gfs2_meta_syncfs(sb->s_fs_info); + struct gfs2_sbd *sdp = sb->s_fs_info; + if (sdp) { + gfs2_meta_syncfs(sdp); + dput(sdp->sd_root_dir); + dput(sdp->sd_master_dir); + sdp->sd_root_dir = NULL; + sdp->sd_master_dir = NULL; } + shrink_dcache_sb(sb); kill_block_super(sb); -} - -static void gfs2_kill_sb_meta(struct super_block *sb) -{ - struct gfs2_sbd *sdp = sb->s_fs_info; - generic_shutdown_super(sb); - sdp->sd_vfs_meta = NULL; - atomic_dec(&sdp->sd_gfs2mnt->mnt_count); + if (sdp) + gfs2_delete_debugfs_file(sdp); } struct file_system_type gfs2_fs_type = { @@ -1046,7 +1249,6 @@ struct file_system_type gfs2meta_fs_type = { .name = "gfs2meta", .fs_flags = FS_REQUIRES_DEV, .get_sb = gfs2_get_sb_meta, - .kill_sb = gfs2_kill_sb_meta, .owner = THIS_MODULE, }; diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c index 2686ad4c0029..534e1e2c65ca 100644 --- a/fs/gfs2/ops_inode.c +++ b/fs/gfs2/ops_inode.c @@ -74,7 +74,7 @@ static int gfs2_create(struct inode *dir, struct dentry *dentry, return PTR_ERR(inode); } - inode = gfs2_lookupi(dir, &dentry->d_name, 0, nd); + inode = gfs2_lookupi(dir, &dentry->d_name, 0); if (inode) { if (!IS_ERR(inode)) { gfs2_holder_uninit(ghs); @@ -109,7 +109,7 @@ static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry, dentry->d_op = &gfs2_dops; - inode = gfs2_lookupi(dir, &dentry->d_name, 0, nd); + inode = gfs2_lookupi(dir, &dentry->d_name, 0); if (inode && IS_ERR(inode)) return ERR_CAST(inode); @@ -159,11 +159,15 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); - error = gfs2_glock_nq_m(2, ghs); + error = gfs2_glock_nq(ghs); /* parent */ if (error) - goto out; + goto out_parent; + + error = gfs2_glock_nq(ghs + 1); /* child */ + if (error) + goto out_child; - error = permission(dir, MAY_WRITE | MAY_EXEC, NULL); + error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC); if (error) goto out_gunlock; @@ -245,8 +249,10 @@ out_alloc: if (alloc_required) gfs2_alloc_put(dip); out_gunlock: - gfs2_glock_dq_m(2, ghs); -out: + gfs2_glock_dq(ghs + 1); +out_child: + gfs2_glock_dq(ghs); +out_parent: gfs2_holder_uninit(ghs); gfs2_holder_uninit(ghs + 1); if (!error) { @@ -302,7 +308,7 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry) error = gfs2_unlink_ok(dip, &dentry->d_name, ip); if (error) - goto out_rgrp; + goto out_gunlock; error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0); if (error) @@ -316,6 +322,7 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry) out_end_trans: gfs2_trans_end(sdp); +out_gunlock: gfs2_glock_dq(ghs + 2); out_rgrp: gfs2_holder_uninit(ghs + 2); @@ -485,7 +492,6 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry) struct gfs2_holder ri_gh; int error; - error = gfs2_rindex_hold(sdp, &ri_gh); if (error) return error; @@ -495,9 +501,17 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry) rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr); gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2); - error = gfs2_glock_nq_m(3, ghs); + error = gfs2_glock_nq(ghs); /* parent */ if (error) - goto out; + goto out_parent; + + error = gfs2_glock_nq(ghs + 1); /* child */ + if (error) + goto out_child; + + error = gfs2_glock_nq(ghs + 2); /* rgrp */ + if (error) + goto out_rgrp; error = gfs2_unlink_ok(dip, &dentry->d_name, ip); if (error) @@ -523,11 +537,15 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry) gfs2_trans_end(sdp); out_gunlock: - gfs2_glock_dq_m(3, ghs); -out: - gfs2_holder_uninit(ghs); - gfs2_holder_uninit(ghs + 1); + gfs2_glock_dq(ghs + 2); +out_rgrp: gfs2_holder_uninit(ghs + 2); + gfs2_glock_dq(ghs + 1); +out_child: + gfs2_holder_uninit(ghs + 1); + gfs2_glock_dq(ghs); +out_parent: + gfs2_holder_uninit(ghs); gfs2_glock_dq_uninit(&ri_gh); return error; } @@ -571,6 +589,54 @@ static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode, return 0; } +/* + * gfs2_ok_to_move - check if it's ok to move a directory to another directory + * @this: move this + * @to: to here + * + * Follow @to back to the root and make sure we don't encounter @this + * Assumes we already hold the rename lock. + * + * Returns: errno + */ + +static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to) +{ + struct inode *dir = &to->i_inode; + struct super_block *sb = dir->i_sb; + struct inode *tmp; + struct qstr dotdot; + int error = 0; + + gfs2_str2qstr(&dotdot, ".."); + + igrab(dir); + + for (;;) { + if (dir == &this->i_inode) { + error = -EINVAL; + break; + } + if (dir == sb->s_root->d_inode) { + error = 0; + break; + } + + tmp = gfs2_lookupi(dir, &dotdot, 1); + if (IS_ERR(tmp)) { + error = PTR_ERR(tmp); + break; + } + + iput(dir); + dir = tmp; + } + + iput(dir); + + return error; +} + /** * gfs2_rename - Rename a file * @odir: Parent directory of old file name @@ -589,7 +655,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, struct gfs2_inode *ip = GFS2_I(odentry->d_inode); struct gfs2_inode *nip = NULL; struct gfs2_sbd *sdp = GFS2_SB(odir); - struct gfs2_holder ghs[5], r_gh; + struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }; struct gfs2_rgrpd *nrgd; unsigned int num_gh; int dir_rename = 0; @@ -603,19 +669,20 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, return 0; } - /* Make sure we aren't trying to move a dirctory into it's subdir */ - if (S_ISDIR(ip->i_inode.i_mode) && odip != ndip) { - dir_rename = 1; - - error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE, 0, - &r_gh); + if (odip != ndip) { + error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE, + 0, &r_gh); if (error) goto out; - error = gfs2_ok_to_move(ip, ndip); - if (error) - goto out_gunlock_r; + if (S_ISDIR(ip->i_inode.i_mode)) { + dir_rename = 1; + /* don't move a dirctory into it's subdir */ + error = gfs2_ok_to_move(ip, ndip); + if (error) + goto out_gunlock_r; + } } num_gh = 1; @@ -639,9 +706,11 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, gfs2_holder_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh++); } - error = gfs2_glock_nq_m(num_gh, ghs); - if (error) - goto out_uninit; + for (x = 0; x < num_gh; x++) { + error = gfs2_glock_nq(ghs + x); + if (error) + goto out_gunlock; + } /* Check out the old directory */ @@ -669,7 +738,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, } } } else { - error = permission(ndir, MAY_WRITE | MAY_EXEC, NULL); + error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC); if (error) goto out_gunlock; @@ -704,7 +773,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, /* Check out the dir to be renamed */ if (dir_rename) { - error = permission(odentry->d_inode, MAY_WRITE, NULL); + error = gfs2_permission(odentry->d_inode, MAY_WRITE); if (error) goto out_gunlock; } @@ -804,12 +873,12 @@ out_alloc: if (alloc_required) gfs2_alloc_put(ndip); out_gunlock: - gfs2_glock_dq_m(num_gh, ghs); -out_uninit: - for (x = 0; x < num_gh; x++) + while (x--) { + gfs2_glock_dq(ghs + x); gfs2_holder_uninit(ghs + x); + } out_gunlock_r: - if (dir_rename) + if (r_gh.gh_gl) gfs2_glock_dq_uninit(&r_gh); out: return error; @@ -891,7 +960,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd) * Returns: errno */ -static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd) +int gfs2_permission(struct inode *inode, int mask) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder i_gh; @@ -905,7 +974,10 @@ static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd) unlock = 1; } - error = generic_permission(inode, mask, gfs2_check_acl); + if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode)) + error = -EACCES; + else + error = generic_permission(inode, mask, gfs2_check_acl); if (unlock) gfs2_glock_dq_uninit(&i_gh); diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c index 0b7cc920eb89..d5355d9b5926 100644 --- a/fs/gfs2/ops_super.c +++ b/fs/gfs2/ops_super.c @@ -20,6 +20,7 @@ #include <linux/gfs2_ondisk.h> #include <linux/crc32.h> #include <linux/lm_interface.h> +#include <linux/time.h> #include "gfs2.h" #include "incore.h" @@ -38,6 +39,7 @@ #include "dir.h" #include "eattr.h" #include "bmap.h" +#include "meta_io.h" /** * gfs2_write_inode - Make sure the inode is stable on the disk @@ -50,16 +52,74 @@ static int gfs2_write_inode(struct inode *inode, int sync) { struct gfs2_inode *ip = GFS2_I(inode); - - /* Check this is a "normal" inode */ - if (test_bit(GIF_USER, &ip->i_flags)) { - if (current->flags & PF_MEMALLOC) - return 0; - if (sync) - gfs2_log_flush(GFS2_SB(inode), ip->i_gl); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_holder gh; + struct buffer_head *bh; + struct timespec atime; + struct gfs2_dinode *di; + int ret = 0; + + /* Check this is a "normal" inode, etc */ + if (!test_bit(GIF_USER, &ip->i_flags) || + (current->flags & PF_MEMALLOC)) + return 0; + ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + if (ret) + goto do_flush; + ret = gfs2_trans_begin(sdp, RES_DINODE, 0); + if (ret) + goto do_unlock; + ret = gfs2_meta_inode_buffer(ip, &bh); + if (ret == 0) { + di = (struct gfs2_dinode *)bh->b_data; + atime.tv_sec = be64_to_cpu(di->di_atime); + atime.tv_nsec = be32_to_cpu(di->di_atime_nsec); + if (timespec_compare(&inode->i_atime, &atime) > 0) { + gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_dinode_out(ip, bh->b_data); + } + brelse(bh); } + gfs2_trans_end(sdp); +do_unlock: + gfs2_glock_dq_uninit(&gh); +do_flush: + if (sync != 0) + gfs2_log_flush(GFS2_SB(inode), ip->i_gl); + return ret; +} - return 0; +/** + * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one + * @sdp: the filesystem + * + * Returns: errno + */ + +static int gfs2_make_fs_ro(struct gfs2_sbd *sdp) +{ + struct gfs2_holder t_gh; + int error; + + gfs2_quota_sync(sdp); + gfs2_statfs_sync(sdp); + + error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE, + &t_gh); + if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) + return error; + + gfs2_meta_syncfs(sdp); + gfs2_log_shutdown(sdp); + + clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); + + if (t_gh.gh_gl) + gfs2_glock_dq_uninit(&t_gh); + + gfs2_quota_cleanup(sdp); + + return error; } /** @@ -73,12 +133,6 @@ static void gfs2_put_super(struct super_block *sb) struct gfs2_sbd *sdp = sb->s_fs_info; int error; - if (!sdp) - return; - - if (!strncmp(sb->s_type->name, "gfs2meta", 8)) - return; /* Nothing to do */ - /* Unfreeze the filesystem, if we need to */ mutex_lock(&sdp->sd_freeze_lock); @@ -101,7 +155,6 @@ static void gfs2_put_super(struct super_block *sb) /* Release stuff */ - iput(sdp->sd_master_dir); iput(sdp->sd_jindex); iput(sdp->sd_inum_inode); iput(sdp->sd_statfs_inode); @@ -126,7 +179,7 @@ static void gfs2_put_super(struct super_block *sb) gfs2_clear_rgrpd(sdp); gfs2_jindex_free(sdp); /* Take apart glock structures and buffer lists */ - gfs2_gl_hash_clear(sdp, WAIT); + gfs2_gl_hash_clear(sdp); /* Unmount the locking protocol */ gfs2_lm_unmount(sdp); @@ -152,10 +205,11 @@ static void gfs2_write_super(struct super_block *sb) * * Flushes the log to disk. */ + static int gfs2_sync_fs(struct super_block *sb, int wait) { sb->s_dirt = 0; - if (wait) + if (wait && sb->s_fs_info) gfs2_log_flush(sb->s_fs_info, NULL); return 0; } @@ -270,14 +324,6 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) } } - if (*flags & (MS_NOATIME | MS_NODIRATIME)) - set_bit(SDF_NOATIME, &sdp->sd_flags); - else - clear_bit(SDF_NOATIME, &sdp->sd_flags); - - /* Don't let the VFS update atimes. GFS2 handles this itself. */ - *flags |= MS_NOATIME | MS_NODIRATIME; - return error; } @@ -295,6 +341,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) * inode's blocks, or alternatively pass the baton on to another * node for later deallocation. */ + static void gfs2_drop_inode(struct inode *inode) { struct gfs2_inode *ip = GFS2_I(inode); @@ -333,6 +380,16 @@ static void gfs2_clear_inode(struct inode *inode) } } +static int is_ancestor(const struct dentry *d1, const struct dentry *d2) +{ + do { + if (d1 == d2) + return 1; + d1 = d1->d_parent; + } while (!IS_ROOT(d1)); + return 0; +} + /** * gfs2_show_options - Show mount options for /proc/mounts * @s: seq_file structure @@ -346,6 +403,8 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt) struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info; struct gfs2_args *args = &sdp->sd_args; + if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir)) + seq_printf(s, ",meta"); if (args->ar_lockproto[0]) seq_printf(s, ",lockproto=%s", args->ar_lockproto); if (args->ar_locktable[0]) @@ -414,6 +473,7 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt) * conversion on the iopen lock, but we can change that later. This * is safe, just less efficient. */ + static void gfs2_delete_inode(struct inode *inode) { struct gfs2_sbd *sdp = inode->i_sb->s_fs_info; @@ -478,8 +538,6 @@ out: clear_inode(inode); } - - static struct inode *gfs2_alloc_inode(struct super_block *sb) { struct gfs2_inode *ip; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 56aaf915c59a..3e073f5144fa 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -904,7 +904,7 @@ static int need_sync(struct gfs2_quota_data *qd) do_sync = 0; else { value *= gfs2_jindex_size(sdp) * num; - do_div(value, den); + value = div_s64(value, den); value += (s64)be64_to_cpu(qd->qd_qb.qb_value); if (value < (s64)be64_to_cpu(qd->qd_qb.qb_limit)) do_sync = 0; diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 2888e4b4b1c5..d5e91f4f6a0b 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -428,6 +428,9 @@ static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *hea static void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid, unsigned int message) { + if (!sdp->sd_lockstruct.ls_ops->lm_recovery_done) + return; + if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) sdp->sd_lockstruct.ls_ops->lm_recovery_done( sdp->sd_lockstruct.ls_lockspace, jid, message); @@ -505,7 +508,7 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd) error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, LM_FLAG_NOEXP | LM_FLAG_PRIORITY | - GL_NOCANCEL | GL_NOCACHE, &t_gh); + GL_NOCACHE, &t_gh); if (error) goto fail_gunlock_ji; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 3401628d742b..2d90fb253505 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -371,11 +371,6 @@ static void clear_rgrpdi(struct gfs2_sbd *sdp) spin_lock(&sdp->sd_rindex_spin); sdp->sd_rindex_forward = NULL; - head = &sdp->sd_rindex_recent_list; - while (!list_empty(head)) { - rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent); - list_del(&rgd->rd_recent); - } spin_unlock(&sdp->sd_rindex_spin); head = &sdp->sd_rindex_list; @@ -945,107 +940,30 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked) } /** - * recent_rgrp_first - get first RG from "recent" list - * @sdp: The GFS2 superblock - * @rglast: address of the rgrp used last - * - * Returns: The first rgrp in the recent list - */ - -static struct gfs2_rgrpd *recent_rgrp_first(struct gfs2_sbd *sdp, - u64 rglast) -{ - struct gfs2_rgrpd *rgd; - - spin_lock(&sdp->sd_rindex_spin); - - if (rglast) { - list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) { - if (rgrp_contains_block(rgd, rglast)) - goto out; - } - } - rgd = NULL; - if (!list_empty(&sdp->sd_rindex_recent_list)) - rgd = list_entry(sdp->sd_rindex_recent_list.next, - struct gfs2_rgrpd, rd_recent); -out: - spin_unlock(&sdp->sd_rindex_spin); - return rgd; -} - -/** * recent_rgrp_next - get next RG from "recent" list * @cur_rgd: current rgrp - * @remove: * * Returns: The next rgrp in the recent list */ -static struct gfs2_rgrpd *recent_rgrp_next(struct gfs2_rgrpd *cur_rgd, - int remove) +static struct gfs2_rgrpd *recent_rgrp_next(struct gfs2_rgrpd *cur_rgd) { struct gfs2_sbd *sdp = cur_rgd->rd_sbd; struct list_head *head; struct gfs2_rgrpd *rgd; spin_lock(&sdp->sd_rindex_spin); - - head = &sdp->sd_rindex_recent_list; - - list_for_each_entry(rgd, head, rd_recent) { - if (rgd == cur_rgd) { - if (cur_rgd->rd_recent.next != head) - rgd = list_entry(cur_rgd->rd_recent.next, - struct gfs2_rgrpd, rd_recent); - else - rgd = NULL; - - if (remove) - list_del(&cur_rgd->rd_recent); - - goto out; - } + head = &sdp->sd_rindex_mru_list; + if (unlikely(cur_rgd->rd_list_mru.next == head)) { + spin_unlock(&sdp->sd_rindex_spin); + return NULL; } - - rgd = NULL; - if (!list_empty(head)) - rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent); - -out: + rgd = list_entry(cur_rgd->rd_list_mru.next, struct gfs2_rgrpd, rd_list_mru); spin_unlock(&sdp->sd_rindex_spin); return rgd; } /** - * recent_rgrp_add - add an RG to tail of "recent" list - * @new_rgd: The rgrp to add - * - */ - -static void recent_rgrp_add(struct gfs2_rgrpd *new_rgd) -{ - struct gfs2_sbd *sdp = new_rgd->rd_sbd; - struct gfs2_rgrpd *rgd; - unsigned int count = 0; - unsigned int max = sdp->sd_rgrps / gfs2_jindex_size(sdp); - - spin_lock(&sdp->sd_rindex_spin); - - list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) { - if (rgd == new_rgd) - goto out; - - if (++count >= max) - goto out; - } - list_add_tail(&new_rgd->rd_recent, &sdp->sd_rindex_recent_list); - -out: - spin_unlock(&sdp->sd_rindex_spin); -} - -/** * forward_rgrp_get - get an rgrp to try next from full list * @sdp: The GFS2 superblock * @@ -1112,9 +1030,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) int loops = 0; int error, rg_locked; - /* Try recently successful rgrps */ - - rgd = recent_rgrp_first(sdp, ip->i_goal); + rgd = gfs2_blk2rgrpd(sdp, ip->i_goal); while (rgd) { rg_locked = 0; @@ -1136,11 +1052,9 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) gfs2_glock_dq_uninit(&al->al_rgd_gh); if (inode) return inode; - rgd = recent_rgrp_next(rgd, 1); - break; - + /* fall through */ case GLR_TRYFAILED: - rgd = recent_rgrp_next(rgd, 0); + rgd = recent_rgrp_next(rgd); break; default: @@ -1199,7 +1113,9 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) out: if (begin) { - recent_rgrp_add(rgd); + spin_lock(&sdp->sd_rindex_spin); + list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list); + spin_unlock(&sdp->sd_rindex_spin); rgd = gfs2_rgrpd_get_next(rgd); if (!rgd) rgd = gfs2_rgrpd_get_first(sdp); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 7aeacbc65f35..c3ba3d9d0aac 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -33,314 +33,6 @@ #include "trans.h" #include "util.h" -static const u32 gfs2_old_fs_formats[] = { - 0 -}; - -static const u32 gfs2_old_multihost_formats[] = { - 0 -}; - -/** - * gfs2_tune_init - Fill a gfs2_tune structure with default values - * @gt: tune - * - */ - -void gfs2_tune_init(struct gfs2_tune *gt) -{ - spin_lock_init(>->gt_spin); - - gt->gt_demote_secs = 300; - gt->gt_incore_log_blocks = 1024; - gt->gt_log_flush_secs = 60; - gt->gt_recoverd_secs = 60; - gt->gt_logd_secs = 1; - gt->gt_quotad_secs = 5; - gt->gt_quota_simul_sync = 64; - gt->gt_quota_warn_period = 10; - gt->gt_quota_scale_num = 1; - gt->gt_quota_scale_den = 1; - gt->gt_quota_cache_secs = 300; - gt->gt_quota_quantum = 60; - gt->gt_atime_quantum = 3600; - gt->gt_new_files_jdata = 0; - gt->gt_new_files_directio = 0; - gt->gt_max_readahead = 1 << 18; - gt->gt_stall_secs = 600; - gt->gt_complain_secs = 10; - gt->gt_statfs_quantum = 30; - gt->gt_statfs_slow = 0; -} - -/** - * gfs2_check_sb - Check superblock - * @sdp: the filesystem - * @sb: The superblock - * @silent: Don't print a message if the check fails - * - * Checks the version code of the FS is one that we understand how to - * read and that the sizes of the various on-disk structures have not - * changed. - */ - -int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent) -{ - unsigned int x; - - if (sb->sb_magic != GFS2_MAGIC || - sb->sb_type != GFS2_METATYPE_SB) { - if (!silent) - printk(KERN_WARNING "GFS2: not a GFS2 filesystem\n"); - return -EINVAL; - } - - /* If format numbers match exactly, we're done. */ - - if (sb->sb_fs_format == GFS2_FORMAT_FS && - sb->sb_multihost_format == GFS2_FORMAT_MULTI) - return 0; - - if (sb->sb_fs_format != GFS2_FORMAT_FS) { - for (x = 0; gfs2_old_fs_formats[x]; x++) - if (gfs2_old_fs_formats[x] == sb->sb_fs_format) - break; - - if (!gfs2_old_fs_formats[x]) { - printk(KERN_WARNING - "GFS2: code version (%u, %u) is incompatible " - "with ondisk format (%u, %u)\n", - GFS2_FORMAT_FS, GFS2_FORMAT_MULTI, - sb->sb_fs_format, sb->sb_multihost_format); - printk(KERN_WARNING - "GFS2: I don't know how to upgrade this FS\n"); - return -EINVAL; - } - } - - if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) { - for (x = 0; gfs2_old_multihost_formats[x]; x++) - if (gfs2_old_multihost_formats[x] == - sb->sb_multihost_format) - break; - - if (!gfs2_old_multihost_formats[x]) { - printk(KERN_WARNING - "GFS2: code version (%u, %u) is incompatible " - "with ondisk format (%u, %u)\n", - GFS2_FORMAT_FS, GFS2_FORMAT_MULTI, - sb->sb_fs_format, sb->sb_multihost_format); - printk(KERN_WARNING - "GFS2: I don't know how to upgrade this FS\n"); - return -EINVAL; - } - } - - if (!sdp->sd_args.ar_upgrade) { - printk(KERN_WARNING - "GFS2: code version (%u, %u) is incompatible " - "with ondisk format (%u, %u)\n", - GFS2_FORMAT_FS, GFS2_FORMAT_MULTI, - sb->sb_fs_format, sb->sb_multihost_format); - printk(KERN_INFO - "GFS2: Use the \"upgrade\" mount option to upgrade " - "the FS\n"); - printk(KERN_INFO "GFS2: See the manual for more details\n"); - return -EINVAL; - } - - return 0; -} - - -static void end_bio_io_page(struct bio *bio, int error) -{ - struct page *page = bio->bi_private; - - if (!error) - SetPageUptodate(page); - else - printk(KERN_WARNING "gfs2: error %d reading superblock\n", error); - unlock_page(page); -} - -static void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf) -{ - const struct gfs2_sb *str = buf; - - sb->sb_magic = be32_to_cpu(str->sb_header.mh_magic); - sb->sb_type = be32_to_cpu(str->sb_header.mh_type); - sb->sb_format = be32_to_cpu(str->sb_header.mh_format); - sb->sb_fs_format = be32_to_cpu(str->sb_fs_format); - sb->sb_multihost_format = be32_to_cpu(str->sb_multihost_format); - sb->sb_bsize = be32_to_cpu(str->sb_bsize); - sb->sb_bsize_shift = be32_to_cpu(str->sb_bsize_shift); - sb->sb_master_dir.no_addr = be64_to_cpu(str->sb_master_dir.no_addr); - sb->sb_master_dir.no_formal_ino = be64_to_cpu(str->sb_master_dir.no_formal_ino); - sb->sb_root_dir.no_addr = be64_to_cpu(str->sb_root_dir.no_addr); - sb->sb_root_dir.no_formal_ino = be64_to_cpu(str->sb_root_dir.no_formal_ino); - - memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN); - memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN); -} - -/** - * gfs2_read_super - Read the gfs2 super block from disk - * @sdp: The GFS2 super block - * @sector: The location of the super block - * @error: The error code to return - * - * This uses the bio functions to read the super block from disk - * because we want to be 100% sure that we never read cached data. - * A super block is read twice only during each GFS2 mount and is - * never written to by the filesystem. The first time its read no - * locks are held, and the only details which are looked at are those - * relating to the locking protocol. Once locking is up and working, - * the sb is read again under the lock to establish the location of - * the master directory (contains pointers to journals etc) and the - * root directory. - * - * Returns: 0 on success or error - */ - -int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector) -{ - struct super_block *sb = sdp->sd_vfs; - struct gfs2_sb *p; - struct page *page; - struct bio *bio; - - page = alloc_page(GFP_NOFS); - if (unlikely(!page)) - return -ENOBUFS; - - ClearPageUptodate(page); - ClearPageDirty(page); - lock_page(page); - - bio = bio_alloc(GFP_NOFS, 1); - if (unlikely(!bio)) { - __free_page(page); - return -ENOBUFS; - } - - bio->bi_sector = sector * (sb->s_blocksize >> 9); - bio->bi_bdev = sb->s_bdev; - bio_add_page(bio, page, PAGE_SIZE, 0); - - bio->bi_end_io = end_bio_io_page; - bio->bi_private = page; - submit_bio(READ_SYNC | (1 << BIO_RW_META), bio); - wait_on_page_locked(page); - bio_put(bio); - if (!PageUptodate(page)) { - __free_page(page); - return -EIO; - } - p = kmap(page); - gfs2_sb_in(&sdp->sd_sb, p); - kunmap(page); - __free_page(page); - return 0; -} - -/** - * gfs2_read_sb - Read super block - * @sdp: The GFS2 superblock - * @gl: the glock for the superblock (assumed to be held) - * @silent: Don't print message if mount fails - * - */ - -int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent) -{ - u32 hash_blocks, ind_blocks, leaf_blocks; - u32 tmp_blocks; - unsigned int x; - int error; - - error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift); - if (error) { - if (!silent) - fs_err(sdp, "can't read superblock\n"); - return error; - } - - error = gfs2_check_sb(sdp, &sdp->sd_sb, silent); - if (error) - return error; - - sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - - GFS2_BASIC_BLOCK_SHIFT; - sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift; - sdp->sd_diptrs = (sdp->sd_sb.sb_bsize - - sizeof(struct gfs2_dinode)) / sizeof(u64); - sdp->sd_inptrs = (sdp->sd_sb.sb_bsize - - sizeof(struct gfs2_meta_header)) / sizeof(u64); - sdp->sd_jbsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header); - sdp->sd_hash_bsize = sdp->sd_sb.sb_bsize / 2; - sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1; - sdp->sd_hash_ptrs = sdp->sd_hash_bsize / sizeof(u64); - sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize - - sizeof(struct gfs2_meta_header)) / - sizeof(struct gfs2_quota_change); - - /* Compute maximum reservation required to add a entry to a directory */ - - hash_blocks = DIV_ROUND_UP(sizeof(u64) * (1 << GFS2_DIR_MAX_DEPTH), - sdp->sd_jbsize); - - ind_blocks = 0; - for (tmp_blocks = hash_blocks; tmp_blocks > sdp->sd_diptrs;) { - tmp_blocks = DIV_ROUND_UP(tmp_blocks, sdp->sd_inptrs); - ind_blocks += tmp_blocks; - } - - leaf_blocks = 2 + GFS2_DIR_MAX_DEPTH; - - sdp->sd_max_dirres = hash_blocks + ind_blocks + leaf_blocks; - - sdp->sd_heightsize[0] = sdp->sd_sb.sb_bsize - - sizeof(struct gfs2_dinode); - sdp->sd_heightsize[1] = sdp->sd_sb.sb_bsize * sdp->sd_diptrs; - for (x = 2;; x++) { - u64 space, d; - u32 m; - - space = sdp->sd_heightsize[x - 1] * sdp->sd_inptrs; - d = space; - m = do_div(d, sdp->sd_inptrs); - - if (d != sdp->sd_heightsize[x - 1] || m) - break; - sdp->sd_heightsize[x] = space; - } - sdp->sd_max_height = x; - sdp->sd_heightsize[x] = ~0; - gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT); - - sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize - - sizeof(struct gfs2_dinode); - sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs; - for (x = 2;; x++) { - u64 space, d; - u32 m; - - space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs; - d = space; - m = do_div(d, sdp->sd_inptrs); - - if (d != sdp->sd_jheightsize[x - 1] || m) - break; - sdp->sd_jheightsize[x] = space; - } - sdp->sd_max_jheight = x; - sdp->sd_jheightsize[x] = ~0; - gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT); - - return 0; -} - /** * gfs2_jindex_hold - Grab a lock on the jindex * @sdp: The GFS2 superblock @@ -390,7 +82,7 @@ int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh) break; INIT_LIST_HEAD(&jd->extent_list); - jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1, NULL); + jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1); if (!jd->jd_inode || IS_ERR(jd->jd_inode)) { if (!jd->jd_inode) error = -ENOENT; @@ -582,39 +274,6 @@ fail: return error; } -/** - * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one - * @sdp: the filesystem - * - * Returns: errno - */ - -int gfs2_make_fs_ro(struct gfs2_sbd *sdp) -{ - struct gfs2_holder t_gh; - int error; - - gfs2_quota_sync(sdp); - gfs2_statfs_sync(sdp); - - error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE, - &t_gh); - if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) - return error; - - gfs2_meta_syncfs(sdp); - gfs2_log_shutdown(sdp); - - clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); - - if (t_gh.gh_gl) - gfs2_glock_dq_uninit(&t_gh); - - gfs2_quota_cleanup(sdp); - - return error; -} - static void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf) { const struct gfs2_statfs_change *str = buf; @@ -941,8 +600,7 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp, } error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_DEFERRED, - LM_FLAG_PRIORITY | GL_NOCACHE, - t_gh); + GL_NOCACHE, t_gh); list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { error = gfs2_jdesc_check(jd); diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h index 44361ecc44f7..50a4c9b1215e 100644 --- a/fs/gfs2/super.h +++ b/fs/gfs2/super.h @@ -12,11 +12,6 @@ #include "incore.h" -void gfs2_tune_init(struct gfs2_tune *gt); - -int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent); -int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent); -int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector); void gfs2_lm_unmount(struct gfs2_sbd *sdp); static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp) @@ -40,7 +35,6 @@ int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename, struct gfs2_inode **ipp); int gfs2_make_fs_rw(struct gfs2_sbd *sdp); -int gfs2_make_fs_ro(struct gfs2_sbd *sdp); int gfs2_statfs_init(struct gfs2_sbd *sdp); void gfs2_statfs_change(struct gfs2_sbd *sdp, diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 9ab9fc85ecd0..7e1879f1a02c 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -110,18 +110,6 @@ static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf, return len; } -static ssize_t shrink_store(struct gfs2_sbd *sdp, const char *buf, size_t len) -{ - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - - if (simple_strtol(buf, NULL, 0) != 1) - return -EINVAL; - - gfs2_gl_hash_clear(sdp, NO_WAIT); - return len; -} - static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf, size_t len) { @@ -175,7 +163,6 @@ static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store) GFS2_ATTR(id, 0444, id_show, NULL); GFS2_ATTR(fsname, 0444, fsname_show, NULL); GFS2_ATTR(freeze, 0644, freeze_show, freeze_store); -GFS2_ATTR(shrink, 0200, NULL, shrink_store); GFS2_ATTR(withdraw, 0644, withdraw_show, withdraw_store); GFS2_ATTR(statfs_sync, 0200, NULL, statfs_sync_store); GFS2_ATTR(quota_sync, 0200, NULL, quota_sync_store); @@ -186,7 +173,6 @@ static struct attribute *gfs2_attrs[] = { &gfs2_attr_id.attr, &gfs2_attr_fsname.attr, &gfs2_attr_freeze.attr, - &gfs2_attr_shrink.attr, &gfs2_attr_withdraw.attr, &gfs2_attr_statfs_sync.attr, &gfs2_attr_quota_sync.attr, @@ -283,14 +269,6 @@ ARGS_ATTR(quota, "%u\n"); ARGS_ATTR(suiddir, "%d\n"); ARGS_ATTR(data, "%d\n"); -/* one oddball doesn't fit the macro mold */ -static ssize_t noatime_show(struct gfs2_sbd *sdp, char *buf) -{ - return snprintf(buf, PAGE_SIZE, "%d\n", - !!test_bit(SDF_NOATIME, &sdp->sd_flags)); -} -static struct args_attr args_attr_noatime = __ATTR_RO(noatime); - static struct attribute *args_attrs[] = { &args_attr_lockproto.attr, &args_attr_locktable.attr, @@ -306,7 +284,6 @@ static struct attribute *args_attrs[] = { &args_attr_quota.attr, &args_attr_suiddir.attr, &args_attr_data.attr, - &args_attr_noatime.attr, NULL, }; @@ -421,12 +398,10 @@ TUNE_ATTR(incore_log_blocks, 0); TUNE_ATTR(log_flush_secs, 0); TUNE_ATTR(quota_warn_period, 0); TUNE_ATTR(quota_quantum, 0); -TUNE_ATTR(atime_quantum, 0); TUNE_ATTR(max_readahead, 0); TUNE_ATTR(complain_secs, 0); TUNE_ATTR(statfs_slow, 0); TUNE_ATTR(new_files_jdata, 0); -TUNE_ATTR(new_files_directio, 0); TUNE_ATTR(quota_simul_sync, 1); TUNE_ATTR(quota_cache_secs, 1); TUNE_ATTR(stall_secs, 1); @@ -442,7 +417,6 @@ static struct attribute *tune_attrs[] = { &tune_attr_log_flush_secs.attr, &tune_attr_quota_warn_period.attr, &tune_attr_quota_quantum.attr, - &tune_attr_atime_quantum.attr, &tune_attr_max_readahead.attr, &tune_attr_complain_secs.attr, &tune_attr_statfs_slow.attr, @@ -455,7 +429,6 @@ static struct attribute *tune_attrs[] = { &tune_attr_quotad_secs.attr, &tune_attr_quota_scale.attr, &tune_attr_new_files_jdata.attr, - &tune_attr_new_files_directio.attr, NULL, }; diff --git a/fs/hfs/bitmap.c b/fs/hfs/bitmap.c index 24e75798ddf0..c6e97366e8ac 100644 --- a/fs/hfs/bitmap.c +++ b/fs/hfs/bitmap.c @@ -145,7 +145,7 @@ u32 hfs_vbm_search_free(struct super_block *sb, u32 goal, u32 *num_bits) if (!*num_bits) return 0; - down(&HFS_SB(sb)->bitmap_lock); + mutex_lock(&HFS_SB(sb)->bitmap_lock); bitmap = HFS_SB(sb)->bitmap; pos = hfs_find_set_zero_bits(bitmap, HFS_SB(sb)->fs_ablocks, goal, num_bits); @@ -162,7 +162,7 @@ u32 hfs_vbm_search_free(struct super_block *sb, u32 goal, u32 *num_bits) HFS_SB(sb)->free_ablocks -= *num_bits; hfs_bitmap_dirty(sb); out: - up(&HFS_SB(sb)->bitmap_lock); + mutex_unlock(&HFS_SB(sb)->bitmap_lock); return pos; } @@ -205,7 +205,7 @@ int hfs_clear_vbm_bits(struct super_block *sb, u16 start, u16 count) if ((start + count) > HFS_SB(sb)->fs_ablocks) return -2; - down(&HFS_SB(sb)->bitmap_lock); + mutex_lock(&HFS_SB(sb)->bitmap_lock); /* bitmap is always on a 32-bit boundary */ curr = HFS_SB(sb)->bitmap + (start / 32); len = count; @@ -236,7 +236,7 @@ int hfs_clear_vbm_bits(struct super_block *sb, u16 start, u16 count) } out: HFS_SB(sb)->free_ablocks += len; - up(&HFS_SB(sb)->bitmap_lock); + mutex_unlock(&HFS_SB(sb)->bitmap_lock); hfs_bitmap_dirty(sb); return 0; diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c index f6621a785202..9b9d6395bad3 100644 --- a/fs/hfs/btree.c +++ b/fs/hfs/btree.c @@ -40,7 +40,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke { struct hfs_mdb *mdb = HFS_SB(sb)->mdb; HFS_I(tree->inode)->flags = 0; - init_MUTEX(&HFS_I(tree->inode)->extents_lock); + mutex_init(&HFS_I(tree->inode)->extents_lock); switch (id) { case HFS_EXT_CNID: hfs_inode_read_fork(tree->inode, mdb->drXTExtRec, mdb->drXTFlSize, diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c index c176f67ba0a5..2c16316d2917 100644 --- a/fs/hfs/extent.c +++ b/fs/hfs/extent.c @@ -343,16 +343,16 @@ int hfs_get_block(struct inode *inode, sector_t block, goto done; } - down(&HFS_I(inode)->extents_lock); + mutex_lock(&HFS_I(inode)->extents_lock); res = hfs_ext_read_extent(inode, ablock); if (!res) dblock = hfs_ext_find_block(HFS_I(inode)->cached_extents, ablock - HFS_I(inode)->cached_start); else { - up(&HFS_I(inode)->extents_lock); + mutex_unlock(&HFS_I(inode)->extents_lock); return -EIO; } - up(&HFS_I(inode)->extents_lock); + mutex_unlock(&HFS_I(inode)->extents_lock); done: map_bh(bh_result, sb, HFS_SB(sb)->fs_start + @@ -375,7 +375,7 @@ int hfs_extend_file(struct inode *inode) u32 start, len, goal; int res; - down(&HFS_I(inode)->extents_lock); + mutex_lock(&HFS_I(inode)->extents_lock); if (HFS_I(inode)->alloc_blocks == HFS_I(inode)->first_blocks) goal = hfs_ext_lastblock(HFS_I(inode)->first_extents); else { @@ -425,7 +425,7 @@ int hfs_extend_file(struct inode *inode) goto insert_extent; } out: - up(&HFS_I(inode)->extents_lock); + mutex_unlock(&HFS_I(inode)->extents_lock); if (!res) { HFS_I(inode)->alloc_blocks += len; mark_inode_dirty(inode); @@ -487,7 +487,7 @@ void hfs_file_truncate(struct inode *inode) if (blk_cnt == alloc_cnt) goto out; - down(&HFS_I(inode)->extents_lock); + mutex_lock(&HFS_I(inode)->extents_lock); hfs_find_init(HFS_SB(sb)->ext_tree, &fd); while (1) { if (alloc_cnt == HFS_I(inode)->first_blocks) { @@ -514,7 +514,7 @@ void hfs_file_truncate(struct inode *inode) hfs_brec_remove(&fd); } hfs_find_exit(&fd); - up(&HFS_I(inode)->extents_lock); + mutex_unlock(&HFS_I(inode)->extents_lock); HFS_I(inode)->alloc_blocks = blk_cnt; out: diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index 147374b6f675..9955232fdf8c 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -11,6 +11,7 @@ #include <linux/slab.h> #include <linux/types.h> +#include <linux/mutex.h> #include <linux/buffer_head.h> #include <linux/fs.h> @@ -53,7 +54,7 @@ struct hfs_inode_info { struct list_head open_dir_list; struct inode *rsrc_inode; - struct semaphore extents_lock; + struct mutex extents_lock; u16 alloc_blocks, clump_blocks; sector_t fs_blocks; @@ -139,7 +140,7 @@ struct hfs_sb_info { struct nls_table *nls_io, *nls_disk; - struct semaphore bitmap_lock; + struct mutex bitmap_lock; unsigned long flags; diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 97f8446c4ff4..7e19835efa2e 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -150,7 +150,7 @@ struct inode *hfs_new_inode(struct inode *dir, struct qstr *name, int mode) if (!inode) return NULL; - init_MUTEX(&HFS_I(inode)->extents_lock); + mutex_init(&HFS_I(inode)->extents_lock); INIT_LIST_HEAD(&HFS_I(inode)->open_dir_list); hfs_cat_build_key(sb, (btree_key *)&HFS_I(inode)->cat_key, dir->i_ino, name); inode->i_ino = HFS_SB(sb)->next_id++; @@ -281,7 +281,7 @@ static int hfs_read_inode(struct inode *inode, void *data) HFS_I(inode)->flags = 0; HFS_I(inode)->rsrc_inode = NULL; - init_MUTEX(&HFS_I(inode)->extents_lock); + mutex_init(&HFS_I(inode)->extents_lock); INIT_LIST_HEAD(&HFS_I(inode)->open_dir_list); /* Initialize the inode */ @@ -511,8 +511,7 @@ void hfs_clear_inode(struct inode *inode) } } -static int hfs_permission(struct inode *inode, int mask, - struct nameidata *nd) +static int hfs_permission(struct inode *inode, int mask) { if (S_ISREG(inode->i_mode) && mask & MAY_EXEC) return 0; @@ -523,8 +522,6 @@ static int hfs_file_open(struct inode *inode, struct file *file) { if (HFS_IS_RSRC(inode)) inode = HFS_I(inode)->rsrc_inode; - if (atomic_read(&file->f_count) != 1) - return 0; atomic_inc(&HFS_I(inode)->opencnt); return 0; } @@ -535,8 +532,6 @@ static int hfs_file_release(struct inode *inode, struct file *file) if (HFS_IS_RSRC(inode)) inode = HFS_I(inode)->rsrc_inode; - if (atomic_read(&file->f_count) != 0) - return 0; if (atomic_dec_and_test(&HFS_I(inode)->opencnt)) { mutex_lock(&inode->i_mutex); hfs_file_truncate(inode); diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 8cf67974adf6..3c7c7637719c 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -173,7 +173,7 @@ enum { opt_err }; -static match_table_t tokens = { +static const match_table_t tokens = { { opt_uid, "uid=%u" }, { opt_gid, "gid=%u" }, { opt_umask, "umask=%o" }, @@ -372,7 +372,7 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_op = &hfs_super_operations; sb->s_flags |= MS_NODIRATIME; - init_MUTEX(&sbi->bitmap_lock); + mutex_init(&sbi->bitmap_lock); res = hfs_mdb_get(sb); if (res) { @@ -432,7 +432,7 @@ static struct file_system_type hfs_fs_type = { .fs_flags = FS_REQUIRES_DEV, }; -static void hfs_init_once(struct kmem_cache *cachep, void *p) +static void hfs_init_once(void *p) { struct hfs_inode_info *i = p; diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index 12e899cd7886..fec8f61227ff 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -199,16 +199,16 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock, goto done; } - down(&HFSPLUS_I(inode).extents_lock); + mutex_lock(&HFSPLUS_I(inode).extents_lock); res = hfsplus_ext_read_extent(inode, ablock); if (!res) { dblock = hfsplus_ext_find_block(HFSPLUS_I(inode).cached_extents, ablock - HFSPLUS_I(inode).cached_start); } else { - up(&HFSPLUS_I(inode).extents_lock); + mutex_unlock(&HFSPLUS_I(inode).extents_lock); return -EIO; } - up(&HFSPLUS_I(inode).extents_lock); + mutex_unlock(&HFSPLUS_I(inode).extents_lock); done: dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n", inode->i_ino, (long long)iblock, dblock); @@ -355,7 +355,7 @@ int hfsplus_file_extend(struct inode *inode) return -ENOSPC; } - down(&HFSPLUS_I(inode).extents_lock); + mutex_lock(&HFSPLUS_I(inode).extents_lock); if (HFSPLUS_I(inode).alloc_blocks == HFSPLUS_I(inode).first_blocks) goal = hfsplus_ext_lastblock(HFSPLUS_I(inode).first_extents); else { @@ -408,7 +408,7 @@ int hfsplus_file_extend(struct inode *inode) goto insert_extent; } out: - up(&HFSPLUS_I(inode).extents_lock); + mutex_unlock(&HFSPLUS_I(inode).extents_lock); if (!res) { HFSPLUS_I(inode).alloc_blocks += len; mark_inode_dirty(inode); @@ -465,7 +465,7 @@ void hfsplus_file_truncate(struct inode *inode) if (blk_cnt == alloc_cnt) goto out; - down(&HFSPLUS_I(inode).extents_lock); + mutex_lock(&HFSPLUS_I(inode).extents_lock); hfs_find_init(HFSPLUS_SB(sb).ext_tree, &fd); while (1) { if (alloc_cnt == HFSPLUS_I(inode).first_blocks) { @@ -492,7 +492,7 @@ void hfsplus_file_truncate(struct inode *inode) hfs_brec_remove(&fd); } hfs_find_exit(&fd); - up(&HFSPLUS_I(inode).extents_lock); + mutex_unlock(&HFSPLUS_I(inode).extents_lock); HFSPLUS_I(inode).alloc_blocks = blk_cnt; out: diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 9e59537b43d5..f027a905225f 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -11,6 +11,7 @@ #define _LINUX_HFSPLUS_FS_H #include <linux/fs.h> +#include <linux/mutex.h> #include <linux/buffer_head.h> #include "hfsplus_raw.h" @@ -154,7 +155,7 @@ struct hfsplus_sb_info { struct hfsplus_inode_info { - struct semaphore extents_lock; + struct mutex extents_lock; u32 clump_blocks, alloc_blocks; sector_t fs_blocks; /* Allocation extents from catalog record or volume header */ diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 67e1c8b467c4..b085d64a2b67 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -163,7 +163,7 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent inode->i_ino = dir->i_ino; INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list); - init_MUTEX(&HFSPLUS_I(inode).extents_lock); + mutex_init(&HFSPLUS_I(inode).extents_lock); HFSPLUS_I(inode).flags = HFSPLUS_FLG_RSRC; hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); @@ -238,7 +238,7 @@ static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms) perms->dev = cpu_to_be32(HFSPLUS_I(inode).dev); } -static int hfsplus_permission(struct inode *inode, int mask, struct nameidata *nd) +static int hfsplus_permission(struct inode *inode, int mask) { /* MAY_EXEC is also used for lookup, if no x bit is set allow lookup, * open_exec has the same test, so it's still not executable, if a x bit @@ -254,8 +254,6 @@ static int hfsplus_file_open(struct inode *inode, struct file *file) { if (HFSPLUS_IS_RSRC(inode)) inode = HFSPLUS_I(inode).rsrc_inode; - if (atomic_read(&file->f_count) != 1) - return 0; atomic_inc(&HFSPLUS_I(inode).opencnt); return 0; } @@ -266,8 +264,6 @@ static int hfsplus_file_release(struct inode *inode, struct file *file) if (HFSPLUS_IS_RSRC(inode)) inode = HFSPLUS_I(inode).rsrc_inode; - if (atomic_read(&file->f_count) != 0) - return 0; if (atomic_dec_and_test(&HFSPLUS_I(inode).opencnt)) { mutex_lock(&inode->i_mutex); hfsplus_file_truncate(inode); @@ -316,7 +312,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, int mode) inode->i_nlink = 1; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list); - init_MUTEX(&HFSPLUS_I(inode).extents_lock); + mutex_init(&HFSPLUS_I(inode).extents_lock); atomic_set(&HFSPLUS_I(inode).opencnt, 0); HFSPLUS_I(inode).flags = 0; memset(HFSPLUS_I(inode).first_extents, 0, sizeof(hfsplus_extent_rec)); diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c index 9997cbf8beb5..9699c56d323f 100644 --- a/fs/hfsplus/options.c +++ b/fs/hfsplus/options.c @@ -25,7 +25,7 @@ enum { opt_force, opt_err }; -static match_table_t tokens = { +static const match_table_t tokens = { { opt_creator, "creator=%s" }, { opt_type, "type=%s" }, { opt_umask, "umask=%o" }, diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index ce97a54518d8..e834e578c93f 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -34,7 +34,7 @@ struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino) return inode; INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list); - init_MUTEX(&HFSPLUS_I(inode).extents_lock); + mutex_init(&HFSPLUS_I(inode).extents_lock); HFSPLUS_I(inode).flags = 0; HFSPLUS_I(inode).rsrc_inode = NULL; atomic_set(&HFSPLUS_I(inode).opencnt, 0); @@ -485,7 +485,7 @@ static struct file_system_type hfsplus_fs_type = { .fs_flags = FS_REQUIRES_DEV, }; -static void hfsplus_init_once(struct kmem_cache *cachep, void *p) +static void hfsplus_init_once(void *p) { struct hfsplus_inode_info *i = p; diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 5222345ddccf..d6ecabf4d231 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -822,7 +822,7 @@ int hostfs_rename(struct inode *from_ino, struct dentry *from, return err; } -int hostfs_permission(struct inode *ino, int desired, struct nameidata *nd) +int hostfs_permission(struct inode *ino, int desired) { char *name; int r = 0, w = 0, x = 0, err; diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index d256559b4104..d9c59a775449 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -415,7 +415,7 @@ again: d_drop(dentry); spin_lock(&dentry->d_lock); if (atomic_read(&dentry->d_count) > 1 || - permission(inode, MAY_WRITE, NULL) || + generic_permission(inode, MAY_WRITE, NULL) || !S_ISREG(inode->i_mode) || get_write_access(inode)) { spin_unlock(&dentry->d_lock); diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index f63a699ec659..29ad461d568f 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -173,7 +173,7 @@ static void hpfs_destroy_inode(struct inode *inode) kmem_cache_free(hpfs_inode_cachep, hpfs_i(inode)); } -static void init_once(struct kmem_cache *cachep, void *foo) +static void init_once(void *foo) { struct hpfs_inode_info *ei = (struct hpfs_inode_info *) foo; @@ -215,7 +215,7 @@ enum { Opt_timeshift, Opt_err, }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_help, "help"}, {Opt_uid, "uid=%u"}, {Opt_gid, "gid=%u"}, diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index 65077aa90f0a..2b3d1828db99 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -655,20 +655,13 @@ static void *hppfs_follow_link(struct dentry *dentry, struct nameidata *nd) return proc_dentry->d_inode->i_op->follow_link(proc_dentry, nd); } -int hppfs_permission(struct inode *inode, int mask, struct nameidata *nd) -{ - return generic_permission(inode, mask, NULL); -} - static const struct inode_operations hppfs_dir_iops = { .lookup = hppfs_lookup, - .permission = hppfs_permission, }; static const struct inode_operations hppfs_link_iops = { .readlink = hppfs_readlink, .follow_link = hppfs_follow_link, - .permission = hppfs_permission, }; static struct inode *get_inode(struct super_block *sb, struct dentry *dentry) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index aeabf80f81a5..61edc701b0e6 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -53,15 +53,17 @@ int sysctl_hugetlb_shm_group; enum { Opt_size, Opt_nr_inodes, Opt_mode, Opt_uid, Opt_gid, + Opt_pagesize, Opt_err, }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_size, "size=%s"}, {Opt_nr_inodes, "nr_inodes=%s"}, {Opt_mode, "mode=%o"}, {Opt_uid, "uid=%u"}, {Opt_gid, "gid=%u"}, + {Opt_pagesize, "pagesize=%s"}, {Opt_err, NULL}, }; @@ -80,6 +82,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) struct inode *inode = file->f_path.dentry->d_inode; loff_t len, vma_len; int ret; + struct hstate *h = hstate_file(file); /* * vma address alignment (but not the pgoff alignment) has @@ -92,7 +95,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) vma->vm_flags |= VM_HUGETLB | VM_RESERVED; vma->vm_ops = &hugetlb_vm_ops; - if (vma->vm_pgoff & ~(HPAGE_MASK >> PAGE_SHIFT)) + if (vma->vm_pgoff & ~(huge_page_mask(h) >> PAGE_SHIFT)) return -EINVAL; vma_len = (loff_t)(vma->vm_end - vma->vm_start); @@ -103,9 +106,9 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) ret = -ENOMEM; len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); - if (vma->vm_flags & VM_MAYSHARE && - hugetlb_reserve_pages(inode, vma->vm_pgoff >> (HPAGE_SHIFT-PAGE_SHIFT), - len >> HPAGE_SHIFT)) + if (hugetlb_reserve_pages(inode, + vma->vm_pgoff >> huge_page_order(h), + len >> huge_page_shift(h), vma)) goto out; ret = 0; @@ -130,20 +133,21 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned long start_addr; + struct hstate *h = hstate_file(file); - if (len & ~HPAGE_MASK) + if (len & ~huge_page_mask(h)) return -EINVAL; if (len > TASK_SIZE) return -ENOMEM; if (flags & MAP_FIXED) { - if (prepare_hugepage_range(addr, len)) + if (prepare_hugepage_range(file, addr, len)) return -EINVAL; return addr; } if (addr) { - addr = ALIGN(addr, HPAGE_SIZE); + addr = ALIGN(addr, huge_page_size(h)); vma = find_vma(mm, addr); if (TASK_SIZE - len >= addr && (!vma || addr + len <= vma->vm_start)) @@ -156,7 +160,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, start_addr = TASK_UNMAPPED_BASE; full_search: - addr = ALIGN(start_addr, HPAGE_SIZE); + addr = ALIGN(start_addr, huge_page_size(h)); for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { /* At this point: (!vma || addr < vma->vm_end). */ @@ -174,7 +178,7 @@ full_search: if (!vma || addr + len <= vma->vm_start) return addr; - addr = ALIGN(vma->vm_end, HPAGE_SIZE); + addr = ALIGN(vma->vm_end, huge_page_size(h)); } } #endif @@ -225,10 +229,11 @@ hugetlbfs_read_actor(struct page *page, unsigned long offset, static ssize_t hugetlbfs_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) { + struct hstate *h = hstate_file(filp); struct address_space *mapping = filp->f_mapping; struct inode *inode = mapping->host; - unsigned long index = *ppos >> HPAGE_SHIFT; - unsigned long offset = *ppos & ~HPAGE_MASK; + unsigned long index = *ppos >> huge_page_shift(h); + unsigned long offset = *ppos & ~huge_page_mask(h); unsigned long end_index; loff_t isize; ssize_t retval = 0; @@ -243,17 +248,17 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf, if (!isize) goto out; - end_index = (isize - 1) >> HPAGE_SHIFT; + end_index = (isize - 1) >> huge_page_shift(h); for (;;) { struct page *page; - int nr, ret; + unsigned long nr, ret; /* nr is the maximum number of bytes to copy from this page */ - nr = HPAGE_SIZE; + nr = huge_page_size(h); if (index >= end_index) { if (index > end_index) goto out; - nr = ((isize - 1) & ~HPAGE_MASK) + 1; + nr = ((isize - 1) & ~huge_page_mask(h)) + 1; if (nr <= offset) { goto out; } @@ -287,8 +292,8 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf, offset += ret; retval += ret; len -= ret; - index += offset >> HPAGE_SHIFT; - offset &= ~HPAGE_MASK; + index += offset >> huge_page_shift(h); + offset &= ~huge_page_mask(h); if (page) page_cache_release(page); @@ -298,7 +303,7 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf, break; } out: - *ppos = ((loff_t)index << HPAGE_SHIFT) + offset; + *ppos = ((loff_t)index << huge_page_shift(h)) + offset; mutex_unlock(&inode->i_mutex); return retval; } @@ -339,8 +344,9 @@ static void truncate_huge_page(struct page *page) static void truncate_hugepages(struct inode *inode, loff_t lstart) { + struct hstate *h = hstate_inode(inode); struct address_space *mapping = &inode->i_data; - const pgoff_t start = lstart >> HPAGE_SHIFT; + const pgoff_t start = lstart >> huge_page_shift(h); struct pagevec pvec; pgoff_t next; int i, freed = 0; @@ -441,7 +447,7 @@ hugetlb_vmtruncate_list(struct prio_tree_root *root, pgoff_t pgoff) v_offset = 0; __unmap_hugepage_range(vma, - vma->vm_start + v_offset, vma->vm_end); + vma->vm_start + v_offset, vma->vm_end, NULL); } } @@ -449,8 +455,9 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) { pgoff_t pgoff; struct address_space *mapping = inode->i_mapping; + struct hstate *h = hstate_inode(inode); - BUG_ON(offset & ~HPAGE_MASK); + BUG_ON(offset & ~huge_page_mask(h)); pgoff = offset >> PAGE_SHIFT; i_size_write(inode, offset); @@ -465,6 +472,7 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; + struct hstate *h = hstate_inode(inode); int error; unsigned int ia_valid = attr->ia_valid; @@ -476,7 +484,7 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) if (ia_valid & ATTR_SIZE) { error = -EINVAL; - if (!(attr->ia_size & ~HPAGE_MASK)) + if (!(attr->ia_size & ~huge_page_mask(h))) error = hugetlb_vmtruncate(inode, attr->ia_size); if (error) goto out; @@ -610,9 +618,10 @@ static int hugetlbfs_set_page_dirty(struct page *page) static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb); + struct hstate *h = hstate_inode(dentry->d_inode); buf->f_type = HUGETLBFS_MAGIC; - buf->f_bsize = HPAGE_SIZE; + buf->f_bsize = huge_page_size(h); if (sbinfo) { spin_lock(&sbinfo->stat_lock); /* If no limits set, just report 0 for max/free/used @@ -696,7 +705,7 @@ static const struct address_space_operations hugetlbfs_aops = { }; -static void init_once(struct kmem_cache *cachep, void *foo) +static void init_once(void *foo) { struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo; @@ -743,6 +752,8 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) char *p, *rest; substring_t args[MAX_OPT_ARGS]; int option; + unsigned long long size = 0; + enum { NO_SIZE, SIZE_STD, SIZE_PERCENT } setsize = NO_SIZE; if (!options) return 0; @@ -773,17 +784,13 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) break; case Opt_size: { - unsigned long long size; /* memparse() will accept a K/M/G without a digit */ if (!isdigit(*args[0].from)) goto bad_val; size = memparse(args[0].from, &rest); - if (*rest == '%') { - size <<= HPAGE_SHIFT; - size *= max_huge_pages; - do_div(size, 100); - } - pconfig->nr_blocks = (size >> HPAGE_SHIFT); + setsize = SIZE_STD; + if (*rest == '%') + setsize = SIZE_PERCENT; break; } @@ -794,6 +801,19 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) pconfig->nr_inodes = memparse(args[0].from, &rest); break; + case Opt_pagesize: { + unsigned long ps; + ps = memparse(args[0].from, &rest); + pconfig->hstate = size_to_hstate(ps); + if (!pconfig->hstate) { + printk(KERN_ERR + "hugetlbfs: Unsupported page size %lu MB\n", + ps >> 20); + return -EINVAL; + } + break; + } + default: printk(KERN_ERR "hugetlbfs: Bad mount option: \"%s\"\n", p); @@ -801,6 +821,18 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) break; } } + + /* Do size after hstate is set up */ + if (setsize > NO_SIZE) { + struct hstate *h = pconfig->hstate; + if (setsize == SIZE_PERCENT) { + size <<= huge_page_shift(h); + size *= h->max_huge_pages; + do_div(size, 100); + } + pconfig->nr_blocks = (size >> huge_page_shift(h)); + } + return 0; bad_val: @@ -825,6 +857,7 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent) config.uid = current->fsuid; config.gid = current->fsgid; config.mode = 0755; + config.hstate = &default_hstate; ret = hugetlbfs_parse_options(data, &config); if (ret) return ret; @@ -833,14 +866,15 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent) if (!sbinfo) return -ENOMEM; sb->s_fs_info = sbinfo; + sbinfo->hstate = config.hstate; spin_lock_init(&sbinfo->stat_lock); sbinfo->max_blocks = config.nr_blocks; sbinfo->free_blocks = config.nr_blocks; sbinfo->max_inodes = config.nr_inodes; sbinfo->free_inodes = config.nr_inodes; sb->s_maxbytes = MAX_LFS_FILESIZE; - sb->s_blocksize = HPAGE_SIZE; - sb->s_blocksize_bits = HPAGE_SHIFT; + sb->s_blocksize = huge_page_size(config.hstate); + sb->s_blocksize_bits = huge_page_shift(config.hstate); sb->s_magic = HUGETLBFS_MAGIC; sb->s_op = &hugetlbfs_ops; sb->s_time_gran = 1; @@ -942,7 +976,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size) goto out_dentry; error = -ENOMEM; - if (hugetlb_reserve_pages(inode, 0, size >> HPAGE_SHIFT)) + if (hugetlb_reserve_pages(inode, 0, + size >> huge_page_shift(hstate_inode(inode)), NULL)) goto out_inode; d_instantiate(dentry, inode); diff --git a/fs/inode.c b/fs/inode.c index c36d9480335c..0487ddba1397 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -166,6 +166,7 @@ static struct inode *alloc_inode(struct super_block *sb) mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE); mapping->assoc_mapping = NULL; mapping->backing_dev_info = &default_backing_dev_info; + mapping->writeback_index = 0; /* * If the block_device provides a backing_dev_info for client @@ -209,7 +210,7 @@ void inode_init_once(struct inode *inode) INIT_LIST_HEAD(&inode->i_dentry); INIT_LIST_HEAD(&inode->i_devices); INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); - rwlock_init(&inode->i_data.tree_lock); + spin_lock_init(&inode->i_data.tree_lock); spin_lock_init(&inode->i_data.i_mmap_lock); INIT_LIST_HEAD(&inode->i_data.private_list); spin_lock_init(&inode->i_data.private_lock); @@ -224,7 +225,7 @@ void inode_init_once(struct inode *inode) EXPORT_SYMBOL(inode_init_once); -static void init_once(struct kmem_cache * cachep, void *foo) +static void init_once(void *foo) { struct inode * inode = (struct inode *) foo; diff --git a/fs/inotify_user.c b/fs/inotify_user.c index 6676c06bb7c1..d85c7d931cdf 100644 --- a/fs/inotify_user.c +++ b/fs/inotify_user.c @@ -323,7 +323,7 @@ out: } /* - * remove_kevent - cleans up and ultimately frees the given kevent + * remove_kevent - cleans up the given kevent * * Caller must hold dev->ev_mutex. */ @@ -334,7 +334,13 @@ static void remove_kevent(struct inotify_device *dev, dev->event_count--; dev->queue_size -= sizeof(struct inotify_event) + kevent->event.len; +} +/* + * free_kevent - frees the given kevent. + */ +static void free_kevent(struct inotify_kernel_event *kevent) +{ kfree(kevent->name); kmem_cache_free(event_cachep, kevent); } @@ -350,24 +356,25 @@ static void inotify_dev_event_dequeue(struct inotify_device *dev) struct inotify_kernel_event *kevent; kevent = inotify_dev_get_event(dev); remove_kevent(dev, kevent); + free_kevent(kevent); } } /* - * find_inode - resolve a user-given path to a specific inode and return a nd + * find_inode - resolve a user-given path to a specific inode */ -static int find_inode(const char __user *dirname, struct nameidata *nd, +static int find_inode(const char __user *dirname, struct path *path, unsigned flags) { int error; - error = __user_walk(dirname, flags, nd); + error = user_path_at(AT_FDCWD, dirname, flags, path); if (error) return error; /* you can only watch an inode if you have read permissions on it */ - error = vfs_permission(nd, MAY_READ); + error = inode_permission(path->dentry->d_inode, MAY_READ); if (error) - path_put(&nd->path); + path_put(path); return error; } @@ -433,17 +440,15 @@ static ssize_t inotify_read(struct file *file, char __user *buf, dev = file->private_data; while (1) { - int events; prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE); mutex_lock(&dev->ev_mutex); - events = !list_empty(&dev->events); - mutex_unlock(&dev->ev_mutex); - if (events) { + if (!list_empty(&dev->events)) { ret = 0; break; } + mutex_unlock(&dev->ev_mutex); if (file->f_flags & O_NONBLOCK) { ret = -EAGAIN; @@ -462,7 +467,6 @@ static ssize_t inotify_read(struct file *file, char __user *buf, if (ret) return ret; - mutex_lock(&dev->ev_mutex); while (1) { struct inotify_kernel_event *kevent; @@ -481,6 +485,13 @@ static ssize_t inotify_read(struct file *file, char __user *buf, } break; } + remove_kevent(dev, kevent); + + /* + * Must perform the copy_to_user outside the mutex in order + * to avoid a lock order reversal with mmap_sem. + */ + mutex_unlock(&dev->ev_mutex); if (copy_to_user(buf, &kevent->event, event_size)) { ret = -EFAULT; @@ -498,7 +509,9 @@ static ssize_t inotify_read(struct file *file, char __user *buf, count -= kevent->event.len; } - remove_kevent(dev, kevent); + free_kevent(kevent); + + mutex_lock(&dev->ev_mutex); } mutex_unlock(&dev->ev_mutex); @@ -566,7 +579,7 @@ static const struct inotify_operations inotify_user_ops = { .destroy_watch = free_inotify_user_watch, }; -asmlinkage long sys_inotify_init(void) +asmlinkage long sys_inotify_init1(int flags) { struct inotify_device *dev; struct inotify_handle *ih; @@ -574,7 +587,14 @@ asmlinkage long sys_inotify_init(void) struct file *filp; int fd, ret; - fd = get_unused_fd(); + /* Check the IN_* constants for consistency. */ + BUILD_BUG_ON(IN_CLOEXEC != O_CLOEXEC); + BUILD_BUG_ON(IN_NONBLOCK != O_NONBLOCK); + + if (flags & ~(IN_CLOEXEC | IN_NONBLOCK)) + return -EINVAL; + + fd = get_unused_fd_flags(flags & O_CLOEXEC); if (fd < 0) return fd; @@ -610,7 +630,7 @@ asmlinkage long sys_inotify_init(void) filp->f_path.dentry = dget(inotify_mnt->mnt_root); filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping; filp->f_mode = FMODE_READ; - filp->f_flags = O_RDONLY; + filp->f_flags = O_RDONLY | (flags & O_NONBLOCK); filp->private_data = dev; INIT_LIST_HEAD(&dev->events); @@ -638,11 +658,16 @@ out_put_fd: return ret; } -asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask) +asmlinkage long sys_inotify_init(void) +{ + return sys_inotify_init1(0); +} + +asmlinkage long sys_inotify_add_watch(int fd, const char __user *pathname, u32 mask) { struct inode *inode; struct inotify_device *dev; - struct nameidata nd; + struct path path; struct file *filp; int ret, fput_needed; unsigned flags = 0; @@ -662,12 +687,12 @@ asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask) if (mask & IN_ONLYDIR) flags |= LOOKUP_DIRECTORY; - ret = find_inode(path, &nd, flags); + ret = find_inode(pathname, &path, flags); if (unlikely(ret)) goto fput_and_out; - /* inode held in place by reference to nd; dev by fget on fd */ - inode = nd.path.dentry->d_inode; + /* inode held in place by reference to path; dev by fget on fd */ + inode = path.dentry->d_inode; dev = filp->private_data; mutex_lock(&dev->up_mutex); @@ -676,7 +701,7 @@ asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask) ret = create_watch(dev, inode, mask); mutex_unlock(&dev->up_mutex); - path_put(&nd.path); + path_put(&path); fput_and_out: fput_light(filp, fput_needed); return ret; diff --git a/fs/ioctl.c b/fs/ioctl.c index 7db32b3382d3..d152856c371b 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -13,9 +13,14 @@ #include <linux/security.h> #include <linux/module.h> #include <linux/uaccess.h> +#include <linux/writeback.h> +#include <linux/buffer_head.h> #include <asm/ioctls.h> +/* So that the fiemap access checks can't overflow on 32 bit machines. */ +#define FIEMAP_MAX_EXTENTS (UINT_MAX / sizeof(struct fiemap_extent)) + /** * vfs_ioctl - call filesystem specific ioctl methods * @filp: open file to invoke ioctl method on @@ -71,6 +76,276 @@ static int ioctl_fibmap(struct file *filp, int __user *p) return put_user(res, p); } +/** + * fiemap_fill_next_extent - Fiemap helper function + * @fieinfo: Fiemap context passed into ->fiemap + * @logical: Extent logical start offset, in bytes + * @phys: Extent physical start offset, in bytes + * @len: Extent length, in bytes + * @flags: FIEMAP_EXTENT flags that describe this extent + * + * Called from file system ->fiemap callback. Will populate extent + * info as passed in via arguments and copy to user memory. On + * success, extent count on fieinfo is incremented. + * + * Returns 0 on success, -errno on error, 1 if this was the last + * extent that will fit in user array. + */ +#define SET_UNKNOWN_FLAGS (FIEMAP_EXTENT_DELALLOC) +#define SET_NO_UNMOUNTED_IO_FLAGS (FIEMAP_EXTENT_DATA_ENCRYPTED) +#define SET_NOT_ALIGNED_FLAGS (FIEMAP_EXTENT_DATA_TAIL|FIEMAP_EXTENT_DATA_INLINE) +int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical, + u64 phys, u64 len, u32 flags) +{ + struct fiemap_extent extent; + struct fiemap_extent *dest = fieinfo->fi_extents_start; + + /* only count the extents */ + if (fieinfo->fi_extents_max == 0) { + fieinfo->fi_extents_mapped++; + return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0; + } + + if (fieinfo->fi_extents_mapped >= fieinfo->fi_extents_max) + return 1; + + if (flags & SET_UNKNOWN_FLAGS) + flags |= FIEMAP_EXTENT_UNKNOWN; + if (flags & SET_NO_UNMOUNTED_IO_FLAGS) + flags |= FIEMAP_EXTENT_ENCODED; + if (flags & SET_NOT_ALIGNED_FLAGS) + flags |= FIEMAP_EXTENT_NOT_ALIGNED; + + memset(&extent, 0, sizeof(extent)); + extent.fe_logical = logical; + extent.fe_physical = phys; + extent.fe_length = len; + extent.fe_flags = flags; + + dest += fieinfo->fi_extents_mapped; + if (copy_to_user(dest, &extent, sizeof(extent))) + return -EFAULT; + + fieinfo->fi_extents_mapped++; + if (fieinfo->fi_extents_mapped == fieinfo->fi_extents_max) + return 1; + return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0; +} +EXPORT_SYMBOL(fiemap_fill_next_extent); + +/** + * fiemap_check_flags - check validity of requested flags for fiemap + * @fieinfo: Fiemap context passed into ->fiemap + * @fs_flags: Set of fiemap flags that the file system understands + * + * Called from file system ->fiemap callback. This will compute the + * intersection of valid fiemap flags and those that the fs supports. That + * value is then compared against the user supplied flags. In case of bad user + * flags, the invalid values will be written into the fieinfo structure, and + * -EBADR is returned, which tells ioctl_fiemap() to return those values to + * userspace. For this reason, a return code of -EBADR should be preserved. + * + * Returns 0 on success, -EBADR on bad flags. + */ +int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags) +{ + u32 incompat_flags; + + incompat_flags = fieinfo->fi_flags & ~(FIEMAP_FLAGS_COMPAT & fs_flags); + if (incompat_flags) { + fieinfo->fi_flags = incompat_flags; + return -EBADR; + } + return 0; +} +EXPORT_SYMBOL(fiemap_check_flags); + +static int fiemap_check_ranges(struct super_block *sb, + u64 start, u64 len, u64 *new_len) +{ + *new_len = len; + + if (len == 0) + return -EINVAL; + + if (start > sb->s_maxbytes) + return -EFBIG; + + /* + * Shrink request scope to what the fs can actually handle. + */ + if ((len > sb->s_maxbytes) || + (sb->s_maxbytes - len) < start) + *new_len = sb->s_maxbytes - start; + + return 0; +} + +static int ioctl_fiemap(struct file *filp, unsigned long arg) +{ + struct fiemap fiemap; + struct fiemap_extent_info fieinfo = { 0, }; + struct inode *inode = filp->f_path.dentry->d_inode; + struct super_block *sb = inode->i_sb; + u64 len; + int error; + + if (!inode->i_op->fiemap) + return -EOPNOTSUPP; + + if (copy_from_user(&fiemap, (struct fiemap __user *)arg, + sizeof(struct fiemap))) + return -EFAULT; + + if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS) + return -EINVAL; + + error = fiemap_check_ranges(sb, fiemap.fm_start, fiemap.fm_length, + &len); + if (error) + return error; + + fieinfo.fi_flags = fiemap.fm_flags; + fieinfo.fi_extents_max = fiemap.fm_extent_count; + fieinfo.fi_extents_start = (struct fiemap_extent *)(arg + sizeof(fiemap)); + + if (fiemap.fm_extent_count != 0 && + !access_ok(VERIFY_WRITE, fieinfo.fi_extents_start, + fieinfo.fi_extents_max * sizeof(struct fiemap_extent))) + return -EFAULT; + + if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC) + filemap_write_and_wait(inode->i_mapping); + + error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, len); + fiemap.fm_flags = fieinfo.fi_flags; + fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped; + if (copy_to_user((char *)arg, &fiemap, sizeof(fiemap))) + error = -EFAULT; + + return error; +} + +#ifdef CONFIG_BLOCK + +#define blk_to_logical(inode, blk) (blk << (inode)->i_blkbits) +#define logical_to_blk(inode, offset) (offset >> (inode)->i_blkbits); + +/* + * @inode - the inode to map + * @arg - the pointer to userspace where we copy everything to + * @get_block - the fs's get_block function + * + * This does FIEMAP for block based inodes. Basically it will just loop + * through get_block until we hit the number of extents we want to map, or we + * go past the end of the file and hit a hole. + * + * If it is possible to have data blocks beyond a hole past @inode->i_size, then + * please do not use this function, it will stop at the first unmapped block + * beyond i_size + */ +int generic_block_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, u64 start, + u64 len, get_block_t *get_block) +{ + struct buffer_head tmp; + unsigned int start_blk; + long long length = 0, map_len = 0; + u64 logical = 0, phys = 0, size = 0; + u32 flags = FIEMAP_EXTENT_MERGED; + int ret = 0; + + if ((ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC))) + return ret; + + start_blk = logical_to_blk(inode, start); + + /* guard against change */ + mutex_lock(&inode->i_mutex); + + length = (long long)min_t(u64, len, i_size_read(inode)); + map_len = length; + + do { + /* + * we set b_size to the total size we want so it will map as + * many contiguous blocks as possible at once + */ + memset(&tmp, 0, sizeof(struct buffer_head)); + tmp.b_size = map_len; + + ret = get_block(inode, start_blk, &tmp, 0); + if (ret) + break; + + /* HOLE */ + if (!buffer_mapped(&tmp)) { + /* + * first hole after going past the EOF, this is our + * last extent + */ + if (length <= 0) { + flags = FIEMAP_EXTENT_MERGED|FIEMAP_EXTENT_LAST; + ret = fiemap_fill_next_extent(fieinfo, logical, + phys, size, + flags); + break; + } + + length -= blk_to_logical(inode, 1); + + /* if we have holes up to/past EOF then we're done */ + if (length <= 0) + break; + + start_blk++; + } else { + if (length <= 0 && size) { + ret = fiemap_fill_next_extent(fieinfo, logical, + phys, size, + flags); + if (ret) + break; + } + + logical = blk_to_logical(inode, start_blk); + phys = blk_to_logical(inode, tmp.b_blocknr); + size = tmp.b_size; + flags = FIEMAP_EXTENT_MERGED; + + length -= tmp.b_size; + start_blk += logical_to_blk(inode, size); + + /* + * if we are past the EOF we need to loop again to see + * if there is a hole so we can mark this extent as the + * last one, and if not keep mapping things until we + * find a hole, or we run out of slots in the extent + * array + */ + if (length <= 0) + continue; + + ret = fiemap_fill_next_extent(fieinfo, logical, phys, + size, flags); + if (ret) + break; + } + cond_resched(); + } while (1); + + mutex_unlock(&inode->i_mutex); + + /* if ret is 1 then we just hit the end of the extent array */ + if (ret == 1) + ret = 0; + + return ret; +} +EXPORT_SYMBOL(generic_block_fiemap); + +#endif /* CONFIG_BLOCK */ + static int file_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { @@ -80,6 +355,8 @@ static int file_ioctl(struct file *filp, unsigned int cmd, switch (cmd) { case FIBMAP: return ioctl_fibmap(filp, p); + case FS_IOC_FIEMAP: + return ioctl_fiemap(filp, arg); case FIGETBSZ: return put_user(inode->i_sb->s_blocksize, p); case FIONREAD: diff --git a/fs/ioprio.c b/fs/ioprio.c index c4a1c3c65aac..da3cc460d4df 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -115,11 +115,11 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio) pgrp = task_pgrp(current); else pgrp = find_vpid(who); - do_each_pid_task(pgrp, PIDTYPE_PGID, p) { + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { ret = set_task_ioprio(p, ioprio); if (ret) break; - } while_each_pid_task(pgrp, PIDTYPE_PGID, p); + } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); break; case IOPRIO_WHO_USER: if (!who) @@ -204,7 +204,7 @@ asmlinkage long sys_ioprio_get(int which, int who) pgrp = task_pgrp(current); else pgrp = find_vpid(who); - do_each_pid_task(pgrp, PIDTYPE_PGID, p) { + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { tmpio = get_task_ioprio(p); if (tmpio < 0) continue; @@ -212,7 +212,7 @@ asmlinkage long sys_ioprio_get(int which, int who) ret = tmpio; else ret = ioprio_best(ret, tmpio); - } while_each_pid_task(pgrp, PIDTYPE_PGID, p); + } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); break; case IOPRIO_WHO_USER: if (!who) diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 044a254d526b..3f8af0f1505b 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -73,7 +73,7 @@ static void isofs_destroy_inode(struct inode *inode) kmem_cache_free(isofs_inode_cachep, ISOFS_I(inode)); } -static void init_once(struct kmem_cache *cachep, void *foo) +static void init_once(void *foo) { struct iso_inode_info *ei = foo; @@ -310,7 +310,7 @@ enum { Opt_nocompress, Opt_hide, Opt_showassoc, Opt_dmode, }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_norock, "norock"}, {Opt_nojoliet, "nojoliet"}, {Opt_unhide, "unhide"}, diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c index 6bd48f0a7047..c2fb2dd0131f 100644 --- a/fs/isofs/rock.c +++ b/fs/isofs/rock.c @@ -209,6 +209,11 @@ repeat: while (rs.len > 2) { /* There may be one byte for padding somewhere */ rr = (struct rock_ridge *)rs.chr; + /* + * Ignore rock ridge info if rr->len is out of range, but + * don't return -EIO because that would make the file + * invisible. + */ if (rr->len < 3) goto out; /* Something got screwed up here */ sig = isonum_721(rs.chr); @@ -216,8 +221,12 @@ repeat: goto eio; rs.chr += rr->len; rs.len -= rr->len; + /* + * As above, just ignore the rock ridge info if rr->len + * is bogus. + */ if (rs.len < 0) - goto eio; /* corrupted isofs */ + goto out; /* Something got screwed up here */ switch (sig) { case SIG('R', 'R'): @@ -307,6 +316,11 @@ parse_rock_ridge_inode_internal(struct iso_directory_record *de, repeat: while (rs.len > 2) { /* There may be one byte for padding somewhere */ rr = (struct rock_ridge *)rs.chr; + /* + * Ignore rock ridge info if rr->len is out of range, but + * don't return -EIO because that would make the file + * invisible. + */ if (rr->len < 3) goto out; /* Something got screwed up here */ sig = isonum_721(rs.chr); @@ -314,8 +328,12 @@ repeat: goto eio; rs.chr += rr->len; rs.len -= rr->len; + /* + * As above, just ignore the rock ridge info if rr->len + * is bogus. + */ if (rs.len < 0) - goto eio; /* corrupted isofs */ + goto out; /* Something got screwed up here */ switch (sig) { #ifndef CONFIG_ZISOFS /* No flag for SF or ZF */ diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 5a8ca61498ca..ae08c057e751 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -36,7 +36,7 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) /* * When an ext3-ordered file is truncated, it is possible that many pages are - * not sucessfully freed, because they are attached to a committing transaction. + * not successfully freed, because they are attached to a committing transaction. * After the transaction commits, these pages are left on the LRU, with no * ->mapping, and with attached buffers. These pages are trivially reclaimable * by the VM, but their apparent absence upsets the VM accounting, and it makes @@ -45,8 +45,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) * So here, we have a buffer which has just come off the forget list. Look to * see if we can strip all buffers from the backing page. * - * Called under lock_journal(), and possibly under journal_datalist_lock. The - * caller provided us with a ref against the buffer, and we drop that here. + * Called under journal->j_list_lock. The caller provided us with a ref + * against the buffer, and we drop that here. */ static void release_buffer_page(struct buffer_head *bh) { @@ -63,7 +63,7 @@ static void release_buffer_page(struct buffer_head *bh) goto nope; /* OK, it's a truncated page */ - if (TestSetPageLocked(page)) + if (!trylock_page(page)) goto nope; page_cache_get(page); @@ -78,6 +78,19 @@ nope: } /* + * Decrement reference counter for data buffer. If it has been marked + * 'BH_Freed', release it and the page to which it belongs if possible. + */ +static void release_data_buffer(struct buffer_head *bh) +{ + if (buffer_freed(bh)) { + clear_buffer_freed(bh); + release_buffer_page(bh); + } else + put_bh(bh); +} + +/* * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is * held. For ranking reasons we must trylock. If we lose, schedule away and * return 0. j_list_lock is dropped in this case. @@ -172,7 +185,7 @@ static void journal_do_submit_data(struct buffer_head **wbuf, int bufs) /* * Submit all the data buffers to disk */ -static void journal_submit_data_buffers(journal_t *journal, +static int journal_submit_data_buffers(journal_t *journal, transaction_t *commit_transaction) { struct journal_head *jh; @@ -180,6 +193,7 @@ static void journal_submit_data_buffers(journal_t *journal, int locked; int bufs = 0; struct buffer_head **wbuf = journal->j_wbuf; + int err = 0; /* * Whenever we unlock the journal and sleep, things can get added @@ -207,7 +221,7 @@ write_out_data: * blocking lock_buffer(). */ if (buffer_dirty(bh)) { - if (test_set_buffer_locked(bh)) { + if (!trylock_buffer(bh)) { BUFFER_TRACE(bh, "needs blocking lock"); spin_unlock(&journal->j_list_lock); /* Write out all data to prevent deadlocks */ @@ -231,7 +245,7 @@ write_out_data: if (locked) unlock_buffer(bh); BUFFER_TRACE(bh, "already cleaned up"); - put_bh(bh); + release_data_buffer(bh); continue; } if (locked && test_clear_buffer_dirty(bh)) { @@ -253,15 +267,17 @@ write_out_data: put_bh(bh); } else { BUFFER_TRACE(bh, "writeout complete: unfile"); + if (unlikely(!buffer_uptodate(bh))) + err = -EIO; __journal_unfile_buffer(jh); jbd_unlock_bh_state(bh); if (locked) unlock_buffer(bh); journal_remove_journal_head(bh); - /* Once for our safety reference, once for + /* One for our safety reference, other for * journal_remove_journal_head() */ put_bh(bh); - put_bh(bh); + release_data_buffer(bh); } if (need_resched() || spin_needbreak(&journal->j_list_lock)) { @@ -271,6 +287,8 @@ write_out_data: } spin_unlock(&journal->j_list_lock); journal_do_submit_data(wbuf, bufs); + + return err; } /* @@ -410,8 +428,7 @@ void journal_commit_transaction(journal_t *journal) * Now start flushing things to disk, in the order they appear * on the transaction lists. Data blocks go first. */ - err = 0; - journal_submit_data_buffers(journal, commit_transaction); + err = journal_submit_data_buffers(journal, commit_transaction); /* * Wait for all previously submitted IO to complete. @@ -426,10 +443,21 @@ void journal_commit_transaction(journal_t *journal) if (buffer_locked(bh)) { spin_unlock(&journal->j_list_lock); wait_on_buffer(bh); - if (unlikely(!buffer_uptodate(bh))) - err = -EIO; spin_lock(&journal->j_list_lock); } + if (unlikely(!buffer_uptodate(bh))) { + if (!trylock_page(bh->b_page)) { + spin_unlock(&journal->j_list_lock); + lock_page(bh->b_page); + spin_lock(&journal->j_list_lock); + } + if (bh->b_page->mapping) + set_bit(AS_EIO, &bh->b_page->mapping->flags); + + unlock_page(bh->b_page); + SetPageError(bh->b_page); + err = -EIO; + } if (!inverted_lock(journal, bh)) { put_bh(bh); spin_lock(&journal->j_list_lock); @@ -443,17 +471,21 @@ void journal_commit_transaction(journal_t *journal) } else { jbd_unlock_bh_state(bh); } - put_bh(bh); + release_data_buffer(bh); cond_resched_lock(&journal->j_list_lock); } spin_unlock(&journal->j_list_lock); - if (err) - journal_abort(journal, err); + if (err) { + char b[BDEVNAME_SIZE]; - journal_write_revoke_records(journal, commit_transaction); + printk(KERN_WARNING + "JBD: Detected IO errors while flushing file data " + "on %s\n", bdevname(journal->j_fs_dev, b)); + err = 0; + } - jbd_debug(3, "JBD: commit phase 2\n"); + journal_write_revoke_records(journal, commit_transaction); /* * If we found any dirty or locked buffers, then we should have diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index b99c3b3654c4..aa7143a8349b 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -68,7 +68,6 @@ EXPORT_SYMBOL(journal_set_features); EXPORT_SYMBOL(journal_create); EXPORT_SYMBOL(journal_load); EXPORT_SYMBOL(journal_destroy); -EXPORT_SYMBOL(journal_update_superblock); EXPORT_SYMBOL(journal_abort); EXPORT_SYMBOL(journal_errno); EXPORT_SYMBOL(journal_ack_err); @@ -1636,9 +1635,10 @@ static int journal_init_journal_head_cache(void) static void journal_destroy_journal_head_cache(void) { - J_ASSERT(journal_head_cache != NULL); - kmem_cache_destroy(journal_head_cache); - journal_head_cache = NULL; + if (journal_head_cache) { + kmem_cache_destroy(journal_head_cache); + journal_head_cache = NULL; + } } /* diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c index 1bb43e987f4b..c7bd649bbbdc 100644 --- a/fs/jbd/revoke.c +++ b/fs/jbd/revoke.c @@ -166,138 +166,123 @@ static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal, return NULL; } +void journal_destroy_revoke_caches(void) +{ + if (revoke_record_cache) { + kmem_cache_destroy(revoke_record_cache); + revoke_record_cache = NULL; + } + if (revoke_table_cache) { + kmem_cache_destroy(revoke_table_cache); + revoke_table_cache = NULL; + } +} + int __init journal_init_revoke_caches(void) { + J_ASSERT(!revoke_record_cache); + J_ASSERT(!revoke_table_cache); + revoke_record_cache = kmem_cache_create("revoke_record", sizeof(struct jbd_revoke_record_s), 0, SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY, NULL); if (!revoke_record_cache) - return -ENOMEM; + goto record_cache_failure; revoke_table_cache = kmem_cache_create("revoke_table", sizeof(struct jbd_revoke_table_s), 0, SLAB_TEMPORARY, NULL); - if (!revoke_table_cache) { - kmem_cache_destroy(revoke_record_cache); - revoke_record_cache = NULL; - return -ENOMEM; - } + if (!revoke_table_cache) + goto table_cache_failure; + return 0; -} -void journal_destroy_revoke_caches(void) -{ - kmem_cache_destroy(revoke_record_cache); - revoke_record_cache = NULL; - kmem_cache_destroy(revoke_table_cache); - revoke_table_cache = NULL; +table_cache_failure: + journal_destroy_revoke_caches(); +record_cache_failure: + return -ENOMEM; } -/* Initialise the revoke table for a given journal to a given size. */ - -int journal_init_revoke(journal_t *journal, int hash_size) +static struct jbd_revoke_table_s *journal_init_revoke_table(int hash_size) { - int shift, tmp; + int shift = 0; + int tmp = hash_size; + struct jbd_revoke_table_s *table; - J_ASSERT (journal->j_revoke_table[0] == NULL); + table = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL); + if (!table) + goto out; - shift = 0; - tmp = hash_size; while((tmp >>= 1UL) != 0UL) shift++; - journal->j_revoke_table[0] = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL); - if (!journal->j_revoke_table[0]) - return -ENOMEM; - journal->j_revoke = journal->j_revoke_table[0]; - - /* Check that the hash_size is a power of two */ - J_ASSERT(is_power_of_2(hash_size)); - - journal->j_revoke->hash_size = hash_size; - - journal->j_revoke->hash_shift = shift; - - journal->j_revoke->hash_table = + table->hash_size = hash_size; + table->hash_shift = shift; + table->hash_table = kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL); - if (!journal->j_revoke->hash_table) { - kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]); - journal->j_revoke = NULL; - return -ENOMEM; + if (!table->hash_table) { + kmem_cache_free(revoke_table_cache, table); + table = NULL; + goto out; } for (tmp = 0; tmp < hash_size; tmp++) - INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]); + INIT_LIST_HEAD(&table->hash_table[tmp]); - journal->j_revoke_table[1] = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL); - if (!journal->j_revoke_table[1]) { - kfree(journal->j_revoke_table[0]->hash_table); - kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]); - return -ENOMEM; +out: + return table; +} + +static void journal_destroy_revoke_table(struct jbd_revoke_table_s *table) +{ + int i; + struct list_head *hash_list; + + for (i = 0; i < table->hash_size; i++) { + hash_list = &table->hash_table[i]; + J_ASSERT(list_empty(hash_list)); } - journal->j_revoke = journal->j_revoke_table[1]; + kfree(table->hash_table); + kmem_cache_free(revoke_table_cache, table); +} - /* Check that the hash_size is a power of two */ +/* Initialise the revoke table for a given journal to a given size. */ +int journal_init_revoke(journal_t *journal, int hash_size) +{ + J_ASSERT(journal->j_revoke_table[0] == NULL); J_ASSERT(is_power_of_2(hash_size)); - journal->j_revoke->hash_size = hash_size; + journal->j_revoke_table[0] = journal_init_revoke_table(hash_size); + if (!journal->j_revoke_table[0]) + goto fail0; - journal->j_revoke->hash_shift = shift; + journal->j_revoke_table[1] = journal_init_revoke_table(hash_size); + if (!journal->j_revoke_table[1]) + goto fail1; - journal->j_revoke->hash_table = - kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL); - if (!journal->j_revoke->hash_table) { - kfree(journal->j_revoke_table[0]->hash_table); - kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]); - kmem_cache_free(revoke_table_cache, journal->j_revoke_table[1]); - journal->j_revoke = NULL; - return -ENOMEM; - } - - for (tmp = 0; tmp < hash_size; tmp++) - INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]); + journal->j_revoke = journal->j_revoke_table[1]; spin_lock_init(&journal->j_revoke_lock); return 0; -} -/* Destoy a journal's revoke table. The table must already be empty! */ +fail1: + journal_destroy_revoke_table(journal->j_revoke_table[0]); +fail0: + return -ENOMEM; +} +/* Destroy a journal's revoke table. The table must already be empty! */ void journal_destroy_revoke(journal_t *journal) { - struct jbd_revoke_table_s *table; - struct list_head *hash_list; - int i; - - table = journal->j_revoke_table[0]; - if (!table) - return; - - for (i=0; i<table->hash_size; i++) { - hash_list = &table->hash_table[i]; - J_ASSERT (list_empty(hash_list)); - } - - kfree(table->hash_table); - kmem_cache_free(revoke_table_cache, table); - journal->j_revoke = NULL; - - table = journal->j_revoke_table[1]; - if (!table) - return; - - for (i=0; i<table->hash_size; i++) { - hash_list = &table->hash_table[i]; - J_ASSERT (list_empty(hash_list)); - } - - kfree(table->hash_table); - kmem_cache_free(revoke_table_cache, table); journal->j_revoke = NULL; + if (journal->j_revoke_table[0]) + journal_destroy_revoke_table(journal->j_revoke_table[0]); + if (journal->j_revoke_table[1]) + journal_destroy_revoke_table(journal->j_revoke_table[1]); } diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 67ff2024c23c..0540ca27a446 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -291,7 +291,7 @@ handle_t *journal_start(journal_t *journal, int nblocks) goto out; } - lock_acquire(&handle->h_lockdep_map, 0, 0, 0, 2, _THIS_IP_); + lock_map_acquire(&handle->h_lockdep_map); out: return handle; @@ -1448,7 +1448,7 @@ int journal_stop(handle_t *handle) spin_unlock(&journal->j_state_lock); } - lock_release(&handle->h_lockdep_map, 1, _THIS_IP_); + lock_map_release(&handle->h_lockdep_map); jbd_free_handle(handle); return err; @@ -1648,12 +1648,42 @@ out: return; } +/* + * journal_try_to_free_buffers() could race with journal_commit_transaction() + * The latter might still hold the a count on buffers when inspecting + * them on t_syncdata_list or t_locked_list. + * + * journal_try_to_free_buffers() will call this function to + * wait for the current transaction to finish syncing data buffers, before + * tryinf to free that buffer. + * + * Called with journal->j_state_lock held. + */ +static void journal_wait_for_transaction_sync_data(journal_t *journal) +{ + transaction_t *transaction = NULL; + tid_t tid; + + spin_lock(&journal->j_state_lock); + transaction = journal->j_committing_transaction; + + if (!transaction) { + spin_unlock(&journal->j_state_lock); + return; + } + + tid = transaction->t_tid; + spin_unlock(&journal->j_state_lock); + log_wait_commit(journal, tid); +} /** * int journal_try_to_free_buffers() - try to free page buffers. * @journal: journal for operation * @page: to try and free - * @unused_gfp_mask: unused + * @gfp_mask: we use the mask to detect how hard should we try to release + * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to + * release the buffers. * * * For all the buffers on this page, @@ -1682,9 +1712,11 @@ out: * journal_try_to_free_buffer() is changing its state. But that * cannot happen because we never reallocate freed data as metadata * while the data is part of a transaction. Yes? + * + * Return 0 on failure, 1 on success */ int journal_try_to_free_buffers(journal_t *journal, - struct page *page, gfp_t unused_gfp_mask) + struct page *page, gfp_t gfp_mask) { struct buffer_head *head; struct buffer_head *bh; @@ -1713,7 +1745,28 @@ int journal_try_to_free_buffers(journal_t *journal, if (buffer_jbd(bh)) goto busy; } while ((bh = bh->b_this_page) != head); + ret = try_to_free_buffers(page); + + /* + * There are a number of places where journal_try_to_free_buffers() + * could race with journal_commit_transaction(), the later still + * holds the reference to the buffers to free while processing them. + * try_to_free_buffers() failed to free those buffers. Some of the + * caller of releasepage() request page buffers to be dropped, otherwise + * treat the fail-to-free as errors (such as generic_file_direct_IO()) + * + * So, if the caller of try_to_release_page() wants the synchronous + * behaviour(i.e make sure buffers are dropped upon return), + * let's wait for the current transaction to finish flush of + * dirty data buffers, then try to free those buffers again, + * with the journal locked. + */ + if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) { + journal_wait_for_transaction_sync_data(journal); + ret = try_to_free_buffers(page); + } + busy: return ret; } diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 6914598022ce..9203c3332f17 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -20,6 +20,7 @@ #include <linux/time.h> #include <linux/fs.h> #include <linux/jbd2.h> +#include <linux/marker.h> #include <linux/errno.h> #include <linux/slab.h> @@ -93,7 +94,8 @@ static int __try_to_free_cp_buf(struct journal_head *jh) int ret = 0; struct buffer_head *bh = jh2bh(jh); - if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) { + if (jh->b_jlist == BJ_None && !buffer_locked(bh) && + !buffer_dirty(bh) && !buffer_write_io_error(bh)) { JBUFFER_TRACE(jh, "remove from checkpoint list"); ret = __jbd2_journal_remove_checkpoint(jh) + 1; jbd_unlock_bh_state(bh); @@ -126,14 +128,29 @@ void __jbd2_log_wait_for_space(journal_t *journal) /* * Test again, another process may have checkpointed while we - * were waiting for the checkpoint lock + * were waiting for the checkpoint lock. If there are no + * outstanding transactions there is nothing to checkpoint and + * we can't make progress. Abort the journal in this case. */ spin_lock(&journal->j_state_lock); + spin_lock(&journal->j_list_lock); nblocks = jbd_space_needed(journal); if (__jbd2_log_space_left(journal) < nblocks) { + int chkpt = journal->j_checkpoint_transactions != NULL; + + spin_unlock(&journal->j_list_lock); spin_unlock(&journal->j_state_lock); - jbd2_log_do_checkpoint(journal); + if (chkpt) { + jbd2_log_do_checkpoint(journal); + } else { + printk(KERN_ERR "%s: no transactions\n", + __func__); + jbd2_journal_abort(journal, 0); + } + spin_lock(&journal->j_state_lock); + } else { + spin_unlock(&journal->j_list_lock); } mutex_unlock(&journal->j_checkpoint_mutex); } @@ -160,21 +177,25 @@ static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh) * buffers. Note that we take the buffers in the opposite ordering * from the one in which they were submitted for IO. * + * Return 0 on success, and return <0 if some buffers have failed + * to be written out. + * * Called with j_list_lock held. */ -static void __wait_cp_io(journal_t *journal, transaction_t *transaction) +static int __wait_cp_io(journal_t *journal, transaction_t *transaction) { struct journal_head *jh; struct buffer_head *bh; tid_t this_tid; int released = 0; + int ret = 0; this_tid = transaction->t_tid; restart: /* Did somebody clean up the transaction in the meanwhile? */ if (journal->j_checkpoint_transactions != transaction || transaction->t_tid != this_tid) - return; + return ret; while (!released && transaction->t_checkpoint_io_list) { jh = transaction->t_checkpoint_io_list; bh = jh2bh(jh); @@ -194,6 +215,9 @@ restart: spin_lock(&journal->j_list_lock); goto restart; } + if (unlikely(buffer_write_io_error(bh))) + ret = -EIO; + /* * Now in whatever state the buffer currently is, we know that * it has been written out and so we can drop it from the list @@ -203,6 +227,8 @@ restart: jbd2_journal_remove_journal_head(bh); __brelse(bh); } + + return ret; } #define NR_BATCH 64 @@ -226,7 +252,8 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) * Try to flush one buffer from the checkpoint list to disk. * * Return 1 if something happened which requires us to abort the current - * scan of the checkpoint list. + * scan of the checkpoint list. Return <0 if the buffer has failed to + * be written out. * * Called with j_list_lock held and drops it if 1 is returned * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it @@ -258,6 +285,9 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, jbd2_log_wait_commit(journal, tid); ret = 1; } else if (!buffer_dirty(bh)) { + ret = 1; + if (unlikely(buffer_write_io_error(bh))) + ret = -EIO; J_ASSERT_JH(jh, !buffer_jbddirty(bh)); BUFFER_TRACE(bh, "remove from checkpoint"); __jbd2_journal_remove_checkpoint(jh); @@ -265,7 +295,6 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, jbd_unlock_bh_state(bh); jbd2_journal_remove_journal_head(bh); __brelse(bh); - ret = 1; } else { /* * Important: we are about to write the buffer, and @@ -298,6 +327,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, * to disk. We submit larger chunks of data at once. * * The journal should be locked before calling this function. + * Called with j_checkpoint_mutex held. */ int jbd2_log_do_checkpoint(journal_t *journal) { @@ -313,6 +343,8 @@ int jbd2_log_do_checkpoint(journal_t *journal) * journal straight away. */ result = jbd2_cleanup_journal_tail(journal); + trace_mark(jbd2_checkpoint, "dev %s need_checkpoint %d", + journal->j_devname, result); jbd_debug(1, "cleanup_journal_tail returned %d\n", result); if (result <= 0) return result; @@ -321,6 +353,7 @@ int jbd2_log_do_checkpoint(journal_t *journal) * OK, we need to start writing disk blocks. Take one transaction * and write it. */ + result = 0; spin_lock(&journal->j_list_lock); if (!journal->j_checkpoint_transactions) goto out; @@ -339,7 +372,7 @@ restart: int batch_count = 0; struct buffer_head *bhs[NR_BATCH]; struct journal_head *jh; - int retry = 0; + int retry = 0, err; while (!retry && transaction->t_checkpoint_list) { struct buffer_head *bh; @@ -353,6 +386,8 @@ restart: } retry = __process_buffer(journal, jh, bhs, &batch_count, transaction); + if (retry < 0 && !result) + result = retry; if (!retry && (need_resched() || spin_needbreak(&journal->j_list_lock))) { spin_unlock(&journal->j_list_lock); @@ -377,14 +412,18 @@ restart: * Now we have cleaned up the first transaction's checkpoint * list. Let's clean up the second one */ - __wait_cp_io(journal, transaction); + err = __wait_cp_io(journal, transaction); + if (!result) + result = err; } out: spin_unlock(&journal->j_list_lock); - result = jbd2_cleanup_journal_tail(journal); if (result < 0) - return result; - return 0; + jbd2_journal_abort(journal, result); + else + result = jbd2_cleanup_journal_tail(journal); + + return (result < 0) ? result : 0; } /* @@ -400,8 +439,9 @@ out: * This is the only part of the journaling code which really needs to be * aware of transaction aborts. Checkpointing involves writing to the * main filesystem area rather than to the journal, so it can proceed - * even in abort state, but we must not update the journal superblock if - * we have an abort error outstanding. + * even in abort state, but we must not update the super block if + * checkpointing may have failed. Otherwise, we would lose some metadata + * buffers which should be written-back to the filesystem. */ int jbd2_cleanup_journal_tail(journal_t *journal) @@ -410,6 +450,9 @@ int jbd2_cleanup_journal_tail(journal_t *journal) tid_t first_tid; unsigned long blocknr, freed; + if (is_journal_aborted(journal)) + return 1; + /* OK, work out the oldest transaction remaining in the log, and * the log block it starts at. * @@ -688,7 +731,6 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact J_ASSERT(transaction->t_state == T_FINISHED); J_ASSERT(transaction->t_buffers == NULL); - J_ASSERT(transaction->t_sync_datalist == NULL); J_ASSERT(transaction->t_forget == NULL); J_ASSERT(transaction->t_iobuf_list == NULL); J_ASSERT(transaction->t_shadow_list == NULL); diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index a2ed72f7ceee..0abe02c4242a 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -16,12 +16,15 @@ #include <linux/time.h> #include <linux/fs.h> #include <linux/jbd2.h> +#include <linux/marker.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/jiffies.h> #include <linux/crc32.h> +#include <linux/writeback.h> +#include <linux/backing-dev.h> /* * Default IO end handler for temporary BJ_IO buffer_heads. @@ -37,8 +40,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) } /* - * When an ext3-ordered file is truncated, it is possible that many pages are - * not sucessfully freed, because they are attached to a committing transaction. + * When an ext4 file is truncated, it is possible that some pages are not + * successfully freed, because they are attached to a committing transaction. * After the transaction commits, these pages are left on the LRU, with no * ->mapping, and with attached buffers. These pages are trivially reclaimable * by the VM, but their apparent absence upsets the VM accounting, and it makes @@ -65,7 +68,7 @@ static void release_buffer_page(struct buffer_head *bh) goto nope; /* OK, it's a truncated page */ - if (TestSetPageLocked(page)) + if (!trylock_page(page)) goto nope; page_cache_get(page); @@ -80,21 +83,6 @@ nope: } /* - * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is - * held. For ranking reasons we must trylock. If we lose, schedule away and - * return 0. j_list_lock is dropped in this case. - */ -static int inverted_lock(journal_t *journal, struct buffer_head *bh) -{ - if (!jbd_trylock_bh_state(bh)) { - spin_unlock(&journal->j_list_lock); - schedule(); - return 0; - } - return 1; -} - -/* * Done it all: now submit the commit record. We should have * cleaned up our previous buffers by now, so if we are in abort * mode we can now just skip the rest of the journal write @@ -112,6 +100,7 @@ static int journal_submit_commit_record(journal_t *journal, struct buffer_head *bh; int ret; int barrier_done = 0; + struct timespec now = current_kernel_time(); if (is_journal_aborted(journal)) return 0; @@ -126,6 +115,8 @@ static int journal_submit_commit_record(journal_t *journal, tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); + tmp->h_commit_sec = cpu_to_be64(now.tv_sec); + tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec); if (JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM)) { @@ -136,8 +127,7 @@ static int journal_submit_commit_record(journal_t *journal, JBUFFER_TRACE(descriptor, "submit commit block"); lock_buffer(bh); - get_bh(bh); - set_buffer_dirty(bh); + clear_buffer_dirty(bh); set_buffer_uptodate(bh); bh->b_end_io = journal_end_buffer_io_sync; @@ -157,12 +147,9 @@ static int journal_submit_commit_record(journal_t *journal, * to remember if we sent a barrier request */ if (ret == -EOPNOTSUPP && barrier_done) { - char b[BDEVNAME_SIZE]; - printk(KERN_WARNING - "JBD: barrier-based sync failed on %s - " - "disabling barriers\n", - bdevname(journal->j_dev, b)); + "JBD: barrier-based sync failed on %s - " + "disabling barriers\n", journal->j_devname); spin_lock(&journal->j_state_lock); journal->j_flags &= ~JBD2_BARRIER; spin_unlock(&journal->j_state_lock); @@ -170,7 +157,7 @@ static int journal_submit_commit_record(journal_t *journal, /* And try again, without the barrier */ lock_buffer(bh); set_buffer_uptodate(bh); - set_buffer_dirty(bh); + clear_buffer_dirty(bh); ret = submit_bh(WRITE, bh); } *cbh = bh; @@ -197,159 +184,114 @@ static int journal_wait_on_commit_record(struct buffer_head *bh) } /* - * Wait for all submitted IO to complete. + * write the filemap data using writepage() address_space_operations. + * We don't do block allocation here even for delalloc. We don't + * use writepages() because with dealyed allocation we may be doing + * block allocation in writepages(). */ -static int journal_wait_on_locked_list(journal_t *journal, - transaction_t *commit_transaction) +static int journal_submit_inode_data_buffers(struct address_space *mapping) { - int ret = 0; - struct journal_head *jh; - - while (commit_transaction->t_locked_list) { - struct buffer_head *bh; - - jh = commit_transaction->t_locked_list->b_tprev; - bh = jh2bh(jh); - get_bh(bh); - if (buffer_locked(bh)) { - spin_unlock(&journal->j_list_lock); - wait_on_buffer(bh); - if (unlikely(!buffer_uptodate(bh))) - ret = -EIO; - spin_lock(&journal->j_list_lock); - } - if (!inverted_lock(journal, bh)) { - put_bh(bh); - spin_lock(&journal->j_list_lock); - continue; - } - if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) { - __jbd2_journal_unfile_buffer(jh); - jbd_unlock_bh_state(bh); - jbd2_journal_remove_journal_head(bh); - put_bh(bh); - } else { - jbd_unlock_bh_state(bh); - } - put_bh(bh); - cond_resched_lock(&journal->j_list_lock); - } + int ret; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = mapping->nrpages * 2, + .range_start = 0, + .range_end = i_size_read(mapping->host), + .for_writepages = 1, + }; + + ret = generic_writepages(mapping, &wbc); return ret; - } +} -static void journal_do_submit_data(struct buffer_head **wbuf, int bufs) +/* + * Submit all the data buffers of inode associated with the transaction to + * disk. + * + * We are in a committing transaction. Therefore no new inode can be added to + * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently + * operate on from being released while we write out pages. + */ +static int journal_submit_data_buffers(journal_t *journal, + transaction_t *commit_transaction) { - int i; + struct jbd2_inode *jinode; + int err, ret = 0; + struct address_space *mapping; - for (i = 0; i < bufs; i++) { - wbuf[i]->b_end_io = end_buffer_write_sync; - /* We use-up our safety reference in submit_bh() */ - submit_bh(WRITE, wbuf[i]); + spin_lock(&journal->j_list_lock); + list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { + mapping = jinode->i_vfs_inode->i_mapping; + jinode->i_flags |= JI_COMMIT_RUNNING; + spin_unlock(&journal->j_list_lock); + /* + * submit the inode data buffers. We use writepage + * instead of writepages. Because writepages can do + * block allocation with delalloc. We need to write + * only allocated blocks here. + */ + err = journal_submit_inode_data_buffers(mapping); + if (!ret) + ret = err; + spin_lock(&journal->j_list_lock); + J_ASSERT(jinode->i_transaction == commit_transaction); + jinode->i_flags &= ~JI_COMMIT_RUNNING; + wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); } + spin_unlock(&journal->j_list_lock); + return ret; } /* - * Submit all the data buffers to disk + * Wait for data submitted for writeout, refile inodes to proper + * transaction if needed. + * */ -static void journal_submit_data_buffers(journal_t *journal, - transaction_t *commit_transaction) +static int journal_finish_inode_data_buffers(journal_t *journal, + transaction_t *commit_transaction) { - struct journal_head *jh; - struct buffer_head *bh; - int locked; - int bufs = 0; - struct buffer_head **wbuf = journal->j_wbuf; + struct jbd2_inode *jinode, *next_i; + int err, ret = 0; - /* - * Whenever we unlock the journal and sleep, things can get added - * onto ->t_sync_datalist, so we have to keep looping back to - * write_out_data until we *know* that the list is empty. - * - * Cleanup any flushed data buffers from the data list. Even in - * abort mode, we want to flush this out as soon as possible. - */ -write_out_data: - cond_resched(); + /* For locking, see the comment in journal_submit_data_buffers() */ spin_lock(&journal->j_list_lock); - - while (commit_transaction->t_sync_datalist) { - jh = commit_transaction->t_sync_datalist; - bh = jh2bh(jh); - locked = 0; - - /* Get reference just to make sure buffer does not disappear - * when we are forced to drop various locks */ - get_bh(bh); - /* If the buffer is dirty, we need to submit IO and hence - * we need the buffer lock. We try to lock the buffer without - * blocking. If we fail, we need to drop j_list_lock and do - * blocking lock_buffer(). - */ - if (buffer_dirty(bh)) { - if (test_set_buffer_locked(bh)) { - BUFFER_TRACE(bh, "needs blocking lock"); - spin_unlock(&journal->j_list_lock); - /* Write out all data to prevent deadlocks */ - journal_do_submit_data(wbuf, bufs); - bufs = 0; - lock_buffer(bh); - spin_lock(&journal->j_list_lock); - } - locked = 1; - } - /* We have to get bh_state lock. Again out of order, sigh. */ - if (!inverted_lock(journal, bh)) { - jbd_lock_bh_state(bh); - spin_lock(&journal->j_list_lock); - } - /* Someone already cleaned up the buffer? */ - if (!buffer_jbd(bh) - || jh->b_transaction != commit_transaction - || jh->b_jlist != BJ_SyncData) { - jbd_unlock_bh_state(bh); - if (locked) - unlock_buffer(bh); - BUFFER_TRACE(bh, "already cleaned up"); - put_bh(bh); - continue; - } - if (locked && test_clear_buffer_dirty(bh)) { - BUFFER_TRACE(bh, "needs writeout, adding to array"); - wbuf[bufs++] = bh; - __jbd2_journal_file_buffer(jh, commit_transaction, - BJ_Locked); - jbd_unlock_bh_state(bh); - if (bufs == journal->j_wbufsize) { - spin_unlock(&journal->j_list_lock); - journal_do_submit_data(wbuf, bufs); - bufs = 0; - goto write_out_data; - } - } else if (!locked && buffer_locked(bh)) { - __jbd2_journal_file_buffer(jh, commit_transaction, - BJ_Locked); - jbd_unlock_bh_state(bh); - put_bh(bh); - } else { - BUFFER_TRACE(bh, "writeout complete: unfile"); - __jbd2_journal_unfile_buffer(jh); - jbd_unlock_bh_state(bh); - if (locked) - unlock_buffer(bh); - jbd2_journal_remove_journal_head(bh); - /* Once for our safety reference, once for - * jbd2_journal_remove_journal_head() */ - put_bh(bh); - put_bh(bh); + list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { + jinode->i_flags |= JI_COMMIT_RUNNING; + spin_unlock(&journal->j_list_lock); + err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping); + if (err) { + /* + * Because AS_EIO is cleared by + * wait_on_page_writeback_range(), set it again so + * that user process can get -EIO from fsync(). + */ + set_bit(AS_EIO, + &jinode->i_vfs_inode->i_mapping->flags); + + if (!ret) + ret = err; } + spin_lock(&journal->j_list_lock); + jinode->i_flags &= ~JI_COMMIT_RUNNING; + wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); + } - if (need_resched() || spin_needbreak(&journal->j_list_lock)) { - spin_unlock(&journal->j_list_lock); - goto write_out_data; + /* Now refile inode to proper lists */ + list_for_each_entry_safe(jinode, next_i, + &commit_transaction->t_inode_list, i_list) { + list_del(&jinode->i_list); + if (jinode->i_next_transaction) { + jinode->i_transaction = jinode->i_next_transaction; + jinode->i_next_transaction = NULL; + list_add(&jinode->i_list, + &jinode->i_transaction->t_inode_list); + } else { + jinode->i_transaction = NULL; } } spin_unlock(&journal->j_list_lock); - journal_do_submit_data(wbuf, bufs); + + return ret; } static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh) @@ -426,6 +368,8 @@ void jbd2_journal_commit_transaction(journal_t *journal) commit_transaction = journal->j_running_transaction; J_ASSERT(commit_transaction->t_state == T_RUNNING); + trace_mark(jbd2_start_commit, "dev %s transaction %d", + journal->j_devname, commit_transaction->t_tid); jbd_debug(1, "JBD: starting commit of transaction %d\n", commit_transaction->t_tid); @@ -524,21 +468,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) * Now start flushing things to disk, in the order they appear * on the transaction lists. Data blocks go first. */ - err = 0; - journal_submit_data_buffers(journal, commit_transaction); - - /* - * Wait for all previously submitted IO to complete if commit - * record is to be written synchronously. - */ - spin_lock(&journal->j_list_lock); - if (!JBD2_HAS_INCOMPAT_FEATURE(journal, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) - err = journal_wait_on_locked_list(journal, - commit_transaction); - - spin_unlock(&journal->j_list_lock); - + err = journal_submit_data_buffers(journal, commit_transaction); if (err) jbd2_journal_abort(journal, err); @@ -547,16 +477,6 @@ void jbd2_journal_commit_transaction(journal_t *journal) jbd_debug(3, "JBD: commit phase 2\n"); /* - * If we found any dirty or locked buffers, then we should have - * looped back up to the write_out_data label. If there weren't - * any then journal_clean_data_list should have wiped the list - * clean by now, so check that it is in fact empty. - */ - J_ASSERT (commit_transaction->t_sync_datalist == NULL); - - jbd_debug (3, "JBD: commit phase 3\n"); - - /* * Way to go: we have now written out all of the data for a * transaction! Now comes the tricky part: we need to write out * metadata. Loop over the transaction's entire buffer list: @@ -574,6 +494,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) J_ASSERT(commit_transaction->t_nr_buffers <= commit_transaction->t_outstanding_credits); + err = 0; descriptor = NULL; bufs = 0; while (commit_transaction->t_buffers) { @@ -583,9 +504,10 @@ void jbd2_journal_commit_transaction(journal_t *journal) jh = commit_transaction->t_buffers; /* If we're in abort mode, we just un-journal the buffer and - release it for background writing. */ + release it. */ if (is_journal_aborted(journal)) { + clear_buffer_jbddirty(jh2bh(jh)); JBUFFER_TRACE(jh, "journal is aborting: refile"); jbd2_journal_refile_buffer(journal, jh); /* If that was the last one, we need to clean up @@ -748,13 +670,23 @@ start_journal_io: &cbh, crc32_sum); if (err) __jbd2_journal_abort_hard(journal); + } - spin_lock(&journal->j_list_lock); - err = journal_wait_on_locked_list(journal, - commit_transaction); - spin_unlock(&journal->j_list_lock); - if (err) - __jbd2_journal_abort_hard(journal); + /* + * This is the right place to wait for data buffers both for ASYNC + * and !ASYNC commit. If commit is ASYNC, we need to wait only after + * the commit block went to disk (which happens above). If commit is + * SYNC, we need to wait for data buffers before we start writing + * commit block, which happens below in such setting. + */ + err = journal_finish_inode_data_buffers(journal, commit_transaction); + if (err) { + printk(KERN_WARNING + "JBD2: Detected IO errors while flushing file data " + "on %s\n", journal->j_devname); + if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR) + jbd2_journal_abort(journal, err); + err = 0; } /* Lo and behold: we have just managed to send a transaction to @@ -768,7 +700,7 @@ start_journal_io: so we incur less scheduling load. */ - jbd_debug(3, "JBD: commit phase 4\n"); + jbd_debug(3, "JBD: commit phase 3\n"); /* * akpm: these are BJ_IO, and j_list_lock is not needed. @@ -827,7 +759,7 @@ wait_for_iobuf: J_ASSERT (commit_transaction->t_shadow_list == NULL); - jbd_debug(3, "JBD: commit phase 5\n"); + jbd_debug(3, "JBD: commit phase 4\n"); /* Here we wait for the revoke record and descriptor record buffers */ wait_for_ctlbuf: @@ -854,7 +786,10 @@ wait_for_iobuf: /* AKPM: bforget here */ } - jbd_debug(3, "JBD: commit phase 6\n"); + if (err) + jbd2_journal_abort(journal, err); + + jbd_debug(3, "JBD: commit phase 5\n"); if (!JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { @@ -874,9 +809,9 @@ wait_for_iobuf: transaction can be removed from any checkpoint list it was on before. */ - jbd_debug(3, "JBD: commit phase 7\n"); + jbd_debug(3, "JBD: commit phase 6\n"); - J_ASSERT(commit_transaction->t_sync_datalist == NULL); + J_ASSERT(list_empty(&commit_transaction->t_inode_list)); J_ASSERT(commit_transaction->t_buffers == NULL); J_ASSERT(commit_transaction->t_checkpoint_list == NULL); J_ASSERT(commit_transaction->t_iobuf_list == NULL); @@ -952,6 +887,8 @@ restart_loop: if (buffer_jbddirty(bh)) { JBUFFER_TRACE(jh, "add to new checkpointing trans"); __jbd2_journal_insert_checkpoint(jh, commit_transaction); + if (is_journal_aborted(journal)) + clear_buffer_jbddirty(bh); JBUFFER_TRACE(jh, "refile for checkpoint writeback"); __jbd2_journal_refile_buffer(jh); jbd_unlock_bh_state(bh); @@ -997,7 +934,7 @@ restart_loop: /* Done with this transaction! */ - jbd_debug(3, "JBD: commit phase 8\n"); + jbd_debug(3, "JBD: commit phase 7\n"); J_ASSERT(commit_transaction->t_state == T_COMMIT); @@ -1058,6 +995,9 @@ restart_loop: } spin_unlock(&journal->j_list_lock); + trace_mark(jbd2_end_commit, "dev %s transaction %d head %d", + journal->j_devname, commit_transaction->t_tid, + journal->j_tail_sequence); jbd_debug(1, "JBD: commit %d complete, head %d\n", journal->j_commit_sequence, journal->j_tail_sequence); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 2e24567c4a79..783de118de92 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -50,7 +50,6 @@ EXPORT_SYMBOL(jbd2_journal_unlock_updates); EXPORT_SYMBOL(jbd2_journal_get_write_access); EXPORT_SYMBOL(jbd2_journal_get_create_access); EXPORT_SYMBOL(jbd2_journal_get_undo_access); -EXPORT_SYMBOL(jbd2_journal_dirty_data); EXPORT_SYMBOL(jbd2_journal_dirty_metadata); EXPORT_SYMBOL(jbd2_journal_release_buffer); EXPORT_SYMBOL(jbd2_journal_forget); @@ -69,7 +68,6 @@ EXPORT_SYMBOL(jbd2_journal_set_features); EXPORT_SYMBOL(jbd2_journal_create); EXPORT_SYMBOL(jbd2_journal_load); EXPORT_SYMBOL(jbd2_journal_destroy); -EXPORT_SYMBOL(jbd2_journal_update_superblock); EXPORT_SYMBOL(jbd2_journal_abort); EXPORT_SYMBOL(jbd2_journal_errno); EXPORT_SYMBOL(jbd2_journal_ack_err); @@ -82,6 +80,10 @@ EXPORT_SYMBOL(jbd2_journal_blocks_per_page); EXPORT_SYMBOL(jbd2_journal_invalidatepage); EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers); EXPORT_SYMBOL(jbd2_journal_force_commit); +EXPORT_SYMBOL(jbd2_journal_file_inode); +EXPORT_SYMBOL(jbd2_journal_init_jbd_inode); +EXPORT_SYMBOL(jbd2_journal_release_jbd_inode); +EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate); static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); static void __journal_abort_soft (journal_t *journal, int errno); @@ -595,13 +597,9 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr, if (ret) *retp = ret; else { - char b[BDEVNAME_SIZE]; - printk(KERN_ALERT "%s: journal block not found " "at offset %lu on %s\n", - __func__, - blocknr, - bdevname(journal->j_dev, b)); + __func__, blocknr, journal->j_devname); err = -EIO; __journal_abort_soft(journal, err); } @@ -899,10 +897,7 @@ static struct proc_dir_entry *proc_jbd2_stats; static void jbd2_stats_proc_init(journal_t *journal) { - char name[BDEVNAME_SIZE]; - - bdevname(journal->j_dev, name); - journal->j_proc_entry = proc_mkdir(name, proc_jbd2_stats); + journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats); if (journal->j_proc_entry) { proc_create_data("history", S_IRUGO, journal->j_proc_entry, &jbd2_seq_history_fops, journal); @@ -913,12 +908,9 @@ static void jbd2_stats_proc_init(journal_t *journal) static void jbd2_stats_proc_exit(journal_t *journal) { - char name[BDEVNAME_SIZE]; - - bdevname(journal->j_dev, name); remove_proc_entry("info", journal->j_proc_entry); remove_proc_entry("history", journal->j_proc_entry); - remove_proc_entry(name, proc_jbd2_stats); + remove_proc_entry(journal->j_devname, proc_jbd2_stats); } static void journal_init_stats(journal_t *journal) @@ -1016,6 +1008,7 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev, { journal_t *journal = journal_init_common(); struct buffer_head *bh; + char *p; int n; if (!journal) @@ -1037,6 +1030,10 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev, journal->j_fs_dev = fs_dev; journal->j_blk_offset = start; journal->j_maxlen = len; + bdevname(journal->j_dev, journal->j_devname); + p = journal->j_devname; + while ((p = strchr(p, '/'))) + *p = '!'; jbd2_stats_proc_init(journal); bh = __getblk(journal->j_dev, start, journal->j_blocksize); @@ -1059,6 +1056,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode) { struct buffer_head *bh; journal_t *journal = journal_init_common(); + char *p; int err; int n; unsigned long long blocknr; @@ -1068,6 +1066,12 @@ journal_t * jbd2_journal_init_inode (struct inode *inode) journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev; journal->j_inode = inode; + bdevname(journal->j_dev, journal->j_devname); + p = journal->j_devname; + while ((p = strchr(p, '/'))) + *p = '!'; + p = journal->j_devname + strlen(journal->j_devname); + sprintf(p, ":%lu", journal->j_inode->i_ino); jbd_debug(1, "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n", journal, inode->i_sb->s_id, inode->i_ino, @@ -1251,6 +1255,22 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait) goto out; } + if (buffer_write_io_error(bh)) { + /* + * Oh, dear. A previous attempt to write the journal + * superblock failed. This could happen because the + * USB device was yanked out. Or it could happen to + * be a transient write error and maybe the block will + * be remapped. Nothing we can do but to retry the + * write and hope for the best. + */ + printk(KERN_ERR "JBD2: previous I/O error detected " + "for journal superblock update for %s.\n", + journal->j_devname); + clear_buffer_write_io_error(bh); + set_buffer_uptodate(bh); + } + spin_lock(&journal->j_state_lock); jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n", journal->j_tail, journal->j_tail_sequence, journal->j_errno); @@ -1262,9 +1282,16 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait) BUFFER_TRACE(bh, "marking dirty"); mark_buffer_dirty(bh); - if (wait) + if (wait) { sync_dirty_buffer(bh); - else + if (buffer_write_io_error(bh)) { + printk(KERN_ERR "JBD2: I/O error detected " + "when updating journal superblock for %s.\n", + journal->j_devname); + clear_buffer_write_io_error(bh); + set_buffer_uptodate(bh); + } + } else ll_rw_block(SWRITE, 1, &bh); out: @@ -1424,9 +1451,12 @@ recovery_error: * * Release a journal_t structure once it is no longer in use by the * journaled object. + * Return <0 if we couldn't clean up the journal. */ -void jbd2_journal_destroy(journal_t *journal) +int jbd2_journal_destroy(journal_t *journal) { + int err = 0; + /* Wait for the commit thread to wake up and die. */ journal_kill_thread(journal); @@ -1449,11 +1479,16 @@ void jbd2_journal_destroy(journal_t *journal) J_ASSERT(journal->j_checkpoint_transactions == NULL); spin_unlock(&journal->j_list_lock); - /* We can now mark the journal as empty. */ - journal->j_tail = 0; - journal->j_tail_sequence = ++journal->j_transaction_sequence; if (journal->j_sb_buffer) { - jbd2_journal_update_superblock(journal, 1); + if (!is_journal_aborted(journal)) { + /* We can now mark the journal as empty. */ + journal->j_tail = 0; + journal->j_tail_sequence = + ++journal->j_transaction_sequence; + jbd2_journal_update_superblock(journal, 1); + } else { + err = -EIO; + } brelse(journal->j_sb_buffer); } @@ -1465,6 +1500,8 @@ void jbd2_journal_destroy(journal_t *journal) jbd2_journal_destroy_revoke(journal); kfree(journal->j_wbuf); kfree(journal); + + return err; } @@ -1690,10 +1727,16 @@ int jbd2_journal_flush(journal_t *journal) spin_lock(&journal->j_list_lock); while (!err && journal->j_checkpoint_transactions != NULL) { spin_unlock(&journal->j_list_lock); + mutex_lock(&journal->j_checkpoint_mutex); err = jbd2_log_do_checkpoint(journal); + mutex_unlock(&journal->j_checkpoint_mutex); spin_lock(&journal->j_list_lock); } spin_unlock(&journal->j_list_lock); + + if (is_journal_aborted(journal)) + return -EIO; + jbd2_cleanup_journal_tail(journal); /* Finally, mark the journal as really needing no recovery. @@ -1715,7 +1758,7 @@ int jbd2_journal_flush(journal_t *journal) J_ASSERT(journal->j_head == journal->j_tail); J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence); spin_unlock(&journal->j_state_lock); - return err; + return 0; } /** @@ -1759,23 +1802,6 @@ int jbd2_journal_wipe(journal_t *journal, int write) } /* - * journal_dev_name: format a character string to describe on what - * device this journal is present. - */ - -static const char *journal_dev_name(journal_t *journal, char *buffer) -{ - struct block_device *bdev; - - if (journal->j_inode) - bdev = journal->j_inode->i_sb->s_bdev; - else - bdev = journal->j_dev; - - return bdevname(bdev, buffer); -} - -/* * Journal abort has very specific semantics, which we describe * for journal abort. * @@ -1791,13 +1817,12 @@ static const char *journal_dev_name(journal_t *journal, char *buffer) void __jbd2_journal_abort_hard(journal_t *journal) { transaction_t *transaction; - char b[BDEVNAME_SIZE]; if (journal->j_flags & JBD2_ABORT) return; printk(KERN_ERR "Aborting journal on device %s.\n", - journal_dev_name(journal, b)); + journal->j_devname); spin_lock(&journal->j_state_lock); journal->j_flags |= JBD2_ABORT; @@ -2195,6 +2220,54 @@ void jbd2_journal_put_journal_head(struct journal_head *jh) } /* + * Initialize jbd inode head + */ +void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode) +{ + jinode->i_transaction = NULL; + jinode->i_next_transaction = NULL; + jinode->i_vfs_inode = inode; + jinode->i_flags = 0; + INIT_LIST_HEAD(&jinode->i_list); +} + +/* + * Function to be called before we start removing inode from memory (i.e., + * clear_inode() is a fine place to be called from). It removes inode from + * transaction's lists. + */ +void jbd2_journal_release_jbd_inode(journal_t *journal, + struct jbd2_inode *jinode) +{ + int writeout = 0; + + if (!journal) + return; +restart: + spin_lock(&journal->j_list_lock); + /* Is commit writing out inode - we have to wait */ + if (jinode->i_flags & JI_COMMIT_RUNNING) { + wait_queue_head_t *wq; + DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING); + wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING); + prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&journal->j_list_lock); + schedule(); + finish_wait(wq, &wait.wait); + goto restart; + } + + /* Do we need to wait for data writeback? */ + if (journal->j_committing_transaction == jinode->i_transaction) + writeout = 1; + if (jinode->i_transaction) { + list_del(&jinode->i_list); + jinode->i_transaction = NULL; + } + spin_unlock(&journal->j_list_lock); +} + +/* * debugfs tunables */ #ifdef CONFIG_JBD2_DEBUG diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 058f50f65b76..73063285b13f 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -225,7 +225,7 @@ do { \ */ int jbd2_journal_recover(journal_t *journal) { - int err; + int err, err2; journal_superblock_t * sb; struct recovery_info info; @@ -263,7 +263,10 @@ int jbd2_journal_recover(journal_t *journal) journal->j_transaction_sequence = ++info.end_transaction; jbd2_journal_clear_revoke(journal); - sync_blockdev(journal->j_fs_dev); + err2 = sync_blockdev(journal->j_fs_dev); + if (!err) + err = err2; + return err; } diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index d6e006e67804..e5d540588fa9 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -41,7 +41,6 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); * new transaction and we can't block without protecting against other * processes trying to touch the journal while it is in transition. * - * Called under j_state_lock */ static transaction_t * @@ -52,6 +51,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) transaction->t_tid = journal->j_transaction_sequence++; transaction->t_expires = jiffies + journal->j_commit_interval; spin_lock_init(&transaction->t_handle_lock); + INIT_LIST_HEAD(&transaction->t_inode_list); /* Set up the commit timer for the new transaction. */ journal->j_commit_timer.expires = round_jiffies(transaction->t_expires); @@ -301,7 +301,7 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks) goto out; } - lock_acquire(&handle->h_lockdep_map, 0, 0, 0, 2, _THIS_IP_); + lock_map_acquire(&handle->h_lockdep_map); out: return handle; } @@ -943,183 +943,6 @@ out: } /** - * int jbd2_journal_dirty_data() - mark a buffer as containing dirty data which - * needs to be flushed before we can commit the - * current transaction. - * @handle: transaction - * @bh: bufferhead to mark - * - * The buffer is placed on the transaction's data list and is marked as - * belonging to the transaction. - * - * Returns error number or 0 on success. - * - * jbd2_journal_dirty_data() can be called via page_launder->ext3_writepage - * by kswapd. - */ -int jbd2_journal_dirty_data(handle_t *handle, struct buffer_head *bh) -{ - journal_t *journal = handle->h_transaction->t_journal; - int need_brelse = 0; - struct journal_head *jh; - - if (is_handle_aborted(handle)) - return 0; - - jh = jbd2_journal_add_journal_head(bh); - JBUFFER_TRACE(jh, "entry"); - - /* - * The buffer could *already* be dirty. Writeout can start - * at any time. - */ - jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid); - - /* - * What if the buffer is already part of a running transaction? - * - * There are two cases: - * 1) It is part of the current running transaction. Refile it, - * just in case we have allocated it as metadata, deallocated - * it, then reallocated it as data. - * 2) It is part of the previous, still-committing transaction. - * If all we want to do is to guarantee that the buffer will be - * written to disk before this new transaction commits, then - * being sure that the *previous* transaction has this same - * property is sufficient for us! Just leave it on its old - * transaction. - * - * In case (2), the buffer must not already exist as metadata - * --- that would violate write ordering (a transaction is free - * to write its data at any point, even before the previous - * committing transaction has committed). The caller must - * never, ever allow this to happen: there's nothing we can do - * about it in this layer. - */ - jbd_lock_bh_state(bh); - spin_lock(&journal->j_list_lock); - - /* Now that we have bh_state locked, are we really still mapped? */ - if (!buffer_mapped(bh)) { - JBUFFER_TRACE(jh, "unmapped buffer, bailing out"); - goto no_journal; - } - - if (jh->b_transaction) { - JBUFFER_TRACE(jh, "has transaction"); - if (jh->b_transaction != handle->h_transaction) { - JBUFFER_TRACE(jh, "belongs to older transaction"); - J_ASSERT_JH(jh, jh->b_transaction == - journal->j_committing_transaction); - - /* @@@ IS THIS TRUE ? */ - /* - * Not any more. Scenario: someone does a write() - * in data=journal mode. The buffer's transaction has - * moved into commit. Then someone does another - * write() to the file. We do the frozen data copyout - * and set b_next_transaction to point to j_running_t. - * And while we're in that state, someone does a - * writepage() in an attempt to pageout the same area - * of the file via a shared mapping. At present that - * calls jbd2_journal_dirty_data(), and we get right here. - * It may be too late to journal the data. Simply - * falling through to the next test will suffice: the - * data will be dirty and wil be checkpointed. The - * ordering comments in the next comment block still - * apply. - */ - //J_ASSERT_JH(jh, jh->b_next_transaction == NULL); - - /* - * If we're journalling data, and this buffer was - * subject to a write(), it could be metadata, forget - * or shadow against the committing transaction. Now, - * someone has dirtied the same darn page via a mapping - * and it is being writepage()'d. - * We *could* just steal the page from commit, with some - * fancy locking there. Instead, we just skip it - - * don't tie the page's buffers to the new transaction - * at all. - * Implication: if we crash before the writepage() data - * is written into the filesystem, recovery will replay - * the write() data. - */ - if (jh->b_jlist != BJ_None && - jh->b_jlist != BJ_SyncData && - jh->b_jlist != BJ_Locked) { - JBUFFER_TRACE(jh, "Not stealing"); - goto no_journal; - } - - /* - * This buffer may be undergoing writeout in commit. We - * can't return from here and let the caller dirty it - * again because that can cause the write-out loop in - * commit to never terminate. - */ - if (buffer_dirty(bh)) { - get_bh(bh); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - need_brelse = 1; - sync_dirty_buffer(bh); - jbd_lock_bh_state(bh); - spin_lock(&journal->j_list_lock); - /* Since we dropped the lock... */ - if (!buffer_mapped(bh)) { - JBUFFER_TRACE(jh, "buffer got unmapped"); - goto no_journal; - } - /* The buffer may become locked again at any - time if it is redirtied */ - } - - /* journal_clean_data_list() may have got there first */ - if (jh->b_transaction != NULL) { - JBUFFER_TRACE(jh, "unfile from commit"); - __jbd2_journal_temp_unlink_buffer(jh); - /* It still points to the committing - * transaction; move it to this one so - * that the refile assert checks are - * happy. */ - jh->b_transaction = handle->h_transaction; - } - /* The buffer will be refiled below */ - - } - /* - * Special case --- the buffer might actually have been - * allocated and then immediately deallocated in the previous, - * committing transaction, so might still be left on that - * transaction's metadata lists. - */ - if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) { - JBUFFER_TRACE(jh, "not on correct data list: unfile"); - J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow); - __jbd2_journal_temp_unlink_buffer(jh); - jh->b_transaction = handle->h_transaction; - JBUFFER_TRACE(jh, "file as data"); - __jbd2_journal_file_buffer(jh, handle->h_transaction, - BJ_SyncData); - } - } else { - JBUFFER_TRACE(jh, "not on a transaction"); - __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_SyncData); - } -no_journal: - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - if (need_brelse) { - BUFFER_TRACE(bh, "brelse"); - __brelse(bh); - } - JBUFFER_TRACE(jh, "exit"); - jbd2_journal_put_journal_head(jh); - return 0; -} - -/** * int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata * @handle: transaction to add buffer to. * @bh: buffer to mark @@ -1456,7 +1279,7 @@ int jbd2_journal_stop(handle_t *handle) spin_unlock(&journal->j_state_lock); } - lock_release(&handle->h_lockdep_map, 1, _THIS_IP_); + lock_map_release(&handle->h_lockdep_map); jbd2_free_handle(handle); return err; @@ -1541,10 +1364,10 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh) * Remove a buffer from the appropriate transaction list. * * Note that this function can *change* the value of - * bh->b_transaction->t_sync_datalist, t_buffers, t_forget, - * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller - * is holding onto a copy of one of thee pointers, it could go bad. - * Generally the caller needs to re-read the pointer from the transaction_t. + * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list, + * t_log_list or t_reserved_list. If the caller is holding onto a copy of one + * of these pointers, it could go bad. Generally the caller needs to re-read + * the pointer from the transaction_t. * * Called under j_list_lock. The journal may not be locked. */ @@ -1566,9 +1389,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) switch (jh->b_jlist) { case BJ_None: return; - case BJ_SyncData: - list = &transaction->t_sync_datalist; - break; case BJ_Metadata: transaction->t_nr_buffers--; J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0); @@ -1589,9 +1409,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) case BJ_Reserved: list = &transaction->t_reserved_list; break; - case BJ_Locked: - list = &transaction->t_locked_list; - break; } __blist_del_buffer(list, jh); @@ -1634,15 +1451,7 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) goto out; spin_lock(&journal->j_list_lock); - if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) { - if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) { - /* A written-back ordered data buffer */ - JBUFFER_TRACE(jh, "release data"); - __jbd2_journal_unfile_buffer(jh); - jbd2_journal_remove_journal_head(bh); - __brelse(bh); - } - } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { + if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { /* written-back checkpointed metadata buffer */ if (jh->b_jlist == BJ_None) { JBUFFER_TRACE(jh, "remove from checkpoint list"); @@ -1656,12 +1465,43 @@ out: return; } +/* + * jbd2_journal_try_to_free_buffers() could race with + * jbd2_journal_commit_transaction(). The later might still hold the + * reference count to the buffers when inspecting them on + * t_syncdata_list or t_locked_list. + * + * jbd2_journal_try_to_free_buffers() will call this function to + * wait for the current transaction to finish syncing data buffers, before + * try to free that buffer. + * + * Called with journal->j_state_lock hold. + */ +static void jbd2_journal_wait_for_transaction_sync_data(journal_t *journal) +{ + transaction_t *transaction; + tid_t tid; + + spin_lock(&journal->j_state_lock); + transaction = journal->j_committing_transaction; + + if (!transaction) { + spin_unlock(&journal->j_state_lock); + return; + } + + tid = transaction->t_tid; + spin_unlock(&journal->j_state_lock); + jbd2_log_wait_commit(journal, tid); +} /** * int jbd2_journal_try_to_free_buffers() - try to free page buffers. * @journal: journal for operation * @page: to try and free - * @unused_gfp_mask: unused + * @gfp_mask: we use the mask to detect how hard should we try to release + * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to + * release the buffers. * * * For all the buffers on this page, @@ -1690,9 +1530,11 @@ out: * journal_try_to_free_buffer() is changing its state. But that * cannot happen because we never reallocate freed data as metadata * while the data is part of a transaction. Yes? + * + * Return 0 on failure, 1 on success */ int jbd2_journal_try_to_free_buffers(journal_t *journal, - struct page *page, gfp_t unused_gfp_mask) + struct page *page, gfp_t gfp_mask) { struct buffer_head *head; struct buffer_head *bh; @@ -1708,7 +1550,8 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal, /* * We take our own ref against the journal_head here to avoid * having to add tons of locking around each instance of - * jbd2_journal_remove_journal_head() and jbd2_journal_put_journal_head(). + * jbd2_journal_remove_journal_head() and + * jbd2_journal_put_journal_head(). */ jh = jbd2_journal_grab_journal_head(bh); if (!jh) @@ -1721,7 +1564,28 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal, if (buffer_jbd(bh)) goto busy; } while ((bh = bh->b_this_page) != head); + ret = try_to_free_buffers(page); + + /* + * There are a number of places where jbd2_journal_try_to_free_buffers() + * could race with jbd2_journal_commit_transaction(), the later still + * holds the reference to the buffers to free while processing them. + * try_to_free_buffers() failed to free those buffers. Some of the + * caller of releasepage() request page buffers to be dropped, otherwise + * treat the fail-to-free as errors (such as generic_file_direct_IO()) + * + * So, if the caller of try_to_release_page() wants the synchronous + * behaviour(i.e make sure buffers are dropped upon return), + * let's wait for the current transaction to finish flush of + * dirty data buffers, then try to free those buffers again, + * with the journal locked. + */ + if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) { + jbd2_journal_wait_for_transaction_sync_data(journal); + ret = try_to_free_buffers(page); + } + busy: return ret; } @@ -1823,6 +1687,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) if (!buffer_jbd(bh)) goto zap_buffer_unlocked; + /* OK, we have data buffer in journaled mode */ spin_lock(&journal->j_state_lock); jbd_lock_bh_state(bh); spin_lock(&journal->j_list_lock); @@ -1886,15 +1751,6 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) } } else if (transaction == journal->j_committing_transaction) { JBUFFER_TRACE(jh, "on committing transaction"); - if (jh->b_jlist == BJ_Locked) { - /* - * The buffer is on the committing transaction's locked - * list. We have the buffer locked, so I/O has - * completed. So we can nail the buffer now. - */ - may_free = __dispose_buffer(jh, transaction); - goto zap_buffer; - } /* * If it is committing, we simply cannot touch it. We * can remove it's next_transaction pointer from the @@ -2027,9 +1883,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh, J_ASSERT_JH(jh, !jh->b_committed_data); J_ASSERT_JH(jh, !jh->b_frozen_data); return; - case BJ_SyncData: - list = &transaction->t_sync_datalist; - break; case BJ_Metadata: transaction->t_nr_buffers++; list = &transaction->t_buffers; @@ -2049,9 +1902,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh, case BJ_Reserved: list = &transaction->t_reserved_list; break; - case BJ_Locked: - list = &transaction->t_locked_list; - break; } __blist_add_buffer(list, jh); @@ -2141,3 +1991,88 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh) spin_unlock(&journal->j_list_lock); __brelse(bh); } + +/* + * File inode in the inode list of the handle's transaction + */ +int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode) +{ + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + + if (is_handle_aborted(handle)) + return -EIO; + + jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino, + transaction->t_tid); + + /* + * First check whether inode isn't already on the transaction's + * lists without taking the lock. Note that this check is safe + * without the lock as we cannot race with somebody removing inode + * from the transaction. The reason is that we remove inode from the + * transaction only in journal_release_jbd_inode() and when we commit + * the transaction. We are guarded from the first case by holding + * a reference to the inode. We are safe against the second case + * because if jinode->i_transaction == transaction, commit code + * cannot touch the transaction because we hold reference to it, + * and if jinode->i_next_transaction == transaction, commit code + * will only file the inode where we want it. + */ + if (jinode->i_transaction == transaction || + jinode->i_next_transaction == transaction) + return 0; + + spin_lock(&journal->j_list_lock); + + if (jinode->i_transaction == transaction || + jinode->i_next_transaction == transaction) + goto done; + + /* On some different transaction's list - should be + * the committing one */ + if (jinode->i_transaction) { + J_ASSERT(jinode->i_next_transaction == NULL); + J_ASSERT(jinode->i_transaction == + journal->j_committing_transaction); + jinode->i_next_transaction = transaction; + goto done; + } + /* Not on any transaction list... */ + J_ASSERT(!jinode->i_next_transaction); + jinode->i_transaction = transaction; + list_add(&jinode->i_list, &transaction->t_inode_list); +done: + spin_unlock(&journal->j_list_lock); + + return 0; +} + +/* + * This function must be called when inode is journaled in ordered mode + * before truncation happens. It starts writeout of truncated part in + * case it is in the committing transaction so that we stand to ordered + * mode consistency guarantees. + */ +int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode, + loff_t new_size) +{ + journal_t *journal; + transaction_t *commit_trans; + int ret = 0; + + if (!inode->i_transaction && !inode->i_next_transaction) + goto out; + journal = inode->i_transaction->t_journal; + spin_lock(&journal->j_state_lock); + commit_trans = journal->j_committing_transaction; + spin_unlock(&journal->j_state_lock); + if (inode->i_transaction == commit_trans) { + ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping, + new_size, LLONG_MAX); + if (ret) + jbd2_journal_abort(journal, ret); + } +out: + return ret; +} diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index 4c80404a9aba..d98713777a1b 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -314,7 +314,7 @@ static int jffs2_check_acl(struct inode *inode, int mask) return -EAGAIN; } -int jffs2_permission(struct inode *inode, int mask, struct nameidata *nd) +int jffs2_permission(struct inode *inode, int mask) { return generic_permission(inode, mask, jffs2_check_acl); } diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h index 0bb7f003fd80..8ca058aed384 100644 --- a/fs/jffs2/acl.h +++ b/fs/jffs2/acl.h @@ -28,7 +28,7 @@ struct jffs2_acl_header { #define JFFS2_ACL_NOT_CACHED ((void *)-1) -extern int jffs2_permission(struct inode *, int, struct nameidata *); +extern int jffs2_permission(struct inode *, int); extern int jffs2_acl_chmod(struct inode *); extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *); extern int jffs2_init_acl_post(struct inode *); diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index c0c141f6fde1..cd219ef55254 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -38,7 +38,7 @@ const struct file_operations jffs2_dir_operations = { .read = generic_read_dir, .readdir = jffs2_readdir, - .ioctl = jffs2_ioctl, + .unlocked_ioctl=jffs2_ioctl, .fsync = jffs2_fsync }; diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 5e920343b2c5..5a98aa87c853 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -46,7 +46,7 @@ const struct file_operations jffs2_file_operations = .aio_read = generic_file_aio_read, .write = do_sync_write, .aio_write = generic_file_aio_write, - .ioctl = jffs2_ioctl, + .unlocked_ioctl=jffs2_ioctl, .mmap = generic_file_readonly_mmap, .fsync = jffs2_fsync, .splice_read = generic_file_splice_read, diff --git a/fs/jffs2/ioctl.c b/fs/jffs2/ioctl.c index e2177210f621..9d41f43e47bb 100644 --- a/fs/jffs2/ioctl.c +++ b/fs/jffs2/ioctl.c @@ -12,8 +12,7 @@ #include <linux/fs.h> #include "nodelist.h" -int jffs2_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, - unsigned long arg) +long jffs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { /* Later, this will provide for lsattr.jffs2 and chattr.jffs2, which will include compression support etc. */ diff --git a/fs/jffs2/jffs2_fs_i.h b/fs/jffs2/jffs2_fs_i.h index 31559f45fdde..4c41db91eaa4 100644 --- a/fs/jffs2/jffs2_fs_i.h +++ b/fs/jffs2/jffs2_fs_i.h @@ -12,7 +12,6 @@ #ifndef _JFFS2_FS_I #define _JFFS2_FS_I -#include <linux/version.h> #include <linux/rbtree.h> #include <linux/posix_acl.h> #include <linux/mutex.h> diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h index 2cc866cf134f..5e194a5c8e29 100644 --- a/fs/jffs2/os-linux.h +++ b/fs/jffs2/os-linux.h @@ -167,7 +167,7 @@ int jffs2_fsync(struct file *, struct dentry *, int); int jffs2_do_readpage_unlock (struct inode *inode, struct page *pg); /* ioctl.c */ -int jffs2_ioctl(struct inode *, struct file *, unsigned int, unsigned long); +long jffs2_ioctl(struct file *, unsigned int, unsigned long); /* symlink.c */ extern const struct inode_operations jffs2_symlink_inode_operations; diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c index 629af01e5ade..6caf1e1ee26d 100644 --- a/fs/jffs2/summary.c +++ b/fs/jffs2/summary.c @@ -23,6 +23,8 @@ int jffs2_sum_init(struct jffs2_sb_info *c) { + uint32_t sum_size = max_t(uint32_t, c->sector_size, MAX_SUMMARY_SIZE); + c->summary = kzalloc(sizeof(struct jffs2_summary), GFP_KERNEL); if (!c->summary) { @@ -30,7 +32,7 @@ int jffs2_sum_init(struct jffs2_sb_info *c) return -ENOMEM; } - c->summary->sum_buf = vmalloc(c->sector_size); + c->summary->sum_buf = kmalloc(sum_size, GFP_KERNEL); if (!c->summary->sum_buf) { JFFS2_WARNING("Can't allocate buffer for writing out summary information!\n"); @@ -49,7 +51,7 @@ void jffs2_sum_exit(struct jffs2_sb_info *c) jffs2_sum_disable_collecting(c->summary); - vfree(c->summary->sum_buf); + kfree(c->summary->sum_buf); c->summary->sum_buf = NULL; kfree(c->summary); @@ -665,7 +667,7 @@ crc_err: /* Write summary data to flash - helper function for jffs2_sum_write_sumnode() */ static int jffs2_sum_write_data(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, - uint32_t infosize, uint32_t datasize, int padsize) + uint32_t infosize, uint32_t datasize, int padsize) { struct jffs2_raw_summary isum; union jffs2_sum_mem *temp; @@ -676,6 +678,26 @@ static int jffs2_sum_write_data(struct jffs2_sb_info *c, struct jffs2_eraseblock int ret; size_t retlen; + if (padsize + datasize > MAX_SUMMARY_SIZE) { + /* It won't fit in the buffer. Abort summary for this jeb */ + jffs2_sum_disable_collecting(c->summary); + + JFFS2_WARNING("Summary too big (%d data, %d pad) in eraseblock at %08x\n", + datasize, padsize, jeb->offset); + /* Non-fatal */ + return 0; + } + /* Is there enough space for summary? */ + if (padsize < 0) { + /* don't try to write out summary for this jeb */ + jffs2_sum_disable_collecting(c->summary); + + JFFS2_WARNING("Not enough space for summary, padsize = %d\n", + padsize); + /* Non-fatal */ + return 0; + } + memset(c->summary->sum_buf, 0xff, datasize); memset(&isum, 0, sizeof(isum)); @@ -821,7 +843,7 @@ int jffs2_sum_write_sumnode(struct jffs2_sb_info *c) { int datasize, infosize, padsize; struct jffs2_eraseblock *jeb; - int ret; + int ret = 0; dbg_summary("called\n"); @@ -841,16 +863,6 @@ int jffs2_sum_write_sumnode(struct jffs2_sb_info *c) infosize += padsize; datasize += padsize; - /* Is there enough space for summary? */ - if (padsize < 0) { - /* don't try to write out summary for this jeb */ - jffs2_sum_disable_collecting(c->summary); - - JFFS2_WARNING("Not enough space for summary, padsize = %d\n", padsize); - spin_lock(&c->erase_completion_lock); - return 0; - } - ret = jffs2_sum_write_data(c, jeb, infosize, datasize, padsize); spin_lock(&c->erase_completion_lock); return ret; diff --git a/fs/jffs2/summary.h b/fs/jffs2/summary.h index 8bf34f2fa5ce..60207a2ae952 100644 --- a/fs/jffs2/summary.h +++ b/fs/jffs2/summary.h @@ -13,6 +13,12 @@ #ifndef JFFS2_SUMMARY_H #define JFFS2_SUMMARY_H +/* Limit summary size to 64KiB so that we can kmalloc it. If the summary + is larger than that, we have to just ditch it and avoid using summary + for the eraseblock in question... and it probably doesn't hurt us much + anyway. */ +#define MAX_SUMMARY_SIZE 65536 + #include <linux/uio.h> #include <linux/jffs2.h> diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 7da69eae49e4..efd401257ed9 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -44,7 +44,7 @@ static void jffs2_destroy_inode(struct inode *inode) kmem_cache_free(jffs2_inode_cachep, JFFS2_INODE_INFO(inode)); } -static void jffs2_i_init_once(struct kmem_cache *cachep, void *foo) +static void jffs2_i_init_once(void *foo) { struct jffs2_inode_info *f = foo; diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index 4d84bdc88299..d3e5c33665de 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -140,7 +140,7 @@ static int jfs_check_acl(struct inode *inode, int mask) return -EAGAIN; } -int jfs_permission(struct inode *inode, int mask, struct nameidata *nd) +int jfs_permission(struct inode *inode, int mask) { return generic_permission(inode, mask, jfs_check_acl); } diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h index 455fa4292045..88475f10a389 100644 --- a/fs/jfs/jfs_acl.h +++ b/fs/jfs/jfs_acl.h @@ -20,7 +20,7 @@ #ifdef CONFIG_JFS_POSIX_ACL -int jfs_permission(struct inode *, int, struct nameidata *); +int jfs_permission(struct inode *, int); int jfs_init_acl(tid_t, struct inode *, struct inode *); int jfs_setattr(struct dentry *, struct iattr *); diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c index bf6ab19b86ee..6a73de84bcef 100644 --- a/fs/jfs/jfs_debug.c +++ b/fs/jfs/jfs_debug.c @@ -21,6 +21,7 @@ #include <linux/ctype.h> #include <linux/module.h> #include <linux/proc_fs.h> +#include <linux/seq_file.h> #include <asm/uaccess.h> #include "jfs_incore.h" #include "jfs_filsys.h" @@ -30,29 +31,19 @@ static struct proc_dir_entry *base; #ifdef CONFIG_JFS_DEBUG -static int loglevel_read(char *page, char **start, off_t off, - int count, int *eof, void *data) +static int jfs_loglevel_proc_show(struct seq_file *m, void *v) { - int len; - - len = sprintf(page, "%d\n", jfsloglevel); - - len -= off; - *start = page + off; - - if (len > count) - len = count; - else - *eof = 1; - - if (len < 0) - len = 0; + seq_printf(m, "%d\n", jfsloglevel); + return 0; +} - return len; +static int jfs_loglevel_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, jfs_loglevel_proc_show, NULL); } -static int loglevel_write(struct file *file, const char __user *buffer, - unsigned long count, void *data) +static ssize_t jfs_loglevel_proc_write(struct file *file, + const char __user *buffer, size_t count, loff_t *ppos) { char c; @@ -65,22 +56,30 @@ static int loglevel_write(struct file *file, const char __user *buffer, jfsloglevel = c - '0'; return count; } + +static const struct file_operations jfs_loglevel_proc_fops = { + .owner = THIS_MODULE, + .open = jfs_loglevel_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = jfs_loglevel_proc_write, +}; #endif static struct { const char *name; - read_proc_t *read_fn; - write_proc_t *write_fn; + const struct file_operations *proc_fops; } Entries[] = { #ifdef CONFIG_JFS_STATISTICS - { "lmstats", jfs_lmstats_read, }, - { "txstats", jfs_txstats_read, }, - { "xtstat", jfs_xtstat_read, }, - { "mpstat", jfs_mpstat_read, }, + { "lmstats", &jfs_lmstats_proc_fops, }, + { "txstats", &jfs_txstats_proc_fops, }, + { "xtstat", &jfs_xtstat_proc_fops, }, + { "mpstat", &jfs_mpstat_proc_fops, }, #endif #ifdef CONFIG_JFS_DEBUG - { "TxAnchor", jfs_txanchor_read, }, - { "loglevel", loglevel_read, loglevel_write } + { "TxAnchor", &jfs_txanchor_proc_fops, }, + { "loglevel", &jfs_loglevel_proc_fops } #endif }; #define NPROCENT ARRAY_SIZE(Entries) @@ -93,13 +92,8 @@ void jfs_proc_init(void) return; base->owner = THIS_MODULE; - for (i = 0; i < NPROCENT; i++) { - struct proc_dir_entry *p; - if ((p = create_proc_entry(Entries[i].name, 0, base))) { - p->read_proc = Entries[i].read_fn; - p->write_proc = Entries[i].write_fn; - } - } + for (i = 0; i < NPROCENT; i++) + proc_create(Entries[i].name, 0, base, Entries[i].proc_fops); } void jfs_proc_clean(void) diff --git a/fs/jfs/jfs_debug.h b/fs/jfs/jfs_debug.h index 044c1e654cc0..eafd1300a00b 100644 --- a/fs/jfs/jfs_debug.h +++ b/fs/jfs/jfs_debug.h @@ -62,7 +62,7 @@ extern void jfs_proc_clean(void); extern int jfsloglevel; -extern int jfs_txanchor_read(char *, char **, off_t, int, int *, void *); +extern const struct file_operations jfs_txanchor_proc_fops; /* information message: e.g., configuration, major event */ #define jfs_info(fmt, arg...) do { \ @@ -105,10 +105,10 @@ extern int jfs_txanchor_read(char *, char **, off_t, int, int *, void *); * ---------- */ #ifdef CONFIG_JFS_STATISTICS -extern int jfs_lmstats_read(char *, char **, off_t, int, int *, void *); -extern int jfs_txstats_read(char *, char **, off_t, int, int *, void *); -extern int jfs_mpstat_read(char *, char **, off_t, int, int *, void *); -extern int jfs_xtstat_read(char *, char **, off_t, int, int *, void *); +extern const struct file_operations jfs_lmstats_proc_fops; +extern const struct file_operations jfs_txstats_proc_fops; +extern const struct file_operations jfs_mpstat_proc_fops; +extern const struct file_operations jfs_xtstat_proc_fops; #define INCREMENT(x) ((x)++) #define DECREMENT(x) ((x)--) diff --git a/fs/jfs/jfs_dtree.h b/fs/jfs/jfs_dtree.h index cdac2d5bafeb..2545bb317235 100644 --- a/fs/jfs/jfs_dtree.h +++ b/fs/jfs/jfs_dtree.h @@ -243,9 +243,6 @@ typedef union { #define JFS_REMOVE 3 #define JFS_RENAME 4 -#define DIRENTSIZ(namlen) \ - ( (sizeof(struct dirent) - 2*(JFS_NAME_MAX+1) + 2*((namlen)+1) + 3) &~ 3 ) - /* * Maximum file offset for directories. */ diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index 734ec916beaf..d6363d8309d0 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -1520,7 +1520,7 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip) jfs_error(ip->i_sb, "diAlloc: can't find free bit " "in wmap"); - return EIO; + return -EIO; } /* determine the inode number within the diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 325a9679b95a..cd2ec2988b59 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -69,6 +69,7 @@ #include <linux/freezer.h> #include <linux/delay.h> #include <linux/mutex.h> +#include <linux/seq_file.h> #include "jfs_incore.h" #include "jfs_filsys.h" #include "jfs_metapage.h" @@ -2503,13 +2504,9 @@ exit: } #ifdef CONFIG_JFS_STATISTICS -int jfs_lmstats_read(char *buffer, char **start, off_t offset, int length, - int *eof, void *data) +static int jfs_lmstats_proc_show(struct seq_file *m, void *v) { - int len = 0; - off_t begin; - - len += sprintf(buffer, + seq_printf(m, "JFS Logmgr stats\n" "================\n" "commits = %d\n" @@ -2522,19 +2519,19 @@ int jfs_lmstats_read(char *buffer, char **start, off_t offset, int length, lmStat.pagedone, lmStat.full_page, lmStat.partial_page); + return 0; +} - begin = offset; - *start = buffer + begin; - len -= begin; - - if (len > length) - len = length; - else - *eof = 1; - - if (len < 0) - len = 0; - - return len; +static int jfs_lmstats_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, jfs_lmstats_proc_show, NULL); } + +const struct file_operations jfs_lmstats_proc_fops = { + .owner = THIS_MODULE, + .open = jfs_lmstats_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; #endif /* CONFIG_JFS_STATISTICS */ diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index d1e64f2f2fcd..c350057087dd 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -19,10 +19,12 @@ #include <linux/fs.h> #include <linux/mm.h> +#include <linux/module.h> #include <linux/bio.h> #include <linux/init.h> #include <linux/buffer_head.h> #include <linux/mempool.h> +#include <linux/seq_file.h> #include "jfs_incore.h" #include "jfs_superblock.h" #include "jfs_filsys.h" @@ -180,7 +182,7 @@ static inline void remove_metapage(struct page *page, struct metapage *mp) #endif -static void init_once(struct kmem_cache *cachep, void *foo) +static void init_once(void *foo) { struct metapage *mp = (struct metapage *)foo; @@ -804,13 +806,9 @@ void __invalidate_metapages(struct inode *ip, s64 addr, int len) } #ifdef CONFIG_JFS_STATISTICS -int jfs_mpstat_read(char *buffer, char **start, off_t offset, int length, - int *eof, void *data) +static int jfs_mpstat_proc_show(struct seq_file *m, void *v) { - int len = 0; - off_t begin; - - len += sprintf(buffer, + seq_printf(m, "JFS Metapage statistics\n" "=======================\n" "page allocations = %d\n" @@ -819,19 +817,19 @@ int jfs_mpstat_read(char *buffer, char **start, off_t offset, int length, mpStat.pagealloc, mpStat.pagefree, mpStat.lockwait); + return 0; +} - begin = offset; - *start = buffer + begin; - len -= begin; - - if (len > length) - len = length; - else - *eof = 1; - - if (len < 0) - len = 0; - - return len; +static int jfs_mpstat_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, jfs_mpstat_proc_show, NULL); } + +const struct file_operations jfs_mpstat_proc_fops = { + .owner = THIS_MODULE, + .open = jfs_mpstat_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; #endif diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index e7c60ae6b5b2..f26e4d03ada5 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -49,6 +49,7 @@ #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/kthread.h> +#include <linux/seq_file.h> #include "jfs_incore.h" #include "jfs_inode.h" #include "jfs_filsys.h" @@ -3009,11 +3010,8 @@ int jfs_sync(void *arg) } #if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_DEBUG) -int jfs_txanchor_read(char *buffer, char **start, off_t offset, int length, - int *eof, void *data) +static int jfs_txanchor_proc_show(struct seq_file *m, void *v) { - int len = 0; - off_t begin; char *freewait; char *freelockwait; char *lowlockwait; @@ -3025,7 +3023,7 @@ int jfs_txanchor_read(char *buffer, char **start, off_t offset, int length, lowlockwait = waitqueue_active(&TxAnchor.lowlockwait) ? "active" : "empty"; - len += sprintf(buffer, + seq_printf(m, "JFS TxAnchor\n" "============\n" "freetid = %d\n" @@ -3044,31 +3042,27 @@ int jfs_txanchor_read(char *buffer, char **start, off_t offset, int length, TxAnchor.tlocksInUse, jfs_tlocks_low, list_empty(&TxAnchor.unlock_queue) ? "" : "not "); + return 0; +} - begin = offset; - *start = buffer + begin; - len -= begin; - - if (len > length) - len = length; - else - *eof = 1; - - if (len < 0) - len = 0; - - return len; +static int jfs_txanchor_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, jfs_txanchor_proc_show, NULL); } + +const struct file_operations jfs_txanchor_proc_fops = { + .owner = THIS_MODULE, + .open = jfs_txanchor_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; #endif #if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_STATISTICS) -int jfs_txstats_read(char *buffer, char **start, off_t offset, int length, - int *eof, void *data) +static int jfs_txstats_proc_show(struct seq_file *m, void *v) { - int len = 0; - off_t begin; - - len += sprintf(buffer, + seq_printf(m, "JFS TxStats\n" "===========\n" "calls to txBegin = %d\n" @@ -3089,19 +3083,19 @@ int jfs_txstats_read(char *buffer, char **start, off_t offset, int length, TxStat.txBeginAnon_lockslow, TxStat.txLockAlloc, TxStat.txLockAlloc_freelock); + return 0; +} - begin = offset; - *start = buffer + begin; - len -= begin; - - if (len > length) - len = length; - else - *eof = 1; - - if (len < 0) - len = 0; - - return len; +static int jfs_txstats_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, jfs_txstats_proc_show, NULL); } + +const struct file_operations jfs_txstats_proc_fops = { + .owner = THIS_MODULE, + .open = jfs_txstats_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; #endif diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c index 5a61ebf2cbcc..ae3acafb447b 100644 --- a/fs/jfs/jfs_xtree.c +++ b/fs/jfs/jfs_xtree.c @@ -20,7 +20,9 @@ */ #include <linux/fs.h> +#include <linux/module.h> #include <linux/quotaops.h> +#include <linux/seq_file.h> #include "jfs_incore.h" #include "jfs_filsys.h" #include "jfs_metapage.h" @@ -4134,13 +4136,9 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size) } #ifdef CONFIG_JFS_STATISTICS -int jfs_xtstat_read(char *buffer, char **start, off_t offset, int length, - int *eof, void *data) +static int jfs_xtstat_proc_show(struct seq_file *m, void *v) { - int len = 0; - off_t begin; - - len += sprintf(buffer, + seq_printf(m, "JFS Xtree statistics\n" "====================\n" "searches = %d\n" @@ -4149,19 +4147,19 @@ int jfs_xtstat_read(char *buffer, char **start, off_t offset, int length, xtStat.search, xtStat.fastSearch, xtStat.split); + return 0; +} - begin = offset; - *start = buffer + begin; - len -= begin; - - if (len > length) - len = length; - else - *eof = 1; - - if (len < 0) - len = 0; - - return len; +static int jfs_xtstat_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, jfs_xtstat_proc_show, NULL); } + +const struct file_operations jfs_xtstat_proc_fops = { + .owner = THIS_MODULE, + .open = jfs_xtstat_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; #endif diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 0ba6778edaa2..2aba82386810 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -1455,7 +1455,7 @@ static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struc free_UCSname(&key); if (rc == -ENOENT) { d_add(dentry, NULL); - return ERR_PTR(0); + return NULL; } else if (rc) { jfs_err("jfs_lookup: dtSearch returned %d", rc); return ERR_PTR(rc); diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 50ea65451732..0dae345e481b 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -22,6 +22,7 @@ #include <linux/parser.h> #include <linux/completion.h> #include <linux/vfs.h> +#include <linux/quotaops.h> #include <linux/mount.h> #include <linux/moduleparam.h> #include <linux/kthread.h> @@ -198,7 +199,7 @@ enum { Opt_usrquota, Opt_grpquota, Opt_uid, Opt_gid, Opt_umask }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_integrity, "integrity"}, {Opt_nointegrity, "nointegrity"}, {Opt_iocharset, "iocharset=%s"}, @@ -499,7 +500,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) inode = jfs_iget(sb, ROOT_I); if (IS_ERR(inode)) { ret = PTR_ERR(inode); - goto out_no_root; + goto out_no_rw; } sb->s_root = d_alloc_root(inode); if (!sb->s_root) @@ -521,9 +522,8 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) return 0; out_no_root: - jfs_err("jfs_read_super: get root inode failed"); - if (inode) - iput(inode); + jfs_err("jfs_read_super: get root dentry failed"); + iput(inode); out_no_rw: rc = jfs_umount(sb); @@ -760,7 +760,7 @@ static struct file_system_type jfs_fs_type = { .fs_flags = FS_REQUIRES_DEV, }; -static void init_once(struct kmem_cache *cachep, void *foo) +static void init_once(void *foo) { struct jfs_inode_info *jfs_ip = (struct jfs_inode_info *) foo; diff --git a/fs/libfs.c b/fs/libfs.c index 892d41cb3382..1add676a19df 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -216,8 +216,8 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name, s->s_flags = MS_NOUSER; s->s_maxbytes = ~0ULL; - s->s_blocksize = 1024; - s->s_blocksize_bits = 10; + s->s_blocksize = PAGE_SIZE; + s->s_blocksize_bits = PAGE_SHIFT; s->s_magic = magic; s->s_op = ops ? ops : &simple_super_operations; s->s_time_gran = 1; @@ -512,6 +512,20 @@ void simple_release_fs(struct vfsmount **mount, int *count) mntput(mnt); } +/** + * simple_read_from_buffer - copy data from the buffer to user space + * @to: the user space buffer to read to + * @count: the maximum number of bytes to read + * @ppos: the current position in the buffer + * @from: the buffer to read from + * @available: the size of the buffer + * + * The simple_read_from_buffer() function reads up to @count bytes from the + * buffer @from at offset @ppos into the user space address starting at @to. + * + * On success, the number of bytes read is returned and the offset @ppos is + * advanced by this number, or negative value is returned on error. + **/ ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos, const void *from, size_t available) { @@ -528,6 +542,20 @@ ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos, return count; } +/** + * memory_read_from_buffer - copy data from the buffer + * @to: the kernel space buffer to read to + * @count: the maximum number of bytes to read + * @ppos: the current position in the buffer + * @from: the buffer to read from + * @available: the size of the buffer + * + * The memory_read_from_buffer() function reads up to @count bytes from the + * buffer @from at offset @ppos into the kernel space address starting at @to. + * + * On success, the number of bytes read is returned and the offset @ppos is + * advanced by this number, or negative value is returned on error. + **/ ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, const void *from, size_t available) { diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile index 7725a0a9a555..97f6073ab339 100644 --- a/fs/lockd/Makefile +++ b/fs/lockd/Makefile @@ -5,6 +5,6 @@ obj-$(CONFIG_LOCKD) += lockd.o lockd-objs-y := clntlock.o clntproc.o host.o svc.o svclock.o svcshare.o \ - svcproc.o svcsubs.o mon.o xdr.o + svcproc.o svcsubs.o mon.o xdr.o grace.o lockd-objs-$(CONFIG_LOCKD_V4) += xdr4.o svc4proc.o lockd-objs := $(lockd-objs-y) diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index 0b45fd3a4bfd..8307dd64bf46 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -54,14 +54,13 @@ struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init) u32 nlm_version = (nlm_init->nfs_version == 2) ? 1 : 4; int status; - status = lockd_up(nlm_init->protocol); + status = lockd_up(); if (status < 0) return ERR_PTR(status); - host = nlmclnt_lookup_host((struct sockaddr_in *)nlm_init->address, + host = nlmclnt_lookup_host(nlm_init->address, nlm_init->addrlen, nlm_init->protocol, nlm_version, - nlm_init->hostname, - strlen(nlm_init->hostname)); + nlm_init->hostname); if (host == NULL) { lockd_down(); return ERR_PTR(-ENOLCK); @@ -142,7 +141,7 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout) /* * The server lockd has called us back to tell us the lock was granted */ -__be32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock) +__be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock) { const struct file_lock *fl = &lock->fl; const struct nfs_fh *fh = &lock->fh; @@ -166,7 +165,7 @@ __be32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock */ if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid) continue; - if (!nlm_cmp_addr(&block->b_host->h_addr, addr)) + if (!nlm_cmp_addr(nlm_addr(block->b_host), addr)) continue; if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0) continue; @@ -216,7 +215,7 @@ reclaimer(void *ptr) /* This one ensures that our parent doesn't terminate while the * reclaim is in progress */ lock_kernel(); - lockd_up(0); /* note: this cannot fail as lockd is already running */ + lockd_up(); /* note: this cannot fail as lockd is already running */ dprintk("lockd: reclaiming locks for host %s\n", host->h_name); diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 5df517b81f3f..31668b690e03 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -224,7 +224,9 @@ void nlm_release_call(struct nlm_rqst *call) static void nlmclnt_rpc_release(void *data) { + lock_kernel(); nlm_release_call(data); + unlock_kernel(); } static int nlm_wait_on_grace(wait_queue_head_t *queue) @@ -430,7 +432,7 @@ nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl) * Report the conflicting lock back to the application. */ fl->fl_start = req->a_res.lock.fl.fl_start; - fl->fl_end = req->a_res.lock.fl.fl_start; + fl->fl_end = req->a_res.lock.fl.fl_end; fl->fl_type = req->a_res.lock.fl.fl_type; fl->fl_pid = 0; break; @@ -580,7 +582,15 @@ again: } if (status < 0) goto out_unlock; - status = nlm_stat_to_errno(resp->status); + /* + * EAGAIN doesn't make sense for sleeping locks, and in some + * cases NLM_LCK_DENIED is returned for a permanent error. So + * turn it into an ENOLCK. + */ + if (resp->status == nlm_lck_denied && (fl_flags & FL_SLEEP)) + status = -ENOLCK; + else + status = nlm_stat_to_errno(resp->status); out_unblock: nlmclnt_finish_block(block); out: @@ -710,7 +720,9 @@ static void nlmclnt_unlock_callback(struct rpc_task *task, void *data) die: return; retry_rebind: + lock_kernel(); nlm_rebind_host(req->a_host); + unlock_kernel(); retry_unlock: rpc_restart_call(task); } @@ -788,7 +800,9 @@ retry_cancel: /* Don't ever retry more than 3 times */ if (req->a_retries++ >= NLMCLNT_MAX_RETRIES) goto die; + lock_kernel(); nlm_rebind_host(req->a_host); + unlock_kernel(); rpc_restart_call(task); rpc_delay(task, 30 * HZ); } diff --git a/fs/lockd/grace.c b/fs/lockd/grace.c new file mode 100644 index 000000000000..183cc1f0af1c --- /dev/null +++ b/fs/lockd/grace.c @@ -0,0 +1,59 @@ +/* + * Common code for control of lockd and nfsv4 grace periods. + */ + +#include <linux/module.h> +#include <linux/lockd/bind.h> + +static LIST_HEAD(grace_list); +static DEFINE_SPINLOCK(grace_lock); + +/** + * locks_start_grace + * @lm: who this grace period is for + * + * A grace period is a period during which locks should not be given + * out. Currently grace periods are only enforced by the two lock + * managers (lockd and nfsd), using the locks_in_grace() function to + * check when they are in a grace period. + * + * This function is called to start a grace period. + */ +void locks_start_grace(struct lock_manager *lm) +{ + spin_lock(&grace_lock); + list_add(&lm->list, &grace_list); + spin_unlock(&grace_lock); +} +EXPORT_SYMBOL_GPL(locks_start_grace); + +/** + * locks_end_grace + * @lm: who this grace period is for + * + * Call this function to state that the given lock manager is ready to + * resume regular locking. The grace period will not end until all lock + * managers that called locks_start_grace() also call locks_end_grace(). + * Note that callers count on it being safe to call this more than once, + * and the second call should be a no-op. + */ +void locks_end_grace(struct lock_manager *lm) +{ + spin_lock(&grace_lock); + list_del_init(&lm->list); + spin_unlock(&grace_lock); +} +EXPORT_SYMBOL_GPL(locks_end_grace); + +/** + * locks_in_grace + * + * Lock managers call this function to determine when it is OK for them + * to answer ordinary lock requests, and when they should accept only + * lock reclaims. + */ +int locks_in_grace(void) +{ + return !list_empty(&grace_list); +} +EXPORT_SYMBOL_GPL(locks_in_grace); diff --git a/fs/lockd/host.c b/fs/lockd/host.c index a17664c7eacc..9fd8889097b7 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -11,16 +11,17 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/in.h> +#include <linux/in6.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/svc.h> #include <linux/lockd/lockd.h> #include <linux/lockd/sm_inter.h> #include <linux/mutex.h> +#include <net/ipv6.h> #define NLMDBG_FACILITY NLMDBG_HOSTCACHE #define NLM_HOST_NRHASH 32 -#define NLM_ADDRHASH(addr) (ntohl(addr) & (NLM_HOST_NRHASH-1)) #define NLM_HOST_REBIND (60 * HZ) #define NLM_HOST_EXPIRE (300 * HZ) #define NLM_HOST_COLLECT (120 * HZ) @@ -30,42 +31,115 @@ static unsigned long next_gc; static int nrhosts; static DEFINE_MUTEX(nlm_host_mutex); - static void nlm_gc_hosts(void); -static struct nsm_handle * __nsm_find(const struct sockaddr_in *, - const char *, unsigned int, int); -static struct nsm_handle * nsm_find(const struct sockaddr_in *sin, - const char *hostname, - unsigned int hostname_len); +static struct nsm_handle *nsm_find(const struct sockaddr *sap, + const size_t salen, + const char *hostname, + const size_t hostname_len, + const int create); + +struct nlm_lookup_host_info { + const int server; /* search for server|client */ + const struct sockaddr *sap; /* address to search for */ + const size_t salen; /* it's length */ + const unsigned short protocol; /* transport to search for*/ + const u32 version; /* NLM version to search for */ + const char *hostname; /* remote's hostname */ + const size_t hostname_len; /* it's length */ + const struct sockaddr *src_sap; /* our address (optional) */ + const size_t src_len; /* it's length */ +}; + +/* + * Hash function must work well on big- and little-endian platforms + */ +static unsigned int __nlm_hash32(const __be32 n) +{ + unsigned int hash = (__force u32)n ^ ((__force u32)n >> 16); + return hash ^ (hash >> 8); +} + +static unsigned int __nlm_hash_addr4(const struct sockaddr *sap) +{ + const struct sockaddr_in *sin = (struct sockaddr_in *)sap; + return __nlm_hash32(sin->sin_addr.s_addr); +} + +static unsigned int __nlm_hash_addr6(const struct sockaddr *sap) +{ + const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; + const struct in6_addr addr = sin6->sin6_addr; + return __nlm_hash32(addr.s6_addr32[0]) ^ + __nlm_hash32(addr.s6_addr32[1]) ^ + __nlm_hash32(addr.s6_addr32[2]) ^ + __nlm_hash32(addr.s6_addr32[3]); +} + +static unsigned int nlm_hash_address(const struct sockaddr *sap) +{ + unsigned int hash; + + switch (sap->sa_family) { + case AF_INET: + hash = __nlm_hash_addr4(sap); + break; + case AF_INET6: + hash = __nlm_hash_addr6(sap); + break; + default: + hash = 0; + } + return hash & (NLM_HOST_NRHASH - 1); +} + +static void nlm_clear_port(struct sockaddr *sap) +{ + switch (sap->sa_family) { + case AF_INET: + ((struct sockaddr_in *)sap)->sin_port = 0; + break; + case AF_INET6: + ((struct sockaddr_in6 *)sap)->sin6_port = 0; + break; + } +} + +static void nlm_display_address(const struct sockaddr *sap, + char *buf, const size_t len) +{ + const struct sockaddr_in *sin = (struct sockaddr_in *)sap; + const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; + + switch (sap->sa_family) { + case AF_UNSPEC: + snprintf(buf, len, "unspecified"); + break; + case AF_INET: + snprintf(buf, len, NIPQUAD_FMT, NIPQUAD(sin->sin_addr.s_addr)); + break; + case AF_INET6: + if (ipv6_addr_v4mapped(&sin6->sin6_addr)) + snprintf(buf, len, NIPQUAD_FMT, + NIPQUAD(sin6->sin6_addr.s6_addr32[3])); + else + snprintf(buf, len, NIP6_FMT, NIP6(sin6->sin6_addr)); + break; + default: + snprintf(buf, len, "unsupported address family"); + break; + } +} /* * Common host lookup routine for server & client */ -static struct nlm_host *nlm_lookup_host(int server, - const struct sockaddr_in *sin, - int proto, u32 version, - const char *hostname, - unsigned int hostname_len, - const struct sockaddr_in *ssin) +static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni) { struct hlist_head *chain; struct hlist_node *pos; struct nlm_host *host; struct nsm_handle *nsm = NULL; - int hash; - - dprintk("lockd: nlm_lookup_host("NIPQUAD_FMT"->"NIPQUAD_FMT - ", p=%d, v=%u, my role=%s, name=%.*s)\n", - NIPQUAD(ssin->sin_addr.s_addr), - NIPQUAD(sin->sin_addr.s_addr), proto, version, - server? "server" : "client", - hostname_len, - hostname? hostname : "<none>"); - - hash = NLM_ADDRHASH(sin->sin_addr.s_addr); - - /* Lock hash table */ mutex_lock(&nlm_host_mutex); if (time_after_eq(jiffies, next_gc)) @@ -78,22 +152,22 @@ static struct nlm_host *nlm_lookup_host(int server, * different NLM rpc_clients into one single nlm_host object. * This would allow us to have one nlm_host per address. */ - chain = &nlm_hosts[hash]; + chain = &nlm_hosts[nlm_hash_address(ni->sap)]; hlist_for_each_entry(host, pos, chain, h_hash) { - if (!nlm_cmp_addr(&host->h_addr, sin)) + if (!nlm_cmp_addr(nlm_addr(host), ni->sap)) continue; /* See if we have an NSM handle for this client */ if (!nsm) nsm = host->h_nsmhandle; - if (host->h_proto != proto) + if (host->h_proto != ni->protocol) continue; - if (host->h_version != version) + if (host->h_version != ni->version) continue; - if (host->h_server != server) + if (host->h_server != ni->server) continue; - if (!nlm_cmp_addr(&host->h_saddr, ssin)) + if (!nlm_cmp_addr(nlm_srcaddr(host), ni->src_sap)) continue; /* Move to head of hash chain. */ @@ -101,30 +175,41 @@ static struct nlm_host *nlm_lookup_host(int server, hlist_add_head(&host->h_hash, chain); nlm_get_host(host); + dprintk("lockd: nlm_lookup_host found host %s (%s)\n", + host->h_name, host->h_addrbuf); goto out; } - if (nsm) - atomic_inc(&nsm->sm_count); - - host = NULL; - /* Sadly, the host isn't in our hash table yet. See if - * we have an NSM handle for it. If not, create one. + /* + * The host wasn't in our hash table. If we don't + * have an NSM handle for it yet, create one. */ - if (!nsm && !(nsm = nsm_find(sin, hostname, hostname_len))) - goto out; + if (nsm) + atomic_inc(&nsm->sm_count); + else { + host = NULL; + nsm = nsm_find(ni->sap, ni->salen, + ni->hostname, ni->hostname_len, 1); + if (!nsm) { + dprintk("lockd: nlm_lookup_host failed; " + "no nsm handle\n"); + goto out; + } + } host = kzalloc(sizeof(*host), GFP_KERNEL); if (!host) { nsm_release(nsm); + dprintk("lockd: nlm_lookup_host failed; no memory\n"); goto out; } host->h_name = nsm->sm_name; - host->h_addr = *sin; - host->h_addr.sin_port = 0; /* ouch! */ - host->h_saddr = *ssin; - host->h_version = version; - host->h_proto = proto; + memcpy(nlm_addr(host), ni->sap, ni->salen); + host->h_addrlen = ni->salen; + nlm_clear_port(nlm_addr(host)); + memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len); + host->h_version = ni->version; + host->h_proto = ni->protocol; host->h_rpcclnt = NULL; mutex_init(&host->h_mutex); host->h_nextrebind = jiffies + NLM_HOST_REBIND; @@ -135,7 +220,7 @@ static struct nlm_host *nlm_lookup_host(int server, host->h_state = 0; /* pseudo NSM state */ host->h_nsmstate = 0; /* real NSM state */ host->h_nsmhandle = nsm; - host->h_server = server; + host->h_server = ni->server; hlist_add_head(&host->h_hash, chain); INIT_LIST_HEAD(&host->h_lockowners); spin_lock_init(&host->h_lock); @@ -143,6 +228,15 @@ static struct nlm_host *nlm_lookup_host(int server, INIT_LIST_HEAD(&host->h_reclaim); nrhosts++; + + nlm_display_address((struct sockaddr *)&host->h_addr, + host->h_addrbuf, sizeof(host->h_addrbuf)); + nlm_display_address((struct sockaddr *)&host->h_srcaddr, + host->h_srcaddrbuf, sizeof(host->h_srcaddrbuf)); + + dprintk("lockd: nlm_lookup_host created host %s\n", + host->h_name); + out: mutex_unlock(&nlm_host_mutex); return host; @@ -170,33 +264,103 @@ nlm_destroy_host(struct nlm_host *host) kfree(host); } -/* - * Find an NLM server handle in the cache. If there is none, create it. +/** + * nlmclnt_lookup_host - Find an NLM host handle matching a remote server + * @sap: network address of server + * @salen: length of server address + * @protocol: transport protocol to use + * @version: NLM protocol version + * @hostname: '\0'-terminated hostname of server + * + * Returns an nlm_host structure that matches the passed-in + * [server address, transport protocol, NLM version, server hostname]. + * If one doesn't already exist in the host cache, a new handle is + * created and returned. */ -struct nlm_host *nlmclnt_lookup_host(const struct sockaddr_in *sin, - int proto, u32 version, - const char *hostname, - unsigned int hostname_len) +struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap, + const size_t salen, + const unsigned short protocol, + const u32 version, const char *hostname) { - struct sockaddr_in ssin = {0}; - - return nlm_lookup_host(0, sin, proto, version, - hostname, hostname_len, &ssin); + const struct sockaddr source = { + .sa_family = AF_UNSPEC, + }; + struct nlm_lookup_host_info ni = { + .server = 0, + .sap = sap, + .salen = salen, + .protocol = protocol, + .version = version, + .hostname = hostname, + .hostname_len = strlen(hostname), + .src_sap = &source, + .src_len = sizeof(source), + }; + + dprintk("lockd: %s(host='%s', vers=%u, proto=%s)\n", __func__, + (hostname ? hostname : "<none>"), version, + (protocol == IPPROTO_UDP ? "udp" : "tcp")); + + return nlm_lookup_host(&ni); } -/* - * Find an NLM client handle in the cache. If there is none, create it. +/** + * nlmsvc_lookup_host - Find an NLM host handle matching a remote client + * @rqstp: incoming NLM request + * @hostname: name of client host + * @hostname_len: length of client hostname + * + * Returns an nlm_host structure that matches the [client address, + * transport protocol, NLM version, client hostname] of the passed-in + * NLM request. If one doesn't already exist in the host cache, a + * new handle is created and returned. + * + * Before possibly creating a new nlm_host, construct a sockaddr + * for a specific source address in case the local system has + * multiple network addresses. The family of the address in + * rq_daddr is guaranteed to be the same as the family of the + * address in rq_addr, so it's safe to use the same family for + * the source address. */ -struct nlm_host * -nlmsvc_lookup_host(struct svc_rqst *rqstp, - const char *hostname, unsigned int hostname_len) +struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp, + const char *hostname, + const size_t hostname_len) { - struct sockaddr_in ssin = {0}; + struct sockaddr_in sin = { + .sin_family = AF_INET, + }; + struct sockaddr_in6 sin6 = { + .sin6_family = AF_INET6, + }; + struct nlm_lookup_host_info ni = { + .server = 1, + .sap = svc_addr(rqstp), + .salen = rqstp->rq_addrlen, + .protocol = rqstp->rq_prot, + .version = rqstp->rq_vers, + .hostname = hostname, + .hostname_len = hostname_len, + .src_len = rqstp->rq_addrlen, + }; + + dprintk("lockd: %s(host='%*s', vers=%u, proto=%s)\n", __func__, + (int)hostname_len, hostname, rqstp->rq_vers, + (rqstp->rq_prot == IPPROTO_UDP ? "udp" : "tcp")); + + switch (ni.sap->sa_family) { + case AF_INET: + sin.sin_addr.s_addr = rqstp->rq_daddr.addr.s_addr; + ni.src_sap = (struct sockaddr *)&sin; + break; + case AF_INET6: + ipv6_addr_copy(&sin6.sin6_addr, &rqstp->rq_daddr.addr6); + ni.src_sap = (struct sockaddr *)&sin6; + break; + default: + return NULL; + } - ssin.sin_addr = rqstp->rq_daddr.addr; - return nlm_lookup_host(1, svc_addr_in(rqstp), - rqstp->rq_prot, rqstp->rq_vers, - hostname, hostname_len, &ssin); + return nlm_lookup_host(&ni); } /* @@ -207,9 +371,8 @@ nlm_bind_host(struct nlm_host *host) { struct rpc_clnt *clnt; - dprintk("lockd: nlm_bind_host("NIPQUAD_FMT"->"NIPQUAD_FMT")\n", - NIPQUAD(host->h_saddr.sin_addr), - NIPQUAD(host->h_addr.sin_addr)); + dprintk("lockd: nlm_bind_host %s (%s), my addr=%s\n", + host->h_name, host->h_addrbuf, host->h_srcaddrbuf); /* Lock host handle */ mutex_lock(&host->h_mutex); @@ -221,7 +384,7 @@ nlm_bind_host(struct nlm_host *host) if (time_after_eq(jiffies, host->h_nextrebind)) { rpc_force_rebind(clnt); host->h_nextrebind = jiffies + NLM_HOST_REBIND; - dprintk("lockd: next rebind in %ld jiffies\n", + dprintk("lockd: next rebind in %lu jiffies\n", host->h_nextrebind - jiffies); } } else { @@ -234,9 +397,9 @@ nlm_bind_host(struct nlm_host *host) }; struct rpc_create_args args = { .protocol = host->h_proto, - .address = (struct sockaddr *)&host->h_addr, - .addrsize = sizeof(host->h_addr), - .saddress = (struct sockaddr *)&host->h_saddr, + .address = nlm_addr(host), + .addrsize = host->h_addrlen, + .saddress = nlm_srcaddr(host), .timeout = &timeparms, .servername = host->h_name, .program = &nlm_program, @@ -324,12 +487,16 @@ void nlm_host_rebooted(const struct sockaddr_in *sin, struct nsm_handle *nsm; struct nlm_host *host; - dprintk("lockd: nlm_host_rebooted(%s, %u.%u.%u.%u)\n", - hostname, NIPQUAD(sin->sin_addr)); - - /* Find the NSM handle for this peer */ - if (!(nsm = __nsm_find(sin, hostname, hostname_len, 0))) + nsm = nsm_find((struct sockaddr *)sin, sizeof(*sin), + hostname, hostname_len, 0); + if (nsm == NULL) { + dprintk("lockd: never saw rebooted peer '%.*s' before\n", + hostname_len, hostname); return; + } + + dprintk("lockd: nlm_host_rebooted(%.*s, %s)\n", + hostname_len, hostname, nsm->sm_addrbuf); /* When reclaiming locks on this peer, make sure that * we set up a new notification */ @@ -461,22 +628,23 @@ nlm_gc_hosts(void) static LIST_HEAD(nsm_handles); static DEFINE_SPINLOCK(nsm_lock); -static struct nsm_handle * -__nsm_find(const struct sockaddr_in *sin, - const char *hostname, unsigned int hostname_len, - int create) +static struct nsm_handle *nsm_find(const struct sockaddr *sap, + const size_t salen, + const char *hostname, + const size_t hostname_len, + const int create) { struct nsm_handle *nsm = NULL; struct nsm_handle *pos; - if (!sin) + if (!sap) return NULL; if (hostname && memchr(hostname, '/', hostname_len) != NULL) { if (printk_ratelimit()) { printk(KERN_WARNING "Invalid hostname \"%.*s\" " "in NFS lock request\n", - hostname_len, hostname); + (int)hostname_len, hostname); } return NULL; } @@ -489,7 +657,7 @@ retry: if (strlen(pos->sm_name) != hostname_len || memcmp(pos->sm_name, hostname, hostname_len)) continue; - } else if (!nlm_cmp_addr(&pos->sm_addr, sin)) + } else if (!nlm_cmp_addr(nsm_addr(pos), sap)) continue; atomic_inc(&pos->sm_count); kfree(nsm); @@ -509,10 +677,13 @@ retry: if (nsm == NULL) return NULL; - nsm->sm_addr = *sin; + memcpy(nsm_addr(nsm), sap, salen); + nsm->sm_addrlen = salen; nsm->sm_name = (char *) (nsm + 1); memcpy(nsm->sm_name, hostname, hostname_len); nsm->sm_name[hostname_len] = '\0'; + nlm_display_address((struct sockaddr *)&nsm->sm_addr, + nsm->sm_addrbuf, sizeof(nsm->sm_addrbuf)); atomic_set(&nsm->sm_count, 1); goto retry; @@ -521,13 +692,6 @@ found: return nsm; } -static struct nsm_handle * -nsm_find(const struct sockaddr_in *sin, const char *hostname, - unsigned int hostname_len) -{ - return __nsm_find(sin, hostname, hostname_len, 1); -} - /* * Release an NSM handle */ diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index e4d563543b11..4e7e958e8f67 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -51,7 +51,7 @@ nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res) memset(&args, 0, sizeof(args)); args.mon_name = nsm->sm_name; - args.addr = nsm->sm_addr.sin_addr.s_addr; + args.addr = nsm_addr_in(nsm)->sin_addr.s_addr; args.prog = NLM_PROGRAM; args.vers = 3; args.proc = NLMPROC_NSM_NOTIFY; diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 2169af4d5455..c631a83931ce 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -50,8 +50,7 @@ EXPORT_SYMBOL(nlmsvc_ops); static DEFINE_MUTEX(nlmsvc_mutex); static unsigned int nlmsvc_users; static struct task_struct *nlmsvc_task; -static struct svc_serv *nlmsvc_serv; -int nlmsvc_grace_period; +static struct svc_rqst *nlmsvc_rqst; unsigned long nlmsvc_timeout; /* @@ -85,27 +84,23 @@ static unsigned long get_lockd_grace_period(void) return nlm_timeout * 5 * HZ; } -unsigned long get_nfs_grace_period(void) -{ - unsigned long lockdgrace = get_lockd_grace_period(); - unsigned long nfsdgrace = 0; - - if (nlmsvc_ops) - nfsdgrace = nlmsvc_ops->get_grace_period(); - - return max(lockdgrace, nfsdgrace); -} -EXPORT_SYMBOL(get_nfs_grace_period); +static struct lock_manager lockd_manager = { +}; -static unsigned long set_grace_period(void) +static void grace_ender(struct work_struct *not_used) { - nlmsvc_grace_period = 1; - return get_nfs_grace_period() + jiffies; + locks_end_grace(&lockd_manager); } -static inline void clear_grace_period(void) +static DECLARE_DELAYED_WORK(grace_period_end, grace_ender); + +static void set_grace_period(void) { - nlmsvc_grace_period = 0; + unsigned long grace_period = get_lockd_grace_period(); + + locks_start_grace(&lockd_manager); + cancel_delayed_work_sync(&grace_period_end); + schedule_delayed_work(&grace_period_end, grace_period); } /* @@ -116,7 +111,6 @@ lockd(void *vrqstp) { int err = 0, preverr = 0; struct svc_rqst *rqstp = vrqstp; - unsigned long grace_period_expire; /* try_to_freeze() is called from svc_recv() */ set_freezable(); @@ -139,7 +133,7 @@ lockd(void *vrqstp) nlm_timeout = LOCKD_DFLT_TIMEO; nlmsvc_timeout = nlm_timeout * HZ; - grace_period_expire = set_grace_period(); + set_grace_period(); /* * The main request loop. We don't terminate until the last @@ -153,21 +147,12 @@ lockd(void *vrqstp) flush_signals(current); if (nlmsvc_ops) { nlmsvc_invalidate_all(); - grace_period_expire = set_grace_period(); + set_grace_period(); } continue; } - /* - * Retry any blocked locks that have been notified by - * the VFS. Don't do this during grace period. - * (Theoretically, there shouldn't even be blocked locks - * during grace period). - */ - if (!nlmsvc_grace_period) { - timeout = nlmsvc_retry_blocked(); - } else if (time_before(grace_period_expire, jiffies)) - clear_grace_period(); + timeout = nlmsvc_retry_blocked(); /* * Find a socket with data available and call its @@ -194,43 +179,38 @@ lockd(void *vrqstp) svc_process(rqstp); } - flush_signals(current); + cancel_delayed_work_sync(&grace_period_end); if (nlmsvc_ops) nlmsvc_invalidate_all(); nlm_shutdown_hosts(); - unlock_kernel(); - - nlmsvc_task = NULL; - nlmsvc_serv = NULL; - - /* Exit the RPC thread */ - svc_exit_thread(rqstp); - return 0; } /* - * Make any sockets that are needed but not present. - * If nlm_udpport or nlm_tcpport were set as module - * options, make those sockets unconditionally + * Ensure there are active UDP and TCP listeners for lockd. + * + * Even if we have only TCP NFS mounts and/or TCP NFSDs, some + * local services (such as rpc.statd) still require UDP, and + * some NFS servers do not yet support NLM over TCP. + * + * Returns zero if all listeners are available; otherwise a + * negative errno value is returned. */ -static int make_socks(struct svc_serv *serv, int proto) +static int make_socks(struct svc_serv *serv) { static int warned; struct svc_xprt *xprt; int err = 0; - if (proto == IPPROTO_UDP || nlm_udpport) { - xprt = svc_find_xprt(serv, "udp", 0, 0); - if (!xprt) - err = svc_create_xprt(serv, "udp", nlm_udpport, - SVC_SOCK_DEFAULTS); - else - svc_xprt_put(xprt); - } - if (err >= 0 && (proto == IPPROTO_TCP || nlm_tcpport)) { + xprt = svc_find_xprt(serv, "udp", 0, 0); + if (!xprt) + err = svc_create_xprt(serv, "udp", nlm_udpport, + SVC_SOCK_DEFAULTS); + else + svc_xprt_put(xprt); + if (err >= 0) { xprt = svc_find_xprt(serv, "tcp", 0, 0); if (!xprt) err = svc_create_xprt(serv, "tcp", nlm_tcpport, @@ -250,22 +230,17 @@ static int make_socks(struct svc_serv *serv, int proto) /* * Bring up the lockd process if it's not already up. */ -int -lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */ +int lockd_up(void) { struct svc_serv *serv; - struct svc_rqst *rqstp; int error = 0; mutex_lock(&nlmsvc_mutex); /* * Check whether we're already up and running. */ - if (nlmsvc_serv) { - if (proto) - error = make_socks(nlmsvc_serv, proto); + if (nlmsvc_rqst) goto out; - } /* * Sanity check: if there's no pid, @@ -276,21 +251,23 @@ lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */ "lockd_up: no pid, %d users??\n", nlmsvc_users); error = -ENOMEM; - serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NULL); + serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, AF_INET, NULL); if (!serv) { printk(KERN_WARNING "lockd_up: create service failed\n"); goto out; } - if ((error = make_socks(serv, proto)) < 0) + error = make_socks(serv); + if (error < 0) goto destroy_and_out; /* * Create the kernel thread and wait for it to start. */ - rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]); - if (IS_ERR(rqstp)) { - error = PTR_ERR(rqstp); + nlmsvc_rqst = svc_prepare_thread(serv, &serv->sv_pools[0]); + if (IS_ERR(nlmsvc_rqst)) { + error = PTR_ERR(nlmsvc_rqst); + nlmsvc_rqst = NULL; printk(KERN_WARNING "lockd_up: svc_rqst allocation failed, error=%d\n", error); @@ -298,16 +275,15 @@ lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */ } svc_sock_update_bufs(serv); - nlmsvc_serv = rqstp->rq_server; - nlmsvc_task = kthread_run(lockd, rqstp, serv->sv_name); + nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, serv->sv_name); if (IS_ERR(nlmsvc_task)) { error = PTR_ERR(nlmsvc_task); + svc_exit_thread(nlmsvc_rqst); nlmsvc_task = NULL; - nlmsvc_serv = NULL; + nlmsvc_rqst = NULL; printk(KERN_WARNING "lockd_up: kthread_run failed, error=%d\n", error); - svc_exit_thread(rqstp); goto destroy_and_out; } @@ -346,6 +322,9 @@ lockd_down(void) BUG(); } kthread_stop(nlmsvc_task); + svc_exit_thread(nlmsvc_rqst); + nlmsvc_task = NULL; + nlmsvc_rqst = NULL; out: mutex_unlock(&nlmsvc_mutex); } diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 385437e3387d..014f6ce48172 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -58,8 +58,7 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp, return 0; no_locks: - if (host) - nlm_release_host(host); + nlm_release_host(host); if (error) return error; return nlm_lck_denied_nolocks; @@ -84,23 +83,17 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp, { struct nlm_host *host; struct nlm_file *file; - int rc = rpc_success; + __be32 rc = rpc_success; dprintk("lockd: TEST4 called\n"); resp->cookie = argp->cookie; - /* Don't accept test requests during grace period */ - if (nlmsvc_grace_period) { - resp->status = nlm_lck_denied_grace_period; - return rc; - } - /* Obtain client and file */ if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; /* Now check for conflicting locks */ - resp->status = nlmsvc_testlock(rqstp, file, &argp->lock, &resp->lock, &resp->cookie); + resp->status = nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock, &resp->cookie); if (resp->status == nlm_drop_reply) rc = rpc_drop_reply; else @@ -117,18 +110,12 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp, { struct nlm_host *host; struct nlm_file *file; - int rc = rpc_success; + __be32 rc = rpc_success; dprintk("lockd: LOCK called\n"); resp->cookie = argp->cookie; - /* Don't accept new lock requests during grace period */ - if (nlmsvc_grace_period && !argp->reclaim) { - resp->status = nlm_lck_denied_grace_period; - return rc; - } - /* Obtain client and file */ if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; @@ -146,8 +133,9 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp, #endif /* Now try to lock the file */ - resp->status = nlmsvc_lock(rqstp, file, &argp->lock, - argp->block, &argp->cookie); + resp->status = nlmsvc_lock(rqstp, file, host, &argp->lock, + argp->block, &argp->cookie, + argp->reclaim); if (resp->status == nlm_drop_reply) rc = rpc_drop_reply; else @@ -170,7 +158,7 @@ nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept requests during grace period */ - if (nlmsvc_grace_period) { + if (locks_in_grace()) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -203,7 +191,7 @@ nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept new lock requests during grace period */ - if (nlmsvc_grace_period) { + if (locks_in_grace()) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -232,7 +220,7 @@ nlm4svc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; dprintk("lockd: GRANTED called\n"); - resp->status = nlmclnt_grant(svc_addr_in(rqstp), &argp->lock); + resp->status = nlmclnt_grant(svc_addr(rqstp), &argp->lock); dprintk("lockd: GRANTED status %d\n", ntohl(resp->status)); return rpc_success; } @@ -248,7 +236,9 @@ static void nlm4svc_callback_exit(struct rpc_task *task, void *data) static void nlm4svc_callback_release(void *data) { + lock_kernel(); nlm_release_call(data); + unlock_kernel(); } static const struct rpc_call_ops nlm4svc_callback_ops = { @@ -340,7 +330,7 @@ nlm4svc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept new lock requests during grace period */ - if (nlmsvc_grace_period && !argp->reclaim) { + if (locks_in_grace() && !argp->reclaim) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -373,7 +363,7 @@ nlm4svc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept requests during grace period */ - if (nlmsvc_grace_period) { + if (locks_in_grace()) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -431,11 +421,9 @@ nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp, { struct sockaddr_in saddr; - memcpy(&saddr, svc_addr_in(rqstp), sizeof(saddr)); - dprintk("lockd: SM_NOTIFY called\n"); - if (saddr.sin_addr.s_addr != htonl(INADDR_LOOPBACK) - || ntohs(saddr.sin_port) >= 1024) { + + if (!nlm_privileged_requester(rqstp)) { char buf[RPC_MAX_ADDRBUFLEN]; printk(KERN_WARNING "lockd: rejected NSM callback from %s\n", svc_print_addr(rqstp, buf, sizeof(buf))); diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 81aca859bfde..6063a8e4b9f3 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -129,9 +129,9 @@ nlmsvc_lookup_block(struct nlm_file *file, struct nlm_lock *lock) static inline int nlm_cookie_match(struct nlm_cookie *a, struct nlm_cookie *b) { - if(a->len != b->len) + if (a->len != b->len) return 0; - if(memcmp(a->data,b->data,a->len)) + if (memcmp(a->data, b->data, a->len)) return 0; return 1; } @@ -180,6 +180,7 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_host *host, struct nlm_block *block; struct nlm_rqst *call = NULL; + nlm_get_host(host); call = nlm_alloc_call(host); if (call == NULL) return NULL; @@ -358,10 +359,10 @@ nlmsvc_defer_lock_rqst(struct svc_rqst *rqstp, struct nlm_block *block) */ __be32 nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, - struct nlm_lock *lock, int wait, struct nlm_cookie *cookie) + struct nlm_host *host, struct nlm_lock *lock, int wait, + struct nlm_cookie *cookie, int reclaim) { struct nlm_block *block = NULL; - struct nlm_host *host; int error; __be32 ret; @@ -373,11 +374,6 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, (long long)lock->fl.fl_end, wait); - /* Create host handle for callback */ - host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len); - if (host == NULL) - return nlm_lck_denied_nolocks; - /* Lock file against concurrent access */ mutex_lock(&file->f_mutex); /* Get existing block (in case client is busy-waiting) @@ -385,8 +381,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, */ block = nlmsvc_lookup_block(file, lock); if (block == NULL) { - block = nlmsvc_create_block(rqstp, nlm_get_host(host), file, - lock, cookie); + block = nlmsvc_create_block(rqstp, host, file, lock, cookie); ret = nlm_lck_denied_nolocks; if (block == NULL) goto out; @@ -411,20 +406,29 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, goto out; } + if (locks_in_grace() && !reclaim) { + ret = nlm_lck_denied_grace_period; + goto out; + } + if (reclaim && !locks_in_grace()) { + ret = nlm_lck_denied_grace_period; + goto out; + } + if (!wait) lock->fl.fl_flags &= ~FL_SLEEP; error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL); lock->fl.fl_flags &= ~FL_SLEEP; dprintk("lockd: vfs_lock_file returned %d\n", error); - switch(error) { + switch (error) { case 0: ret = nlm_granted; goto out; case -EAGAIN: ret = nlm_lck_denied; - break; - case -EINPROGRESS: + goto out; + case FILE_LOCK_DEFERRED: if (wait) break; /* Filesystem lock operation is in progress @@ -439,10 +443,6 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, goto out; } - ret = nlm_lck_denied; - if (!wait) - goto out; - ret = nlm_lck_blocked; /* Append to list of blocked */ @@ -450,7 +450,6 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, out: mutex_unlock(&file->f_mutex); nlmsvc_release_block(block); - nlm_release_host(host); dprintk("lockd: nlmsvc_lock returned %u\n", ret); return ret; } @@ -460,8 +459,8 @@ out: */ __be32 nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, - struct nlm_lock *lock, struct nlm_lock *conflock, - struct nlm_cookie *cookie) + struct nlm_host *host, struct nlm_lock *lock, + struct nlm_lock *conflock, struct nlm_cookie *cookie) { struct nlm_block *block = NULL; int error; @@ -479,16 +478,9 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, if (block == NULL) { struct file_lock *conf = kzalloc(sizeof(*conf), GFP_KERNEL); - struct nlm_host *host; if (conf == NULL) return nlm_granted; - /* Create host handle for callback */ - host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len); - if (host == NULL) { - kfree(conf); - return nlm_lck_denied_nolocks; - } block = nlmsvc_create_block(rqstp, host, file, lock, cookie); if (block == NULL) { kfree(conf); @@ -519,8 +511,12 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, goto out; } + if (locks_in_grace()) { + ret = nlm_lck_denied_grace_period; + goto out; + } error = vfs_test_lock(file->f_file, &lock->fl); - if (error == -EINPROGRESS) { + if (error == FILE_LOCK_DEFERRED) { ret = nlmsvc_defer_lock_rqst(rqstp, block); goto out; } @@ -599,6 +595,9 @@ nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock) (long long)lock->fl.fl_start, (long long)lock->fl.fl_end); + if (locks_in_grace()) + return nlm_lck_denied_grace_period; + mutex_lock(&file->f_mutex); block = nlmsvc_lookup_block(file, lock); mutex_unlock(&file->f_mutex); @@ -744,8 +743,7 @@ nlmsvc_grant_blocked(struct nlm_block *block) switch (error) { case 0: break; - case -EAGAIN: - case -EINPROGRESS: + case FILE_LOCK_DEFERRED: dprintk("lockd: lock still blocked error %d\n", error); nlmsvc_insert_block(block, NLM_NEVER); nlmsvc_release_block(block); @@ -795,6 +793,7 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data) dprintk("lockd: GRANT_MSG RPC callback\n"); + lock_kernel(); /* if the block is not on a list at this point then it has * been invalidated. Don't try to requeue it. * @@ -804,7 +803,7 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data) * for nlm_blocked? */ if (list_empty(&block->b_list)) - return; + goto out; /* Technically, we should down the file semaphore here. Since we * move the block towards the head of the queue only, no harm @@ -818,13 +817,17 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data) } nlmsvc_insert_block(block, timeout); svc_wake_up(block->b_daemon); +out: + unlock_kernel(); } static void nlmsvc_grant_release(void *data) { struct nlm_rqst *call = data; + lock_kernel(); nlmsvc_release_block(call->a_block); + unlock_kernel(); } static const struct rpc_call_ops nlmsvc_grant_ops = { @@ -892,7 +895,7 @@ nlmsvc_retry_blocked(void) if (block->b_when == NLM_NEVER) break; - if (time_after(block->b_when,jiffies)) { + if (time_after(block->b_when, jiffies)) { timeout = block->b_when - jiffies; break; } diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 88379cc6e0b1..548b0bb2b84d 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -87,8 +87,7 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp, return 0; no_locks: - if (host) - nlm_release_host(host); + nlm_release_host(host); if (error) return error; return nlm_lck_denied_nolocks; @@ -113,23 +112,17 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp, { struct nlm_host *host; struct nlm_file *file; - int rc = rpc_success; + __be32 rc = rpc_success; dprintk("lockd: TEST called\n"); resp->cookie = argp->cookie; - /* Don't accept test requests during grace period */ - if (nlmsvc_grace_period) { - resp->status = nlm_lck_denied_grace_period; - return rc; - } - /* Obtain client and file */ if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; /* Now check for conflicting locks */ - resp->status = cast_status(nlmsvc_testlock(rqstp, file, &argp->lock, &resp->lock, &resp->cookie)); + resp->status = cast_status(nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock, &resp->cookie)); if (resp->status == nlm_drop_reply) rc = rpc_drop_reply; else @@ -147,18 +140,12 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp, { struct nlm_host *host; struct nlm_file *file; - int rc = rpc_success; + __be32 rc = rpc_success; dprintk("lockd: LOCK called\n"); resp->cookie = argp->cookie; - /* Don't accept new lock requests during grace period */ - if (nlmsvc_grace_period && !argp->reclaim) { - resp->status = nlm_lck_denied_grace_period; - return rc; - } - /* Obtain client and file */ if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; @@ -176,8 +163,9 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp, #endif /* Now try to lock the file */ - resp->status = cast_status(nlmsvc_lock(rqstp, file, &argp->lock, - argp->block, &argp->cookie)); + resp->status = cast_status(nlmsvc_lock(rqstp, file, host, &argp->lock, + argp->block, &argp->cookie, + argp->reclaim)); if (resp->status == nlm_drop_reply) rc = rpc_drop_reply; else @@ -200,7 +188,7 @@ nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept requests during grace period */ - if (nlmsvc_grace_period) { + if (locks_in_grace()) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -233,7 +221,7 @@ nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept new lock requests during grace period */ - if (nlmsvc_grace_period) { + if (locks_in_grace()) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -262,7 +250,7 @@ nlmsvc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; dprintk("lockd: GRANTED called\n"); - resp->status = nlmclnt_grant(svc_addr_in(rqstp), &argp->lock); + resp->status = nlmclnt_grant(svc_addr(rqstp), &argp->lock); dprintk("lockd: GRANTED status %d\n", ntohl(resp->status)); return rpc_success; } @@ -278,7 +266,9 @@ static void nlmsvc_callback_exit(struct rpc_task *task, void *data) static void nlmsvc_callback_release(void *data) { + lock_kernel(); nlm_release_call(data); + unlock_kernel(); } static const struct rpc_call_ops nlmsvc_callback_ops = { @@ -372,7 +362,7 @@ nlmsvc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept new lock requests during grace period */ - if (nlmsvc_grace_period && !argp->reclaim) { + if (locks_in_grace() && !argp->reclaim) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -405,7 +395,7 @@ nlmsvc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept requests during grace period */ - if (nlmsvc_grace_period) { + if (locks_in_grace()) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -463,11 +453,9 @@ nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp, { struct sockaddr_in saddr; - memcpy(&saddr, svc_addr_in(rqstp), sizeof(saddr)); - dprintk("lockd: SM_NOTIFY called\n"); - if (saddr.sin_addr.s_addr != htonl(INADDR_LOOPBACK) - || ntohs(saddr.sin_port) >= 1024) { + + if (!nlm_privileged_requester(rqstp)) { char buf[RPC_MAX_ADDRBUFLEN]; printk(KERN_WARNING "lockd: rejected NSM callback from %s\n", svc_print_addr(rqstp, buf, sizeof(buf))); diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index d1c48b539df8..34c2766e27c7 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -373,13 +373,16 @@ nlmsvc_free_host_resources(struct nlm_host *host) } } -/* - * Remove all locks held for clients +/** + * nlmsvc_invalidate_all - remove all locks held for clients + * + * Release all locks held by NFS clients. + * */ void nlmsvc_invalidate_all(void) { - /* Release all locks held by NFS clients. + /* * Previously, the code would call * nlmsvc_free_host_resources for each client in * turn, which is about as inefficient as it gets. @@ -396,6 +399,12 @@ nlmsvc_match_sb(void *datap, struct nlm_file *file) return sb == file->f_file->f_path.mnt->mnt_sb; } +/** + * nlmsvc_unlock_all_by_sb - release locks held on this file system + * @sb: super block + * + * Release all locks held by clients accessing this file system. + */ int nlmsvc_unlock_all_by_sb(struct super_block *sb) { @@ -409,17 +418,22 @@ EXPORT_SYMBOL_GPL(nlmsvc_unlock_all_by_sb); static int nlmsvc_match_ip(void *datap, struct nlm_host *host) { - __be32 *server_addr = datap; - - return host->h_saddr.sin_addr.s_addr == *server_addr; + return nlm_cmp_addr(nlm_srcaddr(host), datap); } +/** + * nlmsvc_unlock_all_by_ip - release local locks by IP address + * @server_addr: server's IP address as seen by clients + * + * Release all locks held by clients accessing this host + * via the passed in IP address. + */ int -nlmsvc_unlock_all_by_ip(__be32 server_addr) +nlmsvc_unlock_all_by_ip(struct sockaddr *server_addr) { int ret; - ret = nlm_traverse_files(&server_addr, nlmsvc_match_ip, NULL); - return ret ? -EIO : 0; + ret = nlm_traverse_files(server_addr, nlmsvc_match_ip, NULL); + return ret ? -EIO : 0; } EXPORT_SYMBOL_GPL(nlmsvc_unlock_all_by_ip); diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c index 3e459e18cc31..1f226290c67c 100644 --- a/fs/lockd/xdr.c +++ b/fs/lockd/xdr.c @@ -351,8 +351,6 @@ nlmsvc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp) argp->state = ntohl(*p++); /* Preserve the address in network byte order */ argp->addr = *p++; - argp->vers = *p++; - argp->proto = *p++; return xdr_argsize_check(rqstp, p); } diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c index 43ff9397e6c6..50c493a8ad8e 100644 --- a/fs/lockd/xdr4.c +++ b/fs/lockd/xdr4.c @@ -358,8 +358,6 @@ nlm4svc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp argp->state = ntohl(*p++); /* Preserve the address in network byte order */ argp->addr = *p++; - argp->vers = *p++; - argp->proto = *p++; return xdr_argsize_check(rqstp, p); } diff --git a/fs/locks.c b/fs/locks.c index 11dbf08651b7..5eb259e3cd38 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -201,7 +201,7 @@ EXPORT_SYMBOL(locks_init_lock); * Initialises the fields of the file lock which are invariant for * free file_locks. */ -static void init_once(struct kmem_cache *cache, void *foo) +static void init_once(void *foo) { struct file_lock *lock = (struct file_lock *) foo; @@ -561,9 +561,6 @@ static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl) /* insert into file's list */ fl->fl_next = *pos; *pos = fl; - - if (fl->fl_ops && fl->fl_ops->fl_insert) - fl->fl_ops->fl_insert(fl); } /* @@ -586,9 +583,6 @@ static void locks_delete_lock(struct file_lock **thisfl_p) fl->fl_fasync = NULL; } - if (fl->fl_ops && fl->fl_ops->fl_remove) - fl->fl_ops->fl_remove(fl); - if (fl->fl_nspid) { put_pid(fl->fl_nspid); fl->fl_nspid = NULL; @@ -785,8 +779,10 @@ find_conflict: if (!flock_locks_conflict(request, fl)) continue; error = -EAGAIN; - if (request->fl_flags & FL_SLEEP) - locks_insert_block(fl, request); + if (!(request->fl_flags & FL_SLEEP)) + goto out; + error = FILE_LOCK_DEFERRED; + locks_insert_block(fl, request); goto out; } if (request->fl_flags & FL_ACCESS) @@ -842,7 +838,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str error = -EDEADLK; if (posix_locks_deadlock(request, fl)) goto out; - error = -EAGAIN; + error = FILE_LOCK_DEFERRED; locks_insert_block(fl, request); goto out; } @@ -1041,7 +1037,7 @@ int posix_lock_file_wait(struct file *filp, struct file_lock *fl) might_sleep (); for (;;) { error = posix_lock_file(filp, fl, NULL); - if ((error != -EAGAIN) || !(fl->fl_flags & FL_SLEEP)) + if (error != FILE_LOCK_DEFERRED) break; error = wait_event_interruptible(fl->fl_wait, !fl->fl_next); if (!error) @@ -1113,9 +1109,7 @@ int locks_mandatory_area(int read_write, struct inode *inode, for (;;) { error = __posix_lock_file(inode, &fl, NULL); - if (error != -EAGAIN) - break; - if (!(fl.fl_flags & FL_SLEEP)) + if (error != FILE_LOCK_DEFERRED) break; error = wait_event_interruptible(fl.fl_wait, !fl.fl_next); if (!error) { @@ -1537,7 +1531,7 @@ int flock_lock_file_wait(struct file *filp, struct file_lock *fl) might_sleep(); for (;;) { error = flock_lock_file(filp, fl); - if ((error != -EAGAIN) || !(fl->fl_flags & FL_SLEEP)) + if (error != FILE_LOCK_DEFERRED) break; error = wait_event_interruptible(fl->fl_wait, !fl->fl_next); if (!error) @@ -1722,17 +1716,17 @@ out: * fl_grant is set. Callers expecting ->lock() to return asynchronously * will only use F_SETLK, not F_SETLKW; they will set FL_SLEEP if (and only if) * the request is for a blocking lock. When ->lock() does return asynchronously, - * it must return -EINPROGRESS, and call ->fl_grant() when the lock + * it must return FILE_LOCK_DEFERRED, and call ->fl_grant() when the lock * request completes. * If the request is for non-blocking lock the file system should return - * -EINPROGRESS then try to get the lock and call the callback routine with - * the result. If the request timed out the callback routine will return a + * FILE_LOCK_DEFERRED then try to get the lock and call the callback routine + * with the result. If the request timed out the callback routine will return a * nonzero return code and the file system should release the lock. The file * system is also responsible to keep a corresponding posix lock when it * grants a lock so the VFS can find out which locks are locally held and do * the correct lock cleanup when required. * The underlying filesystem must not drop the kernel lock or call - * ->fl_grant() before returning to the caller with a -EINPROGRESS + * ->fl_grant() before returning to the caller with a FILE_LOCK_DEFERRED * return code. */ int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf) @@ -1744,6 +1738,30 @@ int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, str } EXPORT_SYMBOL_GPL(vfs_lock_file); +static int do_lock_file_wait(struct file *filp, unsigned int cmd, + struct file_lock *fl) +{ + int error; + + error = security_file_lock(filp, fl->fl_type); + if (error) + return error; + + for (;;) { + error = vfs_lock_file(filp, cmd, fl, NULL); + if (error != FILE_LOCK_DEFERRED) + break; + error = wait_event_interruptible(fl->fl_wait, !fl->fl_next); + if (!error) + continue; + + locks_delete_block(fl); + break; + } + + return error; +} + /* Apply the lock described by l to an open file descriptor. * This implements both the F_SETLK and F_SETLKW commands of fcntl(). */ @@ -1801,26 +1819,7 @@ again: goto out; } - error = security_file_lock(filp, file_lock->fl_type); - if (error) - goto out; - - if (filp->f_op && filp->f_op->lock != NULL) - error = filp->f_op->lock(filp, cmd, file_lock); - else { - for (;;) { - error = posix_lock_file(filp, file_lock, NULL); - if (error != -EAGAIN || cmd == F_SETLK) - break; - error = wait_event_interruptible(file_lock->fl_wait, - !file_lock->fl_next); - if (!error) - continue; - - locks_delete_block(file_lock); - break; - } - } + error = do_lock_file_wait(filp, cmd, file_lock); /* * Attempt to detect a close/fcntl race and recover by @@ -1938,26 +1937,7 @@ again: goto out; } - error = security_file_lock(filp, file_lock->fl_type); - if (error) - goto out; - - if (filp->f_op && filp->f_op->lock != NULL) - error = filp->f_op->lock(filp, cmd, file_lock); - else { - for (;;) { - error = posix_lock_file(filp, file_lock, NULL); - if (error != -EAGAIN || cmd == F_SETLK64) - break; - error = wait_event_interruptible(file_lock->fl_wait, - !file_lock->fl_next); - if (!error) - continue; - - locks_delete_block(file_lock); - break; - } - } + error = do_lock_file_wait(filp, cmd, file_lock); /* * Attempt to detect a close/fcntl race and recover by diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 84f6242ba6fc..d1d1eb84679d 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -68,7 +68,7 @@ static void minix_destroy_inode(struct inode *inode) kmem_cache_free(minix_inode_cachep, minix_i(inode)); } -static void init_once(struct kmem_cache * cachep, void *foo) +static void init_once(void *foo) { struct minix_inode_info *ei = (struct minix_inode_info *) foo; @@ -256,9 +256,6 @@ static int minix_fill_super(struct super_block *s, void *data, int silent) if (!s->s_root) goto out_iput; - if (!NO_TRUNCATE) - s->s_root->d_op = &minix_dentry_operations; - if (!(s->s_flags & MS_RDONLY)) { if (sbi->s_version != MINIX_V3) /* s_state is now out from V3 sb */ ms->s_state &= ~MINIX_VALID_FS; diff --git a/fs/minix/minix.h b/fs/minix/minix.h index 326edfe96108..e6a0b193bea4 100644 --- a/fs/minix/minix.h +++ b/fs/minix/minix.h @@ -2,11 +2,6 @@ #include <linux/pagemap.h> #include <linux/minix_fs.h> -/* - * change the define below to 0 if you want names > info->s_namelen chars to be - * truncated. Else they will be disallowed (ENAMETOOLONG). - */ -#define NO_TRUNCATE 1 #define INODE_VERSION(inode) minix_sb(inode->i_sb)->s_version #define MINIX_V1 0x0001 /* original minix fs */ #define MINIX_V2 0x0002 /* minix V2 fs */ @@ -83,7 +78,6 @@ extern const struct inode_operations minix_file_inode_operations; extern const struct inode_operations minix_dir_inode_operations; extern const struct file_operations minix_file_operations; extern const struct file_operations minix_dir_operations; -extern struct dentry_operations minix_dentry_operations; static inline struct minix_sb_info *minix_sb(struct super_block *sb) { diff --git a/fs/minix/namei.c b/fs/minix/namei.c index 102241bc9c79..32b131cd6121 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -18,30 +18,6 @@ static int add_nondir(struct dentry *dentry, struct inode *inode) return err; } -static int minix_hash(struct dentry *dentry, struct qstr *qstr) -{ - unsigned long hash; - int i; - const unsigned char *name; - - i = minix_sb(dentry->d_inode->i_sb)->s_namelen; - if (i >= qstr->len) - return 0; - /* Truncate the name in place, avoids having to define a compare - function. */ - qstr->len = i; - name = qstr->name; - hash = init_name_hash(); - while (i--) - hash = partial_name_hash(*name++, hash); - qstr->hash = end_name_hash(hash); - return 0; -} - -struct dentry_operations minix_dentry_operations = { - .d_hash = minix_hash, -}; - static struct dentry *minix_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd) { struct inode * inode = NULL; diff --git a/fs/mpage.c b/fs/mpage.c index 235e4d3873a8..dbcc7af76a15 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -82,7 +82,7 @@ static void mpage_end_io_write(struct bio *bio, int err) bio_put(bio); } -static struct bio *mpage_bio_submit(int rw, struct bio *bio) +struct bio *mpage_bio_submit(int rw, struct bio *bio) { bio->bi_end_io = mpage_end_io_read; if (rw == WRITE) @@ -90,6 +90,7 @@ static struct bio *mpage_bio_submit(int rw, struct bio *bio) submit_bio(rw, bio); return NULL; } +EXPORT_SYMBOL(mpage_bio_submit); static struct bio * mpage_alloc(struct block_device *bdev, @@ -435,15 +436,9 @@ EXPORT_SYMBOL(mpage_readpage); * written, so it can intelligently allocate a suitably-sized BIO. For now, * just allocate full-size (16-page) BIOs. */ -struct mpage_data { - struct bio *bio; - sector_t last_block_in_bio; - get_block_t *get_block; - unsigned use_writepage; -}; -static int __mpage_writepage(struct page *page, struct writeback_control *wbc, - void *data) +int __mpage_writepage(struct page *page, struct writeback_control *wbc, + void *data) { struct mpage_data *mpd = data; struct bio *bio = mpd->bio; @@ -651,6 +646,7 @@ out: mpd->bio = bio; return ret; } +EXPORT_SYMBOL(__mpage_writepage); /** * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them diff --git a/fs/msdos/namei.c b/fs/msdos/namei.c index 05ff4f1d7026..e844b9809d27 100644 --- a/fs/msdos/namei.c +++ b/fs/msdos/namei.c @@ -14,12 +14,7 @@ /* Characters that are undesirable in an MS-DOS file name */ static unsigned char bad_chars[] = "*?<>|\""; -static unsigned char bad_if_strict_pc[] = "+=,; "; -/* GEMDOS is less restrictive */ -static unsigned char bad_if_strict_atari[] = " "; - -#define bad_if_strict(opts) \ - ((opts)->atari ? bad_if_strict_atari : bad_if_strict_pc) +static unsigned char bad_if_strict[] = "+=,; "; /***** Formats an MS-DOS file name. Rejects invalid names. */ static int msdos_format_name(const unsigned char *name, int len, @@ -40,21 +35,20 @@ static int msdos_format_name(const unsigned char *name, int len, /* Get rid of dot - test for it elsewhere */ name++; len--; - } else if (!opts->atari) + } else return -EINVAL; } /* - * disallow names that _really_ start with a dot for MS-DOS, - * GEMDOS does not care + * disallow names that _really_ start with a dot */ - space = !opts->atari; + space = 1; c = 0; for (walk = res; len && walk - res < 8; walk++) { c = *name++; len--; if (opts->name_check != 'r' && strchr(bad_chars, c)) return -EINVAL; - if (opts->name_check == 's' && strchr(bad_if_strict(opts), c)) + if (opts->name_check == 's' && strchr(bad_if_strict, c)) return -EINVAL; if (c >= 'A' && c <= 'Z' && opts->name_check == 's') return -EINVAL; @@ -94,7 +88,7 @@ static int msdos_format_name(const unsigned char *name, int len, if (opts->name_check != 'r' && strchr(bad_chars, c)) return -EINVAL; if (opts->name_check == 's' && - strchr(bad_if_strict(opts), c)) + strchr(bad_if_strict, c)) return -EINVAL; if (c < ' ' || c == ':' || c == '\\') return -EINVAL; @@ -214,7 +208,7 @@ static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry, dentry->d_op = &msdos_dentry_operations; - lock_kernel(); + lock_super(sb); res = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo); if (res == -ENOENT) goto add; @@ -232,7 +226,7 @@ add: if (dentry) dentry->d_op = &msdos_dentry_operations; out: - unlock_kernel(); + unlock_super(sb); if (!res) return dentry; return ERR_PTR(res); @@ -243,6 +237,7 @@ static int msdos_add_entry(struct inode *dir, const unsigned char *name, int is_dir, int is_hid, int cluster, struct timespec *ts, struct fat_slot_info *sinfo) { + struct msdos_sb_info *sbi = MSDOS_SB(dir->i_sb); struct msdos_dir_entry de; __le16 time, date; int err; @@ -252,7 +247,7 @@ static int msdos_add_entry(struct inode *dir, const unsigned char *name, if (is_hid) de.attr |= ATTR_HIDDEN; de.lcase = 0; - fat_date_unix2dos(ts->tv_sec, &time, &date); + fat_date_unix2dos(ts->tv_sec, &time, &date, sbi->options.tz_utc); de.cdate = de.adate = 0; de.ctime = 0; de.ctime_cs = 0; @@ -286,7 +281,7 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, int mode, unsigned char msdos_name[MSDOS_NAME]; int err, is_hid; - lock_kernel(); + lock_super(sb); err = msdos_format_name(dentry->d_name.name, dentry->d_name.len, msdos_name, &MSDOS_SB(sb)->options); @@ -315,7 +310,7 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, int mode, d_instantiate(dentry, inode); out: - unlock_kernel(); + unlock_super(sb); if (!err) err = fat_flush_inodes(sb, dir, inode); return err; @@ -324,11 +319,12 @@ out: /***** Remove a directory */ static int msdos_rmdir(struct inode *dir, struct dentry *dentry) { + struct super_block *sb = dir->i_sb; struct inode *inode = dentry->d_inode; struct fat_slot_info sinfo; int err; - lock_kernel(); + lock_super(sb); /* * Check whether the directory is not in use, then check * whether it is empty. @@ -349,9 +345,9 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry) inode->i_ctime = CURRENT_TIME_SEC; fat_detach(inode); out: - unlock_kernel(); + unlock_super(sb); if (!err) - err = fat_flush_inodes(inode->i_sb, dir, inode); + err = fat_flush_inodes(sb, dir, inode); return err; } @@ -366,7 +362,7 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode) struct timespec ts; int err, is_hid, cluster; - lock_kernel(); + lock_super(sb); err = msdos_format_name(dentry->d_name.name, dentry->d_name.len, msdos_name, &MSDOS_SB(sb)->options); @@ -404,14 +400,14 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode) d_instantiate(dentry, inode); - unlock_kernel(); + unlock_super(sb); fat_flush_inodes(sb, dir, inode); return 0; out_free: fat_free_clusters(dir, cluster); out: - unlock_kernel(); + unlock_super(sb); return err; } @@ -419,10 +415,11 @@ out: static int msdos_unlink(struct inode *dir, struct dentry *dentry) { struct inode *inode = dentry->d_inode; + struct super_block *sb= inode->i_sb; struct fat_slot_info sinfo; int err; - lock_kernel(); + lock_super(sb); err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo); if (err) goto out; @@ -434,9 +431,9 @@ static int msdos_unlink(struct inode *dir, struct dentry *dentry) inode->i_ctime = CURRENT_TIME_SEC; fat_detach(inode); out: - unlock_kernel(); + unlock_super(sb); if (!err) - err = fat_flush_inodes(inode->i_sb, dir, inode); + err = fat_flush_inodes(sb, dir, inode); return err; } @@ -618,10 +615,11 @@ error_inode: static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { + struct super_block *sb = old_dir->i_sb; unsigned char old_msdos_name[MSDOS_NAME], new_msdos_name[MSDOS_NAME]; int err, is_hid; - lock_kernel(); + lock_super(sb); err = msdos_format_name(old_dentry->d_name.name, old_dentry->d_name.len, old_msdos_name, @@ -640,9 +638,9 @@ static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry, err = do_msdos_rename(old_dir, old_msdos_name, old_dentry, new_dir, new_msdos_name, new_dentry, is_hid); out: - unlock_kernel(); + unlock_super(sb); if (!err) - err = fat_flush_inodes(old_dir->i_sb, old_dir, new_dir); + err = fat_flush_inodes(sb, old_dir, new_dir); return err; } diff --git a/fs/namei.c b/fs/namei.c index c7e43536c49a..4ea63ed5e791 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -31,7 +31,6 @@ #include <linux/file.h> #include <linux/fcntl.h> #include <linux/device_cgroup.h> -#include <asm/namei.h> #include <asm/uaccess.h> #define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE]) @@ -185,6 +184,8 @@ int generic_permission(struct inode *inode, int mask, { umode_t mode = inode->i_mode; + mask &= MAY_READ | MAY_WRITE | MAY_EXEC; + if (current->fsuid == inode->i_uid) mode >>= 6; else { @@ -203,7 +204,7 @@ int generic_permission(struct inode *inode, int mask, /* * If the DACs are ok we don't need any capability check. */ - if (((mode & mask & (MAY_READ|MAY_WRITE|MAY_EXEC)) == mask)) + if ((mask & ~mode) == 0) return 0; check_capabilities: @@ -226,13 +227,9 @@ int generic_permission(struct inode *inode, int mask, return -EACCES; } -int permission(struct inode *inode, int mask, struct nameidata *nd) +int inode_permission(struct inode *inode, int mask) { - int retval, submask; - struct vfsmount *mnt = NULL; - - if (nd) - mnt = nd->path.mnt; + int retval; if (mask & MAY_WRITE) { umode_t mode = inode->i_mode; @@ -251,19 +248,9 @@ int permission(struct inode *inode, int mask, struct nameidata *nd) return -EACCES; } - if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) { - /* - * MAY_EXEC on regular files is denied if the fs is mounted - * with the "noexec" flag. - */ - if (mnt && (mnt->mnt_flags & MNT_NOEXEC)) - return -EACCES; - } - /* Ordinary permission routines do not understand MAY_APPEND. */ - submask = mask & ~MAY_APPEND; if (inode->i_op && inode->i_op->permission) { - retval = inode->i_op->permission(inode, submask, nd); + retval = inode->i_op->permission(inode, mask); if (!retval) { /* * Exec permission on a regular file is denied if none @@ -277,7 +264,7 @@ int permission(struct inode *inode, int mask, struct nameidata *nd) return -EACCES; } } else { - retval = generic_permission(inode, submask, NULL); + retval = generic_permission(inode, mask, NULL); } if (retval) return retval; @@ -286,7 +273,8 @@ int permission(struct inode *inode, int mask, struct nameidata *nd) if (retval) return retval; - return security_inode_permission(inode, mask, nd); + return security_inode_permission(inode, + mask & (MAY_READ|MAY_WRITE|MAY_EXEC|MAY_APPEND)); } /** @@ -301,7 +289,7 @@ int permission(struct inode *inode, int mask, struct nameidata *nd) */ int vfs_permission(struct nameidata *nd, int mask) { - return permission(nd->path.dentry->d_inode, mask, nd); + return inode_permission(nd->path.dentry->d_inode, mask); } /** @@ -318,7 +306,7 @@ int vfs_permission(struct nameidata *nd, int mask) */ int file_permission(struct file *file, int mask) { - return permission(file->f_path.dentry->d_inode, mask, NULL); + return inode_permission(file->f_path.dentry->d_inode, mask); } /* @@ -459,8 +447,7 @@ static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, * short-cut DAC fails, then call permission() to do more * complete permission check. */ -static int exec_permission_lite(struct inode *inode, - struct nameidata *nd) +static int exec_permission_lite(struct inode *inode) { umode_t mode = inode->i_mode; @@ -486,7 +473,7 @@ static int exec_permission_lite(struct inode *inode, return -EACCES; ok: - return security_inode_permission(inode, MAY_EXEC, nd); + return security_inode_permission(inode, MAY_EXEC); } /* @@ -519,7 +506,14 @@ static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, s */ result = d_lookup(parent, name); if (!result) { - struct dentry * dentry = d_alloc(parent, name); + struct dentry *dentry; + + /* Don't create child dentry for a dead directory. */ + result = ERR_PTR(-ENOENT); + if (IS_DEADDIR(dir)) + goto out_unlock; + + dentry = d_alloc(parent, name); result = ERR_PTR(-ENOMEM); if (dentry) { result = dir->i_op->lookup(dir, dentry, nd); @@ -528,6 +522,7 @@ static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, s else result = dentry; } +out_unlock: mutex_unlock(&dir->i_mutex); return result; } @@ -545,27 +540,16 @@ static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, s return result; } -static int __emul_lookup_dentry(const char *, struct nameidata *); - /* SMP-safe */ -static __always_inline int +static __always_inline void walk_init_root(const char *name, struct nameidata *nd) { struct fs_struct *fs = current->fs; read_lock(&fs->lock); - if (fs->altroot.dentry && !(nd->flags & LOOKUP_NOALT)) { - nd->path = fs->altroot; - path_get(&fs->altroot); - read_unlock(&fs->lock); - if (__emul_lookup_dentry(name,nd)) - return 0; - read_lock(&fs->lock); - } nd->path = fs->root; path_get(&fs->root); read_unlock(&fs->lock); - return 1; } /* @@ -581,15 +565,13 @@ static __always_inline int link_path_walk(const char *name, struct nameidata *nd int result; /* make sure the stuff we saved doesn't go away */ - dget(save.dentry); - mntget(save.mnt); + path_get(&save); result = __link_path_walk(name, nd); if (result == -ESTALE) { /* nd->path had been dropped */ nd->path = save; - dget(nd->path.dentry); - mntget(nd->path.mnt); + path_get(&nd->path); nd->flags |= LOOKUP_REVAL; result = __link_path_walk(name, nd); } @@ -608,12 +590,9 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l if (*link == '/') { path_put(&nd->path); - if (!walk_init_root(link, nd)) - /* weird __emul_prefix() stuff did it */ - goto out; + walk_init_root(link, nd); } res = link_path_walk(link, nd); -out: if (nd->depth || res || nd->last_type!=LAST_NORM) return res; /* @@ -891,7 +870,7 @@ static int __link_path_walk(const char *name, struct nameidata *nd) unsigned int c; nd->flags |= LOOKUP_CONTINUE; - err = exec_permission_lite(inode, nd); + err = exec_permission_lite(inode); if (err == -EAGAIN) err = vfs_permission(nd, MAY_EXEC); if (err) @@ -1062,67 +1041,6 @@ static int path_walk(const char *name, struct nameidata *nd) return link_path_walk(name, nd); } -/* - * SMP-safe: Returns 1 and nd will have valid dentry and mnt, if - * everything is done. Returns 0 and drops input nd, if lookup failed; - */ -static int __emul_lookup_dentry(const char *name, struct nameidata *nd) -{ - if (path_walk(name, nd)) - return 0; /* something went wrong... */ - - if (!nd->path.dentry->d_inode || - S_ISDIR(nd->path.dentry->d_inode->i_mode)) { - struct path old_path = nd->path; - struct qstr last = nd->last; - int last_type = nd->last_type; - struct fs_struct *fs = current->fs; - - /* - * NAME was not found in alternate root or it's a directory. - * Try to find it in the normal root: - */ - nd->last_type = LAST_ROOT; - read_lock(&fs->lock); - nd->path = fs->root; - path_get(&fs->root); - read_unlock(&fs->lock); - if (path_walk(name, nd) == 0) { - if (nd->path.dentry->d_inode) { - path_put(&old_path); - return 1; - } - path_put(&nd->path); - } - nd->path = old_path; - nd->last = last; - nd->last_type = last_type; - } - return 1; -} - -void set_fs_altroot(void) -{ - char *emul = __emul_prefix(); - struct nameidata nd; - struct path path = {}, old_path; - int err; - struct fs_struct *fs = current->fs; - - if (!emul) - goto set_it; - err = path_lookup(emul, LOOKUP_FOLLOW|LOOKUP_DIRECTORY|LOOKUP_NOALT, &nd); - if (!err) - path = nd.path; -set_it: - write_lock(&fs->lock); - old_path = fs->altroot; - fs->altroot = path; - write_unlock(&fs->lock); - if (old_path.dentry) - path_put(&old_path); -} - /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ static int do_path_lookup(int dfd, const char *name, unsigned int flags, struct nameidata *nd) @@ -1138,14 +1056,6 @@ static int do_path_lookup(int dfd, const char *name, if (*name=='/') { read_lock(&fs->lock); - if (fs->altroot.dentry && !(nd->flags & LOOKUP_NOALT)) { - nd->path = fs->altroot; - path_get(&fs->altroot); - read_unlock(&fs->lock); - if (__emul_lookup_dentry(name,nd)) - goto out; /* found in altroot */ - read_lock(&fs->lock); - } nd->path = fs->root; path_get(&fs->root); read_unlock(&fs->lock); @@ -1179,7 +1089,6 @@ static int do_path_lookup(int dfd, const char *name, } retval = path_walk(name, nd); -out: if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry && nd->path.dentry->d_inode)) audit_inode(name, nd->path.dentry); @@ -1216,8 +1125,9 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, nd->flags = flags; nd->depth = 0; - nd->path.mnt = mntget(mnt); - nd->path.dentry = dget(dentry); + nd->path.dentry = dentry; + nd->path.mnt = mnt; + path_get(&nd->path); retval = path_walk(name, nd); if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry && @@ -1283,19 +1193,6 @@ static int path_lookup_create(int dfd, const char *name, nd, open_flags, create_mode); } -int __user_path_lookup_open(const char __user *name, unsigned int lookup_flags, - struct nameidata *nd, int open_flags) -{ - char *tmp = getname(name); - int err = PTR_ERR(tmp); - - if (!IS_ERR(tmp)) { - err = __path_lookup_intent_open(AT_FDCWD, tmp, lookup_flags, nd, open_flags, 0); - putname(tmp); - } - return err; -} - static struct dentry *__lookup_hash(struct qstr *name, struct dentry *base, struct nameidata *nd) { @@ -1318,7 +1215,14 @@ static struct dentry *__lookup_hash(struct qstr *name, dentry = cached_lookup(base, name, nd); if (!dentry) { - struct dentry *new = d_alloc(base, name); + struct dentry *new; + + /* Don't create child dentry for a dead directory. */ + dentry = ERR_PTR(-ENOENT); + if (IS_DEADDIR(inode)) + goto out; + + new = d_alloc(base, name); dentry = ERR_PTR(-ENOMEM); if (!new) goto out; @@ -1341,7 +1245,7 @@ static struct dentry *lookup_hash(struct nameidata *nd) { int err; - err = permission(nd->path.dentry->d_inode, MAY_EXEC, nd); + err = inode_permission(nd->path.dentry->d_inode, MAY_EXEC); if (err) return ERR_PTR(err); return __lookup_hash(&nd->last, nd->path.dentry, nd); @@ -1389,7 +1293,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) if (err) return ERR_PTR(err); - err = permission(base->d_inode, MAY_EXEC, NULL); + err = inode_permission(base->d_inode, MAY_EXEC); if (err) return ERR_PTR(err); return __lookup_hash(&this, base, NULL); @@ -1417,22 +1321,40 @@ struct dentry *lookup_one_noperm(const char *name, struct dentry *base) return __lookup_hash(&this, base, NULL); } -int __user_walk_fd(int dfd, const char __user *name, unsigned flags, - struct nameidata *nd) +int user_path_at(int dfd, const char __user *name, unsigned flags, + struct path *path) { + struct nameidata nd; char *tmp = getname(name); int err = PTR_ERR(tmp); - if (!IS_ERR(tmp)) { - err = do_path_lookup(dfd, tmp, flags, nd); + + BUG_ON(flags & LOOKUP_PARENT); + + err = do_path_lookup(dfd, tmp, flags, &nd); putname(tmp); + if (!err) + *path = nd.path; } return err; } -int __user_walk(const char __user *name, unsigned flags, struct nameidata *nd) +static int user_path_parent(int dfd, const char __user *path, + struct nameidata *nd, char **name) { - return __user_walk_fd(AT_FDCWD, name, flags, nd); + char *s = getname(path); + int error; + + if (IS_ERR(s)) + return PTR_ERR(s); + + error = do_path_lookup(dfd, s, LOOKUP_PARENT, nd); + if (error) + putname(s); + else + *name = s; + + return error; } /* @@ -1479,7 +1401,7 @@ static int may_delete(struct inode *dir,struct dentry *victim,int isdir) BUG_ON(victim->d_parent->d_inode != dir); audit_inode_child(victim->d_name.name, victim, dir); - error = permission(dir,MAY_WRITE | MAY_EXEC, NULL); + error = inode_permission(dir, MAY_WRITE | MAY_EXEC); if (error) return error; if (IS_APPEND(dir)) @@ -1509,14 +1431,13 @@ static int may_delete(struct inode *dir,struct dentry *victim,int isdir) * 3. We should have write and exec permissions on dir * 4. We can't do it if dir is immutable (done in permission()) */ -static inline int may_create(struct inode *dir, struct dentry *child, - struct nameidata *nd) +static inline int may_create(struct inode *dir, struct dentry *child) { if (child->d_inode) return -EEXIST; if (IS_DEADDIR(dir)) return -ENOENT; - return permission(dir,MAY_WRITE | MAY_EXEC, nd); + return inode_permission(dir, MAY_WRITE | MAY_EXEC); } /* @@ -1582,7 +1503,7 @@ void unlock_rename(struct dentry *p1, struct dentry *p2) int vfs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd) { - int error = may_create(dir, dentry, nd); + int error = may_create(dir, dentry); if (error) return error; @@ -1756,7 +1677,7 @@ struct file *do_filp_open(int dfd, const char *pathname, int will_write; int flag = open_to_namei_flags(open_flag); - acc_mode = ACC_MODE(flag); + acc_mode = MAY_OPEN | ACC_MODE(flag); /* O_TRUNC implies we need access checks for write permissions */ if (flag & O_TRUNC) @@ -2026,7 +1947,7 @@ EXPORT_SYMBOL_GPL(lookup_create); int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) { - int error = may_create(dir, dentry, NULL); + int error = may_create(dir, dentry); if (error) return error; @@ -2072,20 +1993,18 @@ static int may_mknod(mode_t mode) asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode, unsigned dev) { - int error = 0; - char * tmp; - struct dentry * dentry; + int error; + char *tmp; + struct dentry *dentry; struct nameidata nd; if (S_ISDIR(mode)) return -EPERM; - tmp = getname(filename); - if (IS_ERR(tmp)) - return PTR_ERR(tmp); - error = do_path_lookup(dfd, tmp, LOOKUP_PARENT, &nd); + error = user_path_parent(dfd, filename, &nd, &tmp); if (error) - goto out; + return error; + dentry = lookup_create(&nd, 0); if (IS_ERR(dentry)) { error = PTR_ERR(dentry); @@ -2117,7 +2036,6 @@ out_dput: out_unlock: mutex_unlock(&nd.path.dentry->d_inode->i_mutex); path_put(&nd.path); -out: putname(tmp); return error; @@ -2130,7 +2048,7 @@ asmlinkage long sys_mknod(const char __user *filename, int mode, unsigned dev) int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) { - int error = may_create(dir, dentry, NULL); + int error = may_create(dir, dentry); if (error) return error; @@ -2157,14 +2075,10 @@ asmlinkage long sys_mkdirat(int dfd, const char __user *pathname, int mode) struct dentry *dentry; struct nameidata nd; - tmp = getname(pathname); - error = PTR_ERR(tmp); - if (IS_ERR(tmp)) + error = user_path_parent(dfd, pathname, &nd, &tmp); + if (error) goto out_err; - error = do_path_lookup(dfd, tmp, LOOKUP_PARENT, &nd); - if (error) - goto out; dentry = lookup_create(&nd, 1); error = PTR_ERR(dentry); if (IS_ERR(dentry)) @@ -2182,7 +2096,6 @@ out_dput: out_unlock: mutex_unlock(&nd.path.dentry->d_inode->i_mutex); path_put(&nd.path); -out: putname(tmp); out_err: return error; @@ -2260,13 +2173,9 @@ static long do_rmdir(int dfd, const char __user *pathname) struct dentry *dentry; struct nameidata nd; - name = getname(pathname); - if(IS_ERR(name)) - return PTR_ERR(name); - - error = do_path_lookup(dfd, name, LOOKUP_PARENT, &nd); + error = user_path_parent(dfd, pathname, &nd, &name); if (error) - goto exit; + return error; switch(nd.last_type) { case LAST_DOTDOT: @@ -2295,7 +2204,6 @@ exit2: mutex_unlock(&nd.path.dentry->d_inode->i_mutex); exit1: path_put(&nd.path); -exit: putname(name); return error; } @@ -2344,19 +2252,16 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry) */ static long do_unlinkat(int dfd, const char __user *pathname) { - int error = 0; - char * name; + int error; + char *name; struct dentry *dentry; struct nameidata nd; struct inode *inode = NULL; - name = getname(pathname); - if(IS_ERR(name)) - return PTR_ERR(name); - - error = do_path_lookup(dfd, name, LOOKUP_PARENT, &nd); + error = user_path_parent(dfd, pathname, &nd, &name); if (error) - goto exit; + return error; + error = -EISDIR; if (nd.last_type != LAST_NORM) goto exit1; @@ -2383,7 +2288,6 @@ static long do_unlinkat(int dfd, const char __user *pathname) iput(inode); /* truncate the inode here */ exit1: path_put(&nd.path); -exit: putname(name); return error; @@ -2409,9 +2313,9 @@ asmlinkage long sys_unlink(const char __user *pathname) return do_unlinkat(AT_FDCWD, pathname); } -int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname, int mode) +int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname) { - int error = may_create(dir, dentry, NULL); + int error = may_create(dir, dentry); if (error) return error; @@ -2433,23 +2337,20 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname, i asmlinkage long sys_symlinkat(const char __user *oldname, int newdfd, const char __user *newname) { - int error = 0; - char * from; - char * to; + int error; + char *from; + char *to; struct dentry *dentry; struct nameidata nd; from = getname(oldname); - if(IS_ERR(from)) + if (IS_ERR(from)) return PTR_ERR(from); - to = getname(newname); - error = PTR_ERR(to); - if (IS_ERR(to)) - goto out_putname; - error = do_path_lookup(newdfd, to, LOOKUP_PARENT, &nd); + error = user_path_parent(newdfd, newname, &nd, &to); if (error) - goto out; + goto out_putname; + dentry = lookup_create(&nd, 0); error = PTR_ERR(dentry); if (IS_ERR(dentry)) @@ -2458,14 +2359,13 @@ asmlinkage long sys_symlinkat(const char __user *oldname, error = mnt_want_write(nd.path.mnt); if (error) goto out_dput; - error = vfs_symlink(nd.path.dentry->d_inode, dentry, from, S_IALLUGO); + error = vfs_symlink(nd.path.dentry->d_inode, dentry, from); mnt_drop_write(nd.path.mnt); out_dput: dput(dentry); out_unlock: mutex_unlock(&nd.path.dentry->d_inode->i_mutex); path_put(&nd.path); -out: putname(to); out_putname: putname(from); @@ -2485,7 +2385,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de if (!inode) return -ENOENT; - error = may_create(dir, new_dentry, NULL); + error = may_create(dir, new_dentry); if (error) return error; @@ -2499,19 +2399,19 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de return -EPERM; if (!dir->i_op || !dir->i_op->link) return -EPERM; - if (S_ISDIR(old_dentry->d_inode->i_mode)) + if (S_ISDIR(inode->i_mode)) return -EPERM; error = security_inode_link(old_dentry, dir, new_dentry); if (error) return error; - mutex_lock(&old_dentry->d_inode->i_mutex); + mutex_lock(&inode->i_mutex); DQUOT_INIT(dir); error = dir->i_op->link(old_dentry, dir, new_dentry); - mutex_unlock(&old_dentry->d_inode->i_mutex); + mutex_unlock(&inode->i_mutex); if (!error) - fsnotify_link(dir, old_dentry->d_inode, new_dentry); + fsnotify_link(dir, inode, new_dentry); return error; } @@ -2529,27 +2429,25 @@ asmlinkage long sys_linkat(int olddfd, const char __user *oldname, int flags) { struct dentry *new_dentry; - struct nameidata nd, old_nd; + struct nameidata nd; + struct path old_path; int error; - char * to; + char *to; if ((flags & ~AT_SYMLINK_FOLLOW) != 0) return -EINVAL; - to = getname(newname); - if (IS_ERR(to)) - return PTR_ERR(to); - - error = __user_walk_fd(olddfd, oldname, - flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0, - &old_nd); + error = user_path_at(olddfd, oldname, + flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0, + &old_path); if (error) - goto exit; - error = do_path_lookup(newdfd, to, LOOKUP_PARENT, &nd); + return error; + + error = user_path_parent(newdfd, newname, &nd, &to); if (error) goto out; error = -EXDEV; - if (old_nd.path.mnt != nd.path.mnt) + if (old_path.mnt != nd.path.mnt) goto out_release; new_dentry = lookup_create(&nd, 0); error = PTR_ERR(new_dentry); @@ -2558,7 +2456,7 @@ asmlinkage long sys_linkat(int olddfd, const char __user *oldname, error = mnt_want_write(nd.path.mnt); if (error) goto out_dput; - error = vfs_link(old_nd.path.dentry, nd.path.dentry->d_inode, new_dentry); + error = vfs_link(old_path.dentry, nd.path.dentry->d_inode, new_dentry); mnt_drop_write(nd.path.mnt); out_dput: dput(new_dentry); @@ -2566,10 +2464,9 @@ out_unlock: mutex_unlock(&nd.path.dentry->d_inode->i_mutex); out_release: path_put(&nd.path); -out: - path_put(&old_nd.path); -exit: putname(to); +out: + path_put(&old_path); return error; } @@ -2622,7 +2519,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, * we'll need to flip '..'. */ if (new_dir != old_dir) { - error = permission(old_dentry->d_inode, MAY_WRITE, NULL); + error = inode_permission(old_dentry->d_inode, MAY_WRITE); if (error) return error; } @@ -2697,7 +2594,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, return error; if (!new_dentry->d_inode) - error = may_create(new_dir, new_dentry, NULL); + error = may_create(new_dir, new_dentry); else error = may_delete(new_dir, new_dentry, is_dir); if (error) @@ -2725,20 +2622,22 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, return error; } -static int do_rename(int olddfd, const char *oldname, - int newdfd, const char *newname) +asmlinkage long sys_renameat(int olddfd, const char __user *oldname, + int newdfd, const char __user *newname) { - int error = 0; - struct dentry * old_dir, * new_dir; - struct dentry * old_dentry, *new_dentry; - struct dentry * trap; + struct dentry *old_dir, *new_dir; + struct dentry *old_dentry, *new_dentry; + struct dentry *trap; struct nameidata oldnd, newnd; + char *from; + char *to; + int error; - error = do_path_lookup(olddfd, oldname, LOOKUP_PARENT, &oldnd); + error = user_path_parent(olddfd, oldname, &oldnd, &from); if (error) goto exit; - error = do_path_lookup(newdfd, newname, LOOKUP_PARENT, &newnd); + error = user_path_parent(newdfd, newname, &newnd, &to); if (error) goto exit1; @@ -2800,29 +2699,11 @@ exit3: unlock_rename(new_dir, old_dir); exit2: path_put(&newnd.path); + putname(to); exit1: path_put(&oldnd.path); -exit: - return error; -} - -asmlinkage long sys_renameat(int olddfd, const char __user *oldname, - int newdfd, const char __user *newname) -{ - int error; - char * from; - char * to; - - from = getname(oldname); - if(IS_ERR(from)) - return PTR_ERR(from); - to = getname(newname); - error = PTR_ERR(to); - if (!IS_ERR(to)) { - error = do_rename(olddfd, from, newdfd, to); - putname(to); - } putname(from); +exit: return error; } @@ -2857,16 +2738,17 @@ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen) { struct nameidata nd; void *cookie; + int res; nd.depth = 0; cookie = dentry->d_inode->i_op->follow_link(dentry, &nd); - if (!IS_ERR(cookie)) { - int res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd)); - if (dentry->d_inode->i_op->put_link) - dentry->d_inode->i_op->put_link(dentry, &nd, cookie); - cookie = ERR_PTR(res); - } - return PTR_ERR(cookie); + if (IS_ERR(cookie)) + return PTR_ERR(cookie); + + res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd)); + if (dentry->d_inode->i_op->put_link) + dentry->d_inode->i_op->put_link(dentry, &nd, cookie); + return res; } int vfs_follow_link(struct nameidata *nd, const char *link) @@ -2959,8 +2841,7 @@ const struct inode_operations page_symlink_inode_operations = { .put_link = page_put_link, }; -EXPORT_SYMBOL(__user_walk); -EXPORT_SYMBOL(__user_walk_fd); +EXPORT_SYMBOL(user_path_at); EXPORT_SYMBOL(follow_down); EXPORT_SYMBOL(follow_up); EXPORT_SYMBOL(get_write_access); /* binfmt_aout */ @@ -2975,7 +2856,7 @@ EXPORT_SYMBOL(page_symlink); EXPORT_SYMBOL(page_symlink_inode_operations); EXPORT_SYMBOL(path_lookup); EXPORT_SYMBOL(vfs_path_lookup); -EXPORT_SYMBOL(permission); +EXPORT_SYMBOL(inode_permission); EXPORT_SYMBOL(vfs_permission); EXPORT_SYMBOL(file_permission); EXPORT_SYMBOL(unlock_rename); diff --git a/fs/namespace.c b/fs/namespace.c index 4fc302c2a0e0..6e283c93b50d 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -112,9 +112,13 @@ struct vfsmount *alloc_vfsmnt(const char *name) int err; err = mnt_alloc_id(mnt); - if (err) { - kmem_cache_free(mnt_cache, mnt); - return NULL; + if (err) + goto out_free_cache; + + if (name) { + mnt->mnt_devname = kstrdup(name, GFP_KERNEL); + if (!mnt->mnt_devname) + goto out_free_id; } atomic_set(&mnt->mnt_count, 1); @@ -127,16 +131,14 @@ struct vfsmount *alloc_vfsmnt(const char *name) INIT_LIST_HEAD(&mnt->mnt_slave_list); INIT_LIST_HEAD(&mnt->mnt_slave); atomic_set(&mnt->__mnt_writers, 0); - if (name) { - int size = strlen(name) + 1; - char *newname = kmalloc(size, GFP_KERNEL); - if (newname) { - memcpy(newname, name, size); - mnt->mnt_devname = newname; - } - } } return mnt; + +out_free_id: + mnt_free_id(mnt); +out_free_cache: + kmem_cache_free(mnt_cache, mnt); + return NULL; } /* @@ -309,10 +311,9 @@ static void handle_write_count_underflow(struct vfsmount *mnt) */ if ((atomic_read(&mnt->__mnt_writers) < 0) && !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) { - printk(KERN_DEBUG "leak detected on mount(%p) writers " + WARN(1, KERN_DEBUG "leak detected on mount(%p) writers " "count: %d\n", mnt, atomic_read(&mnt->__mnt_writers)); - WARN_ON(1); /* use the flag to keep the dmesg spam down */ mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT; } @@ -750,7 +751,7 @@ struct proc_fs_info { const char *str; }; -static void show_sb_opts(struct seq_file *m, struct super_block *sb) +static int show_sb_opts(struct seq_file *m, struct super_block *sb) { static const struct proc_fs_info fs_info[] = { { MS_SYNCHRONOUS, ",sync" }, @@ -764,6 +765,8 @@ static void show_sb_opts(struct seq_file *m, struct super_block *sb) if (sb->s_flags & fs_infop->flag) seq_puts(m, fs_infop->str); } + + return security_sb_show_options(m, sb); } static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt) @@ -806,11 +809,14 @@ static int show_vfsmnt(struct seq_file *m, void *v) seq_putc(m, ' '); show_type(m, mnt->mnt_sb); seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw"); - show_sb_opts(m, mnt->mnt_sb); + err = show_sb_opts(m, mnt->mnt_sb); + if (err) + goto out; show_mnt_opts(m, mnt); if (mnt->mnt_sb->s_op->show_options) err = mnt->mnt_sb->s_op->show_options(m, mnt); seq_puts(m, " 0 0\n"); +out: return err; } @@ -865,10 +871,13 @@ static int show_mountinfo(struct seq_file *m, void *v) seq_putc(m, ' '); mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw"); - show_sb_opts(m, sb); + err = show_sb_opts(m, sb); + if (err) + goto out; if (sb->s_op->show_options) err = sb->s_op->show_options(m, mnt); seq_putc(m, '\n'); +out: return err; } @@ -1121,27 +1130,27 @@ static int do_umount(struct vfsmount *mnt, int flags) asmlinkage long sys_umount(char __user * name, int flags) { - struct nameidata nd; + struct path path; int retval; - retval = __user_walk(name, LOOKUP_FOLLOW, &nd); + retval = user_path(name, &path); if (retval) goto out; retval = -EINVAL; - if (nd.path.dentry != nd.path.mnt->mnt_root) + if (path.dentry != path.mnt->mnt_root) goto dput_and_out; - if (!check_mnt(nd.path.mnt)) + if (!check_mnt(path.mnt)) goto dput_and_out; retval = -EPERM; if (!capable(CAP_SYS_ADMIN)) goto dput_and_out; - retval = do_umount(nd.path.mnt, flags); + retval = do_umount(path.mnt, flags); dput_and_out: /* we mustn't call path_put() as that would clear mnt_expiry_mark */ - dput(nd.path.dentry); - mntput_no_expire(nd.path.mnt); + dput(path.dentry); + mntput_no_expire(path.mnt); out: return retval; } @@ -1658,31 +1667,31 @@ static noinline int do_new_mount(struct nameidata *nd, char *type, int flags, if (IS_ERR(mnt)) return PTR_ERR(mnt); - return do_add_mount(mnt, nd, mnt_flags, NULL); + return do_add_mount(mnt, &nd->path, mnt_flags, NULL); } /* * add a mount into a namespace's mount tree * - provide the option of adding the new mount to an expiration list */ -int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd, +int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flags, struct list_head *fslist) { int err; down_write(&namespace_sem); /* Something was mounted here while we slept */ - while (d_mountpoint(nd->path.dentry) && - follow_down(&nd->path.mnt, &nd->path.dentry)) + while (d_mountpoint(path->dentry) && + follow_down(&path->mnt, &path->dentry)) ; err = -EINVAL; - if (!check_mnt(nd->path.mnt)) + if (!check_mnt(path->mnt)) goto unlock; /* Refuse the same filesystem on the same mount point */ err = -EBUSY; - if (nd->path.mnt->mnt_sb == newmnt->mnt_sb && - nd->path.mnt->mnt_root == nd->path.dentry) + if (path->mnt->mnt_sb == newmnt->mnt_sb && + path->mnt->mnt_root == path->dentry) goto unlock; err = -EINVAL; @@ -1690,7 +1699,7 @@ int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd, goto unlock; newmnt->mnt_flags = mnt_flags; - if ((err = graft_tree(newmnt, &nd->path))) + if ((err = graft_tree(newmnt, path))) goto unlock; if (fslist) /* add to the specified expiration list */ @@ -1965,7 +1974,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, struct fs_struct *fs) { struct mnt_namespace *new_ns; - struct vfsmount *rootmnt = NULL, *pwdmnt = NULL, *altrootmnt = NULL; + struct vfsmount *rootmnt = NULL, *pwdmnt = NULL; struct vfsmount *p, *q; new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); @@ -2008,10 +2017,6 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, pwdmnt = p; fs->pwd.mnt = mntget(q); } - if (p == fs->altroot.mnt) { - altrootmnt = p; - fs->altroot.mnt = mntget(q); - } } p = next_mnt(p, mnt_ns->root); q = next_mnt(q, new_ns->root); @@ -2022,8 +2027,6 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, mntput(rootmnt); if (pwdmnt) mntput(pwdmnt); - if (altrootmnt) - mntput(altrootmnt); return new_ns; } @@ -2176,28 +2179,26 @@ asmlinkage long sys_pivot_root(const char __user * new_root, const char __user * put_old) { struct vfsmount *tmp; - struct nameidata new_nd, old_nd; - struct path parent_path, root_parent, root; + struct path new, old, parent_path, root_parent, root; int error; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - error = __user_walk(new_root, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, - &new_nd); + error = user_path_dir(new_root, &new); if (error) goto out0; error = -EINVAL; - if (!check_mnt(new_nd.path.mnt)) + if (!check_mnt(new.mnt)) goto out1; - error = __user_walk(put_old, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old_nd); + error = user_path_dir(put_old, &old); if (error) goto out1; - error = security_sb_pivotroot(&old_nd.path, &new_nd.path); + error = security_sb_pivotroot(&old, &new); if (error) { - path_put(&old_nd.path); + path_put(&old); goto out1; } @@ -2206,69 +2207,69 @@ asmlinkage long sys_pivot_root(const char __user * new_root, path_get(¤t->fs->root); read_unlock(¤t->fs->lock); down_write(&namespace_sem); - mutex_lock(&old_nd.path.dentry->d_inode->i_mutex); + mutex_lock(&old.dentry->d_inode->i_mutex); error = -EINVAL; - if (IS_MNT_SHARED(old_nd.path.mnt) || - IS_MNT_SHARED(new_nd.path.mnt->mnt_parent) || + if (IS_MNT_SHARED(old.mnt) || + IS_MNT_SHARED(new.mnt->mnt_parent) || IS_MNT_SHARED(root.mnt->mnt_parent)) goto out2; if (!check_mnt(root.mnt)) goto out2; error = -ENOENT; - if (IS_DEADDIR(new_nd.path.dentry->d_inode)) + if (IS_DEADDIR(new.dentry->d_inode)) goto out2; - if (d_unhashed(new_nd.path.dentry) && !IS_ROOT(new_nd.path.dentry)) + if (d_unhashed(new.dentry) && !IS_ROOT(new.dentry)) goto out2; - if (d_unhashed(old_nd.path.dentry) && !IS_ROOT(old_nd.path.dentry)) + if (d_unhashed(old.dentry) && !IS_ROOT(old.dentry)) goto out2; error = -EBUSY; - if (new_nd.path.mnt == root.mnt || - old_nd.path.mnt == root.mnt) + if (new.mnt == root.mnt || + old.mnt == root.mnt) goto out2; /* loop, on the same file system */ error = -EINVAL; if (root.mnt->mnt_root != root.dentry) goto out2; /* not a mountpoint */ if (root.mnt->mnt_parent == root.mnt) goto out2; /* not attached */ - if (new_nd.path.mnt->mnt_root != new_nd.path.dentry) + if (new.mnt->mnt_root != new.dentry) goto out2; /* not a mountpoint */ - if (new_nd.path.mnt->mnt_parent == new_nd.path.mnt) + if (new.mnt->mnt_parent == new.mnt) goto out2; /* not attached */ /* make sure we can reach put_old from new_root */ - tmp = old_nd.path.mnt; + tmp = old.mnt; spin_lock(&vfsmount_lock); - if (tmp != new_nd.path.mnt) { + if (tmp != new.mnt) { for (;;) { if (tmp->mnt_parent == tmp) goto out3; /* already mounted on put_old */ - if (tmp->mnt_parent == new_nd.path.mnt) + if (tmp->mnt_parent == new.mnt) break; tmp = tmp->mnt_parent; } - if (!is_subdir(tmp->mnt_mountpoint, new_nd.path.dentry)) + if (!is_subdir(tmp->mnt_mountpoint, new.dentry)) goto out3; - } else if (!is_subdir(old_nd.path.dentry, new_nd.path.dentry)) + } else if (!is_subdir(old.dentry, new.dentry)) goto out3; - detach_mnt(new_nd.path.mnt, &parent_path); + detach_mnt(new.mnt, &parent_path); detach_mnt(root.mnt, &root_parent); /* mount old root on put_old */ - attach_mnt(root.mnt, &old_nd.path); + attach_mnt(root.mnt, &old); /* mount new_root on / */ - attach_mnt(new_nd.path.mnt, &root_parent); + attach_mnt(new.mnt, &root_parent); touch_mnt_namespace(current->nsproxy->mnt_ns); spin_unlock(&vfsmount_lock); - chroot_fs_refs(&root, &new_nd.path); - security_sb_post_pivotroot(&root, &new_nd.path); + chroot_fs_refs(&root, &new); + security_sb_post_pivotroot(&root, &new); error = 0; path_put(&root_parent); path_put(&parent_path); out2: - mutex_unlock(&old_nd.path.dentry->d_inode->i_mutex); + mutex_unlock(&old.dentry->d_inode->i_mutex); up_write(&namespace_sem); path_put(&root); - path_put(&old_nd.path); + path_put(&old); out1: - path_put(&new_nd.path); + path_put(&new); out0: return error; out3: diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index 011ef0b6d2d4..07e9715b8658 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -266,7 +266,7 @@ leave_me:; static int -__ncp_lookup_validate(struct dentry * dentry, struct nameidata *nd) +__ncp_lookup_validate(struct dentry *dentry) { struct ncp_server *server; struct dentry *parent; @@ -340,7 +340,7 @@ ncp_lookup_validate(struct dentry * dentry, struct nameidata *nd) { int res; lock_kernel(); - res = __ncp_lookup_validate(dentry, nd); + res = __ncp_lookup_validate(dentry); unlock_kernel(); return res; } diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c index 2b145de45b39..6a7d901f1936 100644 --- a/fs/ncpfs/file.c +++ b/fs/ncpfs/file.c @@ -18,6 +18,7 @@ #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/sched.h> +#include <linux/smp_lock.h> #include <linux/ncp_fs.h> #include "ncplib_kernel.h" @@ -281,9 +282,18 @@ static int ncp_release(struct inode *inode, struct file *file) { return 0; } +static loff_t ncp_remote_llseek(struct file *file, loff_t offset, int origin) +{ + loff_t ret; + lock_kernel(); + ret = generic_file_llseek_unlocked(file, offset, origin); + unlock_kernel(); + return ret; +} + const struct file_operations ncp_file_operations = { - .llseek = remote_llseek, + .llseek = ncp_remote_llseek, .read = ncp_file_read, .write = ncp_file_write, .ioctl = ncp_ioctl, diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 2e5ab1204dec..d642f0e5b365 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -64,7 +64,7 @@ static void ncp_destroy_inode(struct inode *inode) kmem_cache_free(ncp_inode_cachep, NCP_FINFO(inode)); } -static void init_once(struct kmem_cache *cachep, void *foo) +static void init_once(void *foo) { struct ncp_inode_info *ei = (struct ncp_inode_info *) foo; diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index c1e7c8300629..6a09760c5960 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -27,7 +27,7 @@ struct nfs_callback_data { unsigned int users; - struct svc_serv *serv; + struct svc_rqst *rqst; struct task_struct *task; }; @@ -91,25 +91,22 @@ nfs_callback_svc(void *vrqstp) svc_process(rqstp); } unlock_kernel(); - nfs_callback_info.task = NULL; - svc_exit_thread(rqstp); return 0; } /* - * Bring up the server process if it is not already up. + * Bring up the callback thread if it is not already up. */ int nfs_callback_up(void) { struct svc_serv *serv = NULL; - struct svc_rqst *rqstp; int ret = 0; - lock_kernel(); mutex_lock(&nfs_callback_mutex); if (nfs_callback_info.users++ || nfs_callback_info.task != NULL) goto out; - serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL); + serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, + AF_INET, NULL); ret = -ENOMEM; if (!serv) goto out_err; @@ -121,22 +118,23 @@ int nfs_callback_up(void) nfs_callback_tcpport = ret; dprintk("Callback port = 0x%x\n", nfs_callback_tcpport); - rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]); - if (IS_ERR(rqstp)) { - ret = PTR_ERR(rqstp); + nfs_callback_info.rqst = svc_prepare_thread(serv, &serv->sv_pools[0]); + if (IS_ERR(nfs_callback_info.rqst)) { + ret = PTR_ERR(nfs_callback_info.rqst); + nfs_callback_info.rqst = NULL; goto out_err; } svc_sock_update_bufs(serv); - nfs_callback_info.serv = serv; - nfs_callback_info.task = kthread_run(nfs_callback_svc, rqstp, + nfs_callback_info.task = kthread_run(nfs_callback_svc, + nfs_callback_info.rqst, "nfsv4-svc"); if (IS_ERR(nfs_callback_info.task)) { ret = PTR_ERR(nfs_callback_info.task); - nfs_callback_info.serv = NULL; + svc_exit_thread(nfs_callback_info.rqst); + nfs_callback_info.rqst = NULL; nfs_callback_info.task = NULL; - svc_exit_thread(rqstp); goto out_err; } out: @@ -149,7 +147,6 @@ out: if (serv) svc_destroy(serv); mutex_unlock(&nfs_callback_mutex); - unlock_kernel(); return ret; out_err: dprintk("Couldn't create callback socket or server thread; err = %d\n", @@ -159,17 +156,19 @@ out_err: } /* - * Kill the server process if it is not already down. + * Kill the callback thread if it's no longer being used. */ void nfs_callback_down(void) { - lock_kernel(); mutex_lock(&nfs_callback_mutex); nfs_callback_info.users--; - if (nfs_callback_info.users == 0 && nfs_callback_info.task != NULL) + if (nfs_callback_info.users == 0 && nfs_callback_info.task != NULL) { kthread_stop(nfs_callback_info.task); + svc_exit_thread(nfs_callback_info.rqst); + nfs_callback_info.rqst = NULL; + nfs_callback_info.task = NULL; + } mutex_unlock(&nfs_callback_mutex); - unlock_kernel(); } static int nfs_callback_authenticate(struct svc_rqst *rqstp) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index f2a092ca69b5..5ee23e7058b3 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -431,14 +431,14 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto, { to->to_initval = timeo * HZ / 10; to->to_retries = retrans; - if (!to->to_retries) - to->to_retries = 2; switch (proto) { case XPRT_TRANSPORT_TCP: case XPRT_TRANSPORT_RDMA: + if (to->to_retries == 0) + to->to_retries = NFS_DEF_TCP_RETRANS; if (to->to_initval == 0) - to->to_initval = 60 * HZ; + to->to_initval = NFS_DEF_TCP_TIMEO * HZ / 10; if (to->to_initval > NFS_MAX_TCP_TIMEOUT) to->to_initval = NFS_MAX_TCP_TIMEOUT; to->to_increment = to->to_initval; @@ -450,14 +450,17 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto, to->to_exponential = 0; break; case XPRT_TRANSPORT_UDP: - default: + if (to->to_retries == 0) + to->to_retries = NFS_DEF_UDP_RETRANS; if (!to->to_initval) - to->to_initval = 11 * HZ / 10; + to->to_initval = NFS_DEF_UDP_TIMEO * HZ / 10; if (to->to_initval > NFS_MAX_UDP_TIMEOUT) to->to_initval = NFS_MAX_UDP_TIMEOUT; to->to_maxval = NFS_MAX_UDP_TIMEOUT; to->to_exponential = 1; break; + default: + BUG(); } } diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 58d43daec084..74f92b717f78 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -133,13 +133,14 @@ nfs_opendir(struct inode *inode, struct file *filp) { int res; - dfprintk(VFS, "NFS: opendir(%s/%ld)\n", - inode->i_sb->s_id, inode->i_ino); + dfprintk(FILE, "NFS: open dir(%s/%s)\n", + filp->f_path.dentry->d_parent->d_name.name, + filp->f_path.dentry->d_name.name); + + nfs_inc_stats(inode, NFSIOS_VFSOPEN); - lock_kernel(); /* Call generic open code in order to cache credentials */ res = nfs_open(inode, filp); - unlock_kernel(); return res; } @@ -204,7 +205,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page) * Note: assumes we have exclusive access to this mapping either * through inode->i_mutex or some other mechanism. */ - if (page->index == 0 && invalidate_inode_pages2_range(inode->i_mapping, PAGE_CACHE_SIZE, -1) < 0) { + if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) { /* Should never happen */ nfs_zap_mapping(inode, inode->i_mapping); } @@ -528,13 +529,11 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) struct nfs_fattr fattr; long res; - dfprintk(VFS, "NFS: readdir(%s/%s) starting at cookie %Lu\n", + dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n", dentry->d_parent->d_name.name, dentry->d_name.name, (long long)filp->f_pos); nfs_inc_stats(inode, NFSIOS_VFSGETDENTS); - lock_kernel(); - /* * filp->f_pos points to the dirent entry number. * *desc->dir_cookie has the cookie for the next entry. We have @@ -592,10 +591,9 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) } out: nfs_unblock_sillyrename(dentry); - unlock_kernel(); if (res > 0) res = 0; - dfprintk(VFS, "NFS: readdir(%s/%s) returns %ld\n", + dfprintk(FILE, "NFS: readdir(%s/%s) returns %ld\n", dentry->d_parent->d_name.name, dentry->d_name.name, res); return res; @@ -603,7 +601,15 @@ out: static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin) { - mutex_lock(&filp->f_path.dentry->d_inode->i_mutex); + struct dentry *dentry = filp->f_path.dentry; + struct inode *inode = dentry->d_inode; + + dfprintk(FILE, "NFS: llseek dir(%s/%s, %lld, %d)\n", + dentry->d_parent->d_name.name, + dentry->d_name.name, + offset, origin); + + mutex_lock(&inode->i_mutex); switch (origin) { case 1: offset += filp->f_pos; @@ -619,7 +625,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin) nfs_file_open_context(filp)->dir_cookie = 0; } out: - mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex); + mutex_unlock(&inode->i_mutex); return offset; } @@ -629,10 +635,11 @@ out: */ static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync) { - dfprintk(VFS, "NFS: fsync_dir(%s/%s) datasync %d\n", + dfprintk(FILE, "NFS: fsync dir(%s/%s) datasync %d\n", dentry->d_parent->d_name.name, dentry->d_name.name, datasync); + nfs_inc_stats(dentry->d_inode, NFSIOS_VFSFSYNC); return 0; } @@ -767,7 +774,6 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd) struct nfs_fattr fattr; parent = dget_parent(dentry); - lock_kernel(); dir = parent->d_inode; nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE); inode = dentry->d_inode; @@ -805,7 +811,6 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd) nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); out_valid: - unlock_kernel(); dput(parent); dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is valid\n", __func__, dentry->d_parent->d_name.name, @@ -824,7 +829,6 @@ out_zap_parent: shrink_dcache_parent(dentry); } d_drop(dentry); - unlock_kernel(); dput(parent); dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n", __func__, dentry->d_parent->d_name.name, @@ -858,6 +862,14 @@ static int nfs_dentry_delete(struct dentry *dentry) } +static void nfs_drop_nlink(struct inode *inode) +{ + spin_lock(&inode->i_lock); + if (inode->i_nlink > 0) + drop_nlink(inode); + spin_unlock(&inode->i_lock); +} + /* * Called when the dentry loses inode. * We use it to clean up silly-renamed files. @@ -869,10 +881,8 @@ static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode) NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA; if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { - lock_kernel(); drop_nlink(inode); nfs_complete_unlink(dentry, inode); - unlock_kernel(); } iput(inode); } @@ -903,8 +913,6 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru res = ERR_PTR(-ENOMEM); dentry->d_op = NFS_PROTO(dir)->dentry_ops; - lock_kernel(); - /* * If we're doing an exclusive create, optimize away the lookup * but don't hash the dentry. @@ -912,7 +920,7 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru if (nfs_is_exclusive_create(dir, nd)) { d_instantiate(dentry, NULL); res = NULL; - goto out_unlock; + goto out; } parent = dentry->d_parent; @@ -940,8 +948,6 @@ no_entry: nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); out_unblock_sillyrename: nfs_unblock_sillyrename(parent); -out_unlock: - unlock_kernel(); out: return res; } @@ -999,9 +1005,7 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry } /* Open the file on the server */ - lock_kernel(); res = nfs4_atomic_open(dir, dentry, nd); - unlock_kernel(); if (IS_ERR(res)) { error = PTR_ERR(res); switch (error) { @@ -1063,9 +1067,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd) * operations that change the directory. We therefore save the * change attribute *before* we do the RPC call. */ - lock_kernel(); ret = nfs4_open_revalidate(dir, dentry, openflags, nd); - unlock_kernel(); out: dput(parent); if (!ret) @@ -1218,14 +1220,11 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode, if ((nd->flags & LOOKUP_CREATE) != 0) open_flags = nd->intent.open.flags; - lock_kernel(); error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, nd); if (error != 0) goto out_err; - unlock_kernel(); return 0; out_err: - unlock_kernel(); d_drop(dentry); return error; } @@ -1248,14 +1247,11 @@ nfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) attr.ia_mode = mode; attr.ia_valid = ATTR_MODE; - lock_kernel(); status = NFS_PROTO(dir)->mknod(dir, dentry, &attr, rdev); if (status != 0) goto out_err; - unlock_kernel(); return 0; out_err: - unlock_kernel(); d_drop(dentry); return status; } @@ -1274,15 +1270,12 @@ static int nfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) attr.ia_valid = ATTR_MODE; attr.ia_mode = mode | S_IFDIR; - lock_kernel(); error = NFS_PROTO(dir)->mkdir(dir, dentry, &attr); if (error != 0) goto out_err; - unlock_kernel(); return 0; out_err: d_drop(dentry); - unlock_kernel(); return error; } @@ -1299,14 +1292,12 @@ static int nfs_rmdir(struct inode *dir, struct dentry *dentry) dfprintk(VFS, "NFS: rmdir(%s/%ld), %s\n", dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); - lock_kernel(); error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name); /* Ensure the VFS deletes this inode */ if (error == 0 && dentry->d_inode != NULL) clear_nlink(dentry->d_inode); else if (error == -ENOENT) nfs_dentry_handle_enoent(dentry); - unlock_kernel(); return error; } @@ -1408,7 +1399,7 @@ static int nfs_safe_remove(struct dentry *dentry) error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); /* The VFS may want to delete this inode */ if (error == 0) - drop_nlink(inode); + nfs_drop_nlink(inode); nfs_mark_for_revalidate(inode); } else error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); @@ -1431,7 +1422,6 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry) dfprintk(VFS, "NFS: unlink(%s/%ld, %s)\n", dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); - lock_kernel(); spin_lock(&dcache_lock); spin_lock(&dentry->d_lock); if (atomic_read(&dentry->d_count) > 1) { @@ -1440,7 +1430,6 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry) /* Start asynchronous writeout of the inode */ write_inode_now(dentry->d_inode, 0); error = nfs_sillyrename(dir, dentry); - unlock_kernel(); return error; } if (!d_unhashed(dentry)) { @@ -1454,7 +1443,6 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry) nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); } else if (need_rehash) d_rehash(dentry); - unlock_kernel(); return error; } @@ -1491,13 +1479,9 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym attr.ia_mode = S_IFLNK | S_IRWXUGO; attr.ia_valid = ATTR_MODE; - lock_kernel(); - page = alloc_page(GFP_HIGHUSER); - if (!page) { - unlock_kernel(); + if (!page) return -ENOMEM; - } kaddr = kmap_atomic(page, KM_USER0); memcpy(kaddr, symname, pathlen); @@ -1512,7 +1496,6 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym dentry->d_name.name, symname, error); d_drop(dentry); __free_page(page); - unlock_kernel(); return error; } @@ -1530,7 +1513,6 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym } else __free_page(page); - unlock_kernel(); return 0; } @@ -1544,14 +1526,12 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) old_dentry->d_parent->d_name.name, old_dentry->d_name.name, dentry->d_parent->d_name.name, dentry->d_name.name); - lock_kernel(); d_drop(dentry); error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name); if (error == 0) { atomic_inc(&inode->i_count); d_add(dentry, inode); } - unlock_kernel(); return error; } @@ -1591,7 +1571,6 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, * To prevent any new references to the target during the rename, * we unhash the dentry and free the inode in advance. */ - lock_kernel(); if (!d_unhashed(new_dentry)) { d_drop(new_dentry); rehash = new_dentry; @@ -1635,7 +1614,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, /* dentry still busy? */ goto out; } else - drop_nlink(new_inode); + nfs_drop_nlink(new_inode); go_ahead: /* @@ -1669,7 +1648,6 @@ out: /* new dentry created? */ if (dentry) dput(dentry); - unlock_kernel(); return error; } @@ -1906,7 +1884,7 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask) return status; nfs_access_add_cache(inode, &cache); out: - if ((cache.mask & mask) == mask) + if ((mask & ~cache.mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0) return 0; return -EACCES; } @@ -1929,17 +1907,17 @@ int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags) return nfs_do_access(inode, cred, nfs_open_permission_mask(openflags)); } -int nfs_permission(struct inode *inode, int mask, struct nameidata *nd) +int nfs_permission(struct inode *inode, int mask) { struct rpc_cred *cred; int res = 0; nfs_inc_stats(inode, NFSIOS_VFSACCESS); - if (mask == 0) + if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0) goto out; /* Is this sys_access() ? */ - if (nd != NULL && (nd->flags & LOOKUP_ACCESS)) + if (mask & MAY_ACCESS) goto force_lookup; switch (inode->i_mode & S_IFMT) { @@ -1948,8 +1926,7 @@ int nfs_permission(struct inode *inode, int mask, struct nameidata *nd) case S_IFREG: /* NFSv4 has atomic_open... */ if (nfs_server_capable(inode, NFS_CAP_ATOMIC_OPEN) - && nd != NULL - && (nd->flags & LOOKUP_OPEN)) + && (mask & MAY_OPEN)) goto out; break; case S_IFDIR: @@ -1962,8 +1939,6 @@ int nfs_permission(struct inode *inode, int mask, struct nameidata *nd) } force_lookup: - lock_kernel(); - if (!NFS_PROTO(inode)->access) goto out_notsup; @@ -1973,7 +1948,6 @@ force_lookup: put_rpccred(cred); } else res = PTR_ERR(cred); - unlock_kernel(); out: dfprintk(VFS, "NFS: permission(%s/%ld), mask=0x%x, res=%d\n", inode->i_sb->s_id, inode->i_ino, mask, res); @@ -1982,7 +1956,6 @@ out_notsup: res = nfs_revalidate_inode(NFS_SERVER(inode), inode); if (res == 0) res = generic_permission(inode, mask, NULL); - unlock_kernel(); goto out; } diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 4757a2b326a1..08f6b040d289 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -890,7 +890,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov, count = iov_length(iov, nr_segs); nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count); - dprintk("nfs: direct read(%s/%s, %zd@%Ld)\n", + dfprintk(FILE, "NFS: direct read(%s/%s, %zd@%Ld)\n", file->f_path.dentry->d_parent->d_name.name, file->f_path.dentry->d_name.name, count, (long long) pos); @@ -947,7 +947,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov, count = iov_length(iov, nr_segs); nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count); - dfprintk(VFS, "nfs: direct write(%s/%s, %zd@%Ld)\n", + dfprintk(FILE, "NFS: direct write(%s/%s, %zd@%Ld)\n", file->f_path.dentry->d_parent->d_name.name, file->f_path.dentry->d_name.name, count, (long long) pos); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index d84a3d8f32af..78460657f5cb 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -50,7 +50,7 @@ static ssize_t nfs_file_read(struct kiocb *, const struct iovec *iov, static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov, unsigned long nr_segs, loff_t pos); static int nfs_file_flush(struct file *, fl_owner_t id); -static int nfs_fsync(struct file *, struct dentry *dentry, int datasync); +static int nfs_file_fsync(struct file *, struct dentry *dentry, int datasync); static int nfs_check_flags(int flags); static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl); static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl); @@ -72,7 +72,7 @@ const struct file_operations nfs_file_operations = { .open = nfs_file_open, .flush = nfs_file_flush, .release = nfs_file_release, - .fsync = nfs_fsync, + .fsync = nfs_file_fsync, .lock = nfs_lock, .flock = nfs_flock, .splice_read = nfs_file_splice_read, @@ -119,25 +119,33 @@ nfs_file_open(struct inode *inode, struct file *filp) { int res; + dprintk("NFS: open file(%s/%s)\n", + filp->f_path.dentry->d_parent->d_name.name, + filp->f_path.dentry->d_name.name); + res = nfs_check_flags(filp->f_flags); if (res) return res; nfs_inc_stats(inode, NFSIOS_VFSOPEN); - lock_kernel(); - res = NFS_PROTO(inode)->file_open(inode, filp); - unlock_kernel(); + res = nfs_open(inode, filp); return res; } static int nfs_file_release(struct inode *inode, struct file *filp) { + struct dentry *dentry = filp->f_path.dentry; + + dprintk("NFS: release(%s/%s)\n", + dentry->d_parent->d_name.name, + dentry->d_name.name); + /* Ensure that dirty pages are flushed out with the right creds */ if (filp->f_mode & FMODE_WRITE) - nfs_wb_all(filp->f_path.dentry->d_inode); + nfs_wb_all(dentry->d_inode); nfs_inc_stats(inode, NFSIOS_VFSRELEASE); - return NFS_PROTO(inode)->file_release(inode, filp); + return nfs_release(inode, filp); } /** @@ -170,6 +178,13 @@ force_reval: static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin) { + loff_t loff; + + dprintk("NFS: llseek file(%s/%s, %lld, %d)\n", + filp->f_path.dentry->d_parent->d_name.name, + filp->f_path.dentry->d_name.name, + offset, origin); + /* origin == SEEK_END => we must revalidate the cached file length */ if (origin == SEEK_END) { struct inode *inode = filp->f_mapping->host; @@ -177,11 +192,14 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin) if (retval < 0) return (loff_t)retval; } - return remote_llseek(filp, offset, origin); + lock_kernel(); /* BKL needed? */ + loff = generic_file_llseek_unlocked(filp, offset, origin); + unlock_kernel(); + return loff; } /* - * Helper for nfs_file_flush() and nfs_fsync() + * Helper for nfs_file_flush() and nfs_file_fsync() * * Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to * disk, but it retrieves and clears ctx->error after synching, despite @@ -207,16 +225,18 @@ static int nfs_do_fsync(struct nfs_open_context *ctx, struct inode *inode) /* * Flush all dirty pages, and check for write errors. - * */ static int nfs_file_flush(struct file *file, fl_owner_t id) { struct nfs_open_context *ctx = nfs_file_open_context(file); - struct inode *inode = file->f_path.dentry->d_inode; + struct dentry *dentry = file->f_path.dentry; + struct inode *inode = dentry->d_inode; int status; - dfprintk(VFS, "nfs: flush(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino); + dprintk("NFS: flush(%s/%s)\n", + dentry->d_parent->d_name.name, + dentry->d_name.name); if ((file->f_mode & FMODE_WRITE) == 0) return 0; @@ -241,7 +261,7 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov, if (iocb->ki_filp->f_flags & O_DIRECT) return nfs_file_direct_read(iocb, iov, nr_segs, pos); - dfprintk(VFS, "nfs: read(%s/%s, %lu@%lu)\n", + dprintk("NFS: read(%s/%s, %lu@%lu)\n", dentry->d_parent->d_name.name, dentry->d_name.name, (unsigned long) count, (unsigned long) pos); @@ -261,7 +281,7 @@ nfs_file_splice_read(struct file *filp, loff_t *ppos, struct inode *inode = dentry->d_inode; ssize_t res; - dfprintk(VFS, "nfs: splice_read(%s/%s, %lu@%Lu)\n", + dprintk("NFS: splice_read(%s/%s, %lu@%Lu)\n", dentry->d_parent->d_name.name, dentry->d_name.name, (unsigned long) count, (unsigned long long) *ppos); @@ -278,7 +298,7 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma) struct inode *inode = dentry->d_inode; int status; - dfprintk(VFS, "nfs: mmap(%s/%s)\n", + dprintk("NFS: mmap(%s/%s)\n", dentry->d_parent->d_name.name, dentry->d_name.name); status = nfs_revalidate_mapping(inode, file->f_mapping); @@ -296,12 +316,14 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma) * whether any write errors occurred for this process. */ static int -nfs_fsync(struct file *file, struct dentry *dentry, int datasync) +nfs_file_fsync(struct file *file, struct dentry *dentry, int datasync) { struct nfs_open_context *ctx = nfs_file_open_context(file); struct inode *inode = dentry->d_inode; - dfprintk(VFS, "nfs: fsync(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino); + dprintk("NFS: fsync file(%s/%s) datasync %d\n", + dentry->d_parent->d_name.name, dentry->d_name.name, + datasync); nfs_inc_stats(inode, NFSIOS_VFSFSYNC); return nfs_do_fsync(ctx, inode); @@ -324,6 +346,11 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping, struct page *page; index = pos >> PAGE_CACHE_SHIFT; + dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n", + file->f_path.dentry->d_parent->d_name.name, + file->f_path.dentry->d_name.name, + mapping->host->i_ino, len, (long long) pos); + page = __grab_cache_page(mapping, index); if (!page) return -ENOMEM; @@ -344,9 +371,32 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, unsigned offset = pos & (PAGE_CACHE_SIZE - 1); int status; - lock_kernel(); + dfprintk(PAGECACHE, "NFS: write_end(%s/%s(%ld), %u@%lld)\n", + file->f_path.dentry->d_parent->d_name.name, + file->f_path.dentry->d_name.name, + mapping->host->i_ino, len, (long long) pos); + + /* + * Zero any uninitialised parts of the page, and then mark the page + * as up to date if it turns out that we're extending the file. + */ + if (!PageUptodate(page)) { + unsigned pglen = nfs_page_length(page); + unsigned end = offset + len; + + if (pglen == 0) { + zero_user_segments(page, 0, offset, + end, PAGE_CACHE_SIZE); + SetPageUptodate(page); + } else if (end >= pglen) { + zero_user_segment(page, end, PAGE_CACHE_SIZE); + if (offset == 0) + SetPageUptodate(page); + } else + zero_user_segment(page, pglen, PAGE_CACHE_SIZE); + } + status = nfs_updatepage(file, page, offset, copied); - unlock_kernel(); unlock_page(page); page_cache_release(page); @@ -358,6 +408,8 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, static void nfs_invalidate_page(struct page *page, unsigned long offset) { + dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset); + if (offset != 0) return; /* Cancel any unstarted writes on this page */ @@ -366,13 +418,20 @@ static void nfs_invalidate_page(struct page *page, unsigned long offset) static int nfs_release_page(struct page *page, gfp_t gfp) { + dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); + /* If PagePrivate() is set, then the page is not freeable */ return 0; } static int nfs_launder_page(struct page *page) { - return nfs_wb_page(page->mapping->host, page); + struct inode *inode = page->mapping->host; + + dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n", + inode->i_ino, (long long)page_offset(page)); + + return nfs_wb_page(inode, page); } const struct address_space_operations nfs_file_aops = { @@ -392,13 +451,19 @@ const struct address_space_operations nfs_file_aops = { static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) { struct file *filp = vma->vm_file; + struct dentry *dentry = filp->f_path.dentry; unsigned pagelen; int ret = -EINVAL; struct address_space *mapping; + dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%s/%s(%ld), offset %lld)\n", + dentry->d_parent->d_name.name, dentry->d_name.name, + filp->f_mapping->host->i_ino, + (long long)page_offset(page)); + lock_page(page); mapping = page->mapping; - if (mapping != vma->vm_file->f_path.dentry->d_inode->i_mapping) + if (mapping != dentry->d_inode->i_mapping) goto out_unlock; ret = 0; @@ -446,9 +511,9 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov, if (iocb->ki_filp->f_flags & O_DIRECT) return nfs_file_direct_write(iocb, iov, nr_segs, pos); - dfprintk(VFS, "nfs: write(%s/%s(%ld), %lu@%Ld)\n", + dprintk("NFS: write(%s/%s, %lu@%Ld)\n", dentry->d_parent->d_name.name, dentry->d_name.name, - inode->i_ino, (unsigned long) count, (long long) pos); + (unsigned long) count, (long long) pos); result = -EBUSY; if (IS_SWAPFILE(inode)) @@ -582,7 +647,8 @@ static int do_setlk(struct file *filp, int cmd, struct file_lock *fl) * This makes locking act as a cache coherency point. */ nfs_sync_mapping(filp->f_mapping); - nfs_zap_caches(inode); + if (!nfs_have_delegation(inode, FMODE_READ)) + nfs_zap_caches(inode); out: return status; } @@ -592,23 +658,35 @@ out: */ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl) { - struct inode * inode = filp->f_mapping->host; + struct inode *inode = filp->f_mapping->host; + int ret = -ENOLCK; - dprintk("NFS: nfs_lock(f=%s/%ld, t=%x, fl=%x, r=%Ld:%Ld)\n", - inode->i_sb->s_id, inode->i_ino, + dprintk("NFS: lock(%s/%s, t=%x, fl=%x, r=%lld:%lld)\n", + filp->f_path.dentry->d_parent->d_name.name, + filp->f_path.dentry->d_name.name, fl->fl_type, fl->fl_flags, (long long)fl->fl_start, (long long)fl->fl_end); + nfs_inc_stats(inode, NFSIOS_VFSLOCK); /* No mandatory locks over NFS */ if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) - return -ENOLCK; + goto out_err; + + if (NFS_PROTO(inode)->lock_check_bounds != NULL) { + ret = NFS_PROTO(inode)->lock_check_bounds(fl); + if (ret < 0) + goto out_err; + } if (IS_GETLK(cmd)) - return do_getlk(filp, cmd, fl); - if (fl->fl_type == F_UNLCK) - return do_unlk(filp, cmd, fl); - return do_setlk(filp, cmd, fl); + ret = do_getlk(filp, cmd, fl); + else if (fl->fl_type == F_UNLCK) + ret = do_unlk(filp, cmd, fl); + else + ret = do_setlk(filp, cmd, fl); +out_err: + return ret; } /* @@ -616,9 +694,9 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl) */ static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) { - dprintk("NFS: nfs_flock(f=%s/%ld, t=%x, fl=%x)\n", - filp->f_path.dentry->d_inode->i_sb->s_id, - filp->f_path.dentry->d_inode->i_ino, + dprintk("NFS: flock(%s/%s, t=%x, fl=%x)\n", + filp->f_path.dentry->d_parent->d_name.name, + filp->f_path.dentry->d_name.name, fl->fl_type, fl->fl_flags); /* @@ -641,12 +719,15 @@ static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) return do_setlk(filp, cmd, fl); } +/* + * There is no protocol support for leases, so we have no way to implement + * them correctly in the face of opens by other clients. + */ static int nfs_setlease(struct file *file, long arg, struct file_lock **fl) { - /* - * There is no protocol support for leases, so we have no way - * to implement them correctly in the face of opens by other - * clients. - */ + dprintk("NFS: setlease(%s/%s, arg=%ld)\n", + file->f_path.dentry->d_parent->d_name.name, + file->f_path.dentry->d_name.name, arg); + return -EINVAL; } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 596c5d8e86f4..52daefa2f521 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -57,8 +57,6 @@ static int enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED; static void nfs_invalidate_inode(struct inode *); static int nfs_update_inode(struct inode *, struct nfs_fattr *); -static void nfs_zap_acl_cache(struct inode *); - static struct kmem_cache * nfs_inode_cachep; static inline unsigned long @@ -167,7 +165,7 @@ void nfs_zap_mapping(struct inode *inode, struct address_space *mapping) } } -static void nfs_zap_acl_cache(struct inode *inode) +void nfs_zap_acl_cache(struct inode *inode) { void (*clear_acl_cache)(struct inode *); @@ -347,7 +345,7 @@ out_no_inode: goto out; } -#define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET) +#define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET|ATTR_FILE) int nfs_setattr(struct dentry *dentry, struct iattr *attr) @@ -369,10 +367,9 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr) /* Optimization: if the end result is no change, don't RPC */ attr->ia_valid &= NFS_VALID_ATTRS; - if (attr->ia_valid == 0) + if ((attr->ia_valid & ~ATTR_FILE) == 0) return 0; - lock_kernel(); /* Write all dirty data */ if (S_ISREG(inode->i_mode)) { filemap_write_and_wait(inode->i_mapping); @@ -386,11 +383,66 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr) error = NFS_PROTO(inode)->setattr(dentry, &fattr, attr); if (error == 0) nfs_refresh_inode(inode, &fattr); - unlock_kernel(); return error; } /** + * nfs_vmtruncate - unmap mappings "freed" by truncate() syscall + * @inode: inode of the file used + * @offset: file offset to start truncating + * + * This is a copy of the common vmtruncate, but with the locking + * corrected to take into account the fact that NFS requires + * inode->i_size to be updated under the inode->i_lock. + */ +static int nfs_vmtruncate(struct inode * inode, loff_t offset) +{ + if (i_size_read(inode) < offset) { + unsigned long limit; + + limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; + if (limit != RLIM_INFINITY && offset > limit) + goto out_sig; + if (offset > inode->i_sb->s_maxbytes) + goto out_big; + spin_lock(&inode->i_lock); + i_size_write(inode, offset); + spin_unlock(&inode->i_lock); + } else { + struct address_space *mapping = inode->i_mapping; + + /* + * truncation of in-use swapfiles is disallowed - it would + * cause subsequent swapout to scribble on the now-freed + * blocks. + */ + if (IS_SWAPFILE(inode)) + return -ETXTBSY; + spin_lock(&inode->i_lock); + i_size_write(inode, offset); + spin_unlock(&inode->i_lock); + + /* + * unmap_mapping_range is called twice, first simply for + * efficiency so that truncate_inode_pages does fewer + * single-page unmaps. However after this first call, and + * before truncate_inode_pages finishes, it is possible for + * private pages to be COWed, which remain after + * truncate_inode_pages finishes, hence the second + * unmap_mapping_range call must be made for correctness. + */ + unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); + truncate_inode_pages(mapping, offset); + unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); + } + return 0; +out_sig: + send_sig(SIGXFSZ, current, 0); +out_big: + return -EFBIG; +} + +/** * nfs_setattr_update_inode - Update inode metadata after a setattr call. * @inode: pointer to struct inode * @attr: pointer to struct iattr @@ -416,8 +468,7 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr) } if ((attr->ia_valid & ATTR_SIZE) != 0) { nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC); - inode->i_size = attr->ia_size; - vmtruncate(inode, attr->ia_size); + nfs_vmtruncate(inode, attr->ia_size); } } @@ -647,7 +698,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) inode->i_sb->s_id, (long long)NFS_FILEID(inode)); nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); - lock_kernel(); if (is_bad_inode(inode)) goto out_nowait; if (NFS_STALE(inode)) @@ -696,7 +746,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) nfs_wake_up_inode(inode); out_nowait: - unlock_kernel(); return status; } @@ -831,9 +880,9 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (S_ISDIR(inode->i_mode)) nfsi->cache_validity |= NFS_INO_INVALID_DATA; } - if (inode->i_size == nfs_size_to_loff_t(fattr->pre_size) && + if (i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) && nfsi->npages == 0) - inode->i_size = nfs_size_to_loff_t(fattr->size); + i_size_write(inode, nfs_size_to_loff_t(fattr->size)); } } @@ -974,7 +1023,7 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa (fattr->valid & NFS_ATTR_WCC) == 0) { memcpy(&fattr->pre_ctime, &inode->i_ctime, sizeof(fattr->pre_ctime)); memcpy(&fattr->pre_mtime, &inode->i_mtime, sizeof(fattr->pre_mtime)); - fattr->pre_size = inode->i_size; + fattr->pre_size = i_size_read(inode); fattr->valid |= NFS_ATTR_WCC; } return nfs_post_op_update_inode(inode, fattr); @@ -1059,7 +1108,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) /* Do we perhaps have any outstanding writes, or has * the file grown beyond our last write? */ if (nfsi->npages == 0 || new_isize > cur_isize) { - inode->i_size = new_isize; + i_size_write(inode, new_isize); invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; } dprintk("NFS: isize change on server for file %s/%ld\n", @@ -1193,7 +1242,7 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi) #endif } -static void init_once(struct kmem_cache * cachep, void *foo) +static void init_once(void *foo) { struct nfs_inode *nfsi = (struct nfs_inode *) foo; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 04ae867dddba..24241fcbb98d 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -150,6 +150,7 @@ extern void nfs_clear_inode(struct inode *); #ifdef CONFIG_NFS_V4 extern void nfs4_clear_inode(struct inode *); #endif +void nfs_zap_acl_cache(struct inode *inode); /* super.c */ extern struct file_system_type nfs_xdev_fs_type; diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h index 6350ecbde589..a36952810032 100644 --- a/fs/nfs/iostat.h +++ b/fs/nfs/iostat.h @@ -5,135 +5,41 @@ * * Copyright (C) 2005, 2006 Chuck Lever <cel@netapp.com> * - * NFS client per-mount statistics provide information about the health of - * the NFS client and the health of each NFS mount point. Generally these - * are not for detailed problem diagnosis, but simply to indicate that there - * is a problem. - * - * These counters are not meant to be human-readable, but are meant to be - * integrated into system monitoring tools such as "sar" and "iostat". As - * such, the counters are sampled by the tools over time, and are never - * zeroed after a file system is mounted. Moving averages can be computed - * by the tools by taking the difference between two instantaneous samples - * and dividing that by the time between the samples. */ #ifndef _NFS_IOSTAT #define _NFS_IOSTAT -#define NFS_IOSTAT_VERS "1.0" - -/* - * NFS byte counters - * - * 1. SERVER - the number of payload bytes read from or written to the - * server by the NFS client via an NFS READ or WRITE request. - * - * 2. NORMAL - the number of bytes read or written by applications via - * the read(2) and write(2) system call interfaces. - * - * 3. DIRECT - the number of bytes read or written from files opened - * with the O_DIRECT flag. - * - * These counters give a view of the data throughput into and out of the NFS - * client. Comparing the number of bytes requested by an application with the - * number of bytes the client requests from the server can provide an - * indication of client efficiency (per-op, cache hits, etc). - * - * These counters can also help characterize which access methods are in - * use. DIRECT by itself shows whether there is any O_DIRECT traffic. - * NORMAL + DIRECT shows how much data is going through the system call - * interface. A large amount of SERVER traffic without much NORMAL or - * DIRECT traffic shows that applications are using mapped files. - * - * NFS page counters - * - * These count the number of pages read or written via nfs_readpage(), - * nfs_readpages(), or their write equivalents. - */ -enum nfs_stat_bytecounters { - NFSIOS_NORMALREADBYTES = 0, - NFSIOS_NORMALWRITTENBYTES, - NFSIOS_DIRECTREADBYTES, - NFSIOS_DIRECTWRITTENBYTES, - NFSIOS_SERVERREADBYTES, - NFSIOS_SERVERWRITTENBYTES, - NFSIOS_READPAGES, - NFSIOS_WRITEPAGES, - __NFSIOS_BYTESMAX, -}; - -/* - * NFS event counters - * - * These counters provide a low-overhead way of monitoring client activity - * without enabling NFS trace debugging. The counters show the rate at - * which VFS requests are made, and how often the client invalidates its - * data and attribute caches. This allows system administrators to monitor - * such things as how close-to-open is working, and answer questions such - * as "why are there so many GETATTR requests on the wire?" - * - * They also count anamolous events such as short reads and writes, silly - * renames due to close-after-delete, and operations that change the size - * of a file (such operations can often be the source of data corruption - * if applications aren't using file locking properly). - */ -enum nfs_stat_eventcounters { - NFSIOS_INODEREVALIDATE = 0, - NFSIOS_DENTRYREVALIDATE, - NFSIOS_DATAINVALIDATE, - NFSIOS_ATTRINVALIDATE, - NFSIOS_VFSOPEN, - NFSIOS_VFSLOOKUP, - NFSIOS_VFSACCESS, - NFSIOS_VFSUPDATEPAGE, - NFSIOS_VFSREADPAGE, - NFSIOS_VFSREADPAGES, - NFSIOS_VFSWRITEPAGE, - NFSIOS_VFSWRITEPAGES, - NFSIOS_VFSGETDENTS, - NFSIOS_VFSSETATTR, - NFSIOS_VFSFLUSH, - NFSIOS_VFSFSYNC, - NFSIOS_VFSLOCK, - NFSIOS_VFSRELEASE, - NFSIOS_CONGESTIONWAIT, - NFSIOS_SETATTRTRUNC, - NFSIOS_EXTENDWRITE, - NFSIOS_SILLYRENAME, - NFSIOS_SHORTREAD, - NFSIOS_SHORTWRITE, - NFSIOS_DELAY, - __NFSIOS_COUNTSMAX, -}; - -#ifdef __KERNEL__ - #include <linux/percpu.h> #include <linux/cache.h> +#include <linux/nfs_iostat.h> struct nfs_iostats { unsigned long long bytes[__NFSIOS_BYTESMAX]; unsigned long events[__NFSIOS_COUNTSMAX]; } ____cacheline_aligned; -static inline void nfs_inc_server_stats(struct nfs_server *server, enum nfs_stat_eventcounters stat) +static inline void nfs_inc_server_stats(const struct nfs_server *server, + enum nfs_stat_eventcounters stat) { struct nfs_iostats *iostats; int cpu; cpu = get_cpu(); iostats = per_cpu_ptr(server->io_stats, cpu); - iostats->events[stat] ++; + iostats->events[stat]++; put_cpu_no_resched(); } -static inline void nfs_inc_stats(struct inode *inode, enum nfs_stat_eventcounters stat) +static inline void nfs_inc_stats(const struct inode *inode, + enum nfs_stat_eventcounters stat) { nfs_inc_server_stats(NFS_SERVER(inode), stat); } -static inline void nfs_add_server_stats(struct nfs_server *server, enum nfs_stat_bytecounters stat, unsigned long addend) +static inline void nfs_add_server_stats(const struct nfs_server *server, + enum nfs_stat_bytecounters stat, + unsigned long addend) { struct nfs_iostats *iostats; int cpu; @@ -144,7 +50,9 @@ static inline void nfs_add_server_stats(struct nfs_server *server, enum nfs_stat put_cpu_no_resched(); } -static inline void nfs_add_stats(struct inode *inode, enum nfs_stat_bytecounters stat, unsigned long addend) +static inline void nfs_add_stats(const struct inode *inode, + enum nfs_stat_bytecounters stat, + unsigned long addend) { nfs_add_server_stats(NFS_SERVER(inode), stat, addend); } @@ -160,5 +68,4 @@ static inline void nfs_free_iostats(struct nfs_iostats *stats) free_percpu(stats); } -#endif -#endif +#endif /* _NFS_IOSTAT */ diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 2f285ef76399..66df08dd1caf 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -129,7 +129,7 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd) goto out_err; mntget(mnt); - err = do_add_mount(mnt, nd, nd->path.mnt->mnt_flags|MNT_SHRINKABLE, + err = do_add_mount(mnt, &nd->path, nd->path.mnt->mnt_flags|MNT_SHRINKABLE, &nfs_automount_list); if (err < 0) { mntput(mnt); diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 9b7362565c0c..423842f51ac9 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -5,6 +5,8 @@ #include <linux/posix_acl_xattr.h> #include <linux/nfsacl.h> +#include "internal.h" + #define NFSDBG_FACILITY NFSDBG_PROC ssize_t nfs3_listxattr(struct dentry *dentry, char *buffer, size_t size) @@ -205,6 +207,8 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type) status = nfs_revalidate_inode(server, inode); if (status < 0) return ERR_PTR(status); + if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL) + nfs_zap_acl_cache(inode); acl = nfs3_get_cached_acl(inode, type); if (acl != ERR_PTR(-EAGAIN)) return acl; @@ -319,9 +323,8 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, dprintk("NFS call setacl\n"); msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL]; status = rpc_call_sync(server->client_acl, &msg, 0); - spin_lock(&inode->i_lock); - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS; - spin_unlock(&inode->i_lock); + nfs_access_zap_cache(inode); + nfs_zap_acl_cache(inode); dprintk("NFS reply setacl: %d\n", status); /* pages may have been allocated at the xdr layer. */ diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index c3523ad03ed1..1e750e4574a9 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -129,6 +129,8 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, int status; dprintk("NFS call setattr\n"); + if (sattr->ia_valid & ATTR_FILE) + msg.rpc_cred = nfs_file_cred(sattr->ia_file); nfs_fattr_init(fattr); status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); if (status == 0) @@ -248,6 +250,53 @@ static int nfs3_proc_readlink(struct inode *inode, struct page *page, return status; } +struct nfs3_createdata { + struct rpc_message msg; + union { + struct nfs3_createargs create; + struct nfs3_mkdirargs mkdir; + struct nfs3_symlinkargs symlink; + struct nfs3_mknodargs mknod; + } arg; + struct nfs3_diropres res; + struct nfs_fh fh; + struct nfs_fattr fattr; + struct nfs_fattr dir_attr; +}; + +static struct nfs3_createdata *nfs3_alloc_createdata(void) +{ + struct nfs3_createdata *data; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (data != NULL) { + data->msg.rpc_argp = &data->arg; + data->msg.rpc_resp = &data->res; + data->res.fh = &data->fh; + data->res.fattr = &data->fattr; + data->res.dir_attr = &data->dir_attr; + nfs_fattr_init(data->res.fattr); + nfs_fattr_init(data->res.dir_attr); + } + return data; +} + +static int nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_createdata *data) +{ + int status; + + status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0); + nfs_post_op_update_inode(dir, data->res.dir_attr); + if (status == 0) + status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); + return status; +} + +static void nfs3_free_createdata(struct nfs3_createdata *data) +{ + kfree(data); +} + /* * Create a regular file. * For now, we don't implement O_EXCL. @@ -256,70 +305,60 @@ static int nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, int flags, struct nameidata *nd) { - struct nfs_fh fhandle; - struct nfs_fattr fattr; - struct nfs_fattr dir_attr; - struct nfs3_createargs arg = { - .fh = NFS_FH(dir), - .name = dentry->d_name.name, - .len = dentry->d_name.len, - .sattr = sattr, - }; - struct nfs3_diropres res = { - .dir_attr = &dir_attr, - .fh = &fhandle, - .fattr = &fattr - }; - struct rpc_message msg = { - .rpc_proc = &nfs3_procedures[NFS3PROC_CREATE], - .rpc_argp = &arg, - .rpc_resp = &res, - }; + struct nfs3_createdata *data; mode_t mode = sattr->ia_mode; - int status; + int status = -ENOMEM; dprintk("NFS call create %s\n", dentry->d_name.name); - arg.createmode = NFS3_CREATE_UNCHECKED; + + data = nfs3_alloc_createdata(); + if (data == NULL) + goto out; + + data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_CREATE]; + data->arg.create.fh = NFS_FH(dir); + data->arg.create.name = dentry->d_name.name; + data->arg.create.len = dentry->d_name.len; + data->arg.create.sattr = sattr; + + data->arg.create.createmode = NFS3_CREATE_UNCHECKED; if (flags & O_EXCL) { - arg.createmode = NFS3_CREATE_EXCLUSIVE; - arg.verifier[0] = jiffies; - arg.verifier[1] = current->pid; + data->arg.create.createmode = NFS3_CREATE_EXCLUSIVE; + data->arg.create.verifier[0] = jiffies; + data->arg.create.verifier[1] = current->pid; } sattr->ia_mode &= ~current->fs->umask; -again: - nfs_fattr_init(&dir_attr); - nfs_fattr_init(&fattr); - status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); - nfs_refresh_inode(dir, &dir_attr); + for (;;) { + status = nfs3_do_create(dir, dentry, data); - /* If the server doesn't support the exclusive creation semantics, - * try again with simple 'guarded' mode. */ - if (status == -ENOTSUPP) { - switch (arg.createmode) { + if (status != -ENOTSUPP) + break; + /* If the server doesn't support the exclusive creation + * semantics, try again with simple 'guarded' mode. */ + switch (data->arg.create.createmode) { case NFS3_CREATE_EXCLUSIVE: - arg.createmode = NFS3_CREATE_GUARDED; + data->arg.create.createmode = NFS3_CREATE_GUARDED; break; case NFS3_CREATE_GUARDED: - arg.createmode = NFS3_CREATE_UNCHECKED; + data->arg.create.createmode = NFS3_CREATE_UNCHECKED; break; case NFS3_CREATE_UNCHECKED: goto out; } - goto again; + nfs_fattr_init(data->res.dir_attr); + nfs_fattr_init(data->res.fattr); } - if (status == 0) - status = nfs_instantiate(dentry, &fhandle, &fattr); if (status != 0) goto out; /* When we created the file with exclusive semantics, make * sure we set the attributes afterwards. */ - if (arg.createmode == NFS3_CREATE_EXCLUSIVE) { + if (data->arg.create.createmode == NFS3_CREATE_EXCLUSIVE) { dprintk("NFS call setattr (post-create)\n"); if (!(sattr->ia_valid & ATTR_ATIME_SET)) @@ -330,14 +369,15 @@ again: /* Note: we could use a guarded setattr here, but I'm * not sure this buys us anything (and I'd have * to revamp the NFSv3 XDR code) */ - status = nfs3_proc_setattr(dentry, &fattr, sattr); - nfs_post_op_update_inode(dentry->d_inode, &fattr); + status = nfs3_proc_setattr(dentry, data->res.fattr, sattr); + nfs_post_op_update_inode(dentry->d_inode, data->res.fattr); dprintk("NFS reply setattr (post-create): %d\n", status); + if (status != 0) + goto out; } - if (status != 0) - goto out; status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode); out: + nfs3_free_createdata(data); dprintk("NFS reply create: %d\n", status); return status; } @@ -452,40 +492,28 @@ static int nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, unsigned int len, struct iattr *sattr) { - struct nfs_fh fhandle; - struct nfs_fattr fattr, dir_attr; - struct nfs3_symlinkargs arg = { - .fromfh = NFS_FH(dir), - .fromname = dentry->d_name.name, - .fromlen = dentry->d_name.len, - .pages = &page, - .pathlen = len, - .sattr = sattr - }; - struct nfs3_diropres res = { - .dir_attr = &dir_attr, - .fh = &fhandle, - .fattr = &fattr - }; - struct rpc_message msg = { - .rpc_proc = &nfs3_procedures[NFS3PROC_SYMLINK], - .rpc_argp = &arg, - .rpc_resp = &res, - }; - int status; + struct nfs3_createdata *data; + int status = -ENOMEM; if (len > NFS3_MAXPATHLEN) return -ENAMETOOLONG; dprintk("NFS call symlink %s\n", dentry->d_name.name); - nfs_fattr_init(&dir_attr); - nfs_fattr_init(&fattr); - status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); - nfs_post_op_update_inode(dir, &dir_attr); - if (status != 0) + data = nfs3_alloc_createdata(); + if (data == NULL) goto out; - status = nfs_instantiate(dentry, &fhandle, &fattr); + data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_SYMLINK]; + data->arg.symlink.fromfh = NFS_FH(dir); + data->arg.symlink.fromname = dentry->d_name.name; + data->arg.symlink.fromlen = dentry->d_name.len; + data->arg.symlink.pages = &page; + data->arg.symlink.pathlen = len; + data->arg.symlink.sattr = sattr; + + status = nfs3_do_create(dir, dentry, data); + + nfs3_free_createdata(data); out: dprintk("NFS reply symlink: %d\n", status); return status; @@ -494,42 +522,31 @@ out: static int nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) { - struct nfs_fh fhandle; - struct nfs_fattr fattr, dir_attr; - struct nfs3_mkdirargs arg = { - .fh = NFS_FH(dir), - .name = dentry->d_name.name, - .len = dentry->d_name.len, - .sattr = sattr - }; - struct nfs3_diropres res = { - .dir_attr = &dir_attr, - .fh = &fhandle, - .fattr = &fattr - }; - struct rpc_message msg = { - .rpc_proc = &nfs3_procedures[NFS3PROC_MKDIR], - .rpc_argp = &arg, - .rpc_resp = &res, - }; + struct nfs3_createdata *data; int mode = sattr->ia_mode; - int status; + int status = -ENOMEM; dprintk("NFS call mkdir %s\n", dentry->d_name.name); sattr->ia_mode &= ~current->fs->umask; - nfs_fattr_init(&dir_attr); - nfs_fattr_init(&fattr); - status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); - nfs_post_op_update_inode(dir, &dir_attr); - if (status != 0) + data = nfs3_alloc_createdata(); + if (data == NULL) goto out; - status = nfs_instantiate(dentry, &fhandle, &fattr); + + data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKDIR]; + data->arg.mkdir.fh = NFS_FH(dir); + data->arg.mkdir.name = dentry->d_name.name; + data->arg.mkdir.len = dentry->d_name.len; + data->arg.mkdir.sattr = sattr; + + status = nfs3_do_create(dir, dentry, data); if (status != 0) goto out; + status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode); out: + nfs3_free_createdata(data); dprintk("NFS reply mkdir: %d\n", status); return status; } @@ -615,52 +632,50 @@ static int nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, dev_t rdev) { - struct nfs_fh fh; - struct nfs_fattr fattr, dir_attr; - struct nfs3_mknodargs arg = { - .fh = NFS_FH(dir), - .name = dentry->d_name.name, - .len = dentry->d_name.len, - .sattr = sattr, - .rdev = rdev - }; - struct nfs3_diropres res = { - .dir_attr = &dir_attr, - .fh = &fh, - .fattr = &fattr - }; - struct rpc_message msg = { - .rpc_proc = &nfs3_procedures[NFS3PROC_MKNOD], - .rpc_argp = &arg, - .rpc_resp = &res, - }; + struct nfs3_createdata *data; mode_t mode = sattr->ia_mode; - int status; - - switch (sattr->ia_mode & S_IFMT) { - case S_IFBLK: arg.type = NF3BLK; break; - case S_IFCHR: arg.type = NF3CHR; break; - case S_IFIFO: arg.type = NF3FIFO; break; - case S_IFSOCK: arg.type = NF3SOCK; break; - default: return -EINVAL; - } + int status = -ENOMEM; dprintk("NFS call mknod %s %u:%u\n", dentry->d_name.name, MAJOR(rdev), MINOR(rdev)); sattr->ia_mode &= ~current->fs->umask; - nfs_fattr_init(&dir_attr); - nfs_fattr_init(&fattr); - status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); - nfs_post_op_update_inode(dir, &dir_attr); - if (status != 0) + data = nfs3_alloc_createdata(); + if (data == NULL) goto out; - status = nfs_instantiate(dentry, &fh, &fattr); + + data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKNOD]; + data->arg.mknod.fh = NFS_FH(dir); + data->arg.mknod.name = dentry->d_name.name; + data->arg.mknod.len = dentry->d_name.len; + data->arg.mknod.sattr = sattr; + data->arg.mknod.rdev = rdev; + + switch (sattr->ia_mode & S_IFMT) { + case S_IFBLK: + data->arg.mknod.type = NF3BLK; + break; + case S_IFCHR: + data->arg.mknod.type = NF3CHR; + break; + case S_IFIFO: + data->arg.mknod.type = NF3FIFO; + break; + case S_IFSOCK: + data->arg.mknod.type = NF3SOCK; + break; + default: + status = -EINVAL; + goto out; + } + + status = nfs3_do_create(dir, dentry, data); if (status != 0) goto out; status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode); out: + nfs3_free_createdata(data); dprintk("NFS reply mknod: %d\n", status); return status; } @@ -801,8 +816,6 @@ const struct nfs_rpc_ops nfs_v3_clientops = { .write_done = nfs3_write_done, .commit_setup = nfs3_proc_commit_setup, .commit_done = nfs3_commit_done, - .file_open = nfs_open, - .file_release = nfs_release, .lock = nfs3_proc_lock, .clear_acl_cache = nfs3_forget_cached_acls, }; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 1293e0acd82b..c910413eaeca 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -451,9 +451,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) /* Save the delegation */ memcpy(stateid.data, delegation->stateid.data, sizeof(stateid.data)); rcu_read_unlock(); - lock_kernel(); ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode); - unlock_kernel(); if (ret != 0) goto out; ret = -EAGAIN; @@ -1139,8 +1137,9 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, struct path *path, int return res; } -static int _nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr, - struct iattr *sattr, struct nfs4_state *state) +static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, + struct nfs_fattr *fattr, struct iattr *sattr, + struct nfs4_state *state) { struct nfs_server *server = NFS_SERVER(inode); struct nfs_setattrargs arg = { @@ -1154,9 +1153,10 @@ static int _nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr, .server = server, }; struct rpc_message msg = { - .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR], - .rpc_argp = &arg, - .rpc_resp = &res, + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR], + .rpc_argp = &arg, + .rpc_resp = &res, + .rpc_cred = cred, }; unsigned long timestamp = jiffies; int status; @@ -1166,7 +1166,6 @@ static int _nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr, if (nfs4_copy_delegation_stateid(&arg.stateid, inode)) { /* Use that stateid */ } else if (state != NULL) { - msg.rpc_cred = state->owner->so_cred; nfs4_copy_stateid(&arg.stateid, state, current->files); } else memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid)); @@ -1177,15 +1176,16 @@ static int _nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr, return status; } -static int nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr, - struct iattr *sattr, struct nfs4_state *state) +static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, + struct nfs_fattr *fattr, struct iattr *sattr, + struct nfs4_state *state) { struct nfs_server *server = NFS_SERVER(inode); struct nfs4_exception exception = { }; int err; do { err = nfs4_handle_exception(server, - _nfs4_do_setattr(inode, fattr, sattr, state), + _nfs4_do_setattr(inode, cred, fattr, sattr, state), &exception); } while (exception.retry); return err; @@ -1647,29 +1647,25 @@ static int nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, struct iattr *sattr) { - struct rpc_cred *cred; struct inode *inode = dentry->d_inode; - struct nfs_open_context *ctx; + struct rpc_cred *cred = NULL; struct nfs4_state *state = NULL; int status; nfs_fattr_init(fattr); - cred = rpc_lookup_cred(); - if (IS_ERR(cred)) - return PTR_ERR(cred); - /* Search for an existing open(O_WRITE) file */ - ctx = nfs_find_open_context(inode, cred, FMODE_WRITE); - if (ctx != NULL) + if (sattr->ia_valid & ATTR_FILE) { + struct nfs_open_context *ctx; + + ctx = nfs_file_open_context(sattr->ia_file); + cred = ctx->cred; state = ctx->state; + } - status = nfs4_do_setattr(inode, fattr, sattr, state); + status = nfs4_do_setattr(inode, cred, fattr, sattr, state); if (status == 0) nfs_setattr_update_inode(inode, sattr); - if (ctx != NULL) - put_nfs_open_context(ctx); - put_rpccred(cred); return status; } @@ -1897,17 +1893,16 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, goto out; } state = nfs4_do_open(dir, &path, flags, sattr, cred); - put_rpccred(cred); d_drop(dentry); if (IS_ERR(state)) { status = PTR_ERR(state); - goto out; + goto out_putcred; } d_add(dentry, igrab(state->inode)); nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); if (flags & O_EXCL) { struct nfs_fattr fattr; - status = nfs4_do_setattr(state->inode, &fattr, sattr, state); + status = nfs4_do_setattr(state->inode, cred, &fattr, sattr, state); if (status == 0) nfs_setattr_update_inode(state->inode, sattr); nfs_post_op_update_inode(state->inode, &fattr); @@ -1916,6 +1911,8 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, status = nfs4_intent_set_file(nd, &path, state); else nfs4_close_sync(&path, state, flags); +out_putcred: + put_rpccred(cred); out: return status; } @@ -2079,47 +2076,81 @@ static int nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *n return err; } +struct nfs4_createdata { + struct rpc_message msg; + struct nfs4_create_arg arg; + struct nfs4_create_res res; + struct nfs_fh fh; + struct nfs_fattr fattr; + struct nfs_fattr dir_fattr; +}; + +static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir, + struct qstr *name, struct iattr *sattr, u32 ftype) +{ + struct nfs4_createdata *data; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (data != NULL) { + struct nfs_server *server = NFS_SERVER(dir); + + data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE]; + data->msg.rpc_argp = &data->arg; + data->msg.rpc_resp = &data->res; + data->arg.dir_fh = NFS_FH(dir); + data->arg.server = server; + data->arg.name = name; + data->arg.attrs = sattr; + data->arg.ftype = ftype; + data->arg.bitmask = server->attr_bitmask; + data->res.server = server; + data->res.fh = &data->fh; + data->res.fattr = &data->fattr; + data->res.dir_fattr = &data->dir_fattr; + nfs_fattr_init(data->res.fattr); + nfs_fattr_init(data->res.dir_fattr); + } + return data; +} + +static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_createdata *data) +{ + int status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0); + if (status == 0) { + update_changeattr(dir, &data->res.dir_cinfo); + nfs_post_op_update_inode(dir, data->res.dir_fattr); + status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); + } + return status; +} + +static void nfs4_free_createdata(struct nfs4_createdata *data) +{ + kfree(data); +} + static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, unsigned int len, struct iattr *sattr) { - struct nfs_server *server = NFS_SERVER(dir); - struct nfs_fh fhandle; - struct nfs_fattr fattr, dir_fattr; - struct nfs4_create_arg arg = { - .dir_fh = NFS_FH(dir), - .server = server, - .name = &dentry->d_name, - .attrs = sattr, - .ftype = NF4LNK, - .bitmask = server->attr_bitmask, - }; - struct nfs4_create_res res = { - .server = server, - .fh = &fhandle, - .fattr = &fattr, - .dir_fattr = &dir_fattr, - }; - struct rpc_message msg = { - .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK], - .rpc_argp = &arg, - .rpc_resp = &res, - }; - int status; + struct nfs4_createdata *data; + int status = -ENAMETOOLONG; if (len > NFS4_MAXPATHLEN) - return -ENAMETOOLONG; + goto out; - arg.u.symlink.pages = &page; - arg.u.symlink.len = len; - nfs_fattr_init(&fattr); - nfs_fattr_init(&dir_fattr); + status = -ENOMEM; + data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4LNK); + if (data == NULL) + goto out; + + data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK]; + data->arg.u.symlink.pages = &page; + data->arg.u.symlink.len = len; - status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); - if (!status) { - update_changeattr(dir, &res.dir_cinfo); - nfs_post_op_update_inode(dir, res.dir_fattr); - status = nfs_instantiate(dentry, &fhandle, &fattr); - } + status = nfs4_do_create(dir, dentry, data); + + nfs4_free_createdata(data); +out: return status; } @@ -2140,39 +2171,17 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry, static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) { - struct nfs_server *server = NFS_SERVER(dir); - struct nfs_fh fhandle; - struct nfs_fattr fattr, dir_fattr; - struct nfs4_create_arg arg = { - .dir_fh = NFS_FH(dir), - .server = server, - .name = &dentry->d_name, - .attrs = sattr, - .ftype = NF4DIR, - .bitmask = server->attr_bitmask, - }; - struct nfs4_create_res res = { - .server = server, - .fh = &fhandle, - .fattr = &fattr, - .dir_fattr = &dir_fattr, - }; - struct rpc_message msg = { - .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE], - .rpc_argp = &arg, - .rpc_resp = &res, - }; - int status; + struct nfs4_createdata *data; + int status = -ENOMEM; - nfs_fattr_init(&fattr); - nfs_fattr_init(&dir_fattr); - - status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); - if (!status) { - update_changeattr(dir, &res.dir_cinfo); - nfs_post_op_update_inode(dir, res.dir_fattr); - status = nfs_instantiate(dentry, &fhandle, &fattr); - } + data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4DIR); + if (data == NULL) + goto out; + + status = nfs4_do_create(dir, dentry, data); + + nfs4_free_createdata(data); +out: return status; } @@ -2242,56 +2251,34 @@ static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, dev_t rdev) { - struct nfs_server *server = NFS_SERVER(dir); - struct nfs_fh fh; - struct nfs_fattr fattr, dir_fattr; - struct nfs4_create_arg arg = { - .dir_fh = NFS_FH(dir), - .server = server, - .name = &dentry->d_name, - .attrs = sattr, - .bitmask = server->attr_bitmask, - }; - struct nfs4_create_res res = { - .server = server, - .fh = &fh, - .fattr = &fattr, - .dir_fattr = &dir_fattr, - }; - struct rpc_message msg = { - .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE], - .rpc_argp = &arg, - .rpc_resp = &res, - }; - int status; - int mode = sattr->ia_mode; - - nfs_fattr_init(&fattr); - nfs_fattr_init(&dir_fattr); + struct nfs4_createdata *data; + int mode = sattr->ia_mode; + int status = -ENOMEM; BUG_ON(!(sattr->ia_valid & ATTR_MODE)); BUG_ON(!S_ISFIFO(mode) && !S_ISBLK(mode) && !S_ISCHR(mode) && !S_ISSOCK(mode)); + + data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4SOCK); + if (data == NULL) + goto out; + if (S_ISFIFO(mode)) - arg.ftype = NF4FIFO; + data->arg.ftype = NF4FIFO; else if (S_ISBLK(mode)) { - arg.ftype = NF4BLK; - arg.u.device.specdata1 = MAJOR(rdev); - arg.u.device.specdata2 = MINOR(rdev); + data->arg.ftype = NF4BLK; + data->arg.u.device.specdata1 = MAJOR(rdev); + data->arg.u.device.specdata2 = MINOR(rdev); } else if (S_ISCHR(mode)) { - arg.ftype = NF4CHR; - arg.u.device.specdata1 = MAJOR(rdev); - arg.u.device.specdata2 = MINOR(rdev); + data->arg.ftype = NF4CHR; + data->arg.u.device.specdata1 = MAJOR(rdev); + data->arg.u.device.specdata2 = MINOR(rdev); } - else - arg.ftype = NF4SOCK; - status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); - if (status == 0) { - update_changeattr(dir, &res.dir_cinfo); - nfs_post_op_update_inode(dir, res.dir_fattr); - status = nfs_instantiate(dentry, &fh, &fattr); - } + status = nfs4_do_create(dir, dentry, data); + + nfs4_free_createdata(data); +out: return status; } @@ -2706,6 +2693,8 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen) ret = nfs_revalidate_inode(server, inode); if (ret < 0) return ret; + if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL) + nfs_zap_acl_cache(inode); ret = nfs4_read_cached_acl(inode, buf, buflen); if (ret != -ENOENT) return ret; @@ -2733,7 +2722,8 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl nfs_inode_return_delegation(inode); buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase); ret = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); - nfs_zap_caches(inode); + nfs_access_zap_cache(inode); + nfs_zap_acl_cache(inode); return ret; } @@ -2767,8 +2757,7 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server) task->tk_status = 0; return -EAGAIN; case -NFS4ERR_DELAY: - nfs_inc_server_stats((struct nfs_server *) server, - NFSIOS_DELAY); + nfs_inc_server_stats(server, NFSIOS_DELAY); case -NFS4ERR_GRACE: rpc_delay(task, NFS4_POLL_RETRY_MAX); task->tk_status = 0; @@ -2933,7 +2922,7 @@ static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cre int nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred) { - long timeout; + long timeout = 0; int err; do { err = _nfs4_proc_setclientid_confirm(clp, cred); @@ -3725,8 +3714,6 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .write_done = nfs4_write_done, .commit_setup = nfs4_proc_commit_setup, .commit_done = nfs4_commit_done, - .file_open = nfs_open, - .file_release = nfs_release, .lock = nfs4_proc_lock, .clear_acl_cache = nfs4_zap_acl_attr, }; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 856a8934f610..401ef8b28f97 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -940,7 +940,6 @@ static int reclaimer(void *ptr) allow_signal(SIGKILL); /* Ensure exclusive access to NFSv4 state */ - lock_kernel(); down_write(&clp->cl_sem); /* Are there any NFS mounts out there? */ if (list_empty(&clp->cl_superblocks)) @@ -1000,7 +999,6 @@ restart_loop: nfs_delegation_reap_unclaimed(clp); out: up_write(&clp->cl_sem); - unlock_kernel(); if (status == -NFS4ERR_CB_PATH_DOWN) nfs_handle_cb_pathdown(clp); nfs4_clear_recover_bit(clp); diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index 531379d36823..8478fc25daee 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -1,6 +1,4 @@ /* - * $Id: nfsroot.c,v 1.45 1998/03/07 10:44:46 mj Exp $ - * * Copyright (C) 1995, 1996 Gero Kuhlmann <gero@gkminix.han.de> * * Allow an NFS filesystem to be mounted as root. The way this works is: @@ -129,7 +127,7 @@ enum { Opt_err }; -static match_table_t __initdata tokens = { +static match_table_t __initconst tokens = { {Opt_port, "port=%u"}, {Opt_rsize, "rsize=%u"}, {Opt_wsize, "wsize=%u"}, @@ -297,10 +295,10 @@ static int __init root_nfs_name(char *name) nfs_data.flags = NFS_MOUNT_NONLM; /* No lockd in nfs root yet */ nfs_data.rsize = NFS_DEF_FILE_IO_SIZE; nfs_data.wsize = NFS_DEF_FILE_IO_SIZE; - nfs_data.acregmin = 3; - nfs_data.acregmax = 60; - nfs_data.acdirmin = 30; - nfs_data.acdirmax = 60; + nfs_data.acregmin = NFS_DEF_ACREGMIN; + nfs_data.acregmax = NFS_DEF_ACREGMAX; + nfs_data.acdirmin = NFS_DEF_ACDIRMIN; + nfs_data.acdirmax = NFS_DEF_ACDIRMAX; strcpy(buf, NFS_ROOT); /* Process options received from the remote server */ diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 03599bfe81cf..4dbb84df1b68 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -129,6 +129,8 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, sattr->ia_mode &= S_IALLUGO; dprintk("NFS call setattr\n"); + if (sattr->ia_valid & ATTR_FILE) + msg.rpc_cred = nfs_file_cred(sattr->ia_file); nfs_fattr_init(fattr); status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); if (status == 0) @@ -598,6 +600,29 @@ nfs_proc_lock(struct file *filp, int cmd, struct file_lock *fl) return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl); } +/* Helper functions for NFS lock bounds checking */ +#define NFS_LOCK32_OFFSET_MAX ((__s32)0x7fffffffUL) +static int nfs_lock_check_bounds(const struct file_lock *fl) +{ + __s32 start, end; + + start = (__s32)fl->fl_start; + if ((loff_t)start != fl->fl_start) + goto out_einval; + + if (fl->fl_end != OFFSET_MAX) { + end = (__s32)fl->fl_end; + if ((loff_t)end != fl->fl_end) + goto out_einval; + } else + end = NFS_LOCK32_OFFSET_MAX; + + if (start < 0 || start > end) + goto out_einval; + return 0; +out_einval: + return -EINVAL; +} const struct nfs_rpc_ops nfs_v2_clientops = { .version = 2, /* protocol version */ @@ -630,7 +655,6 @@ const struct nfs_rpc_ops nfs_v2_clientops = { .write_setup = nfs_proc_write_setup, .write_done = nfs_write_done, .commit_setup = nfs_proc_commit_setup, - .file_open = nfs_open, - .file_release = nfs_release, .lock = nfs_proc_lock, + .lock_check_bounds = nfs_lock_check_bounds, }; diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 614efeed5437..ffb697416cb1 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -47,6 +47,7 @@ #include <linux/inet.h> #include <linux/in6.h> #include <net/ipv6.h> +#include <linux/netdevice.h> #include <linux/nfs_xdr.h> #include <linux/magic.h> #include <linux/parser.h> @@ -65,7 +66,6 @@ enum { /* Mount options that take no arguments */ Opt_soft, Opt_hard, - Opt_intr, Opt_nointr, Opt_posix, Opt_noposix, Opt_cto, Opt_nocto, Opt_ac, Opt_noac, @@ -92,19 +92,23 @@ enum { Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost, Opt_addr, Opt_mountaddr, Opt_clientaddr, - /* Mount options that are ignored */ - Opt_userspace, Opt_deprecated, + /* Special mount options */ + Opt_userspace, Opt_deprecated, Opt_sloppy, Opt_err }; -static match_table_t nfs_mount_option_tokens = { +static const match_table_t nfs_mount_option_tokens = { { Opt_userspace, "bg" }, { Opt_userspace, "fg" }, + { Opt_userspace, "retry=%s" }, + + { Opt_sloppy, "sloppy" }, + { Opt_soft, "soft" }, { Opt_hard, "hard" }, - { Opt_intr, "intr" }, - { Opt_nointr, "nointr" }, + { Opt_deprecated, "intr" }, + { Opt_deprecated, "nointr" }, { Opt_posix, "posix" }, { Opt_noposix, "noposix" }, { Opt_cto, "cto" }, @@ -136,7 +140,6 @@ static match_table_t nfs_mount_option_tokens = { { Opt_acdirmin, "acdirmin=%u" }, { Opt_acdirmax, "acdirmax=%u" }, { Opt_actimeo, "actimeo=%u" }, - { Opt_userspace, "retry=%u" }, { Opt_namelen, "namlen=%u" }, { Opt_mountport, "mountport=%u" }, { Opt_mountvers, "mountvers=%u" }, @@ -160,7 +163,7 @@ enum { Opt_xprt_err }; -static match_table_t nfs_xprt_protocol_tokens = { +static const match_table_t nfs_xprt_protocol_tokens = { { Opt_xprt_udp, "udp" }, { Opt_xprt_tcp, "tcp" }, { Opt_xprt_rdma, "rdma" }, @@ -177,7 +180,7 @@ enum { Opt_sec_err }; -static match_table_t nfs_secflavor_tokens = { +static const match_table_t nfs_secflavor_tokens = { { Opt_sec_none, "none" }, { Opt_sec_none, "null" }, { Opt_sec_sys, "sys" }, @@ -207,6 +210,7 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); static void nfs_kill_super(struct super_block *); static void nfs_put_super(struct super_block *); +static int nfs_remount(struct super_block *sb, int *flags, char *raw_data); static struct file_system_type nfs_fs_type = { .owner = THIS_MODULE, @@ -234,6 +238,7 @@ static const struct super_operations nfs_sops = { .umount_begin = nfs_umount_begin, .show_options = nfs_show_options, .show_stats = nfs_show_stats, + .remount_fs = nfs_remount, }; #ifdef CONFIG_NFS_V4 @@ -278,6 +283,7 @@ static const struct super_operations nfs4_sops = { .umount_begin = nfs_umount_begin, .show_options = nfs_show_options, .show_stats = nfs_show_stats, + .remount_fs = nfs_remount, }; #endif @@ -368,8 +374,6 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf) }; int error; - lock_kernel(); - error = server->nfs_client->rpc_ops->statfs(server, fh, &res); if (error < 0) goto out_err; @@ -401,12 +405,10 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_namelen = server->namelen; - unlock_kernel(); return 0; out_err: dprintk("%s: statfs error = %d\n", __func__, -error); - unlock_kernel(); return error; } @@ -514,13 +516,13 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, if (nfss->bsize != 0) seq_printf(m, ",bsize=%u", nfss->bsize); seq_printf(m, ",namlen=%u", nfss->namelen); - if (nfss->acregmin != 3*HZ || showdefaults) + if (nfss->acregmin != NFS_DEF_ACREGMIN*HZ || showdefaults) seq_printf(m, ",acregmin=%u", nfss->acregmin/HZ); - if (nfss->acregmax != 60*HZ || showdefaults) + if (nfss->acregmax != NFS_DEF_ACREGMAX*HZ || showdefaults) seq_printf(m, ",acregmax=%u", nfss->acregmax/HZ); - if (nfss->acdirmin != 30*HZ || showdefaults) + if (nfss->acdirmin != NFS_DEF_ACDIRMIN*HZ || showdefaults) seq_printf(m, ",acdirmin=%u", nfss->acdirmin/HZ); - if (nfss->acdirmax != 60*HZ || showdefaults) + if (nfss->acdirmax != NFS_DEF_ACDIRMAX*HZ || showdefaults) seq_printf(m, ",acdirmax=%u", nfss->acdirmax/HZ); for (nfs_infop = nfs_info; nfs_infop->flag; nfs_infop++) { if (nfss->flags & nfs_infop->flag) @@ -702,49 +704,233 @@ static int nfs_verify_server_address(struct sockaddr *addr) return 0; } +static void nfs_parse_ipv4_address(char *string, size_t str_len, + struct sockaddr *sap, size_t *addr_len) +{ + struct sockaddr_in *sin = (struct sockaddr_in *)sap; + u8 *addr = (u8 *)&sin->sin_addr.s_addr; + + if (str_len <= INET_ADDRSTRLEN) { + dfprintk(MOUNT, "NFS: parsing IPv4 address %*s\n", + (int)str_len, string); + + sin->sin_family = AF_INET; + *addr_len = sizeof(*sin); + if (in4_pton(string, str_len, addr, '\0', NULL)) + return; + } + + sap->sa_family = AF_UNSPEC; + *addr_len = 0; +} + +#define IPV6_SCOPE_DELIMITER '%' + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static void nfs_parse_ipv6_scope_id(const char *string, const size_t str_len, + const char *delim, + struct sockaddr_in6 *sin6) +{ + char *p; + size_t len; + + if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)) + return ; + if (*delim != IPV6_SCOPE_DELIMITER) + return; + + len = (string + str_len) - delim - 1; + p = kstrndup(delim + 1, len, GFP_KERNEL); + if (p) { + unsigned long scope_id = 0; + struct net_device *dev; + + dev = dev_get_by_name(&init_net, p); + if (dev != NULL) { + scope_id = dev->ifindex; + dev_put(dev); + } else { + /* scope_id is set to zero on error */ + strict_strtoul(p, 10, &scope_id); + } + + kfree(p); + sin6->sin6_scope_id = scope_id; + dfprintk(MOUNT, "NFS: IPv6 scope ID = %lu\n", scope_id); + } +} + +static void nfs_parse_ipv6_address(char *string, size_t str_len, + struct sockaddr *sap, size_t *addr_len) +{ + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; + u8 *addr = (u8 *)&sin6->sin6_addr.in6_u; + const char *delim; + + if (str_len <= INET6_ADDRSTRLEN) { + dfprintk(MOUNT, "NFS: parsing IPv6 address %*s\n", + (int)str_len, string); + + sin6->sin6_family = AF_INET6; + *addr_len = sizeof(*sin6); + if (in6_pton(string, str_len, addr, IPV6_SCOPE_DELIMITER, &delim)) { + nfs_parse_ipv6_scope_id(string, str_len, delim, sin6); + return; + } + } + + sap->sa_family = AF_UNSPEC; + *addr_len = 0; +} +#else +static void nfs_parse_ipv6_address(char *string, size_t str_len, + struct sockaddr *sap, size_t *addr_len) +{ + sap->sa_family = AF_UNSPEC; + *addr_len = 0; +} +#endif + /* - * Parse string addresses passed in via a mount option, - * and construct a sockaddr based on the result. + * Construct a sockaddr based on the contents of a string that contains + * an IP address in presentation format. * - * If address parsing fails, set the sockaddr's address - * family to AF_UNSPEC to force nfs_verify_server_address() - * to punt the mount. + * If there is a problem constructing the new sockaddr, set the address + * family to AF_UNSPEC. */ -static void nfs_parse_server_address(char *value, - struct sockaddr *sap, - size_t *len) +static void nfs_parse_ip_address(char *string, size_t str_len, + struct sockaddr *sap, size_t *addr_len) { - if (strchr(value, ':')) { - struct sockaddr_in6 *ap = (struct sockaddr_in6 *)sap; - u8 *addr = (u8 *)&ap->sin6_addr.in6_u; + unsigned int i, colons; - ap->sin6_family = AF_INET6; - *len = sizeof(*ap); - if (in6_pton(value, -1, addr, '\0', NULL)) - return; - } else { - struct sockaddr_in *ap = (struct sockaddr_in *)sap; - u8 *addr = (u8 *)&ap->sin_addr.s_addr; + colons = 0; + for (i = 0; i < str_len; i++) + if (string[i] == ':') + colons++; + + if (colons >= 2) + nfs_parse_ipv6_address(string, str_len, sap, addr_len); + else + nfs_parse_ipv4_address(string, str_len, sap, addr_len); +} + +/* + * Sanity check the NFS transport protocol. + * + */ +static void nfs_validate_transport_protocol(struct nfs_parsed_mount_data *mnt) +{ + switch (mnt->nfs_server.protocol) { + case XPRT_TRANSPORT_UDP: + case XPRT_TRANSPORT_TCP: + case XPRT_TRANSPORT_RDMA: + break; + default: + mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP; + } +} + +/* + * For text based NFSv2/v3 mounts, the mount protocol transport default + * settings should depend upon the specified NFS transport. + */ +static void nfs_set_mount_transport_protocol(struct nfs_parsed_mount_data *mnt) +{ + nfs_validate_transport_protocol(mnt); - ap->sin_family = AF_INET; - *len = sizeof(*ap); - if (in4_pton(value, -1, addr, '\0', NULL)) + if (mnt->mount_server.protocol == XPRT_TRANSPORT_UDP || + mnt->mount_server.protocol == XPRT_TRANSPORT_TCP) return; + switch (mnt->nfs_server.protocol) { + case XPRT_TRANSPORT_UDP: + mnt->mount_server.protocol = XPRT_TRANSPORT_UDP; + break; + case XPRT_TRANSPORT_TCP: + case XPRT_TRANSPORT_RDMA: + mnt->mount_server.protocol = XPRT_TRANSPORT_TCP; } +} - sap->sa_family = AF_UNSPEC; - *len = 0; +/* + * Parse the value of the 'sec=' option. + * + * The flavor_len setting is for v4 mounts. + */ +static int nfs_parse_security_flavors(char *value, + struct nfs_parsed_mount_data *mnt) +{ + substring_t args[MAX_OPT_ARGS]; + + dfprintk(MOUNT, "NFS: parsing sec=%s option\n", value); + + switch (match_token(value, nfs_secflavor_tokens, args)) { + case Opt_sec_none: + mnt->auth_flavor_len = 0; + mnt->auth_flavors[0] = RPC_AUTH_NULL; + break; + case Opt_sec_sys: + mnt->auth_flavor_len = 0; + mnt->auth_flavors[0] = RPC_AUTH_UNIX; + break; + case Opt_sec_krb5: + mnt->auth_flavor_len = 1; + mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5; + break; + case Opt_sec_krb5i: + mnt->auth_flavor_len = 1; + mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5I; + break; + case Opt_sec_krb5p: + mnt->auth_flavor_len = 1; + mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5P; + break; + case Opt_sec_lkey: + mnt->auth_flavor_len = 1; + mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEY; + break; + case Opt_sec_lkeyi: + mnt->auth_flavor_len = 1; + mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYI; + break; + case Opt_sec_lkeyp: + mnt->auth_flavor_len = 1; + mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYP; + break; + case Opt_sec_spkm: + mnt->auth_flavor_len = 1; + mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKM; + break; + case Opt_sec_spkmi: + mnt->auth_flavor_len = 1; + mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMI; + break; + case Opt_sec_spkmp: + mnt->auth_flavor_len = 1; + mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMP; + break; + default: + return 0; + } + + return 1; +} + +static void nfs_parse_invalid_value(const char *option) +{ + dfprintk(MOUNT, "NFS: bad value specified for %s option\n", option); } /* * Error-check and convert a string of mount options from user space into - * a data structure + * a data structure. The whole mount string is processed; bad options are + * skipped as they are encountered. If there were no errors, return 1; + * otherwise return 0 (zero). */ static int nfs_parse_mount_options(char *raw, struct nfs_parsed_mount_data *mnt) { char *p, *string, *secdata; - int rc; + int rc, sloppy = 0, errors = 0; if (!raw) { dfprintk(MOUNT, "NFS: mount options string was NULL.\n"); @@ -777,15 +963,16 @@ static int nfs_parse_mount_options(char *raw, token = match_token(p, nfs_mount_option_tokens, args); switch (token) { + + /* + * boolean options: foo/nofoo + */ case Opt_soft: mnt->flags |= NFS_MOUNT_SOFT; break; case Opt_hard: mnt->flags &= ~NFS_MOUNT_SOFT; break; - case Opt_intr: - case Opt_nointr: - break; case Opt_posix: mnt->flags |= NFS_MOUNT_POSIX; break; @@ -819,20 +1006,14 @@ static int nfs_parse_mount_options(char *raw, case Opt_udp: mnt->flags &= ~NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; - mnt->timeo = 7; - mnt->retrans = 5; break; case Opt_tcp: mnt->flags |= NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP; - mnt->timeo = 600; - mnt->retrans = 2; break; case Opt_rdma: mnt->flags |= NFS_MOUNT_TCP; /* for side protocols */ mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA; - mnt->timeo = 600; - mnt->retrans = 2; break; case Opt_acl: mnt->flags &= ~NFS_MOUNT_NOACL; @@ -853,165 +1034,144 @@ static int nfs_parse_mount_options(char *raw, mnt->flags |= NFS_MOUNT_UNSHARED; break; + /* + * options that take numeric values + */ case Opt_port: - if (match_int(args, &option)) - return 0; - if (option < 0 || option > 65535) - return 0; - mnt->nfs_server.port = option; + if (match_int(args, &option) || + option < 0 || option > USHORT_MAX) { + errors++; + nfs_parse_invalid_value("port"); + } else + mnt->nfs_server.port = option; break; case Opt_rsize: - if (match_int(args, &mnt->rsize)) - return 0; + if (match_int(args, &option) || option < 0) { + errors++; + nfs_parse_invalid_value("rsize"); + } else + mnt->rsize = option; break; case Opt_wsize: - if (match_int(args, &mnt->wsize)) - return 0; + if (match_int(args, &option) || option < 0) { + errors++; + nfs_parse_invalid_value("wsize"); + } else + mnt->wsize = option; break; case Opt_bsize: - if (match_int(args, &option)) - return 0; - if (option < 0) - return 0; - mnt->bsize = option; + if (match_int(args, &option) || option < 0) { + errors++; + nfs_parse_invalid_value("bsize"); + } else + mnt->bsize = option; break; case Opt_timeo: - if (match_int(args, &mnt->timeo)) - return 0; + if (match_int(args, &option) || option <= 0) { + errors++; + nfs_parse_invalid_value("timeo"); + } else + mnt->timeo = option; break; case Opt_retrans: - if (match_int(args, &mnt->retrans)) - return 0; + if (match_int(args, &option) || option <= 0) { + errors++; + nfs_parse_invalid_value("retrans"); + } else + mnt->retrans = option; break; case Opt_acregmin: - if (match_int(args, &mnt->acregmin)) - return 0; + if (match_int(args, &option) || option < 0) { + errors++; + nfs_parse_invalid_value("acregmin"); + } else + mnt->acregmin = option; break; case Opt_acregmax: - if (match_int(args, &mnt->acregmax)) - return 0; + if (match_int(args, &option) || option < 0) { + errors++; + nfs_parse_invalid_value("acregmax"); + } else + mnt->acregmax = option; break; case Opt_acdirmin: - if (match_int(args, &mnt->acdirmin)) - return 0; + if (match_int(args, &option) || option < 0) { + errors++; + nfs_parse_invalid_value("acdirmin"); + } else + mnt->acdirmin = option; break; case Opt_acdirmax: - if (match_int(args, &mnt->acdirmax)) - return 0; + if (match_int(args, &option) || option < 0) { + errors++; + nfs_parse_invalid_value("acdirmax"); + } else + mnt->acdirmax = option; break; case Opt_actimeo: - if (match_int(args, &option)) - return 0; - if (option < 0) - return 0; - mnt->acregmin = - mnt->acregmax = - mnt->acdirmin = - mnt->acdirmax = option; + if (match_int(args, &option) || option < 0) { + errors++; + nfs_parse_invalid_value("actimeo"); + } else + mnt->acregmin = mnt->acregmax = + mnt->acdirmin = mnt->acdirmax = option; break; case Opt_namelen: - if (match_int(args, &mnt->namlen)) - return 0; + if (match_int(args, &option) || option < 0) { + errors++; + nfs_parse_invalid_value("namlen"); + } else + mnt->namlen = option; break; case Opt_mountport: - if (match_int(args, &option)) - return 0; - if (option < 0 || option > 65535) - return 0; - mnt->mount_server.port = option; + if (match_int(args, &option) || + option < 0 || option > USHORT_MAX) { + errors++; + nfs_parse_invalid_value("mountport"); + } else + mnt->mount_server.port = option; break; case Opt_mountvers: - if (match_int(args, &option)) - return 0; - if (option < 0) - return 0; - mnt->mount_server.version = option; + if (match_int(args, &option) || + option < NFS_MNT_VERSION || + option > NFS_MNT3_VERSION) { + errors++; + nfs_parse_invalid_value("mountvers"); + } else + mnt->mount_server.version = option; break; case Opt_nfsvers: - if (match_int(args, &option)) - return 0; + if (match_int(args, &option)) { + errors++; + nfs_parse_invalid_value("nfsvers"); + break; + } switch (option) { - case 2: + case NFS2_VERSION: mnt->flags &= ~NFS_MOUNT_VER3; break; - case 3: + case NFS3_VERSION: mnt->flags |= NFS_MOUNT_VER3; break; default: - goto out_unrec_vers; + errors++; + nfs_parse_invalid_value("nfsvers"); } break; + /* + * options that take text values + */ case Opt_sec: string = match_strdup(args); if (string == NULL) goto out_nomem; - token = match_token(string, nfs_secflavor_tokens, args); + rc = nfs_parse_security_flavors(string, mnt); kfree(string); - - /* - * The flags setting is for v2/v3. The flavor_len - * setting is for v4. v2/v3 also need to know the - * difference between NULL and UNIX. - */ - switch (token) { - case Opt_sec_none: - mnt->flags &= ~NFS_MOUNT_SECFLAVOUR; - mnt->auth_flavor_len = 0; - mnt->auth_flavors[0] = RPC_AUTH_NULL; - break; - case Opt_sec_sys: - mnt->flags &= ~NFS_MOUNT_SECFLAVOUR; - mnt->auth_flavor_len = 0; - mnt->auth_flavors[0] = RPC_AUTH_UNIX; - break; - case Opt_sec_krb5: - mnt->flags |= NFS_MOUNT_SECFLAVOUR; - mnt->auth_flavor_len = 1; - mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5; - break; - case Opt_sec_krb5i: - mnt->flags |= NFS_MOUNT_SECFLAVOUR; - mnt->auth_flavor_len = 1; - mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5I; - break; - case Opt_sec_krb5p: - mnt->flags |= NFS_MOUNT_SECFLAVOUR; - mnt->auth_flavor_len = 1; - mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5P; - break; - case Opt_sec_lkey: - mnt->flags |= NFS_MOUNT_SECFLAVOUR; - mnt->auth_flavor_len = 1; - mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEY; - break; - case Opt_sec_lkeyi: - mnt->flags |= NFS_MOUNT_SECFLAVOUR; - mnt->auth_flavor_len = 1; - mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYI; - break; - case Opt_sec_lkeyp: - mnt->flags |= NFS_MOUNT_SECFLAVOUR; - mnt->auth_flavor_len = 1; - mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYP; - break; - case Opt_sec_spkm: - mnt->flags |= NFS_MOUNT_SECFLAVOUR; - mnt->auth_flavor_len = 1; - mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKM; - break; - case Opt_sec_spkmi: - mnt->flags |= NFS_MOUNT_SECFLAVOUR; - mnt->auth_flavor_len = 1; - mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMI; - break; - case Opt_sec_spkmp: - mnt->flags |= NFS_MOUNT_SECFLAVOUR; - mnt->auth_flavor_len = 1; - mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMP; - break; - default: - goto out_unrec_sec; + if (!rc) { + errors++; + dfprintk(MOUNT, "NFS: unrecognized " + "security flavor\n"); } break; case Opt_proto: @@ -1026,24 +1186,20 @@ static int nfs_parse_mount_options(char *raw, case Opt_xprt_udp: mnt->flags &= ~NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; - mnt->timeo = 7; - mnt->retrans = 5; break; case Opt_xprt_tcp: mnt->flags |= NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP; - mnt->timeo = 600; - mnt->retrans = 2; break; case Opt_xprt_rdma: /* vector side protocols to TCP */ mnt->flags |= NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA; - mnt->timeo = 600; - mnt->retrans = 2; break; default: - goto out_unrec_xprt; + errors++; + dfprintk(MOUNT, "NFS: unrecognized " + "transport protocol\n"); } break; case Opt_mountproto: @@ -1063,16 +1219,19 @@ static int nfs_parse_mount_options(char *raw, break; case Opt_xprt_rdma: /* not used for side protocols */ default: - goto out_unrec_xprt; + errors++; + dfprintk(MOUNT, "NFS: unrecognized " + "transport protocol\n"); } break; case Opt_addr: string = match_strdup(args); if (string == NULL) goto out_nomem; - nfs_parse_server_address(string, (struct sockaddr *) - &mnt->nfs_server.address, - &mnt->nfs_server.addrlen); + nfs_parse_ip_address(string, strlen(string), + (struct sockaddr *) + &mnt->nfs_server.address, + &mnt->nfs_server.addrlen); kfree(string); break; case Opt_clientaddr: @@ -1093,24 +1252,39 @@ static int nfs_parse_mount_options(char *raw, string = match_strdup(args); if (string == NULL) goto out_nomem; - nfs_parse_server_address(string, (struct sockaddr *) - &mnt->mount_server.address, - &mnt->mount_server.addrlen); + nfs_parse_ip_address(string, strlen(string), + (struct sockaddr *) + &mnt->mount_server.address, + &mnt->mount_server.addrlen); kfree(string); break; + /* + * Special options + */ + case Opt_sloppy: + sloppy = 1; + dfprintk(MOUNT, "NFS: relaxing parsing rules\n"); + break; case Opt_userspace: case Opt_deprecated: + dfprintk(MOUNT, "NFS: ignoring mount option " + "'%s'\n", p); break; default: - goto out_unknown; + errors++; + dfprintk(MOUNT, "NFS: unrecognized mount option " + "'%s'\n", p); } } - nfs_set_port((struct sockaddr *)&mnt->nfs_server.address, - mnt->nfs_server.port); - + if (errors > 0) { + dfprintk(MOUNT, "NFS: parsing encountered %d error%s\n", + errors, (errors == 1 ? "" : "s")); + if (!sloppy) + return 0; + } return 1; out_nomem: @@ -1120,21 +1294,6 @@ out_security_failure: free_secdata(secdata); printk(KERN_INFO "NFS: security options invalid: %d\n", rc); return 0; -out_unrec_vers: - printk(KERN_INFO "NFS: unrecognized NFS version number\n"); - return 0; - -out_unrec_xprt: - printk(KERN_INFO "NFS: unrecognized transport protocol\n"); - return 0; - -out_unrec_sec: - printk(KERN_INFO "NFS: unrecognized security flavor\n"); - return 0; - -out_unknown: - printk(KERN_INFO "NFS: unknown mount option: %s\n", p); - return 0; } /* @@ -1188,11 +1347,146 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args, if (status == 0) return 0; - dfprintk(MOUNT, "NFS: unable to mount server %s, error %d", + dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n", hostname, status); return status; } +static int nfs_parse_simple_hostname(const char *dev_name, + char **hostname, size_t maxnamlen, + char **export_path, size_t maxpathlen) +{ + size_t len; + char *colon, *comma; + + colon = strchr(dev_name, ':'); + if (colon == NULL) + goto out_bad_devname; + + len = colon - dev_name; + if (len > maxnamlen) + goto out_hostname; + + /* N.B. caller will free nfs_server.hostname in all cases */ + *hostname = kstrndup(dev_name, len, GFP_KERNEL); + if (!*hostname) + goto out_nomem; + + /* kill possible hostname list: not supported */ + comma = strchr(*hostname, ','); + if (comma != NULL) { + if (comma == *hostname) + goto out_bad_devname; + *comma = '\0'; + } + + colon++; + len = strlen(colon); + if (len > maxpathlen) + goto out_path; + *export_path = kstrndup(colon, len, GFP_KERNEL); + if (!*export_path) + goto out_nomem; + + dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", *export_path); + return 0; + +out_bad_devname: + dfprintk(MOUNT, "NFS: device name not in host:path format\n"); + return -EINVAL; + +out_nomem: + dfprintk(MOUNT, "NFS: not enough memory to parse device name\n"); + return -ENOMEM; + +out_hostname: + dfprintk(MOUNT, "NFS: server hostname too long\n"); + return -ENAMETOOLONG; + +out_path: + dfprintk(MOUNT, "NFS: export pathname too long\n"); + return -ENAMETOOLONG; +} + +/* + * Hostname has square brackets around it because it contains one or + * more colons. We look for the first closing square bracket, and a + * colon must follow it. + */ +static int nfs_parse_protected_hostname(const char *dev_name, + char **hostname, size_t maxnamlen, + char **export_path, size_t maxpathlen) +{ + size_t len; + char *start, *end; + + start = (char *)(dev_name + 1); + + end = strchr(start, ']'); + if (end == NULL) + goto out_bad_devname; + if (*(end + 1) != ':') + goto out_bad_devname; + + len = end - start; + if (len > maxnamlen) + goto out_hostname; + + /* N.B. caller will free nfs_server.hostname in all cases */ + *hostname = kstrndup(start, len, GFP_KERNEL); + if (*hostname == NULL) + goto out_nomem; + + end += 2; + len = strlen(end); + if (len > maxpathlen) + goto out_path; + *export_path = kstrndup(end, len, GFP_KERNEL); + if (!*export_path) + goto out_nomem; + + return 0; + +out_bad_devname: + dfprintk(MOUNT, "NFS: device name not in host:path format\n"); + return -EINVAL; + +out_nomem: + dfprintk(MOUNT, "NFS: not enough memory to parse device name\n"); + return -ENOMEM; + +out_hostname: + dfprintk(MOUNT, "NFS: server hostname too long\n"); + return -ENAMETOOLONG; + +out_path: + dfprintk(MOUNT, "NFS: export pathname too long\n"); + return -ENAMETOOLONG; +} + +/* + * Split "dev_name" into "hostname:export_path". + * + * The leftmost colon demarks the split between the server's hostname + * and the export path. If the hostname starts with a left square + * bracket, then it may contain colons. + * + * Note: caller frees hostname and export path, even on error. + */ +static int nfs_parse_devname(const char *dev_name, + char **hostname, size_t maxnamlen, + char **export_path, size_t maxpathlen) +{ + if (*dev_name == '[') + return nfs_parse_protected_hostname(dev_name, + hostname, maxnamlen, + export_path, maxpathlen); + + return nfs_parse_simple_hostname(dev_name, + hostname, maxnamlen, + export_path, maxpathlen); +} + /* * Validate the NFS2/NFS3 mount data * - fills in the mount root filehandle @@ -1222,16 +1516,14 @@ static int nfs_validate_mount_data(void *options, args->flags = (NFS_MOUNT_VER3 | NFS_MOUNT_TCP); args->rsize = NFS_MAX_FILE_IO_SIZE; args->wsize = NFS_MAX_FILE_IO_SIZE; - args->timeo = 600; - args->retrans = 2; - args->acregmin = 3; - args->acregmax = 60; - args->acdirmin = 30; - args->acdirmax = 60; + args->acregmin = NFS_DEF_ACREGMIN; + args->acregmax = NFS_DEF_ACREGMAX; + args->acdirmin = NFS_DEF_ACDIRMIN; + args->acdirmax = NFS_DEF_ACDIRMAX; args->mount_server.port = 0; /* autobind unless user sets port */ - args->mount_server.protocol = XPRT_TRANSPORT_UDP; args->nfs_server.port = 0; /* autobind unless user sets port */ args->nfs_server.protocol = XPRT_TRANSPORT_TCP; + args->auth_flavors[0] = RPC_AUTH_UNIX; switch (data->version) { case 1: @@ -1289,7 +1581,9 @@ static int nfs_validate_mount_data(void *options, args->nfs_server.hostname = kstrdup(data->hostname, GFP_KERNEL); args->namlen = data->namlen; args->bsize = data->bsize; - args->auth_flavors[0] = data->pseudoflavor; + + if (data->flags & NFS_MOUNT_SECFLAVOUR) + args->auth_flavors[0] = data->pseudoflavor; if (!args->nfs_server.hostname) goto out_nomem; @@ -1321,8 +1615,6 @@ static int nfs_validate_mount_data(void *options, break; default: { - unsigned int len; - char *c; int status; if (nfs_parse_mount_options((char *)options, args) == 0) @@ -1332,21 +1624,22 @@ static int nfs_validate_mount_data(void *options, &args->nfs_server.address)) goto out_no_address; - c = strchr(dev_name, ':'); - if (c == NULL) - return -EINVAL; - len = c - dev_name; - /* N.B. caller will free nfs_server.hostname in all cases */ - args->nfs_server.hostname = kstrndup(dev_name, len, GFP_KERNEL); - if (!args->nfs_server.hostname) - goto out_nomem; + nfs_set_port((struct sockaddr *)&args->nfs_server.address, + args->nfs_server.port); - c++; - if (strlen(c) > NFS_MAXPATHLEN) - return -ENAMETOOLONG; - args->nfs_server.export_path = c; + nfs_set_mount_transport_protocol(args); + + status = nfs_parse_devname(dev_name, + &args->nfs_server.hostname, + PAGE_SIZE, + &args->nfs_server.export_path, + NFS_MAXPATHLEN); + if (!status) + status = nfs_try_mount(args, mntfh); + + kfree(args->nfs_server.export_path); + args->nfs_server.export_path = NULL; - status = nfs_try_mount(args, mntfh); if (status) return status; @@ -1354,9 +1647,6 @@ static int nfs_validate_mount_data(void *options, } } - if (!(args->flags & NFS_MOUNT_SECFLAVOUR)) - args->auth_flavors[0] = RPC_AUTH_UNIX; - #ifndef CONFIG_NFS_V3 if (args->flags & NFS_MOUNT_VER3) goto out_v3_not_compiled; @@ -1396,6 +1686,80 @@ out_invalid_fh: return -EINVAL; } +static int +nfs_compare_remount_data(struct nfs_server *nfss, + struct nfs_parsed_mount_data *data) +{ + if (data->flags != nfss->flags || + data->rsize != nfss->rsize || + data->wsize != nfss->wsize || + data->retrans != nfss->client->cl_timeout->to_retries || + data->auth_flavors[0] != nfss->client->cl_auth->au_flavor || + data->acregmin != nfss->acregmin / HZ || + data->acregmax != nfss->acregmax / HZ || + data->acdirmin != nfss->acdirmin / HZ || + data->acdirmax != nfss->acdirmax / HZ || + data->timeo != (10U * nfss->client->cl_timeout->to_initval / HZ) || + data->nfs_server.addrlen != nfss->nfs_client->cl_addrlen || + memcmp(&data->nfs_server.address, &nfss->nfs_client->cl_addr, + data->nfs_server.addrlen) != 0) + return -EINVAL; + + return 0; +} + +static int +nfs_remount(struct super_block *sb, int *flags, char *raw_data) +{ + int error; + struct nfs_server *nfss = sb->s_fs_info; + struct nfs_parsed_mount_data *data; + struct nfs_mount_data *options = (struct nfs_mount_data *)raw_data; + struct nfs4_mount_data *options4 = (struct nfs4_mount_data *)raw_data; + u32 nfsvers = nfss->nfs_client->rpc_ops->version; + + /* + * Userspace mount programs that send binary options generally send + * them populated with default values. We have no way to know which + * ones were explicitly specified. Fall back to legacy behavior and + * just return success. + */ + if ((nfsvers == 4 && (!options4 || options4->version == 1)) || + (nfsvers <= 3 && (!options || (options->version >= 1 && + options->version <= 6)))) + return 0; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (data == NULL) + return -ENOMEM; + + /* fill out struct with values from existing mount */ + data->flags = nfss->flags; + data->rsize = nfss->rsize; + data->wsize = nfss->wsize; + data->retrans = nfss->client->cl_timeout->to_retries; + data->auth_flavors[0] = nfss->client->cl_auth->au_flavor; + data->acregmin = nfss->acregmin / HZ; + data->acregmax = nfss->acregmax / HZ; + data->acdirmin = nfss->acdirmin / HZ; + data->acdirmax = nfss->acdirmax / HZ; + data->timeo = 10U * nfss->client->cl_timeout->to_initval / HZ; + data->nfs_server.addrlen = nfss->nfs_client->cl_addrlen; + memcpy(&data->nfs_server.address, &nfss->nfs_client->cl_addr, + data->nfs_server.addrlen); + + /* overwrite those values with any that were specified */ + error = nfs_parse_mount_options((char *)options, data); + if (error < 0) + goto out; + + /* compare new mount options with old ones */ + error = nfs_compare_remount_data(nfss, data); +out: + kfree(data); + return error; +} + /* * Initialise the common bits of the superblock */ @@ -1811,14 +2175,13 @@ static int nfs4_validate_mount_data(void *options, args->rsize = NFS_MAX_FILE_IO_SIZE; args->wsize = NFS_MAX_FILE_IO_SIZE; - args->timeo = 600; - args->retrans = 2; - args->acregmin = 3; - args->acregmax = 60; - args->acdirmin = 30; - args->acdirmax = 60; + args->acregmin = NFS_DEF_ACREGMIN; + args->acregmax = NFS_DEF_ACREGMAX; + args->acdirmin = NFS_DEF_ACDIRMIN; + args->acdirmax = NFS_DEF_ACDIRMAX; args->nfs_server.port = NFS_PORT; /* 2049 unless user set port= */ - args->nfs_server.protocol = XPRT_TRANSPORT_TCP; + args->auth_flavors[0] = RPC_AUTH_UNIX; + args->auth_flavor_len = 0; switch (data->version) { case 1: @@ -1834,18 +2197,13 @@ static int nfs4_validate_mount_data(void *options, &args->nfs_server.address)) goto out_no_address; - switch (data->auth_flavourlen) { - case 0: - args->auth_flavors[0] = RPC_AUTH_UNIX; - break; - case 1: + if (data->auth_flavourlen) { + if (data->auth_flavourlen > 1) + goto out_inval_auth; if (copy_from_user(&args->auth_flavors[0], data->auth_flavours, sizeof(args->auth_flavors[0]))) return -EFAULT; - break; - default: - goto out_inval_auth; } c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN); @@ -1879,10 +2237,11 @@ static int nfs4_validate_mount_data(void *options, args->acdirmin = data->acdirmin; args->acdirmax = data->acdirmax; args->nfs_server.protocol = data->proto; + nfs_validate_transport_protocol(args); break; default: { - unsigned int len; + int status; if (nfs_parse_mount_options((char *)options, args) == 0) return -EINVAL; @@ -1891,44 +2250,25 @@ static int nfs4_validate_mount_data(void *options, &args->nfs_server.address)) return -EINVAL; - switch (args->auth_flavor_len) { - case 0: - args->auth_flavors[0] = RPC_AUTH_UNIX; - break; - case 1: - break; - default: - goto out_inval_auth; - } + nfs_set_port((struct sockaddr *)&args->nfs_server.address, + args->nfs_server.port); - /* - * Split "dev_name" into "hostname:mntpath". - */ - c = strchr(dev_name, ':'); - if (c == NULL) - return -EINVAL; - /* while calculating len, pretend ':' is '\0' */ - len = c - dev_name; - if (len > NFS4_MAXNAMLEN) - return -ENAMETOOLONG; - /* N.B. caller will free nfs_server.hostname in all cases */ - args->nfs_server.hostname = kstrndup(dev_name, len, GFP_KERNEL); - if (!args->nfs_server.hostname) - goto out_nomem; - - c++; /* step over the ':' */ - len = strlen(c); - if (len > NFS4_MAXPATHLEN) - return -ENAMETOOLONG; - args->nfs_server.export_path = kstrndup(c, len, GFP_KERNEL); - if (!args->nfs_server.export_path) - goto out_nomem; + nfs_validate_transport_protocol(args); - dprintk("NFS: MNTPATH: '%s'\n", args->nfs_server.export_path); + if (args->auth_flavor_len > 1) + goto out_inval_auth; if (args->client_address == NULL) goto out_no_client_address; + status = nfs_parse_devname(dev_name, + &args->nfs_server.hostname, + NFS4_MAXNAMLEN, + &args->nfs_server.export_path, + NFS4_MAXPATHLEN); + if (status < 0) + return status; + break; } } @@ -1944,10 +2284,6 @@ out_inval_auth: data->auth_flavourlen); return -EINVAL; -out_nomem: - dfprintk(MOUNT, "NFS4: not enough memory to handle mount options\n"); - return -ENOMEM; - out_no_address: dfprintk(MOUNT, "NFS4: mount program didn't pass remote address\n"); return -EINVAL; diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 3adf8b266461..f089e5839d7d 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -95,10 +95,11 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata) static void nfs_async_unlink_release(void *calldata) { struct nfs_unlinkdata *data = calldata; + struct super_block *sb = data->dir->i_sb; nfs_dec_sillycount(data->dir); - nfs_sb_deactive(NFS_SERVER(data->dir)); nfs_free_unlinkdata(data); + nfs_sb_deactive(NFS_SB(sb)); } static const struct rpc_call_ops nfs_unlink_ops = { diff --git a/fs/nfs/write.c b/fs/nfs/write.c index f333848fd3be..3229e217c773 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -34,9 +34,6 @@ /* * Local function declarations */ -static struct nfs_page * nfs_update_request(struct nfs_open_context*, - struct page *, - unsigned int, unsigned int); static void nfs_pageio_init_write(struct nfs_pageio_descriptor *desc, struct inode *inode, int ioflags); static void nfs_redirty_request(struct nfs_page *req); @@ -136,16 +133,21 @@ static struct nfs_page *nfs_page_find_request(struct page *page) static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count) { struct inode *inode = page->mapping->host; - loff_t end, i_size = i_size_read(inode); - pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; + loff_t end, i_size; + pgoff_t end_index; + spin_lock(&inode->i_lock); + i_size = i_size_read(inode); + end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; if (i_size > 0 && page->index < end_index) - return; + goto out; end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + ((loff_t)offset+count); if (i_size >= end) - return; - nfs_inc_stats(inode, NFSIOS_EXTENDWRITE); + goto out; i_size_write(inode, end); + nfs_inc_stats(inode, NFSIOS_EXTENDWRITE); +out: + spin_unlock(&inode->i_lock); } /* A writeback failed: mark the page as bad, and invalidate the page cache */ @@ -169,29 +171,6 @@ static void nfs_mark_uptodate(struct page *page, unsigned int base, unsigned int SetPageUptodate(page); } -static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page, - unsigned int offset, unsigned int count) -{ - struct nfs_page *req; - int ret; - - for (;;) { - req = nfs_update_request(ctx, page, offset, count); - if (!IS_ERR(req)) - break; - ret = PTR_ERR(req); - if (ret != -EBUSY) - return ret; - ret = nfs_wb_page(page->mapping->host, page); - if (ret != 0) - return ret; - } - /* Update file length */ - nfs_grow_file(page, offset, count); - nfs_clear_page_tag_locked(req); - return 0; -} - static int wb_priority(struct writeback_control *wbc) { if (wbc->for_reclaim) @@ -268,12 +247,9 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, return ret; spin_lock(&inode->i_lock); } - if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) { - /* This request is marked for commit */ + if (test_bit(PG_CLEAN, &req->wb_flags)) { spin_unlock(&inode->i_lock); - nfs_clear_page_tag_locked(req); - nfs_pageio_complete(pgio); - return 0; + BUG(); } if (nfs_set_page_writeback(page) != 0) { spin_unlock(&inode->i_lock); @@ -355,11 +331,19 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) /* * Insert a write request into an inode */ -static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) +static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req) { struct nfs_inode *nfsi = NFS_I(inode); int error; + error = radix_tree_preload(GFP_NOFS); + if (error != 0) + goto out; + + /* Lock the request! */ + nfs_lock_request_dontget(req); + + spin_lock(&inode->i_lock); error = radix_tree_insert(&nfsi->nfs_page_tree, req->wb_index, req); BUG_ON(error); if (!nfsi->npages) { @@ -373,6 +357,10 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) kref_get(&req->wb_kref); radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED); + spin_unlock(&inode->i_lock); + radix_tree_preload_end(); +out: + return error; } /* @@ -405,19 +393,6 @@ nfs_mark_request_dirty(struct nfs_page *req) __set_page_dirty_nobuffers(req->wb_page); } -/* - * Check if a request is dirty - */ -static inline int -nfs_dirty_request(struct nfs_page *req) -{ - struct page *page = req->wb_page; - - if (page == NULL || test_bit(PG_NEED_COMMIT, &req->wb_flags)) - return 0; - return !PageWriteback(page); -} - #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) /* * Add a request to the inode's commit list. @@ -430,7 +405,7 @@ nfs_mark_request_commit(struct nfs_page *req) spin_lock(&inode->i_lock); nfsi->ncommit++; - set_bit(PG_NEED_COMMIT, &(req)->wb_flags); + set_bit(PG_CLEAN, &(req)->wb_flags); radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_COMMIT); @@ -440,6 +415,19 @@ nfs_mark_request_commit(struct nfs_page *req) __mark_inode_dirty(inode, I_DIRTY_DATASYNC); } +static int +nfs_clear_request_commit(struct nfs_page *req) +{ + struct page *page = req->wb_page; + + if (test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) { + dec_zone_page_state(page, NR_UNSTABLE_NFS); + dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE); + return 1; + } + return 0; +} + static inline int nfs_write_need_commit(struct nfs_write_data *data) { @@ -449,7 +437,7 @@ int nfs_write_need_commit(struct nfs_write_data *data) static inline int nfs_reschedule_unstable_write(struct nfs_page *req) { - if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) { + if (test_and_clear_bit(PG_NEED_COMMIT, &req->wb_flags)) { nfs_mark_request_commit(req); return 1; } @@ -465,6 +453,12 @@ nfs_mark_request_commit(struct nfs_page *req) { } +static inline int +nfs_clear_request_commit(struct nfs_page *req) +{ + return 0; +} + static inline int nfs_write_need_commit(struct nfs_write_data *data) { @@ -522,11 +516,8 @@ static void nfs_cancel_commit_list(struct list_head *head) while(!list_empty(head)) { req = nfs_list_entry(head->next); - dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); - dec_bdi_stat(req->wb_page->mapping->backing_dev_info, - BDI_RECLAIMABLE); nfs_list_remove_request(req); - clear_bit(PG_NEED_COMMIT, &(req)->wb_flags); + nfs_clear_request_commit(req); nfs_inode_remove_request(req); nfs_unlock_request(req); } @@ -564,110 +555,124 @@ static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pg #endif /* - * Try to update any existing write request, or create one if there is none. - * In order to match, the request's credentials must match those of - * the calling process. + * Search for an existing write request, and attempt to update + * it to reflect a new dirty region on a given page. * - * Note: Should always be called with the Page Lock held! + * If the attempt fails, then the existing request is flushed out + * to disk. */ -static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx, - struct page *page, unsigned int offset, unsigned int bytes) +static struct nfs_page *nfs_try_to_update_request(struct inode *inode, + struct page *page, + unsigned int offset, + unsigned int bytes) { - struct address_space *mapping = page->mapping; - struct inode *inode = mapping->host; - struct nfs_page *req, *new = NULL; - pgoff_t rqend, end; + struct nfs_page *req; + unsigned int rqend; + unsigned int end; + int error; + + if (!PagePrivate(page)) + return NULL; end = offset + bytes; + spin_lock(&inode->i_lock); for (;;) { - /* Loop over all inode entries and see if we find - * A request for the page we wish to update + req = nfs_page_find_request_locked(page); + if (req == NULL) + goto out_unlock; + + rqend = req->wb_offset + req->wb_bytes; + /* + * Tell the caller to flush out the request if + * the offsets are non-contiguous. + * Note: nfs_flush_incompatible() will already + * have flushed out requests having wrong owners. */ - if (new) { - if (radix_tree_preload(GFP_NOFS)) { - nfs_release_request(new); - return ERR_PTR(-ENOMEM); - } - } + if (offset > rqend + || end < req->wb_offset) + goto out_flushme; - spin_lock(&inode->i_lock); - req = nfs_page_find_request_locked(page); - if (req) { - if (!nfs_set_page_tag_locked(req)) { - int error; - - spin_unlock(&inode->i_lock); - error = nfs_wait_on_request(req); - nfs_release_request(req); - if (error < 0) { - if (new) { - radix_tree_preload_end(); - nfs_release_request(new); - } - return ERR_PTR(error); - } - continue; - } - spin_unlock(&inode->i_lock); - if (new) { - radix_tree_preload_end(); - nfs_release_request(new); - } + if (nfs_set_page_tag_locked(req)) break; - } - if (new) { - nfs_lock_request_dontget(new); - nfs_inode_add_request(inode, new); - spin_unlock(&inode->i_lock); - radix_tree_preload_end(); - req = new; - goto zero_page; - } + /* The request is locked, so wait and then retry */ spin_unlock(&inode->i_lock); - - new = nfs_create_request(ctx, inode, page, offset, bytes); - if (IS_ERR(new)) - return new; + error = nfs_wait_on_request(req); + nfs_release_request(req); + if (error != 0) + goto out_err; + spin_lock(&inode->i_lock); } - /* We have a request for our page. - * If the creds don't match, or the - * page addresses don't match, - * tell the caller to wait on the conflicting - * request. - */ - rqend = req->wb_offset + req->wb_bytes; - if (req->wb_context != ctx - || req->wb_page != page - || !nfs_dirty_request(req) - || offset > rqend || end < req->wb_offset) { - nfs_clear_page_tag_locked(req); - return ERR_PTR(-EBUSY); - } + if (nfs_clear_request_commit(req)) + radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree, + req->wb_index, NFS_PAGE_TAG_COMMIT); /* Okay, the request matches. Update the region */ if (offset < req->wb_offset) { req->wb_offset = offset; req->wb_pgbase = offset; - req->wb_bytes = max(end, rqend) - req->wb_offset; - goto zero_page; } - if (end > rqend) req->wb_bytes = end - req->wb_offset; - + else + req->wb_bytes = rqend - req->wb_offset; +out_unlock: + spin_unlock(&inode->i_lock); return req; -zero_page: - /* If this page might potentially be marked as up to date, - * then we need to zero any uninitalised data. */ - if (req->wb_pgbase == 0 && req->wb_bytes != PAGE_CACHE_SIZE - && !PageUptodate(req->wb_page)) - zero_user_segment(req->wb_page, req->wb_bytes, PAGE_CACHE_SIZE); +out_flushme: + spin_unlock(&inode->i_lock); + nfs_release_request(req); + error = nfs_wb_page(inode, page); +out_err: + return ERR_PTR(error); +} + +/* + * Try to update an existing write request, or create one if there is none. + * + * Note: Should always be called with the Page Lock held to prevent races + * if we have to add a new request. Also assumes that the caller has + * already called nfs_flush_incompatible() if necessary. + */ +static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx, + struct page *page, unsigned int offset, unsigned int bytes) +{ + struct inode *inode = page->mapping->host; + struct nfs_page *req; + int error; + + req = nfs_try_to_update_request(inode, page, offset, bytes); + if (req != NULL) + goto out; + req = nfs_create_request(ctx, inode, page, offset, bytes); + if (IS_ERR(req)) + goto out; + error = nfs_inode_add_request(inode, req); + if (error != 0) { + nfs_release_request(req); + req = ERR_PTR(error); + } +out: return req; } +static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page, + unsigned int offset, unsigned int count) +{ + struct nfs_page *req; + + req = nfs_setup_write_request(ctx, page, offset, count); + if (IS_ERR(req)) + return PTR_ERR(req); + /* Update file length */ + nfs_grow_file(page, offset, count); + nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes); + nfs_clear_page_tag_locked(req); + return 0; +} + int nfs_flush_incompatible(struct file *file, struct page *page) { struct nfs_open_context *ctx = nfs_file_open_context(file); @@ -685,8 +690,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page) req = nfs_page_find_request(page); if (req == NULL) return 0; - do_flush = req->wb_page != page || req->wb_context != ctx - || !nfs_dirty_request(req); + do_flush = req->wb_page != page || req->wb_context != ctx; nfs_release_request(req); if (!do_flush) return 0; @@ -721,10 +725,10 @@ int nfs_updatepage(struct file *file, struct page *page, nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE); - dprintk("NFS: nfs_updatepage(%s/%s %d@%Ld)\n", + dprintk("NFS: nfs_updatepage(%s/%s %d@%lld)\n", file->f_path.dentry->d_parent->d_name.name, file->f_path.dentry->d_name.name, count, - (long long)(page_offset(page) +offset)); + (long long)(page_offset(page) + offset)); /* If we're not using byte range locks, and we know the page * is up to date, it may be more efficient to extend the write @@ -744,7 +748,7 @@ int nfs_updatepage(struct file *file, struct page *page, else __set_page_dirty_nobuffers(page); - dprintk("NFS: nfs_updatepage returns %d (isize %Ld)\n", + dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n", status, (long long)i_size_read(inode)); return status; } @@ -752,12 +756,7 @@ int nfs_updatepage(struct file *file, struct page *page, static void nfs_writepage_release(struct nfs_page *req) { - if (PageError(req->wb_page)) { - nfs_end_page_writeback(req->wb_page); - nfs_inode_remove_request(req); - } else if (!nfs_reschedule_unstable_write(req)) { - /* Set the PG_uptodate flag */ - nfs_mark_uptodate(req->wb_page, req->wb_pgbase, req->wb_bytes); + if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req)) { nfs_end_page_writeback(req->wb_page); nfs_inode_remove_request(req); } else @@ -834,7 +833,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req, NFS_PROTO(inode)->write_setup(data, &msg); dprintk("NFS: %5u initiated write call " - "(req %s/%Ld, %u bytes @ offset %Lu)\n", + "(req %s/%lld, %u bytes @ offset %llu)\n", data->task.tk_pid, inode->i_sb->s_id, (long long)NFS_FILEID(inode), @@ -978,13 +977,13 @@ static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata) { struct nfs_write_data *data = calldata; - struct nfs_page *req = data->req; - dprintk("NFS: write (%s/%Ld %d@%Ld)", - req->wb_context->path.dentry->d_inode->i_sb->s_id, - (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode), - req->wb_bytes, - (long long)req_offset(req)); + dprintk("NFS: %5u write(%s/%lld %d@%lld)", + task->tk_pid, + data->req->wb_context->path.dentry->d_inode->i_sb->s_id, + (long long) + NFS_FILEID(data->req->wb_context->path.dentry->d_inode), + data->req->wb_bytes, (long long)req_offset(data->req)); nfs_writeback_done(task, data); } @@ -1058,7 +1057,8 @@ static void nfs_writeback_release_full(void *calldata) nfs_list_remove_request(req); - dprintk("NFS: write (%s/%Ld %d@%Ld)", + dprintk("NFS: %5u write (%s/%lld %d@%lld)", + data->task.tk_pid, req->wb_context->path.dentry->d_inode->i_sb->s_id, (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode), req->wb_bytes, @@ -1078,8 +1078,6 @@ static void nfs_writeback_release_full(void *calldata) dprintk(" marked for commit\n"); goto next; } - /* Set the PG_uptodate flag? */ - nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes); dprintk(" OK\n"); remove_request: nfs_end_page_writeback(page); @@ -1133,7 +1131,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) static unsigned long complain; if (time_before(complain, jiffies)) { - dprintk("NFS: faulty NFS server %s:" + dprintk("NFS: faulty NFS server %s:" " (committed = %d) != (stable = %d)\n", NFS_SERVER(data->inode)->nfs_client->cl_hostname, resp->verf->committed, argp->stable); @@ -1297,12 +1295,9 @@ static void nfs_commit_release(void *calldata) while (!list_empty(&data->pages)) { req = nfs_list_entry(data->pages.next); nfs_list_remove_request(req); - clear_bit(PG_NEED_COMMIT, &(req)->wb_flags); - dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); - dec_bdi_stat(req->wb_page->mapping->backing_dev_info, - BDI_RECLAIMABLE); + nfs_clear_request_commit(req); - dprintk("NFS: commit (%s/%Ld %d@%Ld)", + dprintk("NFS: commit (%s/%lld %d@%lld)", req->wb_context->path.dentry->d_inode->i_sb->s_id, (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode), req->wb_bytes, @@ -1318,9 +1313,6 @@ static void nfs_commit_release(void *calldata) * returned by the server against all stored verfs. */ if (!memcmp(req->wb_verf.verifier, data->verf.verifier, sizeof(data->verf.verifier))) { /* We have a match */ - /* Set the PG_uptodate flag */ - nfs_mark_uptodate(req->wb_page, req->wb_pgbase, - req->wb_bytes); nfs_inode_remove_request(req); dprintk(" OK\n"); goto next; @@ -1479,7 +1471,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page) req = nfs_page_find_request(page); if (req == NULL) goto out; - if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) { + if (test_bit(PG_CLEAN, &req->wb_flags)) { nfs_release_request(req); break; } diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 33bfcf09db46..9dc036f18356 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -1023,7 +1023,7 @@ exp_export(struct nfsctl_export *nxp) /* Look up the dentry */ err = path_lookup(nxp->ex_path, 0, &nd); if (err) - goto out_unlock; + goto out_put_clp; err = -EINVAL; exp = exp_get_by_name(clp, nd.path.mnt, nd.path.dentry, NULL); @@ -1090,9 +1090,9 @@ finish: exp_put(exp); if (fsid_key && !IS_ERR(fsid_key)) cache_put(&fsid_key->h, &svc_expkey_cache); - if (clp) - auth_domain_put(clp); path_put(&nd.path); +out_put_clp: + auth_domain_put(clp); out_unlock: exp_writeunlock(); out: diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c index 9e4a568a5013..b2786a5f9afe 100644 --- a/fs/nfsd/lockd.c +++ b/fs/nfsd/lockd.c @@ -19,6 +19,13 @@ #define NFSDDBG_FACILITY NFSDDBG_LOCKD +#ifdef CONFIG_LOCKD_V4 +#define nlm_stale_fh nlm4_stale_fh +#define nlm_failed nlm4_failed +#else +#define nlm_stale_fh nlm_lck_denied_nolocks +#define nlm_failed nlm_lck_denied_nolocks +#endif /* * Note: we hold the dentry use count while the file is open. */ @@ -35,7 +42,7 @@ nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp) fh.fh_export = NULL; exp_readlock(); - nfserr = nfsd_open(rqstp, &fh, S_IFREG, MAY_LOCK, filp); + nfserr = nfsd_open(rqstp, &fh, S_IFREG, NFSD_MAY_LOCK, filp); fh_put(&fh); rqstp->rq_client = NULL; exp_readunlock(); @@ -47,12 +54,10 @@ nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp) return 0; case nfserr_dropit: return nlm_drop_reply; -#ifdef CONFIG_LOCKD_V4 case nfserr_stale: - return nlm4_stale_fh; -#endif + return nlm_stale_fh; default: - return nlm_lck_denied; + return nlm_failed; } } @@ -65,7 +70,6 @@ nlm_fclose(struct file *filp) static struct nlmsvc_binding nfsd_nlm_ops = { .fopen = nlm_fopen, /* open file for locking */ .fclose = nlm_fclose, /* close file */ - .get_grace_period = get_nfs4_grace_period, }; void diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index 1c3b7654e966..4e3219e84116 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -40,7 +40,8 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp, dprintk("nfsd: GETACL(2acl) %s\n", SVCFH_fmt(&argp->fh)); fh = fh_copy(&resp->fh, &argp->fh); - if ((nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP))) + nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP); + if (nfserr) RETURN_STATUS(nfserr); if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT)) @@ -107,7 +108,7 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst * rqstp, dprintk("nfsd: SETACL(2acl) %s\n", SVCFH_fmt(&argp->fh)); fh = fh_copy(&resp->fh, &argp->fh); - nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_SATTR); + nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR); if (!nfserr) { nfserr = nfserrno( nfsd_set_posix_acl( @@ -134,7 +135,7 @@ static __be32 nfsacld_proc_getattr(struct svc_rqst * rqstp, dprintk("nfsd: GETATTR %s\n", SVCFH_fmt(&argp->fh)); fh_copy(&resp->fh, &argp->fh); - return fh_verify(rqstp, &resp->fh, 0, MAY_NOP); + return fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP); } /* diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index b647f2f872dc..9981dbb377a3 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -36,7 +36,8 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp, __be32 nfserr = 0; fh = fh_copy(&resp->fh, &argp->fh); - if ((nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP))) + nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP); + if (nfserr) RETURN_STATUS(nfserr); if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT)) @@ -101,7 +102,7 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst * rqstp, __be32 nfserr = 0; fh = fh_copy(&resp->fh, &argp->fh); - nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_SATTR); + nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR); if (!nfserr) { nfserr = nfserrno( nfsd_set_posix_acl( diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index c721a1e6e9dd..9dbd2eb91281 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -63,7 +63,8 @@ nfsd3_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp, SVCFH_fmt(&argp->fh)); fh_copy(&resp->fh, &argp->fh); - nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP); + nfserr = fh_verify(rqstp, &resp->fh, 0, + NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT); if (nfserr) RETURN_STATUS(nfserr); @@ -242,7 +243,7 @@ nfsd3_proc_create(struct svc_rqst *rqstp, struct nfsd3_createargs *argp, attr = &argp->attrs; /* Get the directory inode */ - nfserr = fh_verify(rqstp, dirfhp, S_IFDIR, MAY_CREATE); + nfserr = fh_verify(rqstp, dirfhp, S_IFDIR, NFSD_MAY_CREATE); if (nfserr) RETURN_STATUS(nfserr); @@ -530,7 +531,7 @@ nfsd3_proc_fsstat(struct svc_rqst * rqstp, struct nfsd_fhandle *argp, dprintk("nfsd: FSSTAT(3) %s\n", SVCFH_fmt(&argp->fh)); - nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats); + nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats, 0); fh_put(&argp->fh); RETURN_STATUS(nfserr); } @@ -558,7 +559,8 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle *argp, resp->f_maxfilesize = ~(u32) 0; resp->f_properties = NFS3_FSF_DEFAULT; - nfserr = fh_verify(rqstp, &argp->fh, 0, MAY_NOP); + nfserr = fh_verify(rqstp, &argp->fh, 0, + NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT); /* Check special features of the file system. May request * different read/write sizes for file systems known to have @@ -597,7 +599,7 @@ nfsd3_proc_pathconf(struct svc_rqst * rqstp, struct nfsd_fhandle *argp, resp->p_case_insensitive = 0; resp->p_case_preserving = 1; - nfserr = fh_verify(rqstp, &argp->fh, 0, MAY_NOP); + nfserr = fh_verify(rqstp, &argp->fh, 0, NFSD_MAY_NOP); if (nfserr == 0) { struct super_block *sb = argp->fh.fh_dentry->d_inode->i_sb; diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index b6ed38380ab8..54b8b4140c8f 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -443,7 +443,7 @@ init_state(struct posix_acl_state *state, int cnt) * enough space for either: */ alloc = sizeof(struct posix_ace_state_array) - + cnt*sizeof(struct posix_ace_state); + + cnt*sizeof(struct posix_user_ace_state); state->users = kzalloc(alloc, GFP_KERNEL); if (!state->users) return -ENOMEM; diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 4d4760e687c3..094747a1227c 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -225,7 +225,8 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec) RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len); WRITE32(OP_CB_RECALL); - WRITEMEM(&cb_rec->cbr_stateid, sizeof(stateid_t)); + WRITE32(cb_rec->cbr_stateid.si_generation); + WRITEMEM(&cb_rec->cbr_stateid.si_opaque, sizeof(stateid_opaque_t)); WRITE32(cb_rec->cbr_trunc); WRITE32(len); WRITEMEM(cb_rec->cbr_fhval, len); @@ -379,9 +380,10 @@ static int do_probe_callback(void *data) .addrsize = sizeof(addr), .timeout = &timeparms, .program = &cb_program, + .prognumber = cb->cb_prog, .version = nfs_cb_version[1]->number, .authflavor = RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */ - .flags = (RPC_CLNT_CREATE_NOPING), + .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET), }; struct rpc_message msg = { .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL], @@ -396,9 +398,6 @@ static int do_probe_callback(void *data) addr.sin_port = htons(cb->cb_port); addr.sin_addr.s_addr = htonl(cb->cb_addr); - /* Initialize rpc_stat */ - memset(args.program->stats, 0, sizeof(struct rpc_stat)); - /* Create RPC client */ client = rpc_create(&args); if (IS_ERR(client)) { diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index c309c881bd4e..669461e291ae 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -71,11 +71,11 @@ do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs return nfserr_inval; if (open->op_share_access & NFS4_SHARE_ACCESS_READ) - accmode |= MAY_READ; + accmode |= NFSD_MAY_READ; if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) - accmode |= (MAY_WRITE | MAY_TRUNC); + accmode |= (NFSD_MAY_WRITE | NFSD_MAY_TRUNC); if (open->op_share_deny & NFS4_SHARE_DENY_WRITE) - accmode |= MAY_WRITE; + accmode |= NFSD_MAY_WRITE; status = fh_verify(rqstp, current_fh, S_IFREG, accmode); @@ -126,7 +126,8 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o &resfh.fh_handle.fh_base, resfh.fh_handle.fh_size); if (!created) - status = do_open_permission(rqstp, current_fh, open, MAY_NOP); + status = do_open_permission(rqstp, current_fh, open, + NFSD_MAY_NOP); out: fh_put(&resfh); @@ -157,7 +158,8 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_ open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) && (open->op_iattr.ia_size == 0); - status = do_open_permission(rqstp, current_fh, open, MAY_OWNER_OVERRIDE); + status = do_open_permission(rqstp, current_fh, open, + NFSD_MAY_OWNER_OVERRIDE); return status; } @@ -186,7 +188,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, cstate->current_fh.fh_handle.fh_size = rp->rp_openfh_len; memcpy(&cstate->current_fh.fh_handle.fh_base, rp->rp_openfh, rp->rp_openfh_len); - status = fh_verify(rqstp, &cstate->current_fh, 0, MAY_NOP); + status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP); if (status) dprintk("nfsd4_open: replay failed" " restoring previous filehandle\n"); @@ -199,10 +201,10 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, /* Openowner is now set, so sequence id will get bumped. Now we need * these checks before we do any creates: */ status = nfserr_grace; - if (nfs4_in_grace() && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS) + if (locks_in_grace() && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS) goto out; status = nfserr_no_grace; - if (!nfs4_in_grace() && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS) + if (!locks_in_grace() && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS) goto out; switch (open->op_claim_type) { @@ -285,7 +287,7 @@ nfsd4_putfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, cstate->current_fh.fh_handle.fh_size = putfh->pf_fhlen; memcpy(&cstate->current_fh.fh_handle.fh_base, putfh->pf_fhval, putfh->pf_fhlen); - return fh_verify(rqstp, &cstate->current_fh, 0, MAY_NOP); + return fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP); } static __be32 @@ -363,7 +365,8 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, fh_init(&resfh, NFS4_FHSIZE); - status = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, MAY_CREATE); + status = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, + NFSD_MAY_CREATE); if (status == nfserr_symlink) status = nfserr_notdir; if (status) @@ -445,7 +448,7 @@ nfsd4_getattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { __be32 status; - status = fh_verify(rqstp, &cstate->current_fh, 0, MAY_NOP); + status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP); if (status) return status; @@ -572,7 +575,7 @@ nfsd4_remove(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { __be32 status; - if (nfs4_in_grace()) + if (locks_in_grace()) return nfserr_grace; status = nfsd_unlink(rqstp, &cstate->current_fh, 0, remove->rm_name, remove->rm_namelen); @@ -593,7 +596,7 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (!cstate->save_fh.fh_dentry) return status; - if (nfs4_in_grace() && !(cstate->save_fh.fh_export->ex_flags + if (locks_in_grace() && !(cstate->save_fh.fh_export->ex_flags & NFSEXP_NOSUBTREECHECK)) return nfserr_grace; status = nfsd_rename(rqstp, &cstate->save_fh, rename->rn_sname, @@ -730,7 +733,7 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, int count; __be32 status; - status = fh_verify(rqstp, &cstate->current_fh, 0, MAY_NOP); + status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP); if (status) return status; @@ -843,10 +846,13 @@ struct nfsd4_operation { #define ALLOWED_WITHOUT_FH 1 /* GETATTR and ops not listed as returning NFS4ERR_MOVED: */ #define ALLOWED_ON_ABSENT_FS 2 + char *op_name; }; static struct nfsd4_operation nfsd4_ops[]; +static const char *nfsd4_op_name(unsigned opnum); + /* * COMPOUND call. */ @@ -861,11 +867,6 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, int slack_bytes; __be32 status; - status = nfserr_resource; - cstate = cstate_alloc(); - if (cstate == NULL) - goto out; - resp->xbuf = &rqstp->rq_res; resp->p = rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len; resp->tagp = resp->p; @@ -884,11 +885,18 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, if (args->minorversion > NFSD_SUPPORTED_MINOR_VERSION) goto out; + status = nfserr_resource; + cstate = cstate_alloc(); + if (cstate == NULL) + goto out; + status = nfs_ok; while (!status && resp->opcnt < args->opcnt) { op = &args->ops[resp->opcnt++]; - dprintk("nfsv4 compound op #%d: %d\n", resp->opcnt, op->opnum); + dprintk("nfsv4 compound op #%d/%d: %d (%s)\n", + resp->opcnt, args->opcnt, op->opnum, + nfsd4_op_name(op->opnum)); /* * The XDR decode routines may have pre-set op->status; @@ -949,129 +957,172 @@ encode_op: nfsd4_increment_op_stats(op->opnum); } + cstate_free(cstate); out: nfsd4_release_compoundargs(args); - cstate_free(cstate); + dprintk("nfsv4 compound returned %d\n", ntohl(status)); return status; } static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = { [OP_ACCESS] = { .op_func = (nfsd4op_func)nfsd4_access, + .op_name = "OP_ACCESS", }, [OP_CLOSE] = { .op_func = (nfsd4op_func)nfsd4_close, + .op_name = "OP_CLOSE", }, [OP_COMMIT] = { .op_func = (nfsd4op_func)nfsd4_commit, + .op_name = "OP_COMMIT", }, [OP_CREATE] = { .op_func = (nfsd4op_func)nfsd4_create, + .op_name = "OP_CREATE", }, [OP_DELEGRETURN] = { .op_func = (nfsd4op_func)nfsd4_delegreturn, + .op_name = "OP_DELEGRETURN", }, [OP_GETATTR] = { .op_func = (nfsd4op_func)nfsd4_getattr, .op_flags = ALLOWED_ON_ABSENT_FS, + .op_name = "OP_GETATTR", }, [OP_GETFH] = { .op_func = (nfsd4op_func)nfsd4_getfh, + .op_name = "OP_GETFH", }, [OP_LINK] = { .op_func = (nfsd4op_func)nfsd4_link, + .op_name = "OP_LINK", }, [OP_LOCK] = { .op_func = (nfsd4op_func)nfsd4_lock, + .op_name = "OP_LOCK", }, [OP_LOCKT] = { .op_func = (nfsd4op_func)nfsd4_lockt, + .op_name = "OP_LOCKT", }, [OP_LOCKU] = { .op_func = (nfsd4op_func)nfsd4_locku, + .op_name = "OP_LOCKU", }, [OP_LOOKUP] = { .op_func = (nfsd4op_func)nfsd4_lookup, + .op_name = "OP_LOOKUP", }, [OP_LOOKUPP] = { .op_func = (nfsd4op_func)nfsd4_lookupp, + .op_name = "OP_LOOKUPP", }, [OP_NVERIFY] = { .op_func = (nfsd4op_func)nfsd4_nverify, + .op_name = "OP_NVERIFY", }, [OP_OPEN] = { .op_func = (nfsd4op_func)nfsd4_open, + .op_name = "OP_OPEN", }, [OP_OPEN_CONFIRM] = { .op_func = (nfsd4op_func)nfsd4_open_confirm, + .op_name = "OP_OPEN_CONFIRM", }, [OP_OPEN_DOWNGRADE] = { .op_func = (nfsd4op_func)nfsd4_open_downgrade, + .op_name = "OP_OPEN_DOWNGRADE", }, [OP_PUTFH] = { .op_func = (nfsd4op_func)nfsd4_putfh, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + .op_name = "OP_PUTFH", }, [OP_PUTPUBFH] = { - /* unsupported; just for future reference: */ + /* unsupported, just for future reference: */ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + .op_name = "OP_PUTPUBFH", }, [OP_PUTROOTFH] = { .op_func = (nfsd4op_func)nfsd4_putrootfh, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + .op_name = "OP_PUTROOTFH", }, [OP_READ] = { .op_func = (nfsd4op_func)nfsd4_read, + .op_name = "OP_READ", }, [OP_READDIR] = { .op_func = (nfsd4op_func)nfsd4_readdir, + .op_name = "OP_READDIR", }, [OP_READLINK] = { .op_func = (nfsd4op_func)nfsd4_readlink, + .op_name = "OP_READLINK", }, [OP_REMOVE] = { .op_func = (nfsd4op_func)nfsd4_remove, + .op_name = "OP_REMOVE", }, [OP_RENAME] = { + .op_name = "OP_RENAME", .op_func = (nfsd4op_func)nfsd4_rename, }, [OP_RENEW] = { .op_func = (nfsd4op_func)nfsd4_renew, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + .op_name = "OP_RENEW", }, [OP_RESTOREFH] = { .op_func = (nfsd4op_func)nfsd4_restorefh, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + .op_name = "OP_RESTOREFH", }, [OP_SAVEFH] = { .op_func = (nfsd4op_func)nfsd4_savefh, + .op_name = "OP_SAVEFH", }, [OP_SECINFO] = { .op_func = (nfsd4op_func)nfsd4_secinfo, + .op_name = "OP_SECINFO", }, [OP_SETATTR] = { .op_func = (nfsd4op_func)nfsd4_setattr, + .op_name = "OP_SETATTR", }, [OP_SETCLIENTID] = { .op_func = (nfsd4op_func)nfsd4_setclientid, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + .op_name = "OP_SETCLIENTID", }, [OP_SETCLIENTID_CONFIRM] = { .op_func = (nfsd4op_func)nfsd4_setclientid_confirm, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + .op_name = "OP_SETCLIENTID_CONFIRM", }, [OP_VERIFY] = { .op_func = (nfsd4op_func)nfsd4_verify, + .op_name = "OP_VERIFY", }, [OP_WRITE] = { .op_func = (nfsd4op_func)nfsd4_write, + .op_name = "OP_WRITE", }, [OP_RELEASE_LOCKOWNER] = { .op_func = (nfsd4op_func)nfsd4_release_lockowner, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + .op_name = "OP_RELEASE_LOCKOWNER", }, }; +static const char *nfsd4_op_name(unsigned opnum) +{ + if (opnum < ARRAY_SIZE(nfsd4_ops)) + return nfsd4_ops[opnum].op_name; + return "unknown_operation"; +} + #define nfs4svc_decode_voidargs NULL #define nfs4svc_release_void NULL #define nfsd4_voidres nfsd4_voidargs diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 8799b8708188..0cc7ff5d5ab5 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -61,7 +61,6 @@ static time_t lease_time = 90; /* default lease time */ static time_t user_lease_time = 90; static time_t boot_time; -static int in_grace = 1; static u32 current_ownerid = 1; static u32 current_fileid = 1; static u32 current_delegid = 1; @@ -1173,6 +1172,24 @@ static inline int deny_valid(u32 x) return x <= NFS4_SHARE_DENY_BOTH; } +/* + * We store the NONE, READ, WRITE, and BOTH bits separately in the + * st_{access,deny}_bmap field of the stateid, in order to track not + * only what share bits are currently in force, but also what + * combinations of share bits previous opens have used. This allows us + * to enforce the recommendation of rfc 3530 14.2.19 that the server + * return an error if the client attempt to downgrade to a combination + * of share bits not explicable by closing some of its previous opens. + * + * XXX: This enforcement is actually incomplete, since we don't keep + * track of access/deny bit combinations; so, e.g., we allow: + * + * OPEN allow read, deny write + * OPEN allow both, deny none + * DOWNGRADE allow read, deny none + * + * which we should reject. + */ static void set_access(unsigned int *access, unsigned long bmap) { int i; @@ -1570,6 +1587,10 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct svc_fh *cur_fh, struct nfs4_sta int err = get_write_access(inode); if (err) return nfserrno(err); + err = mnt_want_write(cur_fh->fh_export->ex_path.mnt); + if (err) + return nfserrno(err); + file_take_write(filp); } status = nfsd4_truncate(rqstp, cur_fh, open); if (status) { @@ -1579,8 +1600,8 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct svc_fh *cur_fh, struct nfs4_sta } /* remember the open */ filp->f_mode |= open->op_share_access; - set_bit(open->op_share_access, &stp->st_access_bmap); - set_bit(open->op_share_deny, &stp->st_deny_bmap); + __set_bit(open->op_share_access, &stp->st_access_bmap); + __set_bit(open->op_share_deny, &stp->st_deny_bmap); return nfs_ok; } @@ -1618,7 +1639,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta case NFS4_OPEN_CLAIM_NULL: /* Let's not give out any delegations till everyone's * had the chance to reclaim theirs.... */ - if (nfs4_in_grace()) + if (locks_in_grace()) goto out; if (!atomic_read(&cb->cb_set) || !sop->so_confirmed) goto out; @@ -1722,9 +1743,9 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf /* Stateid was not found, this is a new OPEN */ int flags = 0; if (open->op_share_access & NFS4_SHARE_ACCESS_READ) - flags |= MAY_READ; + flags |= NFSD_MAY_READ; if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) - flags |= MAY_WRITE; + flags |= NFSD_MAY_WRITE; status = nfs4_new_open(rqstp, &stp, dp, current_fh, flags); if (status) goto out; @@ -1794,12 +1815,15 @@ out: return status; } +struct lock_manager nfsd4_manager = { +}; + static void -end_grace(void) +nfsd4_end_grace(void) { dprintk("NFSD: end of grace period\n"); nfsd4_recdir_purge_old(); - in_grace = 0; + locks_end_grace(&nfsd4_manager); } static time_t @@ -1816,8 +1840,8 @@ nfs4_laundromat(void) nfs4_lock_state(); dprintk("NFSD: laundromat service - starting\n"); - if (in_grace) - end_grace(); + if (locks_in_grace()) + nfsd4_end_grace(); list_for_each_safe(pos, next, &client_lru) { clp = list_entry(pos, struct nfs4_client, cl_lru); if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) { @@ -1952,7 +1976,7 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags) return nfserr_bad_stateid; else if (ONE_STATEID(stateid) && (flags & RD_STATE)) return nfs_ok; - else if (nfs4_in_grace()) { + else if (locks_in_grace()) { /* Answer in remaining cases depends on existance of * conflicting state; so we must wait out the grace period. */ return nfserr_grace; @@ -1971,7 +1995,7 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags) static inline int io_during_grace_disallowed(struct inode *inode, int flags) { - return nfs4_in_grace() && (flags & (RD_STATE | WR_STATE)) + return locks_in_grace() && (flags & (RD_STATE | WR_STATE)) && mandatory_lock(inode); } @@ -2610,7 +2634,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return nfserr_inval; if ((status = fh_verify(rqstp, &cstate->current_fh, - S_IFREG, MAY_LOCK))) { + S_IFREG, NFSD_MAY_LOCK))) { dprintk("NFSD: nfsd4_lock: permission denied!\n"); return status; } @@ -2671,10 +2695,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, filp = lock_stp->st_vfs_file; status = nfserr_grace; - if (nfs4_in_grace() && !lock->lk_reclaim) + if (locks_in_grace() && !lock->lk_reclaim) goto out; status = nfserr_no_grace; - if (!nfs4_in_grace() && lock->lk_reclaim) + if (!locks_in_grace() && lock->lk_reclaim) goto out; locks_init_lock(&file_lock); @@ -2757,7 +2781,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, int error; __be32 status; - if (nfs4_in_grace()) + if (locks_in_grace()) return nfserr_grace; if (check_lock_length(lockt->lt_offset, lockt->lt_length)) @@ -3170,9 +3194,9 @@ __nfs4_state_start(void) unsigned long grace_time; boot_time = get_seconds(); - grace_time = get_nfs_grace_period(); + grace_time = get_nfs4_grace_period(); lease_time = user_lease_time; - in_grace = 1; + locks_start_grace(&nfsd4_manager); printk(KERN_INFO "NFSD: starting %ld-second grace period\n", grace_time/HZ); laundry_wq = create_singlethread_workqueue("nfsd4"); @@ -3191,12 +3215,6 @@ nfs4_state_start(void) return; } -int -nfs4_in_grace(void) -{ - return in_grace; -} - time_t nfs4_lease_time(void) { @@ -3249,12 +3267,14 @@ nfs4_state_shutdown(void) nfs4_unlock_state(); } +/* + * user_recovery_dirname is protected by the nfsd_mutex since it's only + * accessed when nfsd is starting. + */ static void nfs4_set_recdir(char *recdir) { - nfs4_lock_state(); strcpy(user_recovery_dirname, recdir); - nfs4_unlock_state(); } /* @@ -3278,6 +3298,12 @@ nfs4_reset_recoverydir(char *recdir) return status; } +char * +nfs4_recoverydir(void) +{ + return user_recovery_dirname; +} + /* * Called when leasetime is changed. * @@ -3286,11 +3312,12 @@ nfs4_reset_recoverydir(char *recdir) * we start to register any changes in lease time. If the administrator * really wants to change the lease time *now*, they can go ahead and bring * nfsd down and then back up again after changing the lease time. + * + * user_lease_time is protected by nfsd_mutex since it's only really accessed + * when nfsd is starting */ void nfs4_reset_lease(time_t leasetime) { - lock_kernel(); user_lease_time = leasetime; - unlock_kernel(); } diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index c513bbdf2d36..afcdf4b76843 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -413,6 +413,18 @@ out_nfserr: } static __be32 +nfsd4_decode_stateid(struct nfsd4_compoundargs *argp, stateid_t *sid) +{ + DECODE_HEAD; + + READ_BUF(sizeof(stateid_t)); + READ32(sid->si_generation); + COPYMEM(&sid->si_opaque, sizeof(stateid_opaque_t)); + + DECODE_TAIL; +} + +static __be32 nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access) { DECODE_HEAD; @@ -429,10 +441,9 @@ nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close) DECODE_HEAD; close->cl_stateowner = NULL; - READ_BUF(4 + sizeof(stateid_t)); + READ_BUF(4); READ32(close->cl_seqid); - READ32(close->cl_stateid.si_generation); - COPYMEM(&close->cl_stateid.si_opaque, sizeof(stateid_opaque_t)); + return nfsd4_decode_stateid(argp, &close->cl_stateid); DECODE_TAIL; } @@ -493,13 +504,7 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create static inline __be32 nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, struct nfsd4_delegreturn *dr) { - DECODE_HEAD; - - READ_BUF(sizeof(stateid_t)); - READ32(dr->dr_stateid.si_generation); - COPYMEM(&dr->dr_stateid.si_opaque, sizeof(stateid_opaque_t)); - - DECODE_TAIL; + return nfsd4_decode_stateid(argp, &dr->dr_stateid); } static inline __be32 @@ -542,20 +547,22 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock) READ32(lock->lk_is_new); if (lock->lk_is_new) { - READ_BUF(36); + READ_BUF(4); READ32(lock->lk_new_open_seqid); - READ32(lock->lk_new_open_stateid.si_generation); - - COPYMEM(&lock->lk_new_open_stateid.si_opaque, sizeof(stateid_opaque_t)); + status = nfsd4_decode_stateid(argp, &lock->lk_new_open_stateid); + if (status) + return status; + READ_BUF(8 + sizeof(clientid_t)); READ32(lock->lk_new_lock_seqid); COPYMEM(&lock->lk_new_clientid, sizeof(clientid_t)); READ32(lock->lk_new_owner.len); READ_BUF(lock->lk_new_owner.len); READMEM(lock->lk_new_owner.data, lock->lk_new_owner.len); } else { - READ_BUF(20); - READ32(lock->lk_old_lock_stateid.si_generation); - COPYMEM(&lock->lk_old_lock_stateid.si_opaque, sizeof(stateid_opaque_t)); + status = nfsd4_decode_stateid(argp, &lock->lk_old_lock_stateid); + if (status) + return status; + READ_BUF(4); READ32(lock->lk_old_lock_seqid); } @@ -587,13 +594,15 @@ nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku) DECODE_HEAD; locku->lu_stateowner = NULL; - READ_BUF(24 + sizeof(stateid_t)); + READ_BUF(8); READ32(locku->lu_type); if ((locku->lu_type < NFS4_READ_LT) || (locku->lu_type > NFS4_WRITEW_LT)) goto xdr_error; READ32(locku->lu_seqid); - READ32(locku->lu_stateid.si_generation); - COPYMEM(&locku->lu_stateid.si_opaque, sizeof(stateid_opaque_t)); + status = nfsd4_decode_stateid(argp, &locku->lu_stateid); + if (status) + return status; + READ_BUF(16); READ64(locku->lu_offset); READ64(locku->lu_length); @@ -678,8 +687,10 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) READ32(open->op_delegate_type); break; case NFS4_OPEN_CLAIM_DELEGATE_CUR: - READ_BUF(sizeof(stateid_t) + 4); - COPYMEM(&open->op_delegate_stateid, sizeof(stateid_t)); + status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid); + if (status) + return status; + READ_BUF(4); READ32(open->op_fname.len); READ_BUF(open->op_fname.len); SAVEMEM(open->op_fname.data, open->op_fname.len); @@ -699,9 +710,10 @@ nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_con DECODE_HEAD; open_conf->oc_stateowner = NULL; - READ_BUF(4 + sizeof(stateid_t)); - READ32(open_conf->oc_req_stateid.si_generation); - COPYMEM(&open_conf->oc_req_stateid.si_opaque, sizeof(stateid_opaque_t)); + status = nfsd4_decode_stateid(argp, &open_conf->oc_req_stateid); + if (status) + return status; + READ_BUF(4); READ32(open_conf->oc_seqid); DECODE_TAIL; @@ -713,9 +725,10 @@ nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_d DECODE_HEAD; open_down->od_stateowner = NULL; - READ_BUF(12 + sizeof(stateid_t)); - READ32(open_down->od_stateid.si_generation); - COPYMEM(&open_down->od_stateid.si_opaque, sizeof(stateid_opaque_t)); + status = nfsd4_decode_stateid(argp, &open_down->od_stateid); + if (status) + return status; + READ_BUF(12); READ32(open_down->od_seqid); READ32(open_down->od_share_access); READ32(open_down->od_share_deny); @@ -743,9 +756,10 @@ nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read) { DECODE_HEAD; - READ_BUF(sizeof(stateid_t) + 12); - READ32(read->rd_stateid.si_generation); - COPYMEM(&read->rd_stateid.si_opaque, sizeof(stateid_opaque_t)); + status = nfsd4_decode_stateid(argp, &read->rd_stateid); + if (status) + return status; + READ_BUF(12); READ64(read->rd_offset); READ32(read->rd_length); @@ -834,15 +848,13 @@ nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp, static __be32 nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr) { - DECODE_HEAD; - - READ_BUF(sizeof(stateid_t)); - READ32(setattr->sa_stateid.si_generation); - COPYMEM(&setattr->sa_stateid.si_opaque, sizeof(stateid_opaque_t)); - if ((status = nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr, &setattr->sa_acl))) - goto out; + __be32 status; - DECODE_TAIL; + status = nfsd4_decode_stateid(argp, &setattr->sa_stateid); + if (status) + return status; + return nfsd4_decode_fattr(argp, setattr->sa_bmval, + &setattr->sa_iattr, &setattr->sa_acl); } static __be32 @@ -927,9 +939,10 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) int len; DECODE_HEAD; - READ_BUF(sizeof(stateid_opaque_t) + 20); - READ32(write->wr_stateid.si_generation); - COPYMEM(&write->wr_stateid.si_opaque, sizeof(stateid_opaque_t)); + status = nfsd4_decode_stateid(argp, &write->wr_stateid); + if (status) + return status; + READ_BUF(16); READ64(write->wr_offset); READ32(write->wr_stable_how); if (write->wr_stable_how > 2) @@ -986,10 +999,74 @@ nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_rel } static __be32 +nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p) +{ + return nfs_ok; +} + +static __be32 +nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, void *p) +{ + return nfserr_opnotsupp; +} + +typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, void *); + +static nfsd4_dec nfsd4_dec_ops[] = { + [OP_ACCESS] = (nfsd4_dec)nfsd4_decode_access, + [OP_CLOSE] = (nfsd4_dec)nfsd4_decode_close, + [OP_COMMIT] = (nfsd4_dec)nfsd4_decode_commit, + [OP_CREATE] = (nfsd4_dec)nfsd4_decode_create, + [OP_DELEGPURGE] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_DELEGRETURN] = (nfsd4_dec)nfsd4_decode_delegreturn, + [OP_GETATTR] = (nfsd4_dec)nfsd4_decode_getattr, + [OP_GETFH] = (nfsd4_dec)nfsd4_decode_noop, + [OP_LINK] = (nfsd4_dec)nfsd4_decode_link, + [OP_LOCK] = (nfsd4_dec)nfsd4_decode_lock, + [OP_LOCKT] = (nfsd4_dec)nfsd4_decode_lockt, + [OP_LOCKU] = (nfsd4_dec)nfsd4_decode_locku, + [OP_LOOKUP] = (nfsd4_dec)nfsd4_decode_lookup, + [OP_LOOKUPP] = (nfsd4_dec)nfsd4_decode_noop, + [OP_NVERIFY] = (nfsd4_dec)nfsd4_decode_verify, + [OP_OPEN] = (nfsd4_dec)nfsd4_decode_open, + [OP_OPENATTR] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_OPEN_CONFIRM] = (nfsd4_dec)nfsd4_decode_open_confirm, + [OP_OPEN_DOWNGRADE] = (nfsd4_dec)nfsd4_decode_open_downgrade, + [OP_PUTFH] = (nfsd4_dec)nfsd4_decode_putfh, + [OP_PUTPUBFH] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_PUTROOTFH] = (nfsd4_dec)nfsd4_decode_noop, + [OP_READ] = (nfsd4_dec)nfsd4_decode_read, + [OP_READDIR] = (nfsd4_dec)nfsd4_decode_readdir, + [OP_READLINK] = (nfsd4_dec)nfsd4_decode_noop, + [OP_REMOVE] = (nfsd4_dec)nfsd4_decode_remove, + [OP_RENAME] = (nfsd4_dec)nfsd4_decode_rename, + [OP_RENEW] = (nfsd4_dec)nfsd4_decode_renew, + [OP_RESTOREFH] = (nfsd4_dec)nfsd4_decode_noop, + [OP_SAVEFH] = (nfsd4_dec)nfsd4_decode_noop, + [OP_SECINFO] = (nfsd4_dec)nfsd4_decode_secinfo, + [OP_SETATTR] = (nfsd4_dec)nfsd4_decode_setattr, + [OP_SETCLIENTID] = (nfsd4_dec)nfsd4_decode_setclientid, + [OP_SETCLIENTID_CONFIRM] = (nfsd4_dec)nfsd4_decode_setclientid_confirm, + [OP_VERIFY] = (nfsd4_dec)nfsd4_decode_verify, + [OP_WRITE] = (nfsd4_dec)nfsd4_decode_write, + [OP_RELEASE_LOCKOWNER] = (nfsd4_dec)nfsd4_decode_release_lockowner, +}; + +struct nfsd4_minorversion_ops { + nfsd4_dec *decoders; + int nops; +}; + +static struct nfsd4_minorversion_ops nfsd4_minorversion[] = { + [0] = { nfsd4_dec_ops, ARRAY_SIZE(nfsd4_dec_ops) }, +}; + +static __be32 nfsd4_decode_compound(struct nfsd4_compoundargs *argp) { DECODE_HEAD; struct nfsd4_op *op; + struct nfsd4_minorversion_ops *ops; int i; /* @@ -1019,6 +1096,10 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) } } + if (argp->minorversion >= ARRAY_SIZE(nfsd4_minorversion)) + argp->opcnt = 0; + + ops = &nfsd4_minorversion[argp->minorversion]; for (i = 0; i < argp->opcnt; i++) { op = &argp->ops[i]; op->replay = NULL; @@ -1056,120 +1137,11 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) } op->opnum = ntohl(*argp->p++); - switch (op->opnum) { - case 2: /* Reserved operation */ - op->opnum = OP_ILLEGAL; - if (argp->minorversion == 0) - op->status = nfserr_op_illegal; - else - op->status = nfserr_minor_vers_mismatch; - break; - case OP_ACCESS: - op->status = nfsd4_decode_access(argp, &op->u.access); - break; - case OP_CLOSE: - op->status = nfsd4_decode_close(argp, &op->u.close); - break; - case OP_COMMIT: - op->status = nfsd4_decode_commit(argp, &op->u.commit); - break; - case OP_CREATE: - op->status = nfsd4_decode_create(argp, &op->u.create); - break; - case OP_DELEGRETURN: - op->status = nfsd4_decode_delegreturn(argp, &op->u.delegreturn); - break; - case OP_GETATTR: - op->status = nfsd4_decode_getattr(argp, &op->u.getattr); - break; - case OP_GETFH: - op->status = nfs_ok; - break; - case OP_LINK: - op->status = nfsd4_decode_link(argp, &op->u.link); - break; - case OP_LOCK: - op->status = nfsd4_decode_lock(argp, &op->u.lock); - break; - case OP_LOCKT: - op->status = nfsd4_decode_lockt(argp, &op->u.lockt); - break; - case OP_LOCKU: - op->status = nfsd4_decode_locku(argp, &op->u.locku); - break; - case OP_LOOKUP: - op->status = nfsd4_decode_lookup(argp, &op->u.lookup); - break; - case OP_LOOKUPP: - op->status = nfs_ok; - break; - case OP_NVERIFY: - op->status = nfsd4_decode_verify(argp, &op->u.nverify); - break; - case OP_OPEN: - op->status = nfsd4_decode_open(argp, &op->u.open); - break; - case OP_OPEN_CONFIRM: - op->status = nfsd4_decode_open_confirm(argp, &op->u.open_confirm); - break; - case OP_OPEN_DOWNGRADE: - op->status = nfsd4_decode_open_downgrade(argp, &op->u.open_downgrade); - break; - case OP_PUTFH: - op->status = nfsd4_decode_putfh(argp, &op->u.putfh); - break; - case OP_PUTROOTFH: - op->status = nfs_ok; - break; - case OP_READ: - op->status = nfsd4_decode_read(argp, &op->u.read); - break; - case OP_READDIR: - op->status = nfsd4_decode_readdir(argp, &op->u.readdir); - break; - case OP_READLINK: - op->status = nfs_ok; - break; - case OP_REMOVE: - op->status = nfsd4_decode_remove(argp, &op->u.remove); - break; - case OP_RENAME: - op->status = nfsd4_decode_rename(argp, &op->u.rename); - break; - case OP_RESTOREFH: - op->status = nfs_ok; - break; - case OP_RENEW: - op->status = nfsd4_decode_renew(argp, &op->u.renew); - break; - case OP_SAVEFH: - op->status = nfs_ok; - break; - case OP_SECINFO: - op->status = nfsd4_decode_secinfo(argp, &op->u.secinfo); - break; - case OP_SETATTR: - op->status = nfsd4_decode_setattr(argp, &op->u.setattr); - break; - case OP_SETCLIENTID: - op->status = nfsd4_decode_setclientid(argp, &op->u.setclientid); - break; - case OP_SETCLIENTID_CONFIRM: - op->status = nfsd4_decode_setclientid_confirm(argp, &op->u.setclientid_confirm); - break; - case OP_VERIFY: - op->status = nfsd4_decode_verify(argp, &op->u.verify); - break; - case OP_WRITE: - op->status = nfsd4_decode_write(argp, &op->u.write); - break; - case OP_RELEASE_LOCKOWNER: - op->status = nfsd4_decode_release_lockowner(argp, &op->u.release_lockowner); - break; - default: + if (op->opnum >= OP_ACCESS && op->opnum < ops->nops) + op->status = ops->decoders[op->opnum](argp, &op->u); + else { op->opnum = OP_ILLEGAL; op->status = nfserr_op_illegal; - break; } if (op->status) { @@ -1201,11 +1173,11 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) *p++ = htonl((u32)((n) >> 32)); \ *p++ = htonl((u32)(n)); \ } while (0) -#define WRITEMEM(ptr,nbytes) do { \ +#define WRITEMEM(ptr,nbytes) do { if (nbytes > 0) { \ *(p + XDR_QUADLEN(nbytes) -1) = 0; \ memcpy(p, ptr, nbytes); \ p += XDR_QUADLEN(nbytes); \ -} while (0) +}} while (0) #define WRITECINFO(c) do { \ *p++ = htonl(c.atomic); \ *p++ = htonl(c.before_ctime_sec); \ @@ -1224,7 +1196,6 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) * Header routine to setup seqid operation replay cache */ #define ENCODE_SEQID_OP_HEAD \ - __be32 *p; \ __be32 *save; \ \ save = resp->p; @@ -1992,6 +1963,17 @@ fail: } static void +nfsd4_encode_stateid(struct nfsd4_compoundres *resp, stateid_t *sid) +{ + ENCODE_HEAD; + + RESERVE_SPACE(sizeof(stateid_t)); + WRITE32(sid->si_generation); + WRITEMEM(&sid->si_opaque, sizeof(stateid_opaque_t)); + ADJUST_ARGS(); +} + +static __be32 nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access) { ENCODE_HEAD; @@ -2002,24 +1984,23 @@ nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ WRITE32(access->ac_resp_access); ADJUST_ARGS(); } + return nfserr; } -static void +static __be32 nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close) { ENCODE_SEQID_OP_HEAD; - if (!nfserr) { - RESERVE_SPACE(sizeof(stateid_t)); - WRITE32(close->cl_stateid.si_generation); - WRITEMEM(&close->cl_stateid.si_opaque, sizeof(stateid_opaque_t)); - ADJUST_ARGS(); - } + if (!nfserr) + nfsd4_encode_stateid(resp, &close->cl_stateid); + ENCODE_SEQID_OP_TAIL(close->cl_stateowner); + return nfserr; } -static void +static __be32 nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_commit *commit) { ENCODE_HEAD; @@ -2029,9 +2010,10 @@ nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ WRITEMEM(commit->co_verf.data, 8); ADJUST_ARGS(); } + return nfserr; } -static void +static __be32 nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_create *create) { ENCODE_HEAD; @@ -2044,6 +2026,7 @@ nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ WRITE32(create->cr_bmval[1]); ADJUST_ARGS(); } + return nfserr; } static __be32 @@ -2064,9 +2047,10 @@ nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4 return nfserr; } -static void -nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh *fhp) +static __be32 +nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh **fhpp) { + struct svc_fh *fhp = *fhpp; unsigned int len; ENCODE_HEAD; @@ -2077,6 +2061,7 @@ nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh WRITEMEM(&fhp->fh_handle.fh_base, len); ADJUST_ARGS(); } + return nfserr; } /* @@ -2104,46 +2089,42 @@ nfsd4_encode_lock_denied(struct nfsd4_compoundres *resp, struct nfsd4_lock_denie ADJUST_ARGS(); } -static void +static __be32 nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lock *lock) { ENCODE_SEQID_OP_HEAD; - if (!nfserr) { - RESERVE_SPACE(4 + sizeof(stateid_t)); - WRITE32(lock->lk_resp_stateid.si_generation); - WRITEMEM(&lock->lk_resp_stateid.si_opaque, sizeof(stateid_opaque_t)); - ADJUST_ARGS(); - } else if (nfserr == nfserr_denied) + if (!nfserr) + nfsd4_encode_stateid(resp, &lock->lk_resp_stateid); + else if (nfserr == nfserr_denied) nfsd4_encode_lock_denied(resp, &lock->lk_denied); ENCODE_SEQID_OP_TAIL(lock->lk_replay_owner); + return nfserr; } -static void +static __be32 nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lockt *lockt) { if (nfserr == nfserr_denied) nfsd4_encode_lock_denied(resp, &lockt->lt_denied); + return nfserr; } -static void +static __be32 nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_locku *locku) { ENCODE_SEQID_OP_HEAD; - if (!nfserr) { - RESERVE_SPACE(sizeof(stateid_t)); - WRITE32(locku->lu_stateid.si_generation); - WRITEMEM(&locku->lu_stateid.si_opaque, sizeof(stateid_opaque_t)); - ADJUST_ARGS(); - } - + if (!nfserr) + nfsd4_encode_stateid(resp, &locku->lu_stateid); + ENCODE_SEQID_OP_TAIL(locku->lu_stateowner); + return nfserr; } -static void +static __be32 nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_link *link) { ENCODE_HEAD; @@ -2153,20 +2134,21 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_li WRITECINFO(link->li_cinfo); ADJUST_ARGS(); } + return nfserr; } -static void +static __be32 nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open) { + ENCODE_HEAD; ENCODE_SEQID_OP_HEAD; if (nfserr) goto out; - RESERVE_SPACE(36 + sizeof(stateid_t)); - WRITE32(open->op_stateid.si_generation); - WRITEMEM(&open->op_stateid.si_opaque, sizeof(stateid_opaque_t)); + nfsd4_encode_stateid(resp, &open->op_stateid); + RESERVE_SPACE(40); WRITECINFO(open->op_cinfo); WRITE32(open->op_rflags); WRITE32(2); @@ -2179,8 +2161,8 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op case NFS4_OPEN_DELEGATE_NONE: break; case NFS4_OPEN_DELEGATE_READ: - RESERVE_SPACE(20 + sizeof(stateid_t)); - WRITEMEM(&open->op_delegate_stateid, sizeof(stateid_t)); + nfsd4_encode_stateid(resp, &open->op_delegate_stateid); + RESERVE_SPACE(20); WRITE32(open->op_recall); /* @@ -2193,8 +2175,8 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op ADJUST_ARGS(); break; case NFS4_OPEN_DELEGATE_WRITE: - RESERVE_SPACE(32 + sizeof(stateid_t)); - WRITEMEM(&open->op_delegate_stateid, sizeof(stateid_t)); + nfsd4_encode_stateid(resp, &open->op_delegate_stateid); + RESERVE_SPACE(32); WRITE32(0); /* @@ -2219,36 +2201,31 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op /* XXX save filehandle here */ out: ENCODE_SEQID_OP_TAIL(open->op_stateowner); + return nfserr; } -static void +static __be32 nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc) { ENCODE_SEQID_OP_HEAD; - - if (!nfserr) { - RESERVE_SPACE(sizeof(stateid_t)); - WRITE32(oc->oc_resp_stateid.si_generation); - WRITEMEM(&oc->oc_resp_stateid.si_opaque, sizeof(stateid_opaque_t)); - ADJUST_ARGS(); - } + + if (!nfserr) + nfsd4_encode_stateid(resp, &oc->oc_resp_stateid); ENCODE_SEQID_OP_TAIL(oc->oc_stateowner); + return nfserr; } -static void +static __be32 nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od) { ENCODE_SEQID_OP_HEAD; - - if (!nfserr) { - RESERVE_SPACE(sizeof(stateid_t)); - WRITE32(od->od_stateid.si_generation); - WRITEMEM(&od->od_stateid.si_opaque, sizeof(stateid_opaque_t)); - ADJUST_ARGS(); - } + + if (!nfserr) + nfsd4_encode_stateid(resp, &od->od_stateid); ENCODE_SEQID_OP_TAIL(od->od_stateowner); + return nfserr; } static __be32 @@ -2443,7 +2420,7 @@ err_no_verf: return nfserr; } -static void +static __be32 nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_remove *remove) { ENCODE_HEAD; @@ -2453,9 +2430,10 @@ nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ WRITECINFO(remove->rm_cinfo); ADJUST_ARGS(); } + return nfserr; } -static void +static __be32 nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_rename *rename) { ENCODE_HEAD; @@ -2466,9 +2444,10 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ WRITECINFO(rename->rn_tinfo); ADJUST_ARGS(); } + return nfserr; } -static void +static __be32 nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_secinfo *secinfo) { @@ -2532,13 +2511,14 @@ nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr, out: if (exp) exp_put(exp); + return nfserr; } /* * The SETATTR encode routine is special -- it always encodes a bitmap, * regardless of the error status. */ -static void +static __be32 nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setattr *setattr) { ENCODE_HEAD; @@ -2555,9 +2535,10 @@ nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4 WRITE32(setattr->sa_bmval[1]); } ADJUST_ARGS(); + return nfserr; } -static void +static __be32 nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setclientid *scd) { ENCODE_HEAD; @@ -2574,9 +2555,10 @@ nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct n WRITE32(0); ADJUST_ARGS(); } + return nfserr; } -static void +static __be32 nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_write *write) { ENCODE_HEAD; @@ -2588,8 +2570,56 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w WRITEMEM(write->wr_verifier.data, 8); ADJUST_ARGS(); } + return nfserr; } +static __be32 +nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p) +{ + return nfserr; +} + +typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *); + +static nfsd4_enc nfsd4_enc_ops[] = { + [OP_ACCESS] = (nfsd4_enc)nfsd4_encode_access, + [OP_CLOSE] = (nfsd4_enc)nfsd4_encode_close, + [OP_COMMIT] = (nfsd4_enc)nfsd4_encode_commit, + [OP_CREATE] = (nfsd4_enc)nfsd4_encode_create, + [OP_DELEGPURGE] = (nfsd4_enc)nfsd4_encode_noop, + [OP_DELEGRETURN] = (nfsd4_enc)nfsd4_encode_noop, + [OP_GETATTR] = (nfsd4_enc)nfsd4_encode_getattr, + [OP_GETFH] = (nfsd4_enc)nfsd4_encode_getfh, + [OP_LINK] = (nfsd4_enc)nfsd4_encode_link, + [OP_LOCK] = (nfsd4_enc)nfsd4_encode_lock, + [OP_LOCKT] = (nfsd4_enc)nfsd4_encode_lockt, + [OP_LOCKU] = (nfsd4_enc)nfsd4_encode_locku, + [OP_LOOKUP] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LOOKUPP] = (nfsd4_enc)nfsd4_encode_noop, + [OP_NVERIFY] = (nfsd4_enc)nfsd4_encode_noop, + [OP_OPEN] = (nfsd4_enc)nfsd4_encode_open, + [OP_OPEN_CONFIRM] = (nfsd4_enc)nfsd4_encode_open_confirm, + [OP_OPEN_DOWNGRADE] = (nfsd4_enc)nfsd4_encode_open_downgrade, + [OP_PUTFH] = (nfsd4_enc)nfsd4_encode_noop, + [OP_PUTPUBFH] = (nfsd4_enc)nfsd4_encode_noop, + [OP_PUTROOTFH] = (nfsd4_enc)nfsd4_encode_noop, + [OP_READ] = (nfsd4_enc)nfsd4_encode_read, + [OP_READDIR] = (nfsd4_enc)nfsd4_encode_readdir, + [OP_READLINK] = (nfsd4_enc)nfsd4_encode_readlink, + [OP_REMOVE] = (nfsd4_enc)nfsd4_encode_remove, + [OP_RENAME] = (nfsd4_enc)nfsd4_encode_rename, + [OP_RENEW] = (nfsd4_enc)nfsd4_encode_noop, + [OP_RESTOREFH] = (nfsd4_enc)nfsd4_encode_noop, + [OP_SAVEFH] = (nfsd4_enc)nfsd4_encode_noop, + [OP_SECINFO] = (nfsd4_enc)nfsd4_encode_secinfo, + [OP_SETATTR] = (nfsd4_enc)nfsd4_encode_setattr, + [OP_SETCLIENTID] = (nfsd4_enc)nfsd4_encode_setclientid, + [OP_SETCLIENTID_CONFIRM] = (nfsd4_enc)nfsd4_encode_noop, + [OP_VERIFY] = (nfsd4_enc)nfsd4_encode_noop, + [OP_WRITE] = (nfsd4_enc)nfsd4_encode_write, + [OP_RELEASE_LOCKOWNER] = (nfsd4_enc)nfsd4_encode_noop, +}; + void nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) { @@ -2601,101 +2631,12 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) statp = p++; /* to be backfilled at the end */ ADJUST_ARGS(); - switch (op->opnum) { - case OP_ACCESS: - nfsd4_encode_access(resp, op->status, &op->u.access); - break; - case OP_CLOSE: - nfsd4_encode_close(resp, op->status, &op->u.close); - break; - case OP_COMMIT: - nfsd4_encode_commit(resp, op->status, &op->u.commit); - break; - case OP_CREATE: - nfsd4_encode_create(resp, op->status, &op->u.create); - break; - case OP_DELEGRETURN: - break; - case OP_GETATTR: - op->status = nfsd4_encode_getattr(resp, op->status, &op->u.getattr); - break; - case OP_GETFH: - nfsd4_encode_getfh(resp, op->status, op->u.getfh); - break; - case OP_LINK: - nfsd4_encode_link(resp, op->status, &op->u.link); - break; - case OP_LOCK: - nfsd4_encode_lock(resp, op->status, &op->u.lock); - break; - case OP_LOCKT: - nfsd4_encode_lockt(resp, op->status, &op->u.lockt); - break; - case OP_LOCKU: - nfsd4_encode_locku(resp, op->status, &op->u.locku); - break; - case OP_LOOKUP: - break; - case OP_LOOKUPP: - break; - case OP_NVERIFY: - break; - case OP_OPEN: - nfsd4_encode_open(resp, op->status, &op->u.open); - break; - case OP_OPEN_CONFIRM: - nfsd4_encode_open_confirm(resp, op->status, &op->u.open_confirm); - break; - case OP_OPEN_DOWNGRADE: - nfsd4_encode_open_downgrade(resp, op->status, &op->u.open_downgrade); - break; - case OP_PUTFH: - break; - case OP_PUTROOTFH: - break; - case OP_READ: - op->status = nfsd4_encode_read(resp, op->status, &op->u.read); - break; - case OP_READDIR: - op->status = nfsd4_encode_readdir(resp, op->status, &op->u.readdir); - break; - case OP_READLINK: - op->status = nfsd4_encode_readlink(resp, op->status, &op->u.readlink); - break; - case OP_REMOVE: - nfsd4_encode_remove(resp, op->status, &op->u.remove); - break; - case OP_RENAME: - nfsd4_encode_rename(resp, op->status, &op->u.rename); - break; - case OP_RENEW: - break; - case OP_RESTOREFH: - break; - case OP_SAVEFH: - break; - case OP_SECINFO: - nfsd4_encode_secinfo(resp, op->status, &op->u.secinfo); - break; - case OP_SETATTR: - nfsd4_encode_setattr(resp, op->status, &op->u.setattr); - break; - case OP_SETCLIENTID: - nfsd4_encode_setclientid(resp, op->status, &op->u.setclientid); - break; - case OP_SETCLIENTID_CONFIRM: - break; - case OP_VERIFY: - break; - case OP_WRITE: - nfsd4_encode_write(resp, op->status, &op->u.write); - break; - case OP_RELEASE_LOCKOWNER: - break; - default: - break; - } - + if (op->opnum == OP_ILLEGAL) + goto status; + BUG_ON(op->opnum < 0 || op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) || + !nfsd4_enc_ops[op->opnum]); + op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u); +status: /* * Note: We write the status directly, instead of using WRITE32(), * since it is already in network byte order. diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 5ac00c4fee91..97543df58242 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -12,6 +12,7 @@ #include <linux/time.h> #include <linux/errno.h> #include <linux/fs.h> +#include <linux/namei.h> #include <linux/fcntl.h> #include <linux/net.h> #include <linux/in.h> @@ -310,9 +311,12 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size) static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size) { - __be32 server_ip; - char *fo_path, c; + struct sockaddr_in sin = { + .sin_family = AF_INET, + }; int b1, b2, b3, b4; + char c; + char *fo_path; /* sanity check */ if (size == 0) @@ -326,11 +330,13 @@ static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size) return -EINVAL; /* get ipv4 address */ - if (sscanf(fo_path, "%u.%u.%u.%u%c", &b1, &b2, &b3, &b4, &c) != 4) + if (sscanf(fo_path, NIPQUAD_FMT "%c", &b1, &b2, &b3, &b4, &c) != 4) return -EINVAL; - server_ip = htonl((((((b1<<8)|b2)<<8)|b3)<<8)|b4); + if (b1 > 255 || b2 > 255 || b3 > 255 || b4 > 255) + return -EINVAL; + sin.sin_addr.s_addr = htonl((b1 << 24) | (b2 << 16) | (b3 << 8) | b4); - return nlmsvc_unlock_all_by_ip(server_ip); + return nlmsvc_unlock_all_by_ip((struct sockaddr *)&sin); } static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size) @@ -450,22 +456,26 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size) int i; int rv; int len; - int npools = nfsd_nrpools(); + int npools; int *nthreads; + mutex_lock(&nfsd_mutex); + npools = nfsd_nrpools(); if (npools == 0) { /* * NFS is shut down. The admin can start it by * writing to the threads file but NOT the pool_threads * file, sorry. Report zero threads. */ + mutex_unlock(&nfsd_mutex); strcpy(buf, "0\n"); return strlen(buf); } nthreads = kcalloc(npools, sizeof(int), GFP_KERNEL); + rv = -ENOMEM; if (nthreads == NULL) - return -ENOMEM; + goto out_free; if (size > 0) { for (i = 0; i < npools; i++) { @@ -496,14 +506,16 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size) mesg += len; } + mutex_unlock(&nfsd_mutex); return (mesg-buf); out_free: kfree(nthreads); + mutex_unlock(&nfsd_mutex); return rv; } -static ssize_t write_versions(struct file *file, char *buf, size_t size) +static ssize_t __write_versions(struct file *file, char *buf, size_t size) { /* * Format: @@ -566,14 +578,23 @@ static ssize_t write_versions(struct file *file, char *buf, size_t size) return len; } -static ssize_t write_ports(struct file *file, char *buf, size_t size) +static ssize_t write_versions(struct file *file, char *buf, size_t size) +{ + ssize_t rv; + + mutex_lock(&nfsd_mutex); + rv = __write_versions(file, buf, size); + mutex_unlock(&nfsd_mutex); + return rv; +} + +static ssize_t __write_ports(struct file *file, char *buf, size_t size) { if (size == 0) { int len = 0; - lock_kernel(); + if (nfsd_serv) len = svc_xprt_names(nfsd_serv, buf, 0); - unlock_kernel(); return len; } /* Either a single 'fd' number is written, in which @@ -593,19 +614,16 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size) return -EINVAL; err = nfsd_create_serv(); if (!err) { - int proto = 0; - err = svc_addsock(nfsd_serv, fd, buf, &proto); + err = svc_addsock(nfsd_serv, fd, buf); if (err >= 0) { - err = lockd_up(proto); + err = lockd_up(); if (err < 0) svc_sock_names(buf+strlen(buf)+1, nfsd_serv, buf); } /* Decrease the count, but don't shutdown the * the service */ - lock_kernel(); nfsd_serv->sv_nrthreads--; - unlock_kernel(); } return err < 0 ? err : 0; } @@ -614,10 +632,8 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size) int len = 0; if (!toclose) return -ENOMEM; - lock_kernel(); if (nfsd_serv) len = svc_sock_names(buf, nfsd_serv, toclose); - unlock_kernel(); if (len >= 0) lockd_down(); kfree(toclose); @@ -655,7 +671,6 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size) if (sscanf(&buf[1], "%15s %4d", transport, &port) == 2) { if (port == 0) return -EINVAL; - lock_kernel(); if (nfsd_serv) { xprt = svc_find_xprt(nfsd_serv, transport, AF_UNSPEC, port); @@ -666,13 +681,23 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size) } else err = -ENOTCONN; } - unlock_kernel(); return err < 0 ? err : 0; } } return -EINVAL; } +static ssize_t write_ports(struct file *file, char *buf, size_t size) +{ + ssize_t rv; + + mutex_lock(&nfsd_mutex); + rv = __write_ports(file, buf, size); + mutex_unlock(&nfsd_mutex); + return rv; +} + + int nfsd_max_blksize; static ssize_t write_maxblksize(struct file *file, char *buf, size_t size) @@ -691,13 +716,13 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size) if (bsize > NFSSVC_MAXBLKSIZE) bsize = NFSSVC_MAXBLKSIZE; bsize &= ~(1024-1); - lock_kernel(); + mutex_lock(&nfsd_mutex); if (nfsd_serv && nfsd_serv->sv_nrthreads) { - unlock_kernel(); + mutex_unlock(&nfsd_mutex); return -EBUSY; } nfsd_max_blksize = bsize; - unlock_kernel(); + mutex_unlock(&nfsd_mutex); } return sprintf(buf, "%d\n", nfsd_max_blksize); } @@ -705,16 +730,17 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size) #ifdef CONFIG_NFSD_V4 extern time_t nfs4_leasetime(void); -static ssize_t write_leasetime(struct file *file, char *buf, size_t size) +static ssize_t __write_leasetime(struct file *file, char *buf, size_t size) { /* if size > 10 seconds, call * nfs4_reset_lease() then write out the new lease (seconds) as reply */ char *mesg = buf; - int rv; + int rv, lease; if (size > 0) { - int lease; + if (nfsd_serv) + return -EBUSY; rv = get_int(&mesg, &lease); if (rv) return rv; @@ -726,24 +752,52 @@ static ssize_t write_leasetime(struct file *file, char *buf, size_t size) return strlen(buf); } -static ssize_t write_recoverydir(struct file *file, char *buf, size_t size) +static ssize_t write_leasetime(struct file *file, char *buf, size_t size) +{ + ssize_t rv; + + mutex_lock(&nfsd_mutex); + rv = __write_leasetime(file, buf, size); + mutex_unlock(&nfsd_mutex); + return rv; +} + +extern char *nfs4_recoverydir(void); + +static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size) { char *mesg = buf; char *recdir; int len, status; - if (size == 0 || size > PATH_MAX || buf[size-1] != '\n') - return -EINVAL; - buf[size-1] = 0; + if (size > 0) { + if (nfsd_serv) + return -EBUSY; + if (size > PATH_MAX || buf[size-1] != '\n') + return -EINVAL; + buf[size-1] = 0; - recdir = mesg; - len = qword_get(&mesg, recdir, size); - if (len <= 0) - return -EINVAL; + recdir = mesg; + len = qword_get(&mesg, recdir, size); + if (len <= 0) + return -EINVAL; - status = nfs4_reset_recoverydir(recdir); + status = nfs4_reset_recoverydir(recdir); + } + sprintf(buf, "%s\n", nfs4_recoverydir()); return strlen(buf); } + +static ssize_t write_recoverydir(struct file *file, char *buf, size_t size) +{ + ssize_t rv; + + mutex_lock(&nfsd_mutex); + rv = __write_recoverydir(file, buf, size); + mutex_unlock(&nfsd_mutex); + return rv; +} + #endif /*----------------------------------------------------------------------------*/ diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 100ae5641162..cd25d91895a1 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -51,7 +51,7 @@ static int nfsd_acceptable(void *expv, struct dentry *dentry) /* make sure parents give x permission to user */ int err; parent = dget_parent(tdentry); - err = permission(parent->d_inode, MAY_EXEC, NULL); + err = inode_permission(parent->d_inode, MAY_EXEC); if (err < 0) { dput(parent); break; @@ -176,9 +176,24 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) if (IS_ERR(exp)) return nfserrno(PTR_ERR(exp)); - error = nfsd_setuser_and_check_port(rqstp, exp); - if (error) - goto out; + if (exp->ex_flags & NFSEXP_NOSUBTREECHECK) { + /* Elevate privileges so that the lack of 'r' or 'x' + * permission on some parent directory will + * not stop exportfs_decode_fh from being able + * to reconnect a directory into the dentry cache. + * The same problem can affect "SUBTREECHECK" exports, + * but as nfsd_acceptable depends on correct + * access control settings being in effect, we cannot + * fix that case easily. + */ + current->cap_effective = + cap_raise_nfsd_set(current->cap_effective, + current->cap_permitted); + } else { + error = nfsd_setuser_and_check_port(rqstp, exp); + if (error) + goto out; + } /* * Look up the dentry using the NFS file handle. @@ -215,6 +230,14 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) goto out; } + if (exp->ex_flags & NFSEXP_NOSUBTREECHECK) { + error = nfsd_setuser_and_check_port(rqstp, exp); + if (error) { + dput(dentry); + goto out; + } + } + if (S_ISDIR(dentry->d_inode->i_mode) && (dentry->d_flags & DCACHE_DISCONNECTED)) { printk("nfsd: find_fh_dentry returned a DISCONNECTED directory: %s/%s\n", @@ -279,17 +302,27 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access) if (error) goto out; - if (!(access & MAY_LOCK)) { - /* - * pseudoflavor restrictions are not enforced on NLM, - * which clients virtually always use auth_sys for, - * even while using RPCSEC_GSS for NFS. - */ - error = check_nfsd_access(exp, rqstp); - if (error) - goto out; - } + /* + * pseudoflavor restrictions are not enforced on NLM, + * which clients virtually always use auth_sys for, + * even while using RPCSEC_GSS for NFS. + */ + if (access & NFSD_MAY_LOCK) + goto skip_pseudoflavor_check; + /* + * Clients may expect to be able to use auth_sys during mount, + * even if they use gss for everything else; see section 2.3.2 + * of rfc 2623. + */ + if (access & NFSD_MAY_BYPASS_GSS_ON_ROOT + && exp->ex_path.dentry == dentry) + goto skip_pseudoflavor_check; + + error = check_nfsd_access(exp, rqstp); + if (error) + goto out; +skip_pseudoflavor_check: /* Finally, check access permissions. */ error = nfsd_permission(rqstp, exp, dentry, access); diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 6cfc96a12483..5cffeca7acef 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -65,7 +65,8 @@ nfsd_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp, dprintk("nfsd: GETATTR %s\n", SVCFH_fmt(&argp->fh)); fh_copy(&resp->fh, &argp->fh); - nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP); + nfserr = fh_verify(rqstp, &resp->fh, 0, + NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT); return nfsd_return_attrs(nfserr, resp); } @@ -215,11 +216,11 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp, SVCFH_fmt(dirfhp), argp->len, argp->name); /* First verify the parent file handle */ - nfserr = fh_verify(rqstp, dirfhp, S_IFDIR, MAY_EXEC); + nfserr = fh_verify(rqstp, dirfhp, S_IFDIR, NFSD_MAY_EXEC); if (nfserr) goto done; /* must fh_put dirfhp even on error */ - /* Check for MAY_WRITE in nfsd_create if necessary */ + /* Check for NFSD_MAY_WRITE in nfsd_create if necessary */ nfserr = nfserr_acces; if (!argp->len) @@ -281,7 +282,7 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp, nfserr = nfsd_permission(rqstp, newfhp->fh_export, newfhp->fh_dentry, - MAY_WRITE|MAY_LOCAL_ACCESS); + NFSD_MAY_WRITE|NFSD_MAY_LOCAL_ACCESS); if (nfserr && nfserr != nfserr_rofs) goto out_unlock; } @@ -521,7 +522,8 @@ nfsd_proc_statfs(struct svc_rqst * rqstp, struct nfsd_fhandle *argp, dprintk("nfsd: STATFS %s\n", SVCFH_fmt(&argp->fh)); - nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats); + nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats, + NFSD_MAY_BYPASS_GSS_ON_ROOT); fh_put(&argp->fh); return nfserr; } @@ -614,6 +616,7 @@ nfserrno (int errno) #endif { nfserr_stale, -ESTALE }, { nfserr_jukebox, -ETIMEDOUT }, + { nfserr_jukebox, -ERESTARTSYS }, { nfserr_dropit, -EAGAIN }, { nfserr_dropit, -ENOMEM }, { nfserr_badname, -ESRCH }, diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 941041f4b136..59eeb46f82c5 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -21,6 +21,7 @@ #include <linux/smp_lock.h> #include <linux/freezer.h> #include <linux/fs_struct.h> +#include <linux/kthread.h> #include <linux/sunrpc/types.h> #include <linux/sunrpc/stats.h> @@ -36,28 +37,38 @@ #define NFSDDBG_FACILITY NFSDDBG_SVC -/* these signals will be delivered to an nfsd thread - * when handling a request - */ -#define ALLOWED_SIGS (sigmask(SIGKILL)) -/* these signals will be delivered to an nfsd thread - * when not handling a request. i.e. when waiting - */ -#define SHUTDOWN_SIGS (sigmask(SIGKILL) | sigmask(SIGHUP) | sigmask(SIGINT) | sigmask(SIGQUIT)) -/* if the last thread dies with SIGHUP, then the exports table is - * left unchanged ( like 2.4-{0-9} ). Any other signal will clear - * the exports table (like 2.2). - */ -#define SIG_NOCLEAN SIGHUP - extern struct svc_program nfsd_program; -static void nfsd(struct svc_rqst *rqstp); +static int nfsd(void *vrqstp); struct timeval nfssvc_boot; - struct svc_serv *nfsd_serv; static atomic_t nfsd_busy; static unsigned long nfsd_last_call; static DEFINE_SPINLOCK(nfsd_call_lock); +/* + * nfsd_mutex protects nfsd_serv -- both the pointer itself and the members + * of the svc_serv struct. In particular, ->sv_nrthreads but also to some + * extent ->sv_temp_socks and ->sv_permsocks. It also protects nfsdstats.th_cnt + * + * If (out side the lock) nfsd_serv is non-NULL, then it must point to a + * properly initialised 'struct svc_serv' with ->sv_nrthreads > 0. That number + * of nfsd threads must exist and each must listed in ->sp_all_threads in each + * entry of ->sv_pools[]. + * + * Transitions of the thread count between zero and non-zero are of particular + * interest since the svc_serv needs to be created and initialized at that + * point, or freed. + * + * Finally, the nfsd_mutex also protects some of the global variables that are + * accessed when nfsd starts and that are settable via the write_* routines in + * nfsctl.c. In particular: + * + * user_recovery_dirname + * user_lease_time + * nfsd_versions + */ +DEFINE_MUTEX(nfsd_mutex); +struct svc_serv *nfsd_serv; + #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) static struct svc_stat nfsd_acl_svcstats; static struct svc_version * nfsd_acl_version[] = { @@ -145,13 +156,14 @@ int nfsd_vers(int vers, enum vers_op change) int nfsd_nrthreads(void) { - if (nfsd_serv == NULL) - return 0; - else - return nfsd_serv->sv_nrthreads; + int rv = 0; + mutex_lock(&nfsd_mutex); + if (nfsd_serv) + rv = nfsd_serv->sv_nrthreads; + mutex_unlock(&nfsd_mutex); + return rv; } -static int killsig; /* signal that was used to kill last nfsd */ static void nfsd_last_thread(struct svc_serv *serv) { /* When last nfsd thread exits we need to do some clean-up */ @@ -162,11 +174,9 @@ static void nfsd_last_thread(struct svc_serv *serv) nfsd_racache_shutdown(); nfs4_state_shutdown(); - printk(KERN_WARNING "nfsd: last server has exited\n"); - if (killsig != SIG_NOCLEAN) { - printk(KERN_WARNING "nfsd: unexporting all filesystems\n"); - nfsd_export_flush(); - } + printk(KERN_WARNING "nfsd: last server has exited, flushing export " + "cache\n"); + nfsd_export_flush(); } void nfsd_reset_versions(void) @@ -190,13 +200,14 @@ void nfsd_reset_versions(void) } } + int nfsd_create_serv(void) { int err = 0; - lock_kernel(); + + WARN_ON(!mutex_is_locked(&nfsd_mutex)); if (nfsd_serv) { svc_get(nfsd_serv); - unlock_kernel(); return 0; } if (nfsd_max_blksize == 0) { @@ -217,13 +228,12 @@ int nfsd_create_serv(void) } atomic_set(&nfsd_busy, 0); - nfsd_serv = svc_create_pooled(&nfsd_program, - nfsd_max_blksize, - nfsd_last_thread, - nfsd, SIG_NOCLEAN, THIS_MODULE); + nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, + AF_INET, + nfsd_last_thread, nfsd, THIS_MODULE); if (nfsd_serv == NULL) err = -ENOMEM; - unlock_kernel(); + do_gettimeofday(&nfssvc_boot); /* record boot time */ return err; } @@ -234,25 +244,20 @@ static int nfsd_init_socks(int port) if (!list_empty(&nfsd_serv->sv_permsocks)) return 0; - error = lockd_up(IPPROTO_UDP); - if (error >= 0) { - error = svc_create_xprt(nfsd_serv, "udp", port, + error = svc_create_xprt(nfsd_serv, "udp", port, SVC_SOCK_DEFAULTS); - if (error < 0) - lockd_down(); - } if (error < 0) return error; - error = lockd_up(IPPROTO_TCP); - if (error >= 0) { - error = svc_create_xprt(nfsd_serv, "tcp", port, + error = svc_create_xprt(nfsd_serv, "tcp", port, SVC_SOCK_DEFAULTS); - if (error < 0) - lockd_down(); - } if (error < 0) return error; + + error = lockd_up(); + if (error < 0) + return error; + return 0; } @@ -282,6 +287,8 @@ int nfsd_set_nrthreads(int n, int *nthreads) int tot = 0; int err = 0; + WARN_ON(!mutex_is_locked(&nfsd_mutex)); + if (nfsd_serv == NULL || n <= 0) return 0; @@ -316,7 +323,6 @@ int nfsd_set_nrthreads(int n, int *nthreads) nthreads[0] = 1; /* apply the new numbers */ - lock_kernel(); svc_get(nfsd_serv); for (i = 0; i < n; i++) { err = svc_set_num_threads(nfsd_serv, &nfsd_serv->sv_pools[i], @@ -325,7 +331,6 @@ int nfsd_set_nrthreads(int n, int *nthreads) break; } svc_destroy(nfsd_serv); - unlock_kernel(); return err; } @@ -334,8 +339,8 @@ int nfsd_svc(unsigned short port, int nrservs) { int error; - - lock_kernel(); + + mutex_lock(&nfsd_mutex); dprintk("nfsd: creating service\n"); error = -EINVAL; if (nrservs <= 0) @@ -363,7 +368,7 @@ nfsd_svc(unsigned short port, int nrservs) failure: svc_destroy(nfsd_serv); /* Release server */ out: - unlock_kernel(); + mutex_unlock(&nfsd_mutex); return error; } @@ -391,18 +396,17 @@ update_thread_usage(int busy_threads) /* * This is the NFS server kernel thread */ -static void -nfsd(struct svc_rqst *rqstp) +static int +nfsd(void *vrqstp) { + struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp; struct fs_struct *fsp; - int err; - sigset_t shutdown_mask, allowed_mask; + int err, preverr = 0; /* Lock module and set up kernel thread */ - lock_kernel(); - daemonize("nfsd"); + mutex_lock(&nfsd_mutex); - /* After daemonize() this kernel thread shares current->fs + /* At this point, the thread shares current->fs * with the init process. We need to create files with a * umask of 0 instead of init's umask. */ fsp = copy_fs_struct(current->fs); @@ -414,14 +418,17 @@ nfsd(struct svc_rqst *rqstp) current->fs = fsp; current->fs->umask = 0; - siginitsetinv(&shutdown_mask, SHUTDOWN_SIGS); - siginitsetinv(&allowed_mask, ALLOWED_SIGS); + /* + * thread is spawned with all signals set to SIG_IGN, re-enable + * the ones that will bring down the thread + */ + allow_signal(SIGKILL); + allow_signal(SIGHUP); + allow_signal(SIGINT); + allow_signal(SIGQUIT); nfsdstats.th_cnt++; - - rqstp->rq_task = current; - - unlock_kernel(); + mutex_unlock(&nfsd_mutex); /* * We want less throttling in balance_dirty_pages() so that nfs to @@ -435,26 +442,30 @@ nfsd(struct svc_rqst *rqstp) * The main request loop */ for (;;) { - /* Block all but the shutdown signals */ - sigprocmask(SIG_SETMASK, &shutdown_mask, NULL); - /* * Find a socket with data available and call its * recvfrom routine. */ while ((err = svc_recv(rqstp, 60*60*HZ)) == -EAGAIN) ; - if (err < 0) + if (err == -EINTR) break; + else if (err < 0) { + if (err != preverr) { + printk(KERN_WARNING "%s: unexpected error " + "from svc_recv (%d)\n", __func__, -err); + preverr = err; + } + schedule_timeout_uninterruptible(HZ); + continue; + } + update_thread_usage(atomic_read(&nfsd_busy)); atomic_inc(&nfsd_busy); /* Lock the export hash tables for reading. */ exp_readlock(); - /* Process request with signals blocked. */ - sigprocmask(SIG_SETMASK, &allowed_mask, NULL); - svc_process(rqstp); /* Unlock export hash tables */ @@ -463,22 +474,10 @@ nfsd(struct svc_rqst *rqstp) atomic_dec(&nfsd_busy); } - if (err != -EINTR) { - printk(KERN_WARNING "nfsd: terminating on error %d\n", -err); - } else { - unsigned int signo; - - for (signo = 1; signo <= _NSIG; signo++) - if (sigismember(¤t->pending.signal, signo) && - !sigismember(¤t->blocked, signo)) - break; - killsig = signo; - } /* Clear signals before calling svc_exit_thread() */ flush_signals(current); - lock_kernel(); - + mutex_lock(&nfsd_mutex); nfsdstats.th_cnt --; out: @@ -486,8 +485,9 @@ out: svc_exit_thread(rqstp); /* Release module */ - unlock_kernel(); + mutex_unlock(&nfsd_mutex); module_put_and_exit(0); + return 0; } static __be32 map_new_errors(u32 vers, __be32 nfserr) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index a3a291f771f4..aa1d0d6489a1 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -83,7 +83,6 @@ struct raparm_hbucket { spinlock_t pb_lock; } ____cacheline_aligned_in_smp; -static struct raparms * raparml; #define RAPARM_HASH_BITS 4 #define RAPARM_HASH_SIZE (1<<RAPARM_HASH_BITS) #define RAPARM_HASH_MASK (RAPARM_HASH_SIZE-1) @@ -144,7 +143,7 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name); /* Obtain dentry and export. */ - err = fh_verify(rqstp, fhp, S_IFDIR, MAY_EXEC); + err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC); if (err) return err; @@ -262,14 +261,14 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, { struct dentry *dentry; struct inode *inode; - int accmode = MAY_SATTR; + int accmode = NFSD_MAY_SATTR; int ftype = 0; __be32 err; int host_err; int size_change = 0; if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE)) - accmode |= MAY_WRITE|MAY_OWNER_OVERRIDE; + accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE; if (iap->ia_valid & ATTR_SIZE) ftype = S_IFREG; @@ -331,7 +330,8 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, */ if (iap->ia_valid & ATTR_SIZE) { if (iap->ia_size < inode->i_size) { - err = nfsd_permission(rqstp, fhp->fh_export, dentry, MAY_TRUNC|MAY_OWNER_OVERRIDE); + err = nfsd_permission(rqstp, fhp->fh_export, dentry, + NFSD_MAY_TRUNC|NFSD_MAY_OWNER_OVERRIDE); if (err) goto out; } @@ -462,7 +462,7 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned int flags = 0; /* Get inode */ - error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, MAY_SATTR); + error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, NFSD_MAY_SATTR); if (error) return error; @@ -563,20 +563,20 @@ struct accessmap { int how; }; static struct accessmap nfs3_regaccess[] = { - { NFS3_ACCESS_READ, MAY_READ }, - { NFS3_ACCESS_EXECUTE, MAY_EXEC }, - { NFS3_ACCESS_MODIFY, MAY_WRITE|MAY_TRUNC }, - { NFS3_ACCESS_EXTEND, MAY_WRITE }, + { NFS3_ACCESS_READ, NFSD_MAY_READ }, + { NFS3_ACCESS_EXECUTE, NFSD_MAY_EXEC }, + { NFS3_ACCESS_MODIFY, NFSD_MAY_WRITE|NFSD_MAY_TRUNC }, + { NFS3_ACCESS_EXTEND, NFSD_MAY_WRITE }, { 0, 0 } }; static struct accessmap nfs3_diraccess[] = { - { NFS3_ACCESS_READ, MAY_READ }, - { NFS3_ACCESS_LOOKUP, MAY_EXEC }, - { NFS3_ACCESS_MODIFY, MAY_EXEC|MAY_WRITE|MAY_TRUNC }, - { NFS3_ACCESS_EXTEND, MAY_EXEC|MAY_WRITE }, - { NFS3_ACCESS_DELETE, MAY_REMOVE }, + { NFS3_ACCESS_READ, NFSD_MAY_READ }, + { NFS3_ACCESS_LOOKUP, NFSD_MAY_EXEC }, + { NFS3_ACCESS_MODIFY, NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC}, + { NFS3_ACCESS_EXTEND, NFSD_MAY_EXEC|NFSD_MAY_WRITE }, + { NFS3_ACCESS_DELETE, NFSD_MAY_REMOVE }, { 0, 0 } }; @@ -589,10 +589,10 @@ static struct accessmap nfs3_anyaccess[] = { * mainly at mode bits, and we make sure to ignore read-only * filesystem checks */ - { NFS3_ACCESS_READ, MAY_READ }, - { NFS3_ACCESS_EXECUTE, MAY_EXEC }, - { NFS3_ACCESS_MODIFY, MAY_WRITE|MAY_LOCAL_ACCESS }, - { NFS3_ACCESS_EXTEND, MAY_WRITE|MAY_LOCAL_ACCESS }, + { NFS3_ACCESS_READ, NFSD_MAY_READ }, + { NFS3_ACCESS_EXECUTE, NFSD_MAY_EXEC }, + { NFS3_ACCESS_MODIFY, NFSD_MAY_WRITE|NFSD_MAY_LOCAL_ACCESS }, + { NFS3_ACCESS_EXTEND, NFSD_MAY_WRITE|NFSD_MAY_LOCAL_ACCESS }, { 0, 0 } }; @@ -606,7 +606,7 @@ nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *suppor u32 query, result = 0, sresult = 0; __be32 error; - error = fh_verify(rqstp, fhp, 0, MAY_NOP); + error = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP); if (error) goto out; @@ -678,7 +678,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, * and (hopefully) checked permission - so allow OWNER_OVERRIDE * in case a chmod has now revoked permission. */ - err = fh_verify(rqstp, fhp, type, access | MAY_OWNER_OVERRIDE); + err = fh_verify(rqstp, fhp, type, access | NFSD_MAY_OWNER_OVERRIDE); if (err) goto out; @@ -689,7 +689,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, * or any access when mandatory locking enabled */ err = nfserr_perm; - if (IS_APPEND(inode) && (access & MAY_WRITE)) + if (IS_APPEND(inode) && (access & NFSD_MAY_WRITE)) goto out; /* * We must ignore files (but only files) which might have mandatory @@ -706,14 +706,14 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, * Check to see if there are any leases on this file. * This may block while leases are broken. */ - host_err = break_lease(inode, O_NONBLOCK | ((access & MAY_WRITE) ? FMODE_WRITE : 0)); + host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? FMODE_WRITE : 0)); if (host_err == -EWOULDBLOCK) host_err = -ETIMEDOUT; if (host_err) /* NOMEM or WOULDBLOCK */ goto out_nfserr; - if (access & MAY_WRITE) { - if (access & MAY_READ) + if (access & NFSD_MAY_WRITE) { + if (access & NFSD_MAY_READ) flags = O_RDWR|O_LARGEFILE; else flags = O_WRONLY|O_LARGEFILE; @@ -1069,12 +1069,12 @@ nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, if (file) { err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, - MAY_READ|MAY_OWNER_OVERRIDE); + NFSD_MAY_READ|NFSD_MAY_OWNER_OVERRIDE); if (err) goto out; err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count); } else { - err = nfsd_open(rqstp, fhp, S_IFREG, MAY_READ, &file); + err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file); if (err) goto out; err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count); @@ -1098,13 +1098,13 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, if (file) { err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, - MAY_WRITE|MAY_OWNER_OVERRIDE); + NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE); if (err) goto out; err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt, stablep); } else { - err = nfsd_open(rqstp, fhp, S_IFREG, MAY_WRITE, &file); + err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file); if (err) goto out; @@ -1136,7 +1136,8 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, if ((u64)count > ~(u64)offset) return nfserr_inval; - if ((err = nfsd_open(rqstp, fhp, S_IFREG, MAY_WRITE, &file)) != 0) + err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file); + if (err) return err; if (EX_ISSYNC(fhp->fh_export)) { if (file->f_op && file->f_op->fsync) { @@ -1197,7 +1198,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, if (isdotent(fname, flen)) goto out; - err = fh_verify(rqstp, fhp, S_IFDIR, MAY_CREATE); + err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE); if (err) goto out; @@ -1248,36 +1249,34 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, iap->ia_mode = 0; iap->ia_mode = (iap->ia_mode & S_IALLUGO) | type; + err = nfserr_inval; + if (!S_ISREG(type) && !S_ISDIR(type) && !special_file(type)) { + printk(KERN_WARNING "nfsd: bad file type %o in nfsd_create\n", + type); + goto out; + } + + host_err = mnt_want_write(fhp->fh_export->ex_path.mnt); + if (host_err) + goto out_nfserr; + /* * Get the dir op function pointer. */ err = 0; switch (type) { case S_IFREG: - host_err = mnt_want_write(fhp->fh_export->ex_path.mnt); - if (host_err) - goto out_nfserr; host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL); break; case S_IFDIR: - host_err = mnt_want_write(fhp->fh_export->ex_path.mnt); - if (host_err) - goto out_nfserr; host_err = vfs_mkdir(dirp, dchild, iap->ia_mode); break; case S_IFCHR: case S_IFBLK: case S_IFIFO: case S_IFSOCK: - host_err = mnt_want_write(fhp->fh_export->ex_path.mnt); - if (host_err) - goto out_nfserr; host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev); break; - default: - printk("nfsd: bad file type %o in nfsd_create\n", type); - host_err = -EINVAL; - goto out_nfserr; } if (host_err < 0) { mnt_drop_write(fhp->fh_export->ex_path.mnt); @@ -1289,7 +1288,6 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, write_inode_now(dchild->d_inode, 1); } - err2 = nfsd_create_setattr(rqstp, resfhp, iap); if (err2) err = err2; @@ -1334,7 +1332,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp, goto out; if (!(iap->ia_valid & ATTR_MODE)) iap->ia_mode = 0; - err = fh_verify(rqstp, fhp, S_IFDIR, MAY_CREATE); + err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE); if (err) goto out; @@ -1471,7 +1469,7 @@ nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp) __be32 err; int host_err; - err = fh_verify(rqstp, fhp, S_IFLNK, MAY_NOP); + err = fh_verify(rqstp, fhp, S_IFLNK, NFSD_MAY_NOP); if (err) goto out; @@ -1517,7 +1515,6 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, struct dentry *dentry, *dnew; __be32 err, cerr; int host_err; - umode_t mode; err = nfserr_noent; if (!flen || !plen) @@ -1526,7 +1523,7 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, if (isdotent(fname, flen)) goto out; - err = fh_verify(rqstp, fhp, S_IFDIR, MAY_CREATE); + err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE); if (err) goto out; fh_lock(fhp); @@ -1536,11 +1533,6 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, if (IS_ERR(dnew)) goto out_nfserr; - mode = S_IALLUGO; - /* Only the MODE ATTRibute is even vaguely meaningful */ - if (iap && (iap->ia_valid & ATTR_MODE)) - mode = iap->ia_mode & S_IALLUGO; - host_err = mnt_want_write(fhp->fh_export->ex_path.mnt); if (host_err) goto out_nfserr; @@ -1552,11 +1544,11 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, else { strncpy(path_alloced, path, plen); path_alloced[plen] = 0; - host_err = vfs_symlink(dentry->d_inode, dnew, path_alloced, mode); + host_err = vfs_symlink(dentry->d_inode, dnew, path_alloced); kfree(path_alloced); } } else - host_err = vfs_symlink(dentry->d_inode, dnew, path, mode); + host_err = vfs_symlink(dentry->d_inode, dnew, path); if (!host_err) { if (EX_ISSYNC(fhp->fh_export)) @@ -1591,10 +1583,10 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, __be32 err; int host_err; - err = fh_verify(rqstp, ffhp, S_IFDIR, MAY_CREATE); + err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_CREATE); if (err) goto out; - err = fh_verify(rqstp, tfhp, -S_IFDIR, MAY_NOP); + err = fh_verify(rqstp, tfhp, -S_IFDIR, NFSD_MAY_NOP); if (err) goto out; @@ -1661,10 +1653,10 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, __be32 err; int host_err; - err = fh_verify(rqstp, ffhp, S_IFDIR, MAY_REMOVE); + err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_REMOVE); if (err) goto out; - err = fh_verify(rqstp, tfhp, S_IFDIR, MAY_CREATE); + err = fh_verify(rqstp, tfhp, S_IFDIR, NFSD_MAY_CREATE); if (err) goto out; @@ -1768,7 +1760,7 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, err = nfserr_acces; if (!flen || isdotent(fname, flen)) goto out; - err = fh_verify(rqstp, fhp, S_IFDIR, MAY_REMOVE); + err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_REMOVE); if (err) goto out; @@ -1834,7 +1826,7 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp, struct file *file; loff_t offset = *offsetp; - err = nfsd_open(rqstp, fhp, S_IFDIR, MAY_READ, &file); + err = nfsd_open(rqstp, fhp, S_IFDIR, NFSD_MAY_READ, &file); if (err) goto out; @@ -1873,9 +1865,9 @@ out: * N.B. After this call fhp needs an fh_put */ __be32 -nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat) +nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, int access) { - __be32 err = fh_verify(rqstp, fhp, 0, MAY_NOP); + __be32 err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access); if (!err && vfs_statfs(fhp->fh_dentry,stat)) err = nfserr_io; return err; @@ -1896,18 +1888,18 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, struct inode *inode = dentry->d_inode; int err; - if (acc == MAY_NOP) + if (acc == NFSD_MAY_NOP) return 0; #if 0 dprintk("nfsd: permission 0x%x%s%s%s%s%s%s%s mode 0%o%s%s%s\n", acc, - (acc & MAY_READ)? " read" : "", - (acc & MAY_WRITE)? " write" : "", - (acc & MAY_EXEC)? " exec" : "", - (acc & MAY_SATTR)? " sattr" : "", - (acc & MAY_TRUNC)? " trunc" : "", - (acc & MAY_LOCK)? " lock" : "", - (acc & MAY_OWNER_OVERRIDE)? " owneroverride" : "", + (acc & NFSD_MAY_READ)? " read" : "", + (acc & NFSD_MAY_WRITE)? " write" : "", + (acc & NFSD_MAY_EXEC)? " exec" : "", + (acc & NFSD_MAY_SATTR)? " sattr" : "", + (acc & NFSD_MAY_TRUNC)? " trunc" : "", + (acc & NFSD_MAY_LOCK)? " lock" : "", + (acc & NFSD_MAY_OWNER_OVERRIDE)? " owneroverride" : "", inode->i_mode, IS_IMMUTABLE(inode)? " immut" : "", IS_APPEND(inode)? " append" : "", @@ -1920,18 +1912,18 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, * system. But if it is IRIX doing check on write-access for a * device special file, we ignore rofs. */ - if (!(acc & MAY_LOCAL_ACCESS)) - if (acc & (MAY_WRITE | MAY_SATTR | MAY_TRUNC)) { + if (!(acc & NFSD_MAY_LOCAL_ACCESS)) + if (acc & (NFSD_MAY_WRITE | NFSD_MAY_SATTR | NFSD_MAY_TRUNC)) { if (exp_rdonly(rqstp, exp) || __mnt_is_readonly(exp->ex_path.mnt)) return nfserr_rofs; - if (/* (acc & MAY_WRITE) && */ IS_IMMUTABLE(inode)) + if (/* (acc & NFSD_MAY_WRITE) && */ IS_IMMUTABLE(inode)) return nfserr_perm; } - if ((acc & MAY_TRUNC) && IS_APPEND(inode)) + if ((acc & NFSD_MAY_TRUNC) && IS_APPEND(inode)) return nfserr_perm; - if (acc & MAY_LOCK) { + if (acc & NFSD_MAY_LOCK) { /* If we cannot rely on authentication in NLM requests, * just allow locks, otherwise require read permission, or * ownership @@ -1939,7 +1931,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, if (exp->ex_flags & NFSEXP_NOAUTHNLM) return 0; else - acc = MAY_READ | MAY_OWNER_OVERRIDE; + acc = NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE; } /* * The file owner always gets access permission for accesses that @@ -1955,16 +1947,17 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, * We must trust the client to do permission checking - using "ACCESS" * with NFSv3. */ - if ((acc & MAY_OWNER_OVERRIDE) && + if ((acc & NFSD_MAY_OWNER_OVERRIDE) && inode->i_uid == current->fsuid) return 0; - err = permission(inode, acc & (MAY_READ|MAY_WRITE|MAY_EXEC), NULL); + /* This assumes NFSD_MAY_{READ,WRITE,EXEC} == MAY_{READ,WRITE,EXEC} */ + err = inode_permission(inode, acc & (MAY_READ|MAY_WRITE|MAY_EXEC)); /* Allow read access to binaries even when mode 111 */ if (err == -EACCES && S_ISREG(inode->i_mode) && - acc == (MAY_READ | MAY_OWNER_OVERRIDE)) - err = permission(inode, MAY_EXEC, NULL); + acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE)) + err = inode_permission(inode, MAY_EXEC); return err? nfserrno(err) : 0; } @@ -1972,11 +1965,20 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, void nfsd_racache_shutdown(void) { - if (!raparml) - return; + struct raparms *raparm, *last_raparm; + unsigned int i; + dprintk("nfsd: freeing readahead buffers.\n"); - kfree(raparml); - raparml = NULL; + + for (i = 0; i < RAPARM_HASH_SIZE; i++) { + raparm = raparm_hash[i].pb_head; + while(raparm) { + last_raparm = raparm; + raparm = raparm->p_next; + kfree(last_raparm); + } + raparm_hash[i].pb_head = NULL; + } } /* * Initialize readahead param cache @@ -1987,35 +1989,38 @@ nfsd_racache_init(int cache_size) int i; int j = 0; int nperbucket; + struct raparms **raparm = NULL; - if (raparml) + if (raparm_hash[0].pb_head) return 0; - if (cache_size < 2*RAPARM_HASH_SIZE) - cache_size = 2*RAPARM_HASH_SIZE; - raparml = kcalloc(cache_size, sizeof(struct raparms), GFP_KERNEL); - - if (!raparml) { - printk(KERN_WARNING - "nfsd: Could not allocate memory read-ahead cache.\n"); - return -ENOMEM; - } + nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE); + if (nperbucket < 2) + nperbucket = 2; + cache_size = nperbucket * RAPARM_HASH_SIZE; dprintk("nfsd: allocating %d readahead buffers.\n", cache_size); - for (i = 0 ; i < RAPARM_HASH_SIZE ; i++) { - raparm_hash[i].pb_head = NULL; + + for (i = 0; i < RAPARM_HASH_SIZE; i++) { spin_lock_init(&raparm_hash[i].pb_lock); - } - nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE); - for (i = 0; i < cache_size - 1; i++) { - if (i % nperbucket == 0) - raparm_hash[j++].pb_head = raparml + i; - if (i % nperbucket < nperbucket-1) - raparml[i].p_next = raparml + i + 1; + + raparm = &raparm_hash[i].pb_head; + for (j = 0; j < nperbucket; j++) { + *raparm = kzalloc(sizeof(struct raparms), GFP_KERNEL); + if (!*raparm) + goto out_nomem; + raparm = &(*raparm)->p_next; + } + *raparm = NULL; } nfsdstats.ra_size = cache_size; return 0; + +out_nomem: + dprintk("nfsd: kmalloc failed, freeing readahead buffers\n"); + nfsd_racache_shutdown(); + return -ENOMEM; } #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index 00e9ccde8e42..b38f944f0667 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -1194,7 +1194,7 @@ lock_retry_remap: tbh = bhs[i]; if (!tbh) continue; - if (unlikely(test_set_buffer_locked(tbh))) + if (!trylock_buffer(tbh)) BUG(); /* The buffer dirty state is now irrelevant, just clean it. */ clear_buffer_dirty(tbh); diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c index 33ff314cc507..9669541d0119 100644 --- a/fs/ntfs/compress.c +++ b/fs/ntfs/compress.c @@ -665,7 +665,7 @@ lock_retry_remap: for (i = 0; i < nr_bhs; i++) { struct buffer_head *tbh = bhs[i]; - if (unlikely(test_set_buffer_locked(tbh))) + if (!trylock_buffer(tbh)) continue; if (unlikely(buffer_uptodate(tbh))) { unlock_buffer(tbh); diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 3c5550cd11d6..d020866d4232 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -2118,7 +2118,7 @@ static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb, goto out; if (!count) goto out; - err = remove_suid(file->f_path.dentry); + err = file_remove_suid(file); if (err) goto out; file_update_time(file); diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index 790defb847e7..17d32ca6bc35 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -586,7 +586,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no, for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { struct buffer_head *tbh = bhs[i_bhs]; - if (unlikely(test_set_buffer_locked(tbh))) + if (!trylock_buffer(tbh)) BUG(); BUG_ON(!buffer_uptodate(tbh)); clear_buffer_dirty(tbh); @@ -779,7 +779,7 @@ int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync) for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { struct buffer_head *tbh = bhs[i_bhs]; - if (unlikely(test_set_buffer_locked(tbh))) + if (!trylock_buffer(tbh)) BUG(); BUG_ON(!buffer_uptodate(tbh)); clear_buffer_dirty(tbh); diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c index e1781c8b1650..9e8a95be7a1e 100644 --- a/fs/ntfs/namei.c +++ b/fs/ntfs/namei.c @@ -174,7 +174,6 @@ static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent, // TODO: Consider moving this lot to a separate function! (AIA) handle_name: { - struct dentry *real_dent, *new_dent; MFT_RECORD *m; ntfs_attr_search_ctx *ctx; ntfs_inode *ni = NTFS_I(dent_inode); @@ -255,93 +254,9 @@ handle_name: } nls_name.hash = full_name_hash(nls_name.name, nls_name.len); - /* - * Note: No need for dent->d_lock lock as i_mutex is held on the - * parent inode. - */ - - /* Does a dentry matching the nls_name exist already? */ - real_dent = d_lookup(dent->d_parent, &nls_name); - /* If not, create it now. */ - if (!real_dent) { - real_dent = d_alloc(dent->d_parent, &nls_name); - kfree(nls_name.name); - if (!real_dent) { - err = -ENOMEM; - goto err_out; - } - new_dent = d_splice_alias(dent_inode, real_dent); - if (new_dent) - dput(real_dent); - else - new_dent = real_dent; - ntfs_debug("Done. (Created new dentry.)"); - return new_dent; - } + dent = d_add_ci(dent, dent_inode, &nls_name); kfree(nls_name.name); - /* Matching dentry exists, check if it is negative. */ - if (real_dent->d_inode) { - if (unlikely(real_dent->d_inode != dent_inode)) { - /* This can happen because bad inodes are unhashed. */ - BUG_ON(!is_bad_inode(dent_inode)); - BUG_ON(!is_bad_inode(real_dent->d_inode)); - } - /* - * Already have the inode and the dentry attached, decrement - * the reference count to balance the ntfs_iget() we did - * earlier on. We found the dentry using d_lookup() so it - * cannot be disconnected and thus we do not need to worry - * about any NFS/disconnectedness issues here. - */ - iput(dent_inode); - ntfs_debug("Done. (Already had inode and dentry.)"); - return real_dent; - } - /* - * Negative dentry: instantiate it unless the inode is a directory and - * has a 'disconnected' dentry (i.e. IS_ROOT and DCACHE_DISCONNECTED), - * in which case d_move() that in place of the found dentry. - */ - if (!S_ISDIR(dent_inode->i_mode)) { - /* Not a directory; everything is easy. */ - d_instantiate(real_dent, dent_inode); - ntfs_debug("Done. (Already had negative file dentry.)"); - return real_dent; - } - spin_lock(&dcache_lock); - if (list_empty(&dent_inode->i_dentry)) { - /* - * Directory without a 'disconnected' dentry; we need to do - * d_instantiate() by hand because it takes dcache_lock which - * we already hold. - */ - list_add(&real_dent->d_alias, &dent_inode->i_dentry); - real_dent->d_inode = dent_inode; - spin_unlock(&dcache_lock); - security_d_instantiate(real_dent, dent_inode); - ntfs_debug("Done. (Already had negative directory dentry.)"); - return real_dent; - } - /* - * Directory with a 'disconnected' dentry; get a reference to the - * 'disconnected' dentry. - */ - new_dent = list_entry(dent_inode->i_dentry.next, struct dentry, - d_alias); - dget_locked(new_dent); - spin_unlock(&dcache_lock); - /* Do security vodoo. */ - security_d_instantiate(real_dent, dent_inode); - /* Move new_dent in place of real_dent. */ - d_move(new_dent, real_dent); - /* Balance the ntfs_iget() we did above. */ - iput(dent_inode); - /* Throw away real_dent. */ - dput(real_dent); - /* Use new_dent as the actual dentry. */ - ntfs_debug("Done. (Already had negative, disconnected directory " - "dentry.)"); - return new_dent; + return dent; eio_err_out: ntfs_error(vol->sb, "Illegal file name attribute. Run chkdsk."); diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 3e76f3b216bc..4a46743b5077 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -3080,7 +3080,7 @@ struct kmem_cache *ntfs_inode_cache; struct kmem_cache *ntfs_big_inode_cache; /* Init once constructor for the inode slab cache. */ -static void ntfs_big_inode_init_once(struct kmem_cache *cachep, void *foo) +static void ntfs_big_inode_init_once(void *foo) { ntfs_inode *ni = (ntfs_inode *)foo; diff --git a/fs/ntfs/usnjrnl.h b/fs/ntfs/usnjrnl.h index 3a8af75351e8..4087fbdac327 100644 --- a/fs/ntfs/usnjrnl.h +++ b/fs/ntfs/usnjrnl.h @@ -113,7 +113,7 @@ typedef struct { * Reason flags (32-bit). Cumulative flags describing the change(s) to the * file since it was last opened. I think the names speak for themselves but * if you disagree check out the descriptions in the Linux NTFS project NTFS - * documentation: http://linux-ntfs.sourceforge.net/ntfs/files/usnjrnl.html + * documentation: http://www.linux-ntfs.org/ */ enum { USN_REASON_DATA_OVERWRITE = const_cpu_to_le32(0x00000001), @@ -145,7 +145,7 @@ typedef le32 USN_REASON_FLAGS; * Source info flags (32-bit). Information about the source of the change(s) * to the file. For detailed descriptions of what these mean, see the Linux * NTFS project NTFS documentation: - * http://linux-ntfs.sourceforge.net/ntfs/files/usnjrnl.html + * http://www.linux-ntfs.org/ */ enum { USN_SOURCE_DATA_MANAGEMENT = const_cpu_to_le32(0x00000001), diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile index f6956de56fdb..589dcdfdfe3c 100644 --- a/fs/ocfs2/Makefile +++ b/fs/ocfs2/Makefile @@ -34,7 +34,8 @@ ocfs2-objs := \ symlink.o \ sysfile.o \ uptodate.o \ - ver.o + ver.o \ + xattr.o ocfs2_stackglue-objs := stackglue.o ocfs2_stack_o2cb-objs := stack_o2cb.o diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 10bfb466e068..0cc2deb9394c 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -49,6 +49,340 @@ #include "buffer_head_io.h" + +/* + * Operations for a specific extent tree type. + * + * To implement an on-disk btree (extent tree) type in ocfs2, add + * an ocfs2_extent_tree_operations structure and the matching + * ocfs2_init_<thingy>_extent_tree() function. That's pretty much it + * for the allocation portion of the extent tree. + */ +struct ocfs2_extent_tree_operations { + /* + * last_eb_blk is the block number of the right most leaf extent + * block. Most on-disk structures containing an extent tree store + * this value for fast access. The ->eo_set_last_eb_blk() and + * ->eo_get_last_eb_blk() operations access this value. They are + * both required. + */ + void (*eo_set_last_eb_blk)(struct ocfs2_extent_tree *et, + u64 blkno); + u64 (*eo_get_last_eb_blk)(struct ocfs2_extent_tree *et); + + /* + * The on-disk structure usually keeps track of how many total + * clusters are stored in this extent tree. This function updates + * that value. new_clusters is the delta, and must be + * added to the total. Required. + */ + void (*eo_update_clusters)(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 new_clusters); + + /* + * If ->eo_insert_check() exists, it is called before rec is + * inserted into the extent tree. It is optional. + */ + int (*eo_insert_check)(struct inode *inode, + struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec); + int (*eo_sanity_check)(struct inode *inode, struct ocfs2_extent_tree *et); + + /* + * -------------------------------------------------------------- + * The remaining are internal to ocfs2_extent_tree and don't have + * accessor functions + */ + + /* + * ->eo_fill_root_el() takes et->et_object and sets et->et_root_el. + * It is required. + */ + void (*eo_fill_root_el)(struct ocfs2_extent_tree *et); + + /* + * ->eo_fill_max_leaf_clusters sets et->et_max_leaf_clusters if + * it exists. If it does not, et->et_max_leaf_clusters is set + * to 0 (unlimited). Optional. + */ + void (*eo_fill_max_leaf_clusters)(struct inode *inode, + struct ocfs2_extent_tree *et); +}; + + +/* + * Pre-declare ocfs2_dinode_et_ops so we can use it as a sanity check + * in the methods. + */ +static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et); +static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et, + u64 blkno); +static void ocfs2_dinode_update_clusters(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 clusters); +static int ocfs2_dinode_insert_check(struct inode *inode, + struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec); +static int ocfs2_dinode_sanity_check(struct inode *inode, + struct ocfs2_extent_tree *et); +static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et); +static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = { + .eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk, + .eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk, + .eo_update_clusters = ocfs2_dinode_update_clusters, + .eo_insert_check = ocfs2_dinode_insert_check, + .eo_sanity_check = ocfs2_dinode_sanity_check, + .eo_fill_root_el = ocfs2_dinode_fill_root_el, +}; + +static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et, + u64 blkno) +{ + struct ocfs2_dinode *di = et->et_object; + + BUG_ON(et->et_ops != &ocfs2_dinode_et_ops); + di->i_last_eb_blk = cpu_to_le64(blkno); +} + +static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et) +{ + struct ocfs2_dinode *di = et->et_object; + + BUG_ON(et->et_ops != &ocfs2_dinode_et_ops); + return le64_to_cpu(di->i_last_eb_blk); +} + +static void ocfs2_dinode_update_clusters(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 clusters) +{ + struct ocfs2_dinode *di = et->et_object; + + le32_add_cpu(&di->i_clusters, clusters); + spin_lock(&OCFS2_I(inode)->ip_lock); + OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters); + spin_unlock(&OCFS2_I(inode)->ip_lock); +} + +static int ocfs2_dinode_insert_check(struct inode *inode, + struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL); + mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) && + (OCFS2_I(inode)->ip_clusters != rec->e_cpos), + "Device %s, asking for sparse allocation: inode %llu, " + "cpos %u, clusters %u\n", + osb->dev_str, + (unsigned long long)OCFS2_I(inode)->ip_blkno, + rec->e_cpos, + OCFS2_I(inode)->ip_clusters); + + return 0; +} + +static int ocfs2_dinode_sanity_check(struct inode *inode, + struct ocfs2_extent_tree *et) +{ + int ret = 0; + struct ocfs2_dinode *di; + + BUG_ON(et->et_ops != &ocfs2_dinode_et_ops); + + di = et->et_object; + if (!OCFS2_IS_VALID_DINODE(di)) { + ret = -EIO; + ocfs2_error(inode->i_sb, + "Inode %llu has invalid path root", + (unsigned long long)OCFS2_I(inode)->ip_blkno); + } + + return ret; +} + +static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et) +{ + struct ocfs2_dinode *di = et->et_object; + + et->et_root_el = &di->id2.i_list; +} + + +static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et) +{ + struct ocfs2_xattr_value_root *xv = et->et_object; + + et->et_root_el = &xv->xr_list; +} + +static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et, + u64 blkno) +{ + struct ocfs2_xattr_value_root *xv = + (struct ocfs2_xattr_value_root *)et->et_object; + + xv->xr_last_eb_blk = cpu_to_le64(blkno); +} + +static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et) +{ + struct ocfs2_xattr_value_root *xv = + (struct ocfs2_xattr_value_root *) et->et_object; + + return le64_to_cpu(xv->xr_last_eb_blk); +} + +static void ocfs2_xattr_value_update_clusters(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 clusters) +{ + struct ocfs2_xattr_value_root *xv = + (struct ocfs2_xattr_value_root *)et->et_object; + + le32_add_cpu(&xv->xr_clusters, clusters); +} + +static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = { + .eo_set_last_eb_blk = ocfs2_xattr_value_set_last_eb_blk, + .eo_get_last_eb_blk = ocfs2_xattr_value_get_last_eb_blk, + .eo_update_clusters = ocfs2_xattr_value_update_clusters, + .eo_fill_root_el = ocfs2_xattr_value_fill_root_el, +}; + +static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et) +{ + struct ocfs2_xattr_block *xb = et->et_object; + + et->et_root_el = &xb->xb_attrs.xb_root.xt_list; +} + +static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct inode *inode, + struct ocfs2_extent_tree *et) +{ + et->et_max_leaf_clusters = + ocfs2_clusters_for_bytes(inode->i_sb, + OCFS2_MAX_XATTR_TREE_LEAF_SIZE); +} + +static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et, + u64 blkno) +{ + struct ocfs2_xattr_block *xb = et->et_object; + struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root; + + xt->xt_last_eb_blk = cpu_to_le64(blkno); +} + +static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et) +{ + struct ocfs2_xattr_block *xb = et->et_object; + struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root; + + return le64_to_cpu(xt->xt_last_eb_blk); +} + +static void ocfs2_xattr_tree_update_clusters(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 clusters) +{ + struct ocfs2_xattr_block *xb = et->et_object; + + le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters); +} + +static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = { + .eo_set_last_eb_blk = ocfs2_xattr_tree_set_last_eb_blk, + .eo_get_last_eb_blk = ocfs2_xattr_tree_get_last_eb_blk, + .eo_update_clusters = ocfs2_xattr_tree_update_clusters, + .eo_fill_root_el = ocfs2_xattr_tree_fill_root_el, + .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters, +}; + +static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, + struct inode *inode, + struct buffer_head *bh, + void *obj, + struct ocfs2_extent_tree_operations *ops) +{ + et->et_ops = ops; + et->et_root_bh = bh; + if (!obj) + obj = (void *)bh->b_data; + et->et_object = obj; + + et->et_ops->eo_fill_root_el(et); + if (!et->et_ops->eo_fill_max_leaf_clusters) + et->et_max_leaf_clusters = 0; + else + et->et_ops->eo_fill_max_leaf_clusters(inode, et); +} + +void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et, + struct inode *inode, + struct buffer_head *bh) +{ + __ocfs2_init_extent_tree(et, inode, bh, NULL, &ocfs2_dinode_et_ops); +} + +void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et, + struct inode *inode, + struct buffer_head *bh) +{ + __ocfs2_init_extent_tree(et, inode, bh, NULL, + &ocfs2_xattr_tree_et_ops); +} + +void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, + struct inode *inode, + struct buffer_head *bh, + struct ocfs2_xattr_value_root *xv) +{ + __ocfs2_init_extent_tree(et, inode, bh, xv, + &ocfs2_xattr_value_et_ops); +} + +static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et, + u64 new_last_eb_blk) +{ + et->et_ops->eo_set_last_eb_blk(et, new_last_eb_blk); +} + +static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et) +{ + return et->et_ops->eo_get_last_eb_blk(et); +} + +static inline void ocfs2_et_update_clusters(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 clusters) +{ + et->et_ops->eo_update_clusters(inode, et, clusters); +} + +static inline int ocfs2_et_insert_check(struct inode *inode, + struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec) +{ + int ret = 0; + + if (et->et_ops->eo_insert_check) + ret = et->et_ops->eo_insert_check(inode, et, rec); + return ret; +} + +static inline int ocfs2_et_sanity_check(struct inode *inode, + struct ocfs2_extent_tree *et) +{ + int ret = 0; + + if (et->et_ops->eo_sanity_check) + ret = et->et_ops->eo_sanity_check(inode, et); + return ret; +} + static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt, struct ocfs2_extent_block *eb); @@ -205,17 +539,6 @@ static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh, } /* - * Allocate and initialize a new path based on a disk inode tree. - */ -static struct ocfs2_path *ocfs2_new_inode_path(struct buffer_head *di_bh) -{ - struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; - struct ocfs2_extent_list *el = &di->id2.i_list; - - return ocfs2_new_path(di_bh, el); -} - -/* * Convenience function to journal all components in a path. */ static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle, @@ -368,39 +691,35 @@ struct ocfs2_merge_ctxt { */ int ocfs2_num_free_extents(struct ocfs2_super *osb, struct inode *inode, - struct ocfs2_dinode *fe) + struct ocfs2_extent_tree *et) { int retval; - struct ocfs2_extent_list *el; + struct ocfs2_extent_list *el = NULL; struct ocfs2_extent_block *eb; struct buffer_head *eb_bh = NULL; + u64 last_eb_blk = 0; mlog_entry_void(); - if (!OCFS2_IS_VALID_DINODE(fe)) { - OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); - retval = -EIO; - goto bail; - } + el = et->et_root_el; + last_eb_blk = ocfs2_et_get_last_eb_blk(et); - if (fe->i_last_eb_blk) { - retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), - &eb_bh, OCFS2_BH_CACHED, inode); + if (last_eb_blk) { + retval = ocfs2_read_block(inode, last_eb_blk, + &eb_bh); if (retval < 0) { mlog_errno(retval); goto bail; } eb = (struct ocfs2_extent_block *) eb_bh->b_data; el = &eb->h_list; - } else - el = &fe->id2.i_list; + } BUG_ON(el->l_tree_depth != 0); retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec); bail: - if (eb_bh) - brelse(eb_bh); + brelse(eb_bh); mlog_exit(retval); return retval; @@ -486,8 +805,7 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, bail: if (status < 0) { for(i = 0; i < wanted; i++) { - if (bhs[i]) - brelse(bhs[i]); + brelse(bhs[i]); bhs[i] = NULL; } } @@ -531,7 +849,7 @@ static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list *el) static int ocfs2_add_branch(struct ocfs2_super *osb, handle_t *handle, struct inode *inode, - struct buffer_head *fe_bh, + struct ocfs2_extent_tree *et, struct buffer_head *eb_bh, struct buffer_head **last_eb_bh, struct ocfs2_alloc_context *meta_ac) @@ -540,7 +858,6 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, u64 next_blkno, new_last_eb_blk; struct buffer_head *bh; struct buffer_head **new_eb_bhs = NULL; - struct ocfs2_dinode *fe; struct ocfs2_extent_block *eb; struct ocfs2_extent_list *eb_el; struct ocfs2_extent_list *el; @@ -550,13 +867,11 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, BUG_ON(!last_eb_bh || !*last_eb_bh); - fe = (struct ocfs2_dinode *) fe_bh->b_data; - if (eb_bh) { eb = (struct ocfs2_extent_block *) eb_bh->b_data; el = &eb->h_list; } else - el = &fe->id2.i_list; + el = et->et_root_el; /* we never add a branch to a leaf. */ BUG_ON(!el->l_tree_depth); @@ -646,7 +961,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, mlog_errno(status); goto bail; } - status = ocfs2_journal_access(handle, inode, fe_bh, + status = ocfs2_journal_access(handle, inode, et->et_root_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -662,7 +977,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, } /* Link the new branch into the rest of the tree (el will - * either be on the fe, or the extent block passed in. */ + * either be on the root_bh, or the extent block passed in. */ i = le16_to_cpu(el->l_next_free_rec); el->l_recs[i].e_blkno = cpu_to_le64(next_blkno); el->l_recs[i].e_cpos = cpu_to_le32(new_cpos); @@ -671,7 +986,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, /* fe needs a new last extent block pointer, as does the * next_leaf on the previously last-extent-block. */ - fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk); + ocfs2_et_set_last_eb_blk(et, new_last_eb_blk); eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data; eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk); @@ -679,7 +994,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, status = ocfs2_journal_dirty(handle, *last_eb_bh); if (status < 0) mlog_errno(status); - status = ocfs2_journal_dirty(handle, fe_bh); + status = ocfs2_journal_dirty(handle, et->et_root_bh); if (status < 0) mlog_errno(status); if (eb_bh) { @@ -700,8 +1015,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, bail: if (new_eb_bhs) { for (i = 0; i < new_blocks; i++) - if (new_eb_bhs[i]) - brelse(new_eb_bhs[i]); + brelse(new_eb_bhs[i]); kfree(new_eb_bhs); } @@ -717,16 +1031,15 @@ bail: static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, handle_t *handle, struct inode *inode, - struct buffer_head *fe_bh, + struct ocfs2_extent_tree *et, struct ocfs2_alloc_context *meta_ac, struct buffer_head **ret_new_eb_bh) { int status, i; u32 new_clusters; struct buffer_head *new_eb_bh = NULL; - struct ocfs2_dinode *fe; struct ocfs2_extent_block *eb; - struct ocfs2_extent_list *fe_el; + struct ocfs2_extent_list *root_el; struct ocfs2_extent_list *eb_el; mlog_entry_void(); @@ -746,8 +1059,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, } eb_el = &eb->h_list; - fe = (struct ocfs2_dinode *) fe_bh->b_data; - fe_el = &fe->id2.i_list; + root_el = et->et_root_el; status = ocfs2_journal_access(handle, inode, new_eb_bh, OCFS2_JOURNAL_ACCESS_CREATE); @@ -756,11 +1068,11 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, goto bail; } - /* copy the fe data into the new extent block */ - eb_el->l_tree_depth = fe_el->l_tree_depth; - eb_el->l_next_free_rec = fe_el->l_next_free_rec; - for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) - eb_el->l_recs[i] = fe_el->l_recs[i]; + /* copy the root extent list data into the new extent block */ + eb_el->l_tree_depth = root_el->l_tree_depth; + eb_el->l_next_free_rec = root_el->l_next_free_rec; + for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++) + eb_el->l_recs[i] = root_el->l_recs[i]; status = ocfs2_journal_dirty(handle, new_eb_bh); if (status < 0) { @@ -768,7 +1080,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, goto bail; } - status = ocfs2_journal_access(handle, inode, fe_bh, + status = ocfs2_journal_access(handle, inode, et->et_root_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -777,21 +1089,21 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, new_clusters = ocfs2_sum_rightmost_rec(eb_el); - /* update fe now */ - le16_add_cpu(&fe_el->l_tree_depth, 1); - fe_el->l_recs[0].e_cpos = 0; - fe_el->l_recs[0].e_blkno = eb->h_blkno; - fe_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters); - for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) - memset(&fe_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec)); - fe_el->l_next_free_rec = cpu_to_le16(1); + /* update root_bh now */ + le16_add_cpu(&root_el->l_tree_depth, 1); + root_el->l_recs[0].e_cpos = 0; + root_el->l_recs[0].e_blkno = eb->h_blkno; + root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters); + for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++) + memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec)); + root_el->l_next_free_rec = cpu_to_le16(1); /* If this is our 1st tree depth shift, then last_eb_blk * becomes the allocated extent block */ - if (fe_el->l_tree_depth == cpu_to_le16(1)) - fe->i_last_eb_blk = eb->h_blkno; + if (root_el->l_tree_depth == cpu_to_le16(1)) + ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); - status = ocfs2_journal_dirty(handle, fe_bh); + status = ocfs2_journal_dirty(handle, et->et_root_bh); if (status < 0) { mlog_errno(status); goto bail; @@ -801,8 +1113,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, new_eb_bh = NULL; status = 0; bail: - if (new_eb_bh) - brelse(new_eb_bh); + brelse(new_eb_bh); mlog_exit(status); return status; @@ -817,22 +1128,21 @@ bail: * 1) a lowest extent block is found, then we pass it back in * *lowest_eb_bh and return '0' * - * 2) the search fails to find anything, but the dinode has room. We + * 2) the search fails to find anything, but the root_el has room. We * pass NULL back in *lowest_eb_bh, but still return '0' * - * 3) the search fails to find anything AND the dinode is full, in + * 3) the search fails to find anything AND the root_el is full, in * which case we return > 0 * * return status < 0 indicates an error. */ static int ocfs2_find_branch_target(struct ocfs2_super *osb, struct inode *inode, - struct buffer_head *fe_bh, + struct ocfs2_extent_tree *et, struct buffer_head **target_bh) { int status = 0, i; u64 blkno; - struct ocfs2_dinode *fe; struct ocfs2_extent_block *eb; struct ocfs2_extent_list *el; struct buffer_head *bh = NULL; @@ -842,8 +1152,7 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb, *target_bh = NULL; - fe = (struct ocfs2_dinode *) fe_bh->b_data; - el = &fe->id2.i_list; + el = et->et_root_el; while(le16_to_cpu(el->l_tree_depth) > 1) { if (le16_to_cpu(el->l_next_free_rec) == 0) { @@ -864,13 +1173,10 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb, goto bail; } - if (bh) { - brelse(bh); - bh = NULL; - } + brelse(bh); + bh = NULL; - status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED, - inode); + status = ocfs2_read_block(inode, blkno, &bh); if (status < 0) { mlog_errno(status); goto bail; @@ -886,8 +1192,7 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb, if (le16_to_cpu(el->l_next_free_rec) < le16_to_cpu(el->l_count)) { - if (lowest_bh) - brelse(lowest_bh); + brelse(lowest_bh); lowest_bh = bh; get_bh(lowest_bh); } @@ -895,14 +1200,13 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb, /* If we didn't find one and the fe doesn't have any room, * then return '1' */ - if (!lowest_bh - && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count)) + el = et->et_root_el; + if (!lowest_bh && (el->l_next_free_rec == el->l_count)) status = 1; *target_bh = lowest_bh; bail: - if (bh) - brelse(bh); + brelse(bh); mlog_exit(status); return status; @@ -919,19 +1223,19 @@ bail: * *last_eb_bh will be updated by ocfs2_add_branch(). */ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle, - struct buffer_head *di_bh, int *final_depth, + struct ocfs2_extent_tree *et, int *final_depth, struct buffer_head **last_eb_bh, struct ocfs2_alloc_context *meta_ac) { int ret, shift; - struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; - int depth = le16_to_cpu(di->id2.i_list.l_tree_depth); + struct ocfs2_extent_list *el = et->et_root_el; + int depth = le16_to_cpu(el->l_tree_depth); struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct buffer_head *bh = NULL; BUG_ON(meta_ac == NULL); - shift = ocfs2_find_branch_target(osb, inode, di_bh, &bh); + shift = ocfs2_find_branch_target(osb, inode, et, &bh); if (shift < 0) { ret = shift; mlog_errno(ret); @@ -948,7 +1252,7 @@ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle, /* ocfs2_shift_tree_depth will return us a buffer with * the new extent block (so we can pass that to * ocfs2_add_branch). */ - ret = ocfs2_shift_tree_depth(osb, handle, inode, di_bh, + ret = ocfs2_shift_tree_depth(osb, handle, inode, et, meta_ac, &bh); if (ret < 0) { mlog_errno(ret); @@ -975,7 +1279,7 @@ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle, /* call ocfs2_add_branch to add the final part of the tree with * the new data. */ mlog(0, "add branch. bh = %p\n", bh); - ret = ocfs2_add_branch(osb, handle, inode, di_bh, bh, last_eb_bh, + ret = ocfs2_add_branch(osb, handle, inode, et, bh, last_eb_bh, meta_ac); if (ret < 0) { mlog_errno(ret); @@ -990,15 +1294,6 @@ out: } /* - * This is only valid for leaf nodes, which are the only ones that can - * have empty extents anyway. - */ -static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec) -{ - return !rec->e_leaf_clusters; -} - -/* * This function will discard the rightmost extent record. */ static void ocfs2_shift_records_right(struct ocfs2_extent_list *el) @@ -1245,8 +1540,7 @@ static int __ocfs2_find_path(struct inode *inode, brelse(bh); bh = NULL; - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, - &bh, OCFS2_BH_CACHED, inode); + ret = ocfs2_read_block(inode, blkno, &bh); if (ret) { mlog_errno(ret); goto out; @@ -2067,11 +2361,11 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, struct ocfs2_path *right_path, int subtree_index, struct ocfs2_cached_dealloc_ctxt *dealloc, - int *deleted) + int *deleted, + struct ocfs2_extent_tree *et) { int ret, i, del_right_subtree = 0, right_has_empty = 0; - struct buffer_head *root_bh, *di_bh = path_root_bh(right_path); - struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path); struct ocfs2_extent_list *right_leaf_el, *left_leaf_el; struct ocfs2_extent_block *eb; @@ -2123,7 +2417,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, * We have to update i_last_eb_blk during the meta * data delete. */ - ret = ocfs2_journal_access(handle, inode, di_bh, + ret = ocfs2_journal_access(handle, inode, et_root_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -2198,7 +2492,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, ocfs2_update_edge_lengths(inode, handle, left_path); eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; - di->i_last_eb_blk = eb->h_blkno; + ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); /* * Removal of the extent in the left leaf was skipped @@ -2208,7 +2502,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, if (right_has_empty) ocfs2_remove_empty_extent(left_leaf_el); - ret = ocfs2_journal_dirty(handle, di_bh); + ret = ocfs2_journal_dirty(handle, et_root_bh); if (ret) mlog_errno(ret); @@ -2331,7 +2625,8 @@ static int __ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle, int orig_credits, struct ocfs2_path *path, struct ocfs2_cached_dealloc_ctxt *dealloc, - struct ocfs2_path **empty_extent_path) + struct ocfs2_path **empty_extent_path, + struct ocfs2_extent_tree *et) { int ret, subtree_root, deleted; u32 right_cpos; @@ -2404,7 +2699,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode, ret = ocfs2_rotate_subtree_left(inode, handle, left_path, right_path, subtree_root, - dealloc, &deleted); + dealloc, &deleted, et); if (ret == -EAGAIN) { /* * The rotation has to temporarily stop due to @@ -2447,29 +2742,20 @@ out: } static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle, - struct ocfs2_path *path, - struct ocfs2_cached_dealloc_ctxt *dealloc) + struct ocfs2_path *path, + struct ocfs2_cached_dealloc_ctxt *dealloc, + struct ocfs2_extent_tree *et) { int ret, subtree_index; u32 cpos; struct ocfs2_path *left_path = NULL; - struct ocfs2_dinode *di; struct ocfs2_extent_block *eb; struct ocfs2_extent_list *el; - /* - * XXX: This code assumes that the root is an inode, which is - * true for now but may change as tree code gets generic. - */ - di = (struct ocfs2_dinode *)path_root_bh(path)->b_data; - if (!OCFS2_IS_VALID_DINODE(di)) { - ret = -EIO; - ocfs2_error(inode->i_sb, - "Inode %llu has invalid path root", - (unsigned long long)OCFS2_I(inode)->ip_blkno); - goto out; - } + ret = ocfs2_et_sanity_check(inode, et); + if (ret) + goto out; /* * There's two ways we handle this depending on * whether path is the only existing one. @@ -2526,7 +2812,7 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle, ocfs2_update_edge_lengths(inode, handle, left_path); eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; - di->i_last_eb_blk = eb->h_blkno; + ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); } else { /* * 'path' is also the leftmost path which @@ -2537,12 +2823,12 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle, */ ocfs2_unlink_path(inode, handle, dealloc, path, 1); - el = &di->id2.i_list; + el = et->et_root_el; el->l_tree_depth = 0; el->l_next_free_rec = 0; memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); - di->i_last_eb_blk = 0; + ocfs2_et_set_last_eb_blk(et, 0); } ocfs2_journal_dirty(handle, path_root_bh(path)); @@ -2570,7 +2856,8 @@ out: */ static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle, struct ocfs2_path *path, - struct ocfs2_cached_dealloc_ctxt *dealloc) + struct ocfs2_cached_dealloc_ctxt *dealloc, + struct ocfs2_extent_tree *et) { int ret, orig_credits = handle->h_buffer_credits; struct ocfs2_path *tmp_path = NULL, *restart_path = NULL; @@ -2584,7 +2871,7 @@ static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle, if (path->p_tree_depth == 0) { rightmost_no_delete: /* - * In-inode extents. This is trivially handled, so do + * Inline extents. This is trivially handled, so do * it up front. */ ret = ocfs2_rotate_rightmost_leaf_left(inode, handle, @@ -2638,7 +2925,7 @@ rightmost_no_delete: */ ret = ocfs2_remove_rightmost_path(inode, handle, path, - dealloc); + dealloc, et); if (ret) mlog_errno(ret); goto out; @@ -2650,7 +2937,7 @@ rightmost_no_delete: */ try_rotate: ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path, - dealloc, &restart_path); + dealloc, &restart_path, et); if (ret && ret != -EAGAIN) { mlog_errno(ret); goto out; @@ -2662,7 +2949,7 @@ try_rotate: ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, tmp_path, dealloc, - &restart_path); + &restart_path, et); if (ret && ret != -EAGAIN) { mlog_errno(ret); goto out; @@ -2948,6 +3235,7 @@ static int ocfs2_merge_rec_left(struct inode *inode, handle_t *handle, struct ocfs2_extent_rec *split_rec, struct ocfs2_cached_dealloc_ctxt *dealloc, + struct ocfs2_extent_tree *et, int index) { int ret, i, subtree_index = 0, has_empty_extent = 0; @@ -3068,7 +3356,8 @@ static int ocfs2_merge_rec_left(struct inode *inode, le16_to_cpu(el->l_next_free_rec) == 1) { ret = ocfs2_remove_rightmost_path(inode, handle, - right_path, dealloc); + right_path, + dealloc, et); if (ret) { mlog_errno(ret); goto out; @@ -3095,7 +3384,8 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, int split_index, struct ocfs2_extent_rec *split_rec, struct ocfs2_cached_dealloc_ctxt *dealloc, - struct ocfs2_merge_ctxt *ctxt) + struct ocfs2_merge_ctxt *ctxt, + struct ocfs2_extent_tree *et) { int ret = 0; @@ -3113,7 +3403,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, * illegal. */ ret = ocfs2_rotate_tree_left(inode, handle, path, - dealloc); + dealloc, et); if (ret) { mlog_errno(ret); goto out; @@ -3156,7 +3446,8 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0])); /* The merge left us with an empty extent, remove it. */ - ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc); + ret = ocfs2_rotate_tree_left(inode, handle, path, + dealloc, et); if (ret) { mlog_errno(ret); goto out; @@ -3170,7 +3461,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, */ ret = ocfs2_merge_rec_left(inode, path, handle, rec, - dealloc, + dealloc, et, split_index); if (ret) { @@ -3179,7 +3470,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, } ret = ocfs2_rotate_tree_left(inode, handle, path, - dealloc); + dealloc, et); /* * Error from this last rotate is not critical, so * print but don't bubble it up. @@ -3199,7 +3490,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, ret = ocfs2_merge_rec_left(inode, path, handle, split_rec, - dealloc, + dealloc, et, split_index); if (ret) { mlog_errno(ret); @@ -3222,7 +3513,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, * our leaf. Try to rotate it away. */ ret = ocfs2_rotate_tree_left(inode, handle, path, - dealloc); + dealloc, et); if (ret) mlog_errno(ret); ret = 0; @@ -3356,16 +3647,6 @@ rotate: ocfs2_rotate_leaf(el, insert_rec); } -static inline void ocfs2_update_dinode_clusters(struct inode *inode, - struct ocfs2_dinode *di, - u32 clusters) -{ - le32_add_cpu(&di->i_clusters, clusters); - spin_lock(&OCFS2_I(inode)->ip_lock); - OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters); - spin_unlock(&OCFS2_I(inode)->ip_lock); -} - static void ocfs2_adjust_rightmost_records(struct inode *inode, handle_t *handle, struct ocfs2_path *path, @@ -3567,8 +3848,8 @@ static void ocfs2_split_record(struct inode *inode, } /* - * This function only does inserts on an allocation b-tree. For dinode - * lists, ocfs2_insert_at_leaf() is called directly. + * This function only does inserts on an allocation b-tree. For tree + * depth = 0, ocfs2_insert_at_leaf() is called directly. * * right_path is the path we want to do the actual insert * in. left_path should only be passed in if we need to update that @@ -3665,7 +3946,7 @@ out: static int ocfs2_do_insert_extent(struct inode *inode, handle_t *handle, - struct buffer_head *di_bh, + struct ocfs2_extent_tree *et, struct ocfs2_extent_rec *insert_rec, struct ocfs2_insert_type *type) { @@ -3673,13 +3954,11 @@ static int ocfs2_do_insert_extent(struct inode *inode, u32 cpos; struct ocfs2_path *right_path = NULL; struct ocfs2_path *left_path = NULL; - struct ocfs2_dinode *di; struct ocfs2_extent_list *el; - di = (struct ocfs2_dinode *) di_bh->b_data; - el = &di->id2.i_list; + el = et->et_root_el; - ret = ocfs2_journal_access(handle, inode, di_bh, + ret = ocfs2_journal_access(handle, inode, et->et_root_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -3691,7 +3970,7 @@ static int ocfs2_do_insert_extent(struct inode *inode, goto out_update_clusters; } - right_path = ocfs2_new_inode_path(di_bh); + right_path = ocfs2_new_path(et->et_root_bh, et->et_root_el); if (!right_path) { ret = -ENOMEM; mlog_errno(ret); @@ -3741,7 +4020,7 @@ static int ocfs2_do_insert_extent(struct inode *inode, * ocfs2_rotate_tree_right() might have extended the * transaction without re-journaling our tree root. */ - ret = ocfs2_journal_access(handle, inode, di_bh, + ret = ocfs2_journal_access(handle, inode, et->et_root_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -3766,10 +4045,10 @@ static int ocfs2_do_insert_extent(struct inode *inode, out_update_clusters: if (type->ins_split == SPLIT_NONE) - ocfs2_update_dinode_clusters(inode, di, - le16_to_cpu(insert_rec->e_leaf_clusters)); + ocfs2_et_update_clusters(inode, et, + le16_to_cpu(insert_rec->e_leaf_clusters)); - ret = ocfs2_journal_dirty(handle, di_bh); + ret = ocfs2_journal_dirty(handle, et->et_root_bh); if (ret) mlog_errno(ret); @@ -3899,7 +4178,8 @@ out: static void ocfs2_figure_contig_type(struct inode *inode, struct ocfs2_insert_type *insert, struct ocfs2_extent_list *el, - struct ocfs2_extent_rec *insert_rec) + struct ocfs2_extent_rec *insert_rec, + struct ocfs2_extent_tree *et) { int i; enum ocfs2_contig_type contig_type = CONTIG_NONE; @@ -3915,6 +4195,21 @@ static void ocfs2_figure_contig_type(struct inode *inode, } } insert->ins_contig = contig_type; + + if (insert->ins_contig != CONTIG_NONE) { + struct ocfs2_extent_rec *rec = + &el->l_recs[insert->ins_contig_index]; + unsigned int len = le16_to_cpu(rec->e_leaf_clusters) + + le16_to_cpu(insert_rec->e_leaf_clusters); + + /* + * Caller might want us to limit the size of extents, don't + * calculate contiguousness if we might exceed that limit. + */ + if (et->et_max_leaf_clusters && + (len > et->et_max_leaf_clusters)) + insert->ins_contig = CONTIG_NONE; + } } /* @@ -3923,8 +4218,8 @@ static void ocfs2_figure_contig_type(struct inode *inode, * ocfs2_figure_appending_type() will figure out whether we'll have to * insert at the tail of the rightmost leaf. * - * This should also work against the dinode list for tree's with 0 - * depth. If we consider the dinode list to be the rightmost leaf node + * This should also work against the root extent list for tree's with 0 + * depth. If we consider the root extent list to be the rightmost leaf node * then the logic here makes sense. */ static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert, @@ -3975,14 +4270,13 @@ set_tail_append: * structure. */ static int ocfs2_figure_insert_type(struct inode *inode, - struct buffer_head *di_bh, + struct ocfs2_extent_tree *et, struct buffer_head **last_eb_bh, struct ocfs2_extent_rec *insert_rec, int *free_records, struct ocfs2_insert_type *insert) { int ret; - struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; struct ocfs2_extent_block *eb; struct ocfs2_extent_list *el; struct ocfs2_path *path = NULL; @@ -3990,7 +4284,7 @@ static int ocfs2_figure_insert_type(struct inode *inode, insert->ins_split = SPLIT_NONE; - el = &di->id2.i_list; + el = et->et_root_el; insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth); if (el->l_tree_depth) { @@ -4000,9 +4294,7 @@ static int ocfs2_figure_insert_type(struct inode *inode, * ocfs2_figure_insert_type() and ocfs2_add_branch() * may want it later. */ - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), - le64_to_cpu(di->i_last_eb_blk), &bh, - OCFS2_BH_CACHED, inode); + ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), &bh); if (ret) { mlog_exit(ret); goto out; @@ -4023,12 +4315,12 @@ static int ocfs2_figure_insert_type(struct inode *inode, le16_to_cpu(el->l_next_free_rec); if (!insert->ins_tree_depth) { - ocfs2_figure_contig_type(inode, insert, el, insert_rec); + ocfs2_figure_contig_type(inode, insert, el, insert_rec, et); ocfs2_figure_appending_type(insert, el, insert_rec); return 0; } - path = ocfs2_new_inode_path(di_bh); + path = ocfs2_new_path(et->et_root_bh, et->et_root_el); if (!path) { ret = -ENOMEM; mlog_errno(ret); @@ -4057,7 +4349,7 @@ static int ocfs2_figure_insert_type(struct inode *inode, * into two types of appends: simple record append, or a * rotate inside the tail leaf. */ - ocfs2_figure_contig_type(inode, insert, el, insert_rec); + ocfs2_figure_contig_type(inode, insert, el, insert_rec, et); /* * The insert code isn't quite ready to deal with all cases of @@ -4078,7 +4370,8 @@ static int ocfs2_figure_insert_type(struct inode *inode, * the case that we're doing a tail append, so maybe we can * take advantage of that information somehow. */ - if (le64_to_cpu(di->i_last_eb_blk) == path_leaf_bh(path)->b_blocknr) { + if (ocfs2_et_get_last_eb_blk(et) == + path_leaf_bh(path)->b_blocknr) { /* * Ok, ocfs2_find_path() returned us the rightmost * tree path. This might be an appending insert. There are @@ -4108,7 +4401,7 @@ out: int ocfs2_insert_extent(struct ocfs2_super *osb, handle_t *handle, struct inode *inode, - struct buffer_head *fe_bh, + struct ocfs2_extent_tree *et, u32 cpos, u64 start_blk, u32 new_clusters, @@ -4121,26 +4414,21 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, struct ocfs2_insert_type insert = {0, }; struct ocfs2_extent_rec rec; - BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL); - mlog(0, "add %u clusters at position %u to inode %llu\n", new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno); - mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) && - (OCFS2_I(inode)->ip_clusters != cpos), - "Device %s, asking for sparse allocation: inode %llu, " - "cpos %u, clusters %u\n", - osb->dev_str, - (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, - OCFS2_I(inode)->ip_clusters); - memset(&rec, 0, sizeof(rec)); rec.e_cpos = cpu_to_le32(cpos); rec.e_blkno = cpu_to_le64(start_blk); rec.e_leaf_clusters = cpu_to_le16(new_clusters); rec.e_flags = flags; + status = ocfs2_et_insert_check(inode, et, &rec); + if (status) { + mlog_errno(status); + goto bail; + } - status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec, + status = ocfs2_figure_insert_type(inode, et, &last_eb_bh, &rec, &free_records, &insert); if (status < 0) { mlog_errno(status); @@ -4154,7 +4442,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, free_records, insert.ins_tree_depth); if (insert.ins_contig == CONTIG_NONE && free_records == 0) { - status = ocfs2_grow_tree(inode, handle, fe_bh, + status = ocfs2_grow_tree(inode, handle, et, &insert.ins_tree_depth, &last_eb_bh, meta_ac); if (status) { @@ -4164,17 +4452,124 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, } /* Finally, we can add clusters. This might rotate the tree for us. */ - status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert); + status = ocfs2_do_insert_extent(inode, handle, et, &rec, &insert); if (status < 0) mlog_errno(status); - else + else if (et->et_ops == &ocfs2_dinode_et_ops) ocfs2_extent_map_insert_rec(inode, &rec); bail: - if (last_eb_bh) - brelse(last_eb_bh); + brelse(last_eb_bh); + + mlog_exit(status); + return status; +} + +/* + * Allcate and add clusters into the extent b-tree. + * The new clusters(clusters_to_add) will be inserted at logical_offset. + * The extent b-tree's root is specified by et, and + * it is not limited to the file storage. Any extent tree can use this + * function if it implements the proper ocfs2_extent_tree. + */ +int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb, + struct inode *inode, + u32 *logical_offset, + u32 clusters_to_add, + int mark_unwritten, + struct ocfs2_extent_tree *et, + handle_t *handle, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + enum ocfs2_alloc_restarted *reason_ret) +{ + int status = 0; + int free_extents; + enum ocfs2_alloc_restarted reason = RESTART_NONE; + u32 bit_off, num_bits; + u64 block; + u8 flags = 0; + + BUG_ON(!clusters_to_add); + + if (mark_unwritten) + flags = OCFS2_EXT_UNWRITTEN; + + free_extents = ocfs2_num_free_extents(osb, inode, et); + if (free_extents < 0) { + status = free_extents; + mlog_errno(status); + goto leave; + } + + /* there are two cases which could cause us to EAGAIN in the + * we-need-more-metadata case: + * 1) we haven't reserved *any* + * 2) we are so fragmented, we've needed to add metadata too + * many times. */ + if (!free_extents && !meta_ac) { + mlog(0, "we haven't reserved any metadata!\n"); + status = -EAGAIN; + reason = RESTART_META; + goto leave; + } else if ((!free_extents) + && (ocfs2_alloc_context_bits_left(meta_ac) + < ocfs2_extend_meta_needed(et->et_root_el))) { + mlog(0, "filesystem is really fragmented...\n"); + status = -EAGAIN; + reason = RESTART_META; + goto leave; + } + + status = __ocfs2_claim_clusters(osb, handle, data_ac, 1, + clusters_to_add, &bit_off, &num_bits); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto leave; + } + BUG_ON(num_bits > clusters_to_add); + + /* reserve our write early -- insert_extent may update the inode */ + status = ocfs2_journal_access(handle, inode, et->et_root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + block = ocfs2_clusters_to_blocks(osb->sb, bit_off); + mlog(0, "Allocating %u clusters at block %u for inode %llu\n", + num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); + status = ocfs2_insert_extent(osb, handle, inode, et, + *logical_offset, block, + num_bits, flags, meta_ac); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + status = ocfs2_journal_dirty(handle, et->et_root_bh); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + clusters_to_add -= num_bits; + *logical_offset += num_bits; + + if (clusters_to_add) { + mlog(0, "need to alloc once more, wanted = %u\n", + clusters_to_add); + status = -EAGAIN; + reason = RESTART_TRANS; + } + +leave: mlog_exit(status); + if (reason_ret) + *reason_ret = reason; return status; } @@ -4201,7 +4596,7 @@ static void ocfs2_make_right_split_rec(struct super_block *sb, static int ocfs2_split_and_insert(struct inode *inode, handle_t *handle, struct ocfs2_path *path, - struct buffer_head *di_bh, + struct ocfs2_extent_tree *et, struct buffer_head **last_eb_bh, int split_index, struct ocfs2_extent_rec *orig_split_rec, @@ -4215,7 +4610,6 @@ static int ocfs2_split_and_insert(struct inode *inode, struct ocfs2_extent_rec split_rec = *orig_split_rec; struct ocfs2_insert_type insert; struct ocfs2_extent_block *eb; - struct ocfs2_dinode *di; leftright: /* @@ -4224,8 +4618,7 @@ leftright: */ rec = path_leaf_el(path)->l_recs[split_index]; - di = (struct ocfs2_dinode *)di_bh->b_data; - rightmost_el = &di->id2.i_list; + rightmost_el = et->et_root_el; depth = le16_to_cpu(rightmost_el->l_tree_depth); if (depth) { @@ -4236,8 +4629,8 @@ leftright: if (le16_to_cpu(rightmost_el->l_next_free_rec) == le16_to_cpu(rightmost_el->l_count)) { - ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, last_eb_bh, - meta_ac); + ret = ocfs2_grow_tree(inode, handle, et, + &depth, last_eb_bh, meta_ac); if (ret) { mlog_errno(ret); goto out; @@ -4274,8 +4667,7 @@ leftright: do_leftright = 1; } - ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec, - &insert); + ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert); if (ret) { mlog_errno(ret); goto out; @@ -4317,8 +4709,9 @@ out: * of the tree is required. All other cases will degrade into a less * optimal tree layout. * - * last_eb_bh should be the rightmost leaf block for any inode with a - * btree. Since a split may grow the tree or a merge might shrink it, the caller cannot trust the contents of that buffer after this call. + * last_eb_bh should be the rightmost leaf block for any extent + * btree. Since a split may grow the tree or a merge might shrink it, + * the caller cannot trust the contents of that buffer after this call. * * This code is optimized for readability - several passes might be * made over certain portions of the tree. All of those blocks will @@ -4326,7 +4719,7 @@ out: * extra overhead is not expressed in terms of disk reads. */ static int __ocfs2_mark_extent_written(struct inode *inode, - struct buffer_head *di_bh, + struct ocfs2_extent_tree *et, handle_t *handle, struct ocfs2_path *path, int split_index, @@ -4366,11 +4759,9 @@ static int __ocfs2_mark_extent_written(struct inode *inode, */ if (path->p_tree_depth) { struct ocfs2_extent_block *eb; - struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), - le64_to_cpu(di->i_last_eb_blk), - &last_eb_bh, OCFS2_BH_CACHED, inode); + ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), + &last_eb_bh); if (ret) { mlog_exit(ret); goto out; @@ -4403,7 +4794,7 @@ static int __ocfs2_mark_extent_written(struct inode *inode, if (ctxt.c_split_covers_rec) el->l_recs[split_index] = *split_rec; else - ret = ocfs2_split_and_insert(inode, handle, path, di_bh, + ret = ocfs2_split_and_insert(inode, handle, path, et, &last_eb_bh, split_index, split_rec, meta_ac); if (ret) @@ -4411,7 +4802,7 @@ static int __ocfs2_mark_extent_written(struct inode *inode, } else { ret = ocfs2_try_to_merge_extent(inode, handle, path, split_index, split_rec, - dealloc, &ctxt); + dealloc, &ctxt, et); if (ret) mlog_errno(ret); } @@ -4429,7 +4820,8 @@ out: * * The caller is responsible for passing down meta_ac if we'll need it. */ -int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh, +int ocfs2_mark_extent_written(struct inode *inode, + struct ocfs2_extent_tree *et, handle_t *handle, u32 cpos, u32 len, u32 phys, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc) @@ -4455,10 +4847,14 @@ int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh, /* * XXX: This should be fixed up so that we just re-insert the * next extent records. + * + * XXX: This is a hack on the extent tree, maybe it should be + * an op? */ - ocfs2_extent_map_trunc(inode, 0); + if (et->et_ops == &ocfs2_dinode_et_ops) + ocfs2_extent_map_trunc(inode, 0); - left_path = ocfs2_new_inode_path(di_bh); + left_path = ocfs2_new_path(et->et_root_bh, et->et_root_el); if (!left_path) { ret = -ENOMEM; mlog_errno(ret); @@ -4489,8 +4885,9 @@ int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh, split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags; split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN; - ret = __ocfs2_mark_extent_written(inode, di_bh, handle, left_path, - index, &split_rec, meta_ac, dealloc); + ret = __ocfs2_mark_extent_written(inode, et, handle, left_path, + index, &split_rec, meta_ac, + dealloc); if (ret) mlog_errno(ret); @@ -4499,13 +4896,12 @@ out: return ret; } -static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh, +static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et, handle_t *handle, struct ocfs2_path *path, int index, u32 new_range, struct ocfs2_alloc_context *meta_ac) { int ret, depth, credits = handle->h_buffer_credits; - struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; struct buffer_head *last_eb_bh = NULL; struct ocfs2_extent_block *eb; struct ocfs2_extent_list *rightmost_el, *el; @@ -4522,9 +4918,8 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh, depth = path->p_tree_depth; if (depth > 0) { - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), - le64_to_cpu(di->i_last_eb_blk), - &last_eb_bh, OCFS2_BH_CACHED, inode); + ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), + &last_eb_bh); if (ret < 0) { mlog_errno(ret); goto out; @@ -4535,7 +4930,8 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh, } else rightmost_el = path_leaf_el(path); - credits += path->p_tree_depth + ocfs2_extend_meta_needed(di); + credits += path->p_tree_depth + + ocfs2_extend_meta_needed(et->et_root_el); ret = ocfs2_extend_trans(handle, credits); if (ret) { mlog_errno(ret); @@ -4544,7 +4940,7 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh, if (le16_to_cpu(rightmost_el->l_next_free_rec) == le16_to_cpu(rightmost_el->l_count)) { - ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, &last_eb_bh, + ret = ocfs2_grow_tree(inode, handle, et, &depth, &last_eb_bh, meta_ac); if (ret) { mlog_errno(ret); @@ -4558,7 +4954,7 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh, insert.ins_split = SPLIT_RIGHT; insert.ins_tree_depth = depth; - ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec, &insert); + ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert); if (ret) mlog_errno(ret); @@ -4570,7 +4966,8 @@ out: static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle, struct ocfs2_path *path, int index, struct ocfs2_cached_dealloc_ctxt *dealloc, - u32 cpos, u32 len) + u32 cpos, u32 len, + struct ocfs2_extent_tree *et) { int ret; u32 left_cpos, rec_range, trunc_range; @@ -4582,7 +4979,7 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle, struct ocfs2_extent_block *eb; if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) { - ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc); + ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et); if (ret) { mlog_errno(ret); goto out; @@ -4713,7 +5110,7 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle, ocfs2_journal_dirty(handle, path_leaf_bh(path)); - ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc); + ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et); if (ret) { mlog_errno(ret); goto out; @@ -4724,7 +5121,8 @@ out: return ret; } -int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh, +int ocfs2_remove_extent(struct inode *inode, + struct ocfs2_extent_tree *et, u32 cpos, u32 len, handle_t *handle, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc) @@ -4733,11 +5131,11 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh, u32 rec_range, trunc_range; struct ocfs2_extent_rec *rec; struct ocfs2_extent_list *el; - struct ocfs2_path *path; + struct ocfs2_path *path = NULL; ocfs2_extent_map_trunc(inode, 0); - path = ocfs2_new_inode_path(di_bh); + path = ocfs2_new_path(et->et_root_bh, et->et_root_el); if (!path) { ret = -ENOMEM; mlog_errno(ret); @@ -4790,13 +5188,13 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh, if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) { ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc, - cpos, len); + cpos, len, et); if (ret) { mlog_errno(ret); goto out; } } else { - ret = ocfs2_split_tree(inode, di_bh, handle, path, index, + ret = ocfs2_split_tree(inode, et, handle, path, index, trunc_range, meta_ac); if (ret) { mlog_errno(ret); @@ -4845,7 +5243,7 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh, } ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc, - cpos, len); + cpos, len, et); if (ret) { mlog_errno(ret); goto out; @@ -5188,8 +5586,7 @@ static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb, goto bail; } - status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, - OCFS2_BH_CACHED, inode); + status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh); if (status < 0) { iput(inode); mlog_errno(status); @@ -5264,8 +5661,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, bail: if (tl_inode) iput(tl_inode); - if (tl_bh) - brelse(tl_bh); + brelse(tl_bh); if (status < 0 && (*tl_copy)) { kfree(*tl_copy); @@ -6008,20 +6404,13 @@ bail: return status; } -static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh) +static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh) { set_buffer_uptodate(bh); mark_buffer_dirty(bh); return 0; } -static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh) -{ - set_buffer_uptodate(bh); - mark_buffer_dirty(bh); - return ocfs2_journal_dirty_data(handle, bh); -} - static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, unsigned int from, unsigned int to, struct page *page, int zero, u64 *phys) @@ -6040,17 +6429,18 @@ static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, * here if they aren't - ocfs2_map_page_blocks() * might've skipped some */ - if (ocfs2_should_order_data(inode)) { - ret = walk_page_buffers(handle, - page_buffers(page), - from, to, &partial, - ocfs2_ordered_zero_func); - if (ret < 0) - mlog_errno(ret); - } else { + ret = walk_page_buffers(handle, page_buffers(page), + from, to, &partial, + ocfs2_zero_func); + if (ret < 0) + mlog_errno(ret); + else if (ocfs2_should_order_data(inode)) { + ret = ocfs2_jbd2_file_inode(handle, inode); +#ifdef CONFIG_OCFS2_COMPAT_JBD ret = walk_page_buffers(handle, page_buffers(page), from, to, &partial, - ocfs2_writeback_zero_func); + ocfs2_journal_dirty_data); +#endif if (ret < 0) mlog_errno(ret); } @@ -6215,20 +6605,29 @@ out: return ret; } -static void ocfs2_zero_dinode_id2(struct inode *inode, struct ocfs2_dinode *di) +static void ocfs2_zero_dinode_id2_with_xattr(struct inode *inode, + struct ocfs2_dinode *di) { unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits; + unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size); - memset(&di->id2, 0, blocksize - offsetof(struct ocfs2_dinode, id2)); + if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL) + memset(&di->id2, 0, blocksize - + offsetof(struct ocfs2_dinode, id2) - + xattrsize); + else + memset(&di->id2, 0, blocksize - + offsetof(struct ocfs2_dinode, id2)); } void ocfs2_dinode_new_extent_list(struct inode *inode, struct ocfs2_dinode *di) { - ocfs2_zero_dinode_id2(inode, di); + ocfs2_zero_dinode_id2_with_xattr(inode, di); di->id2.i_list.l_tree_depth = 0; di->id2.i_list.l_next_free_rec = 0; - di->id2.i_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(inode->i_sb)); + di->id2.i_list.l_count = cpu_to_le16( + ocfs2_extent_recs_per_inode_with_xattr(inode->i_sb, di)); } void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di) @@ -6245,9 +6644,10 @@ void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di) * We clear the entire i_data structure here so that all * fields can be properly initialized. */ - ocfs2_zero_dinode_id2(inode, di); + ocfs2_zero_dinode_id2_with_xattr(inode, di); - idata->id_count = cpu_to_le16(ocfs2_max_inline_data(inode->i_sb)); + idata->id_count = cpu_to_le16( + ocfs2_max_inline_data_with_xattr(inode->i_sb, di)); } int ocfs2_convert_inline_data_to_extents(struct inode *inode, @@ -6262,6 +6662,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, struct ocfs2_alloc_context *data_ac = NULL; struct page **pages = NULL; loff_t end = osb->s_clustersize; + struct ocfs2_extent_tree et; has_data = i_size_read(inode) ? 1 : 0; @@ -6361,7 +6762,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, * this proves to be false, we could always re-build * the in-inode data from our pages. */ - ret = ocfs2_insert_extent(osb, handle, inode, di_bh, + ocfs2_init_dinode_extent_tree(&et, inode, di_bh); + ret = ocfs2_insert_extent(osb, handle, inode, &et, 0, block, 1, 0, NULL); if (ret) { mlog_errno(ret); @@ -6404,13 +6806,14 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb, handle_t *handle = NULL; struct inode *tl_inode = osb->osb_tl_inode; struct ocfs2_path *path = NULL; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data; mlog_entry_void(); new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb, i_size_read(inode)); - path = ocfs2_new_inode_path(fe_bh); + path = ocfs2_new_path(fe_bh, &di->id2.i_list); if (!path) { status = -ENOMEM; mlog_errno(status); @@ -6581,8 +6984,8 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb, ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc); if (fe->id2.i_list.l_tree_depth) { - status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), - &last_eb_bh, OCFS2_BH_CACHED, inode); + status = ocfs2_read_block(inode, le64_to_cpu(fe->i_last_eb_blk), + &last_eb_bh); if (status < 0) { mlog_errno(status); goto bail; @@ -6695,8 +7098,7 @@ static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc) mlog(ML_NOTICE, "Truncate completion has non-empty dealloc context\n"); - if (tc->tc_last_eb_bh) - brelse(tc->tc_last_eb_bh); + brelse(tc->tc_last_eb_bh); kfree(tc); } diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index 42ff94bd8011..70257c84cfbe 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -26,30 +26,102 @@ #ifndef OCFS2_ALLOC_H #define OCFS2_ALLOC_H + +/* + * For xattr tree leaf, we limit the leaf byte size to be 64K. + */ +#define OCFS2_MAX_XATTR_TREE_LEAF_SIZE 65536 + +/* + * ocfs2_extent_tree and ocfs2_extent_tree_operations are used to abstract + * the b-tree operations in ocfs2. Now all the b-tree operations are not + * limited to ocfs2_dinode only. Any data which need to allocate clusters + * to store can use b-tree. And it only needs to implement its ocfs2_extent_tree + * and operation. + * + * ocfs2_extent_tree becomes the first-class object for extent tree + * manipulation. Callers of the alloc.c code need to fill it via one of + * the ocfs2_init_*_extent_tree() operations below. + * + * ocfs2_extent_tree contains info for the root of the b-tree, it must have a + * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree + * functions. + * ocfs2_extent_tree_operations abstract the normal operations we do for + * the root of extent b-tree. + */ +struct ocfs2_extent_tree_operations; +struct ocfs2_extent_tree { + struct ocfs2_extent_tree_operations *et_ops; + struct buffer_head *et_root_bh; + struct ocfs2_extent_list *et_root_el; + void *et_object; + unsigned int et_max_leaf_clusters; +}; + +/* + * ocfs2_init_*_extent_tree() will fill an ocfs2_extent_tree from the + * specified object buffer. + */ +void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et, + struct inode *inode, + struct buffer_head *bh); +void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et, + struct inode *inode, + struct buffer_head *bh); +void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, + struct inode *inode, + struct buffer_head *bh, + struct ocfs2_xattr_value_root *xv); + struct ocfs2_alloc_context; int ocfs2_insert_extent(struct ocfs2_super *osb, handle_t *handle, struct inode *inode, - struct buffer_head *fe_bh, + struct ocfs2_extent_tree *et, u32 cpos, u64 start_blk, u32 new_clusters, u8 flags, struct ocfs2_alloc_context *meta_ac); + +enum ocfs2_alloc_restarted { + RESTART_NONE = 0, + RESTART_TRANS, + RESTART_META +}; +int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb, + struct inode *inode, + u32 *logical_offset, + u32 clusters_to_add, + int mark_unwritten, + struct ocfs2_extent_tree *et, + handle_t *handle, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + enum ocfs2_alloc_restarted *reason_ret); struct ocfs2_cached_dealloc_ctxt; -int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh, +int ocfs2_mark_extent_written(struct inode *inode, + struct ocfs2_extent_tree *et, handle_t *handle, u32 cpos, u32 len, u32 phys, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc); -int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh, +int ocfs2_remove_extent(struct inode *inode, + struct ocfs2_extent_tree *et, u32 cpos, u32 len, handle_t *handle, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc); int ocfs2_num_free_extents(struct ocfs2_super *osb, struct inode *inode, - struct ocfs2_dinode *fe); -/* how many new metadata chunks would an allocation need at maximum? */ -static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe) + struct ocfs2_extent_tree *et); + +/* + * how many new metadata chunks would an allocation need at maximum? + * + * Please note that the caller must make sure that root_el is the root + * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise + * the result may be wrong. + */ +static inline int ocfs2_extend_meta_needed(struct ocfs2_extent_list *root_el) { /* * Rather than do all the work of determining how much we need @@ -59,7 +131,7 @@ static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe) * new tree_depth==0 extent_block, and one block at the new * top-of-the tree. */ - return le16_to_cpu(fe->id2.i_list.l_tree_depth) + 2; + return le16_to_cpu(root_el->l_tree_depth) + 2; } void ocfs2_dinode_new_extent_list(struct inode *inode, struct ocfs2_dinode *di); @@ -146,4 +218,13 @@ static inline unsigned int ocfs2_rec_clusters(struct ocfs2_extent_list *el, return le16_to_cpu(rec->e_leaf_clusters); } +/* + * This is only valid for leaf nodes, which are the only ones that can + * have empty extents anyway. + */ +static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec) +{ + return !rec->e_leaf_clusters; +} + #endif /* OCFS2_ALLOC_H */ diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 17964c0505a9..c22543b33420 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -68,9 +68,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, goto bail; } - status = ocfs2_read_block(OCFS2_SB(inode->i_sb), - OCFS2_I(inode)->ip_blkno, - &bh, OCFS2_BH_CACHED, inode); + status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh); if (status < 0) { mlog_errno(status); goto bail; @@ -128,8 +126,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, err = 0; bail: - if (bh) - brelse(bh); + brelse(bh); mlog_exit(err); return err; @@ -174,10 +171,17 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, * need to use BH_New is when we're extending i_size on a file * system which doesn't support holes, in which case BH_New * allows block_prepare_write() to zero. + * + * If we see this on a sparse file system, then a truncate has + * raced us and removed the cluster. In this case, we clear + * the buffers dirty and uptodate bits and let the buffer code + * ignore it as a hole. */ - mlog_bug_on_msg(create && p_blkno == 0 && ocfs2_sparse_alloc(osb), - "ino %lu, iblock %llu\n", inode->i_ino, - (unsigned long long)iblock); + if (create && p_blkno == 0 && ocfs2_sparse_alloc(osb)) { + clear_buffer_dirty(bh_result); + clear_buffer_uptodate(bh_result); + goto bail; + } /* Treat the unwritten extent as a hole for zeroing purposes. */ if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) @@ -254,13 +258,11 @@ static int ocfs2_readpage_inline(struct inode *inode, struct page *page) { int ret; struct buffer_head *di_bh = NULL; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); BUG_ON(!PageLocked(page)); BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)); - ret = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &di_bh, - OCFS2_BH_CACHED, inode); + ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh); if (ret) { mlog_errno(ret); goto out; @@ -478,11 +480,14 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode, } if (ocfs2_should_order_data(inode)) { + ret = ocfs2_jbd2_file_inode(handle, inode); +#ifdef CONFIG_OCFS2_COMPAT_JBD ret = walk_page_buffers(handle, page_buffers(page), from, to, NULL, ocfs2_journal_dirty_data); - if (ret < 0) +#endif + if (ret < 0) mlog_errno(ret); } out: @@ -587,7 +592,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, goto bail; } - if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno) { + if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno && create) { ocfs2_error(inode->i_sb, "Inode %llu has a hole at block %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, @@ -662,7 +667,7 @@ static void ocfs2_invalidatepage(struct page *page, unsigned long offset) { journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal; - journal_invalidatepage(journal, page, offset); + jbd2_journal_invalidatepage(journal, page, offset); } static int ocfs2_releasepage(struct page *page, gfp_t wait) @@ -671,7 +676,7 @@ static int ocfs2_releasepage(struct page *page, gfp_t wait) if (!page_has_buffers(page)) return 0; - return journal_try_to_free_buffers(journal, page, wait); + return jbd2_journal_try_to_free_buffers(journal, page, wait); } static ssize_t ocfs2_direct_IO(int rw, @@ -1066,12 +1071,19 @@ static void ocfs2_write_failure(struct inode *inode, for(i = 0; i < wc->w_num_pages; i++) { tmppage = wc->w_pages[i]; - if (ocfs2_should_order_data(inode)) - walk_page_buffers(wc->w_handle, page_buffers(tmppage), - from, to, NULL, - ocfs2_journal_dirty_data); + if (page_has_buffers(tmppage)) { + if (ocfs2_should_order_data(inode)) { + ocfs2_jbd2_file_inode(wc->w_handle, inode); +#ifdef CONFIG_OCFS2_COMPAT_JBD + walk_page_buffers(wc->w_handle, + page_buffers(tmppage), + from, to, NULL, + ocfs2_journal_dirty_data); +#endif + } - block_commit_write(tmppage, from, to); + block_commit_write(tmppage, from, to); + } } } @@ -1232,6 +1244,7 @@ static int ocfs2_write_cluster(struct address_space *mapping, int ret, i, new, should_zero = 0; u64 v_blkno, p_blkno; struct inode *inode = mapping->host; + struct ocfs2_extent_tree et; new = phys == 0 ? 1 : 0; if (new || unwritten) @@ -1245,10 +1258,10 @@ static int ocfs2_write_cluster(struct address_space *mapping, * any additional semaphores or cluster locks. */ tmp_pos = cpos; - ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode, - &tmp_pos, 1, 0, wc->w_di_bh, - wc->w_handle, data_ac, - meta_ac, NULL); + ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode, + &tmp_pos, 1, 0, wc->w_di_bh, + wc->w_handle, data_ac, + meta_ac, NULL); /* * This shouldn't happen because we must have already * calculated the correct meta data allocation required. The @@ -1266,7 +1279,8 @@ static int ocfs2_write_cluster(struct address_space *mapping, goto out; } } else if (unwritten) { - ret = ocfs2_mark_extent_written(inode, wc->w_di_bh, + ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh); + ret = ocfs2_mark_extent_written(inode, &et, wc->w_handle, cpos, 1, phys, meta_ac, &wc->w_dealloc); if (ret < 0) { @@ -1655,6 +1669,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, struct ocfs2_alloc_context *data_ac = NULL; struct ocfs2_alloc_context *meta_ac = NULL; handle_t *handle; + struct ocfs2_extent_tree et; ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh); if (ret) { @@ -1702,14 +1717,23 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, * ocfs2_lock_allocators(). It greatly over-estimates * the work to be done. */ - ret = ocfs2_lock_allocators(inode, di, clusters_to_alloc, - extents_to_split, &data_ac, &meta_ac); + mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u," + " clusters_to_add = %u, extents_to_split = %u\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (long long)i_size_read(inode), le32_to_cpu(di->i_clusters), + clusters_to_alloc, extents_to_split); + + ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh); + ret = ocfs2_lock_allocators(inode, &et, + clusters_to_alloc, extents_to_split, + &data_ac, &meta_ac); if (ret) { mlog_errno(ret); goto out; } - credits = ocfs2_calc_extend_credits(inode->i_sb, di, + credits = ocfs2_calc_extend_credits(inode->i_sb, + &di->id2.i_list, clusters_to_alloc); } @@ -1894,12 +1918,18 @@ int ocfs2_write_end_nolock(struct address_space *mapping, to = PAGE_CACHE_SIZE; } - if (ocfs2_should_order_data(inode)) - walk_page_buffers(wc->w_handle, page_buffers(tmppage), - from, to, NULL, - ocfs2_journal_dirty_data); - - block_commit_write(tmppage, from, to); + if (page_has_buffers(tmppage)) { + if (ocfs2_should_order_data(inode)) { + ocfs2_jbd2_file_inode(wc->w_handle, inode); +#ifdef CONFIG_OCFS2_COMPAT_JBD + walk_page_buffers(wc->w_handle, + page_buffers(tmppage), + from, to, NULL, + ocfs2_journal_dirty_data); +#endif + } + block_commit_write(tmppage, from, to); + } } out_write_size: diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index f136639f5b41..7e947c672469 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -66,7 +66,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, /* remove from dirty list before I/O. */ clear_buffer_dirty(bh); - get_bh(bh); /* for end_buffer_write_sync() */ + get_bh(bh); /* for end_buffer_write_sync() */ bh->b_end_io = end_buffer_write_sync; submit_bh(WRITE, bh); @@ -88,22 +88,103 @@ out: return ret; } -int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr, - struct buffer_head *bhs[], int flags, - struct inode *inode) +int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, + unsigned int nr, struct buffer_head *bhs[]) +{ + int status = 0; + unsigned int i; + struct buffer_head *bh; + + if (!nr) { + mlog(ML_BH_IO, "No buffers will be read!\n"); + goto bail; + } + + for (i = 0 ; i < nr ; i++) { + if (bhs[i] == NULL) { + bhs[i] = sb_getblk(osb->sb, block++); + if (bhs[i] == NULL) { + status = -EIO; + mlog_errno(status); + goto bail; + } + } + bh = bhs[i]; + + if (buffer_jbd(bh)) { + mlog(ML_ERROR, + "trying to sync read a jbd " + "managed bh (blocknr = %llu), skipping\n", + (unsigned long long)bh->b_blocknr); + continue; + } + + if (buffer_dirty(bh)) { + /* This should probably be a BUG, or + * at least return an error. */ + mlog(ML_ERROR, + "trying to sync read a dirty " + "buffer! (blocknr = %llu), skipping\n", + (unsigned long long)bh->b_blocknr); + continue; + } + + lock_buffer(bh); + if (buffer_jbd(bh)) { + mlog(ML_ERROR, + "block %llu had the JBD bit set " + "while I was in lock_buffer!", + (unsigned long long)bh->b_blocknr); + BUG(); + } + + clear_buffer_uptodate(bh); + get_bh(bh); /* for end_buffer_read_sync() */ + bh->b_end_io = end_buffer_read_sync; + submit_bh(READ, bh); + } + + for (i = nr; i > 0; i--) { + bh = bhs[i - 1]; + + if (buffer_jbd(bh)) { + mlog(ML_ERROR, + "the journal got the buffer while it was " + "locked for io! (blocknr = %llu)\n", + (unsigned long long)bh->b_blocknr); + BUG(); + } + + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + /* Status won't be cleared from here on out, + * so we can safely record this and loop back + * to cleanup the other buffers. */ + status = -EIO; + put_bh(bh); + bhs[i - 1] = NULL; + } + } + +bail: + return status; +} + +int ocfs2_read_blocks(struct inode *inode, u64 block, int nr, + struct buffer_head *bhs[], int flags) { int status = 0; - struct super_block *sb; int i, ignore_cache = 0; struct buffer_head *bh; - mlog_entry("(block=(%llu), nr=(%d), flags=%d, inode=%p)\n", - (unsigned long long)block, nr, flags, inode); + mlog_entry("(inode=%p, block=(%llu), nr=(%d), flags=%d)\n", + inode, (unsigned long long)block, nr, flags); + BUG_ON(!inode); BUG_ON((flags & OCFS2_BH_READAHEAD) && - (!inode || !(flags & OCFS2_BH_CACHED))); + (flags & OCFS2_BH_IGNORE_CACHE)); - if (osb == NULL || osb->sb == NULL || bhs == NULL) { + if (bhs == NULL) { status = -EINVAL; mlog_errno(status); goto bail; @@ -122,26 +203,19 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr, goto bail; } - sb = osb->sb; - - if (flags & OCFS2_BH_CACHED && !inode) - flags &= ~OCFS2_BH_CACHED; - - if (inode) - mutex_lock(&OCFS2_I(inode)->ip_io_mutex); + mutex_lock(&OCFS2_I(inode)->ip_io_mutex); for (i = 0 ; i < nr ; i++) { if (bhs[i] == NULL) { - bhs[i] = sb_getblk(sb, block++); + bhs[i] = sb_getblk(inode->i_sb, block++); if (bhs[i] == NULL) { - if (inode) - mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); + mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); status = -EIO; mlog_errno(status); goto bail; } } bh = bhs[i]; - ignore_cache = 0; + ignore_cache = (flags & OCFS2_BH_IGNORE_CACHE); /* There are three read-ahead cases here which we need to * be concerned with. All three assume a buffer has @@ -167,26 +241,27 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr, * before our is-it-in-flight check. */ - if (flags & OCFS2_BH_CACHED && - !ocfs2_buffer_uptodate(inode, bh)) { + if (!ignore_cache && !ocfs2_buffer_uptodate(inode, bh)) { mlog(ML_UPTODATE, "bh (%llu), inode %llu not uptodate\n", (unsigned long long)bh->b_blocknr, (unsigned long long)OCFS2_I(inode)->ip_blkno); + /* We're using ignore_cache here to say + * "go to disk" */ ignore_cache = 1; } /* XXX: Can we ever get this and *not* have the cached * flag set? */ if (buffer_jbd(bh)) { - if (!(flags & OCFS2_BH_CACHED) || ignore_cache) + if (ignore_cache) mlog(ML_BH_IO, "trying to sync read a jbd " "managed bh (blocknr = %llu)\n", (unsigned long long)bh->b_blocknr); continue; } - if (!(flags & OCFS2_BH_CACHED) || ignore_cache) { + if (ignore_cache) { if (buffer_dirty(bh)) { /* This should probably be a BUG, or * at least return an error. */ @@ -221,7 +296,7 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr, * previously read-ahead buffer may have * completed I/O while we were waiting for the * buffer lock. */ - if ((flags & OCFS2_BH_CACHED) + if (!(flags & OCFS2_BH_IGNORE_CACHE) && !(flags & OCFS2_BH_READAHEAD) && ocfs2_buffer_uptodate(inode, bh)) { unlock_buffer(bh); @@ -265,15 +340,14 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr, /* Always set the buffer in the cache, even if it was * a forced read, or read-ahead which hasn't yet * completed. */ - if (inode) - ocfs2_set_buffer_uptodate(inode, bh); + ocfs2_set_buffer_uptodate(inode, bh); } - if (inode) - mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); + mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n", (unsigned long long)block, nr, - (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes", flags); + ((flags & OCFS2_BH_IGNORE_CACHE) || ignore_cache) ? "no" : "yes", + flags); bail: diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h index c2e78614c3e5..75e1dcb1ade7 100644 --- a/fs/ocfs2/buffer_head_io.h +++ b/fs/ocfs2/buffer_head_io.h @@ -31,31 +31,29 @@ void ocfs2_end_buffer_io_sync(struct buffer_head *bh, int uptodate); -static inline int ocfs2_read_block(struct ocfs2_super *osb, +static inline int ocfs2_read_block(struct inode *inode, u64 off, - struct buffer_head **bh, - int flags, - struct inode *inode); + struct buffer_head **bh); int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, struct inode *inode); -int ocfs2_read_blocks(struct ocfs2_super *osb, +int ocfs2_read_blocks(struct inode *inode, u64 block, int nr, struct buffer_head *bhs[], - int flags, - struct inode *inode); + int flags); +int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, + unsigned int nr, struct buffer_head *bhs[]); int ocfs2_write_super_or_backup(struct ocfs2_super *osb, struct buffer_head *bh); -#define OCFS2_BH_CACHED 1 +#define OCFS2_BH_IGNORE_CACHE 1 #define OCFS2_BH_READAHEAD 8 -static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off, - struct buffer_head **bh, int flags, - struct inode *inode) +static inline int ocfs2_read_block(struct inode *inode, u64 off, + struct buffer_head **bh) { int status = 0; @@ -65,8 +63,7 @@ static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off, goto bail; } - status = ocfs2_read_blocks(osb, off, 1, bh, - flags, inode); + status = ocfs2_read_blocks(inode, off, 1, bh, 0); bail: return status; diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index f02ccb34604d..7dce1612553e 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1493,24 +1493,18 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g const char *name) { struct o2hb_region *reg = NULL; - struct config_item *ret = NULL; reg = kzalloc(sizeof(struct o2hb_region), GFP_KERNEL); if (reg == NULL) - goto out; /* ENOMEM */ + return ERR_PTR(-ENOMEM); config_item_init_type_name(®->hr_item, name, &o2hb_region_type); - ret = ®->hr_item; - spin_lock(&o2hb_live_lock); list_add_tail(®->hr_all_item, &o2hb_all_regions); spin_unlock(&o2hb_live_lock); -out: - if (ret == NULL) - kfree(reg); - return ret; + return ®->hr_item; } static void o2hb_heartbeat_group_drop_item(struct config_group *group, diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c index 23c732f27529..d8a0cb92cef6 100644 --- a/fs/ocfs2/cluster/masklog.c +++ b/fs/ocfs2/cluster/masklog.c @@ -109,6 +109,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = { define_mask(CONN), define_mask(QUORUM), define_mask(EXPORT), + define_mask(XATTR), define_mask(ERROR), define_mask(NOTICE), define_mask(KTHREAD), diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h index 597e064bb94f..57670c680471 100644 --- a/fs/ocfs2/cluster/masklog.h +++ b/fs/ocfs2/cluster/masklog.h @@ -112,6 +112,7 @@ #define ML_CONN 0x0000000004000000ULL /* net connection management */ #define ML_QUORUM 0x0000000008000000ULL /* net connection quorum */ #define ML_EXPORT 0x0000000010000000ULL /* ocfs2 export operations */ +#define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */ /* bits that are infrequently given and frequently matched in the high word */ #define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */ #define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */ diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c index 7bf3c0ea7bd9..52276c02f710 100644 --- a/fs/ocfs2/cluster/netdebug.c +++ b/fs/ocfs2/cluster/netdebug.c @@ -138,18 +138,20 @@ static int nst_seq_show(struct seq_file *seq, void *v) " message id: %d\n" " message type: %u\n" " message key: 0x%08x\n" - " sock acquiry: %lu.%lu\n" - " send start: %lu.%lu\n" - " wait start: %lu.%lu\n", + " sock acquiry: %lu.%ld\n" + " send start: %lu.%ld\n" + " wait start: %lu.%ld\n", nst, (unsigned long)nst->st_task->pid, (unsigned long)nst->st_task->tgid, nst->st_task->comm, nst->st_node, nst->st_sc, nst->st_id, nst->st_msg_type, nst->st_msg_key, - nst->st_sock_time.tv_sec, nst->st_sock_time.tv_usec, - nst->st_send_time.tv_sec, nst->st_send_time.tv_usec, + nst->st_sock_time.tv_sec, + (long)nst->st_sock_time.tv_usec, + nst->st_send_time.tv_sec, + (long)nst->st_send_time.tv_usec, nst->st_status_time.tv_sec, - nst->st_status_time.tv_usec); + (long)nst->st_status_time.tv_usec); } spin_unlock(&o2net_debug_lock); @@ -274,7 +276,7 @@ static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos) return sc; /* unused, just needs to be null when done */ } -#define TV_SEC_USEC(TV) TV.tv_sec, TV.tv_usec +#define TV_SEC_USEC(TV) TV.tv_sec, (long)TV.tv_usec static int sc_seq_show(struct seq_file *seq, void *v) { @@ -307,12 +309,12 @@ static int sc_seq_show(struct seq_file *seq, void *v) " remote node: %s\n" " page off: %zu\n" " handshake ok: %u\n" - " timer: %lu.%lu\n" - " data ready: %lu.%lu\n" - " advance start: %lu.%lu\n" - " advance stop: %lu.%lu\n" - " func start: %lu.%lu\n" - " func stop: %lu.%lu\n" + " timer: %lu.%ld\n" + " data ready: %lu.%ld\n" + " advance start: %lu.%ld\n" + " advance stop: %lu.%ld\n" + " func start: %lu.%ld\n" + " func stop: %lu.%ld\n" " func key: %u\n" " func type: %u\n", sc, diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index cfdb08b484ed..816a3f61330c 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -648,26 +648,19 @@ static struct config_item *o2nm_node_group_make_item(struct config_group *group, const char *name) { struct o2nm_node *node = NULL; - struct config_item *ret = NULL; if (strlen(name) > O2NM_MAX_NAME_LEN) - goto out; /* ENAMETOOLONG */ + return ERR_PTR(-ENAMETOOLONG); node = kzalloc(sizeof(struct o2nm_node), GFP_KERNEL); if (node == NULL) - goto out; /* ENOMEM */ + return ERR_PTR(-ENOMEM); strcpy(node->nd_name, name); /* use item.ci_namebuf instead? */ config_item_init_type_name(&node->nd_item, name, &o2nm_node_type); spin_lock_init(&node->nd_lock); - ret = &node->nd_item; - -out: - if (ret == NULL) - kfree(node); - - return ret; + return &node->nd_item; } static void o2nm_node_group_drop_item(struct config_group *group, @@ -762,7 +755,7 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g /* this runs under the parent dir's i_mutex; there can be only * one caller in here at a time */ if (o2nm_single_cluster) - goto out; /* ENOSPC */ + return ERR_PTR(-ENOSPC); cluster = kzalloc(sizeof(struct o2nm_cluster), GFP_KERNEL); ns = kzalloc(sizeof(struct o2nm_node_group), GFP_KERNEL); @@ -795,6 +788,7 @@ out: kfree(ns); o2hb_free_hb_set(o2hb_group); kfree(defs); + ret = ERR_PTR(-ENOMEM); } return ret; diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index a27d61581bd6..2bcf706d9dd3 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -143,8 +143,8 @@ static void o2net_sc_postpone_idle(struct o2net_sock_container *sc); static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc); #ifdef CONFIG_DEBUG_FS -void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype, - u32 msgkey, struct task_struct *task, u8 node) +static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype, + u32 msgkey, struct task_struct *task, u8 node) { INIT_LIST_HEAD(&nst->st_net_debug_item); nst->st_task = task; @@ -153,31 +153,61 @@ void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype, nst->st_node = node; } -void o2net_set_nst_sock_time(struct o2net_send_tracking *nst) +static void o2net_set_nst_sock_time(struct o2net_send_tracking *nst) { do_gettimeofday(&nst->st_sock_time); } -void o2net_set_nst_send_time(struct o2net_send_tracking *nst) +static void o2net_set_nst_send_time(struct o2net_send_tracking *nst) { do_gettimeofday(&nst->st_send_time); } -void o2net_set_nst_status_time(struct o2net_send_tracking *nst) +static void o2net_set_nst_status_time(struct o2net_send_tracking *nst) { do_gettimeofday(&nst->st_status_time); } -void o2net_set_nst_sock_container(struct o2net_send_tracking *nst, +static void o2net_set_nst_sock_container(struct o2net_send_tracking *nst, struct o2net_sock_container *sc) { nst->st_sc = sc; } -void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id) +static void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id) { nst->st_id = msg_id; } + +#else /* CONFIG_DEBUG_FS */ + +static inline void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype, + u32 msgkey, struct task_struct *task, u8 node) +{ +} + +static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst) +{ +} + +static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst) +{ +} + +static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst) +{ +} + +static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst, + struct o2net_sock_container *sc) +{ +} + +static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, + u32 msg_id) +{ +} + #endif /* CONFIG_DEBUG_FS */ static inline int o2net_reconnect_delay(void) diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index 18307ff81b77..8d58cfe410b1 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h @@ -224,42 +224,10 @@ struct o2net_send_tracking { struct timeval st_send_time; struct timeval st_status_time; }; - -void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype, - u32 msgkey, struct task_struct *task, u8 node); -void o2net_set_nst_sock_time(struct o2net_send_tracking *nst); -void o2net_set_nst_send_time(struct o2net_send_tracking *nst); -void o2net_set_nst_status_time(struct o2net_send_tracking *nst); -void o2net_set_nst_sock_container(struct o2net_send_tracking *nst, - struct o2net_sock_container *sc); -void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id); - #else struct o2net_send_tracking { u32 dummy; }; - -static inline void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype, - u32 msgkey, struct task_struct *task, u8 node) -{ -} -static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst) -{ -} -static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst) -{ -} -static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst) -{ -} -static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst, - struct o2net_sock_container *sc) -{ -} -static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, - u32 msg_id) -{ -} #endif /* CONFIG_DEBUG_FS */ #endif /* O2CLUSTER_TCP_INTERNAL_H */ diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 8a1875848080..026e6eb85187 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -82,6 +82,49 @@ static int ocfs2_do_extend_dir(struct super_block *sb, struct ocfs2_alloc_context *meta_ac, struct buffer_head **new_bh); +static struct buffer_head *ocfs2_bread(struct inode *inode, + int block, int *err, int reada) +{ + struct buffer_head *bh = NULL; + int tmperr; + u64 p_blkno; + int readflags = 0; + + if (reada) + readflags |= OCFS2_BH_READAHEAD; + + if (((u64)block << inode->i_sb->s_blocksize_bits) >= + i_size_read(inode)) { + BUG_ON(!reada); + return NULL; + } + + down_read(&OCFS2_I(inode)->ip_alloc_sem); + tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, + NULL); + up_read(&OCFS2_I(inode)->ip_alloc_sem); + if (tmperr < 0) { + mlog_errno(tmperr); + goto fail; + } + + tmperr = ocfs2_read_blocks(inode, p_blkno, 1, &bh, readflags); + if (tmperr < 0) + goto fail; + + tmperr = 0; + + *err = 0; + return bh; + +fail: + brelse(bh); + bh = NULL; + + *err = -EIO; + return NULL; +} + /* * bh passed here can be an inode block or a dir data block, depending * on the inode inline data flag. @@ -188,8 +231,7 @@ static struct buffer_head *ocfs2_find_entry_id(const char *name, struct ocfs2_dinode *di; struct ocfs2_inline_data *data; - ret = ocfs2_read_block(OCFS2_SB(dir->i_sb), OCFS2_I(dir)->ip_blkno, - &di_bh, OCFS2_BH_CACHED, dir); + ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh); if (ret) { mlog_errno(ret); goto out; @@ -260,14 +302,13 @@ restart: } if ((bh = bh_use[ra_ptr++]) == NULL) goto next; - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) { - /* read error, skip block & hope for the best */ + if (ocfs2_read_block(dir, block, &bh)) { + /* read error, skip block & hope for the best. + * ocfs2_read_block() has released the bh. */ ocfs2_error(dir->i_sb, "reading directory %llu, " "offset %lu\n", (unsigned long long)OCFS2_I(dir)->ip_blkno, block); - brelse(bh); goto next; } i = ocfs2_search_dirblock(bh, dir, name, namelen, @@ -417,8 +458,7 @@ static inline int ocfs2_delete_entry_id(handle_t *handle, struct ocfs2_dinode *di; struct ocfs2_inline_data *data; - ret = ocfs2_read_block(OCFS2_SB(dir->i_sb), OCFS2_I(dir)->ip_blkno, - &di_bh, OCFS2_BH_CACHED, dir); + ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh); if (ret) { mlog_errno(ret); goto out; @@ -596,8 +636,7 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode, struct ocfs2_inline_data *data; struct ocfs2_dir_entry *de; - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno, - &di_bh, OCFS2_BH_CACHED, inode); + ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh); if (ret) { mlog(ML_ERROR, "Unable to read inode block for dir %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); @@ -716,8 +755,7 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode, for (i = ra_sectors >> (sb->s_blocksize_bits - 9); i > 0; i--) { tmp = ocfs2_bread(inode, ++blk, &err, 1); - if (tmp) - brelse(tmp); + brelse(tmp); } last_ra_blk = blk; ra_sectors = 8; @@ -899,10 +937,8 @@ int ocfs2_find_files_on_disk(const char *name, leave: if (status < 0) { *dirent = NULL; - if (*dirent_bh) { - brelse(*dirent_bh); - *dirent_bh = NULL; - } + brelse(*dirent_bh); + *dirent_bh = NULL; } mlog_exit(status); @@ -951,8 +987,7 @@ int ocfs2_check_dir_for_entry(struct inode *dir, ret = 0; bail: - if (dirent_bh) - brelse(dirent_bh); + brelse(dirent_bh); mlog_exit(ret); return ret; @@ -1127,8 +1162,7 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb, status = 0; bail: - if (new_bh) - brelse(new_bh); + brelse(new_bh); mlog_exit(status); return status; @@ -1192,6 +1226,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, struct buffer_head *dirdata_bh = NULL; struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; handle_t *handle; + struct ocfs2_extent_tree et; + + ocfs2_init_dinode_extent_tree(&et, dir, di_bh); alloc = ocfs2_clusters_for_bytes(sb, bytes); @@ -1300,19 +1337,24 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, di->i_size = cpu_to_le64(sb->s_blocksize); di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec); di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec); - dir->i_blocks = ocfs2_inode_sector_count(dir); /* * This should never fail as our extent list is empty and all * related blocks have been journaled already. */ - ret = ocfs2_insert_extent(osb, handle, dir, di_bh, 0, blkno, len, 0, - NULL); + ret = ocfs2_insert_extent(osb, handle, dir, &et, 0, blkno, len, + 0, NULL); if (ret) { mlog_errno(ret); - goto out; + goto out_commit; } + /* + * Set i_blocks after the extent insert for the most up to + * date ip_clusters value. + */ + dir->i_blocks = ocfs2_inode_sector_count(dir); + ret = ocfs2_journal_dirty(handle, di_bh); if (ret) { mlog_errno(ret); @@ -1332,11 +1374,11 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, } blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off); - ret = ocfs2_insert_extent(osb, handle, dir, di_bh, 1, blkno, - len, 0, NULL); + ret = ocfs2_insert_extent(osb, handle, dir, &et, 1, + blkno, len, 0, NULL); if (ret) { mlog_errno(ret); - goto out; + goto out_commit; } } @@ -1378,9 +1420,9 @@ static int ocfs2_do_extend_dir(struct super_block *sb, if (extend) { u32 offset = OCFS2_I(dir)->ip_clusters; - status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, &offset, - 1, 0, parent_fe_bh, handle, - data_ac, meta_ac, NULL); + status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset, + 1, 0, parent_fe_bh, handle, + data_ac, meta_ac, NULL); BUG_ON(status == -EAGAIN); if (status < 0) { mlog_errno(status); @@ -1425,12 +1467,14 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb, int credits, num_free_extents, drop_alloc_sem = 0; loff_t dir_i_size; struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data; + struct ocfs2_extent_list *el = &fe->id2.i_list; struct ocfs2_alloc_context *data_ac = NULL; struct ocfs2_alloc_context *meta_ac = NULL; handle_t *handle = NULL; struct buffer_head *new_bh = NULL; struct ocfs2_dir_entry * de; struct super_block *sb = osb->sb; + struct ocfs2_extent_tree et; mlog_entry_void(); @@ -1474,7 +1518,8 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb, spin_lock(&OCFS2_I(dir)->ip_lock); if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) { spin_unlock(&OCFS2_I(dir)->ip_lock); - num_free_extents = ocfs2_num_free_extents(osb, dir, fe); + ocfs2_init_dinode_extent_tree(&et, dir, parent_fe_bh); + num_free_extents = ocfs2_num_free_extents(osb, dir, &et); if (num_free_extents < 0) { status = num_free_extents; mlog_errno(status); @@ -1482,7 +1527,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb, } if (!num_free_extents) { - status = ocfs2_reserve_new_metadata(osb, fe, &meta_ac); + status = ocfs2_reserve_new_metadata(osb, el, &meta_ac); if (status < 0) { if (status != -ENOSPC) mlog_errno(status); @@ -1497,7 +1542,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb, goto bail; } - credits = ocfs2_calc_extend_credits(sb, fe, 1); + credits = ocfs2_calc_extend_credits(sb, el, 1); } else { spin_unlock(&OCFS2_I(dir)->ip_lock); credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS; @@ -1563,8 +1608,7 @@ bail: if (meta_ac) ocfs2_free_alloc_context(meta_ac); - if (new_bh) - brelse(new_bh); + brelse(new_bh); mlog_exit(status); return status; @@ -1691,8 +1735,7 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name, status = 0; bail: - if (bh) - brelse(bh); + brelse(bh); mlog_exit(status); return status; @@ -1751,7 +1794,6 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb, *ret_de_bh = bh; bh = NULL; out: - if (bh) - brelse(bh); + brelse(bh); return ret; } diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c index e48aba698b77..533a789c3ef8 100644 --- a/fs/ocfs2/dlm/dlmfs.c +++ b/fs/ocfs2/dlm/dlmfs.c @@ -267,8 +267,7 @@ static ssize_t dlmfs_file_write(struct file *filp, return writelen; } -static void dlmfs_init_once(struct kmem_cache *cachep, - void *foo) +static void dlmfs_init_once(void *foo) { struct dlmfs_inode_private *ip = (struct dlmfs_inode_private *) foo; diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index efc015c6128a..44f87caf3683 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -606,7 +606,9 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm, res->last_used = 0; + spin_lock(&dlm->spinlock); list_add_tail(&res->tracking, &dlm->tracking_list); + spin_unlock(&dlm->spinlock); memset(res->lvb, 0, DLM_LVB_LEN); memset(res->refmap, 0, sizeof(res->refmap)); diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 394d25a131a5..ec684426034b 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -31,6 +31,7 @@ #include <linux/pagemap.h> #include <linux/debugfs.h> #include <linux/seq_file.h> +#include <linux/time.h> #define MLOG_MASK_PREFIX ML_DLM_GLUE #include <cluster/masklog.h> @@ -59,6 +60,9 @@ struct ocfs2_mask_waiter { struct completion mw_complete; unsigned long mw_mask; unsigned long mw_goal; +#ifdef CONFIG_OCFS2_FS_STATS + unsigned long long mw_lock_start; +#endif }; static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres); @@ -366,6 +370,75 @@ static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res) spin_unlock(&ocfs2_dlm_tracking_lock); } +#ifdef CONFIG_OCFS2_FS_STATS +static void ocfs2_init_lock_stats(struct ocfs2_lock_res *res) +{ + res->l_lock_num_prmode = 0; + res->l_lock_num_prmode_failed = 0; + res->l_lock_total_prmode = 0; + res->l_lock_max_prmode = 0; + res->l_lock_num_exmode = 0; + res->l_lock_num_exmode_failed = 0; + res->l_lock_total_exmode = 0; + res->l_lock_max_exmode = 0; + res->l_lock_refresh = 0; +} + +static void ocfs2_update_lock_stats(struct ocfs2_lock_res *res, int level, + struct ocfs2_mask_waiter *mw, int ret) +{ + unsigned long long *num, *sum; + unsigned int *max, *failed; + struct timespec ts = current_kernel_time(); + unsigned long long time = timespec_to_ns(&ts) - mw->mw_lock_start; + + if (level == LKM_PRMODE) { + num = &res->l_lock_num_prmode; + sum = &res->l_lock_total_prmode; + max = &res->l_lock_max_prmode; + failed = &res->l_lock_num_prmode_failed; + } else if (level == LKM_EXMODE) { + num = &res->l_lock_num_exmode; + sum = &res->l_lock_total_exmode; + max = &res->l_lock_max_exmode; + failed = &res->l_lock_num_exmode_failed; + } else + return; + + (*num)++; + (*sum) += time; + if (time > *max) + *max = time; + if (ret) + (*failed)++; +} + +static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres) +{ + lockres->l_lock_refresh++; +} + +static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw) +{ + struct timespec ts = current_kernel_time(); + mw->mw_lock_start = timespec_to_ns(&ts); +} +#else +static inline void ocfs2_init_lock_stats(struct ocfs2_lock_res *res) +{ +} +static inline void ocfs2_update_lock_stats(struct ocfs2_lock_res *res, + int level, struct ocfs2_mask_waiter *mw, int ret) +{ +} +static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres) +{ +} +static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw) +{ +} +#endif + static void ocfs2_lock_res_init_common(struct ocfs2_super *osb, struct ocfs2_lock_res *res, enum ocfs2_lock_type type, @@ -385,6 +458,8 @@ static void ocfs2_lock_res_init_common(struct ocfs2_super *osb, res->l_flags = OCFS2_LOCK_INITIALIZED; ocfs2_add_lockres_tracking(res, osb->osb_dlm_debug); + + ocfs2_init_lock_stats(res); } void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res) @@ -1048,6 +1123,7 @@ static void ocfs2_init_mask_waiter(struct ocfs2_mask_waiter *mw) { INIT_LIST_HEAD(&mw->mw_item); init_completion(&mw->mw_complete); + ocfs2_init_start_time(mw); } static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw) @@ -1254,6 +1330,7 @@ out: goto again; mlog_errno(ret); } + ocfs2_update_lock_stats(lockres, level, &mw, ret); mlog_exit(ret); return ret; @@ -1554,8 +1631,8 @@ out: */ int ocfs2_file_lock(struct file *file, int ex, int trylock) { - int ret, level = ex ? LKM_EXMODE : LKM_PRMODE; - unsigned int lkm_flags = trylock ? LKM_NOQUEUE : 0; + int ret, level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; + unsigned int lkm_flags = trylock ? DLM_LKF_NOQUEUE : 0; unsigned long flags; struct ocfs2_file_private *fp = file->private_data; struct ocfs2_lock_res *lockres = &fp->fp_flock; @@ -1582,7 +1659,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock) * Get the lock at NLMODE to start - that way we * can cancel the upconvert request if need be. */ - ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0); + ret = ocfs2_lock_create(osb, lockres, DLM_LOCK_NL, 0); if (ret < 0) { mlog_errno(ret); goto out; @@ -1597,7 +1674,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock) } lockres->l_action = OCFS2_AST_CONVERT; - lkm_flags |= LKM_CONVERT; + lkm_flags |= DLM_LKF_CONVERT; lockres->l_requested = level; lockres_or_flags(lockres, OCFS2_LOCK_BUSY); @@ -1664,7 +1741,7 @@ void ocfs2_file_unlock(struct file *file) if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) return; - if (lockres->l_level == LKM_NLMODE) + if (lockres->l_level == DLM_LOCK_NL) return; mlog(0, "Unlock: \"%s\" flags: 0x%lx, level: %d, act: %d\n", @@ -1678,11 +1755,11 @@ void ocfs2_file_unlock(struct file *file) lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED); lockres->l_blocking = DLM_LOCK_EX; - gen = ocfs2_prepare_downconvert(lockres, LKM_NLMODE); + gen = ocfs2_prepare_downconvert(lockres, DLM_LOCK_NL); lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0); spin_unlock_irqrestore(&lockres->l_lock, flags); - ret = ocfs2_downconvert_lock(osb, lockres, LKM_NLMODE, 0, gen); + ret = ocfs2_downconvert_lock(osb, lockres, DLM_LOCK_NL, 0, gen); if (ret) { mlog_errno(ret); return; @@ -1947,8 +2024,7 @@ static int ocfs2_inode_lock_update(struct inode *inode, } else { /* Boo, we have to go to disk. */ /* read bh, cast, ocfs2_refresh_inode */ - status = ocfs2_read_block(OCFS2_SB(inode->i_sb), oi->ip_blkno, - bh, OCFS2_BH_CACHED, inode); + status = ocfs2_read_block(inode, oi->ip_blkno, bh); if (status < 0) { mlog_errno(status); goto bail_refresh; @@ -1983,6 +2059,7 @@ static int ocfs2_inode_lock_update(struct inode *inode, le32_to_cpu(fe->i_flags)); ocfs2_refresh_inode(inode, fe); + ocfs2_track_lock_refresh(lockres); } status = 0; @@ -2008,11 +2085,7 @@ static int ocfs2_assign_bh(struct inode *inode, return 0; } - status = ocfs2_read_block(OCFS2_SB(inode->i_sb), - OCFS2_I(inode)->ip_blkno, - ret_bh, - OCFS2_BH_CACHED, - inode); + status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, ret_bh); if (status < 0) mlog_errno(status); @@ -2267,6 +2340,7 @@ int ocfs2_super_lock(struct ocfs2_super *osb, if (status < 0) mlog_errno(status); + ocfs2_track_lock_refresh(lockres); } bail: mlog_exit(status); @@ -2461,7 +2535,7 @@ static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos) } /* So that debugfs.ocfs2 can determine which format is being used */ -#define OCFS2_DLM_DEBUG_STR_VERSION 1 +#define OCFS2_DLM_DEBUG_STR_VERSION 2 static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) { int i; @@ -2502,6 +2576,47 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) for(i = 0; i < DLM_LVB_LEN; i++) seq_printf(m, "0x%x\t", lvb[i]); +#ifdef CONFIG_OCFS2_FS_STATS +# define lock_num_prmode(_l) (_l)->l_lock_num_prmode +# define lock_num_exmode(_l) (_l)->l_lock_num_exmode +# define lock_num_prmode_failed(_l) (_l)->l_lock_num_prmode_failed +# define lock_num_exmode_failed(_l) (_l)->l_lock_num_exmode_failed +# define lock_total_prmode(_l) (_l)->l_lock_total_prmode +# define lock_total_exmode(_l) (_l)->l_lock_total_exmode +# define lock_max_prmode(_l) (_l)->l_lock_max_prmode +# define lock_max_exmode(_l) (_l)->l_lock_max_exmode +# define lock_refresh(_l) (_l)->l_lock_refresh +#else +# define lock_num_prmode(_l) (0ULL) +# define lock_num_exmode(_l) (0ULL) +# define lock_num_prmode_failed(_l) (0) +# define lock_num_exmode_failed(_l) (0) +# define lock_total_prmode(_l) (0ULL) +# define lock_total_exmode(_l) (0ULL) +# define lock_max_prmode(_l) (0) +# define lock_max_exmode(_l) (0) +# define lock_refresh(_l) (0) +#endif + /* The following seq_print was added in version 2 of this output */ + seq_printf(m, "%llu\t" + "%llu\t" + "%u\t" + "%u\t" + "%llu\t" + "%llu\t" + "%u\t" + "%u\t" + "%u\t", + lock_num_prmode(lockres), + lock_num_exmode(lockres), + lock_num_prmode_failed(lockres), + lock_num_exmode_failed(lockres), + lock_total_prmode(lockres), + lock_total_exmode(lockres), + lock_max_prmode(lockres), + lock_max_exmode(lockres), + lock_refresh(lockres)); + /* End the line */ seq_printf(m, "\n"); return 0; diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index c58668a326fe..2baedac58234 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -25,6 +25,7 @@ #include <linux/fs.h> #include <linux/init.h> #include <linux/types.h> +#include <linux/fiemap.h> #define MLOG_MASK_PREFIX ML_EXTENT_MAP #include <cluster/masklog.h> @@ -32,6 +33,7 @@ #include "ocfs2.h" #include "alloc.h" +#include "dlmglue.h" #include "extent_map.h" #include "inode.h" #include "super.h" @@ -282,6 +284,50 @@ out: kfree(new_emi); } +static int ocfs2_last_eb_is_empty(struct inode *inode, + struct ocfs2_dinode *di) +{ + int ret, next_free; + u64 last_eb_blk = le64_to_cpu(di->i_last_eb_blk); + struct buffer_head *eb_bh = NULL; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *el; + + ret = ocfs2_read_block(inode, last_eb_blk, &eb_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + + if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { + ret = -EROFS; + OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); + goto out; + } + + if (el->l_tree_depth) { + ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in " + "leaf block %llu\n", inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); + ret = -EROFS; + goto out; + } + + next_free = le16_to_cpu(el->l_next_free_rec); + + if (next_free == 0 || + (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) + ret = 1; + +out: + brelse(eb_bh); + return ret; +} + /* * Return the 1st index within el which contains an extent start * larger than v_cluster. @@ -335,9 +381,9 @@ static int ocfs2_figure_hole_clusters(struct inode *inode, if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL) goto no_more_extents; - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), + ret = ocfs2_read_block(inode, le64_to_cpu(eb->h_next_leaf_blk), - &next_eb_bh, OCFS2_BH_CACHED, inode); + &next_eb_bh); if (ret) { mlog_errno(ret); goto out; @@ -373,42 +419,28 @@ out: return ret; } -int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, - u32 *p_cluster, u32 *num_clusters, - unsigned int *extent_flags) +static int ocfs2_get_clusters_nocache(struct inode *inode, + struct buffer_head *di_bh, + u32 v_cluster, unsigned int *hole_len, + struct ocfs2_extent_rec *ret_rec, + unsigned int *is_last) { - int ret, i; - unsigned int flags = 0; - struct buffer_head *di_bh = NULL; - struct buffer_head *eb_bh = NULL; + int i, ret, tree_height, len; struct ocfs2_dinode *di; - struct ocfs2_extent_block *eb; + struct ocfs2_extent_block *uninitialized_var(eb); struct ocfs2_extent_list *el; struct ocfs2_extent_rec *rec; - u32 coff; - - if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { - ret = -ERANGE; - mlog_errno(ret); - goto out; - } - - ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster, - num_clusters, extent_flags); - if (ret == 0) - goto out; + struct buffer_head *eb_bh = NULL; - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno, - &di_bh, OCFS2_BH_CACHED, inode); - if (ret) { - mlog_errno(ret); - goto out; - } + memset(ret_rec, 0, sizeof(*ret_rec)); + if (is_last) + *is_last = 0; di = (struct ocfs2_dinode *) di_bh->b_data; el = &di->id2.i_list; + tree_height = le16_to_cpu(el->l_tree_depth); - if (el->l_tree_depth) { + if (tree_height > 0) { ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh); if (ret) { mlog_errno(ret); @@ -431,46 +463,202 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, i = ocfs2_search_extent_list(el, v_cluster); if (i == -1) { /* - * A hole was found. Return some canned values that - * callers can key on. If asked for, num_clusters will - * be populated with the size of the hole. + * Holes can be larger than the maximum size of an + * extent, so we return their lengths in a seperate + * field. */ - *p_cluster = 0; - if (num_clusters) { + if (hole_len) { ret = ocfs2_figure_hole_clusters(inode, el, eb_bh, - v_cluster, - num_clusters); + v_cluster, &len); if (ret) { mlog_errno(ret); goto out; } + + *hole_len = len; + } + goto out_hole; + } + + rec = &el->l_recs[i]; + + BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos)); + + if (!rec->e_blkno) { + ocfs2_error(inode->i_sb, "Inode %lu has bad extent " + "record (%u, %u, 0)", inode->i_ino, + le32_to_cpu(rec->e_cpos), + ocfs2_rec_clusters(el, rec)); + ret = -EROFS; + goto out; + } + + *ret_rec = *rec; + + /* + * Checking for last extent is potentially expensive - we + * might have to look at the next leaf over to see if it's + * empty. + * + * The first two checks are to see whether the caller even + * cares for this information, and if the extent is at least + * the last in it's list. + * + * If those hold true, then the extent is last if any of the + * additional conditions hold true: + * - Extent list is in-inode + * - Extent list is right-most + * - Extent list is 2nd to rightmost, with empty right-most + */ + if (is_last) { + if (i == (le16_to_cpu(el->l_next_free_rec) - 1)) { + if (tree_height == 0) + *is_last = 1; + else if (eb->h_blkno == di->i_last_eb_blk) + *is_last = 1; + else if (eb->h_next_leaf_blk == di->i_last_eb_blk) { + ret = ocfs2_last_eb_is_empty(inode, di); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + if (ret == 1) + *is_last = 1; + } + } + } + +out_hole: + ret = 0; +out: + brelse(eb_bh); + return ret; +} + +static void ocfs2_relative_extent_offsets(struct super_block *sb, + u32 v_cluster, + struct ocfs2_extent_rec *rec, + u32 *p_cluster, u32 *num_clusters) + +{ + u32 coff = v_cluster - le32_to_cpu(rec->e_cpos); + + *p_cluster = ocfs2_blocks_to_clusters(sb, le64_to_cpu(rec->e_blkno)); + *p_cluster = *p_cluster + coff; + + if (num_clusters) + *num_clusters = le16_to_cpu(rec->e_leaf_clusters) - coff; +} + +int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, + u32 *p_cluster, u32 *num_clusters, + struct ocfs2_extent_list *el) +{ + int ret = 0, i; + struct buffer_head *eb_bh = NULL; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_rec *rec; + u32 coff; + + if (el->l_tree_depth) { + ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh); + if (ret) { + mlog_errno(ret); + goto out; } + + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + + if (el->l_tree_depth) { + ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in " + "xattr leaf block %llu\n", inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); + ret = -EROFS; + goto out; + } + } + + i = ocfs2_search_extent_list(el, v_cluster); + if (i == -1) { + ret = -EROFS; + mlog_errno(ret); + goto out; } else { rec = &el->l_recs[i]; - BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos)); if (!rec->e_blkno) { ocfs2_error(inode->i_sb, "Inode %lu has bad extent " - "record (%u, %u, 0)", inode->i_ino, + "record (%u, %u, 0) in xattr", inode->i_ino, le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec)); ret = -EROFS; goto out; } - coff = v_cluster - le32_to_cpu(rec->e_cpos); - *p_cluster = ocfs2_blocks_to_clusters(inode->i_sb, le64_to_cpu(rec->e_blkno)); *p_cluster = *p_cluster + coff; - if (num_clusters) *num_clusters = ocfs2_rec_clusters(el, rec) - coff; + } +out: + if (eb_bh) + brelse(eb_bh); + return ret; +} + +int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, + u32 *p_cluster, u32 *num_clusters, + unsigned int *extent_flags) +{ + int ret; + unsigned int uninitialized_var(hole_len), flags = 0; + struct buffer_head *di_bh = NULL; + struct ocfs2_extent_rec rec; + + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + ret = -ERANGE; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster, + num_clusters, extent_flags); + if (ret == 0) + goto out; + + ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh); + if (ret) { + mlog_errno(ret); + goto out; + } - flags = rec->e_flags; + ret = ocfs2_get_clusters_nocache(inode, di_bh, v_cluster, &hole_len, + &rec, NULL); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (rec.e_blkno == 0ULL) { + /* + * A hole was found. Return some canned values that + * callers can key on. If asked for, num_clusters will + * be populated with the size of the hole. + */ + *p_cluster = 0; + if (num_clusters) { + *num_clusters = hole_len; + } + } else { + ocfs2_relative_extent_offsets(inode->i_sb, v_cluster, &rec, + p_cluster, num_clusters); + flags = rec.e_flags; - ocfs2_extent_map_insert_rec(inode, rec); + ocfs2_extent_map_insert_rec(inode, &rec); } if (extent_flags) @@ -478,7 +666,6 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, out: brelse(di_bh); - brelse(eb_bh); return ret; } @@ -521,3 +708,114 @@ int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno, out: return ret; } + +static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh, + struct fiemap_extent_info *fieinfo, + u64 map_start) +{ + int ret; + unsigned int id_count; + struct ocfs2_dinode *di; + u64 phys; + u32 flags = FIEMAP_EXTENT_DATA_INLINE|FIEMAP_EXTENT_LAST; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + di = (struct ocfs2_dinode *)di_bh->b_data; + id_count = le16_to_cpu(di->id2.i_data.id_count); + + if (map_start < id_count) { + phys = oi->ip_blkno << inode->i_sb->s_blocksize_bits; + phys += offsetof(struct ocfs2_dinode, id2.i_data.id_data); + + ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count, + flags); + if (ret < 0) + return ret; + } + + return 0; +} + +#define OCFS2_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC) + +int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 map_start, u64 map_len) +{ + int ret, is_last; + u32 mapping_end, cpos; + unsigned int hole_size; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + u64 len_bytes, phys_bytes, virt_bytes; + struct buffer_head *di_bh = NULL; + struct ocfs2_extent_rec rec; + + ret = fiemap_check_flags(fieinfo, OCFS2_FIEMAP_FLAGS); + if (ret) + return ret; + + ret = ocfs2_inode_lock(inode, &di_bh, 0); + if (ret) { + mlog_errno(ret); + goto out; + } + + down_read(&OCFS2_I(inode)->ip_alloc_sem); + + /* + * Handle inline-data separately. + */ + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + ret = ocfs2_fiemap_inline(inode, di_bh, fieinfo, map_start); + goto out_unlock; + } + + cpos = map_start >> osb->s_clustersize_bits; + mapping_end = ocfs2_clusters_for_bytes(inode->i_sb, + map_start + map_len); + mapping_end -= cpos; + is_last = 0; + while (cpos < mapping_end && !is_last) { + u32 fe_flags; + + ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos, + &hole_size, &rec, &is_last); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (rec.e_blkno == 0ULL) { + cpos += hole_size; + continue; + } + + fe_flags = 0; + if (rec.e_flags & OCFS2_EXT_UNWRITTEN) + fe_flags |= FIEMAP_EXTENT_UNWRITTEN; + if (is_last) + fe_flags |= FIEMAP_EXTENT_LAST; + len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits; + phys_bytes = le64_to_cpu(rec.e_blkno) << osb->sb->s_blocksize_bits; + virt_bytes = (u64)le32_to_cpu(rec.e_cpos) << osb->s_clustersize_bits; + + ret = fiemap_fill_next_extent(fieinfo, virt_bytes, phys_bytes, + len_bytes, fe_flags); + if (ret) + break; + + cpos = le32_to_cpu(rec.e_cpos)+ le16_to_cpu(rec.e_leaf_clusters); + } + + if (ret > 0) + ret = 0; + +out_unlock: + brelse(di_bh); + + up_read(&OCFS2_I(inode)->ip_alloc_sem); + + ocfs2_inode_unlock(inode, 0); +out: + + return ret; +} diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h index de91e3e41a22..1c4aa8b06f34 100644 --- a/fs/ocfs2/extent_map.h +++ b/fs/ocfs2/extent_map.h @@ -50,4 +50,11 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster, int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno, u64 *ret_count, unsigned int *extent_flags); +int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 map_start, u64 map_len); + +int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, + u32 *p_cluster, u32 *num_clusters, + struct ocfs2_extent_list *el); + #endif /* _EXTENT_MAP_H */ diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 57e0d30cde98..8d3225a78073 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -55,6 +55,7 @@ #include "mmap.h" #include "suballoc.h" #include "super.h" +#include "xattr.h" #include "buffer_head_io.h" @@ -184,7 +185,7 @@ static int ocfs2_sync_file(struct file *file, goto bail; journal = osb->journal->j_journal; - err = journal_force_commit(journal); + err = jbd2_journal_force_commit(journal); bail: mlog_exit(err); @@ -488,7 +489,7 @@ bail: } /* - * extend allocation only here. + * extend file allocation only here. * we'll update all the disk stuff, and oip->alloc_size * * expect stuff to be locked, a transaction started and enough data / @@ -497,189 +498,25 @@ bail: * Will return -EAGAIN, and a reason if a restart is needed. * If passed in, *reason will always be set, even in error. */ -int ocfs2_do_extend_allocation(struct ocfs2_super *osb, - struct inode *inode, - u32 *logical_offset, - u32 clusters_to_add, - int mark_unwritten, - struct buffer_head *fe_bh, - handle_t *handle, - struct ocfs2_alloc_context *data_ac, - struct ocfs2_alloc_context *meta_ac, - enum ocfs2_alloc_restarted *reason_ret) +int ocfs2_add_inode_data(struct ocfs2_super *osb, + struct inode *inode, + u32 *logical_offset, + u32 clusters_to_add, + int mark_unwritten, + struct buffer_head *fe_bh, + handle_t *handle, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + enum ocfs2_alloc_restarted *reason_ret) { - int status = 0; - int free_extents; - struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; - enum ocfs2_alloc_restarted reason = RESTART_NONE; - u32 bit_off, num_bits; - u64 block; - u8 flags = 0; - - BUG_ON(!clusters_to_add); - - if (mark_unwritten) - flags = OCFS2_EXT_UNWRITTEN; - - free_extents = ocfs2_num_free_extents(osb, inode, fe); - if (free_extents < 0) { - status = free_extents; - mlog_errno(status); - goto leave; - } - - /* there are two cases which could cause us to EAGAIN in the - * we-need-more-metadata case: - * 1) we haven't reserved *any* - * 2) we are so fragmented, we've needed to add metadata too - * many times. */ - if (!free_extents && !meta_ac) { - mlog(0, "we haven't reserved any metadata!\n"); - status = -EAGAIN; - reason = RESTART_META; - goto leave; - } else if ((!free_extents) - && (ocfs2_alloc_context_bits_left(meta_ac) - < ocfs2_extend_meta_needed(fe))) { - mlog(0, "filesystem is really fragmented...\n"); - status = -EAGAIN; - reason = RESTART_META; - goto leave; - } - - status = __ocfs2_claim_clusters(osb, handle, data_ac, 1, - clusters_to_add, &bit_off, &num_bits); - if (status < 0) { - if (status != -ENOSPC) - mlog_errno(status); - goto leave; - } - - BUG_ON(num_bits > clusters_to_add); - - /* reserve our write early -- insert_extent may update the inode */ - status = ocfs2_journal_access(handle, inode, fe_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); - goto leave; - } - - block = ocfs2_clusters_to_blocks(osb->sb, bit_off); - mlog(0, "Allocating %u clusters at block %u for inode %llu\n", - num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); - status = ocfs2_insert_extent(osb, handle, inode, fe_bh, - *logical_offset, block, num_bits, - flags, meta_ac); - if (status < 0) { - mlog_errno(status); - goto leave; - } - - status = ocfs2_journal_dirty(handle, fe_bh); - if (status < 0) { - mlog_errno(status); - goto leave; - } - - clusters_to_add -= num_bits; - *logical_offset += num_bits; - - if (clusters_to_add) { - mlog(0, "need to alloc once more, clusters = %u, wanted = " - "%u\n", fe->i_clusters, clusters_to_add); - status = -EAGAIN; - reason = RESTART_TRANS; - } - -leave: - mlog_exit(status); - if (reason_ret) - *reason_ret = reason; - return status; -} - -/* - * For a given allocation, determine which allocators will need to be - * accessed, and lock them, reserving the appropriate number of bits. - * - * Sparse file systems call this from ocfs2_write_begin_nolock() - * and ocfs2_allocate_unwritten_extents(). - * - * File systems which don't support holes call this from - * ocfs2_extend_allocation(). - */ -int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, - u32 clusters_to_add, u32 extents_to_split, - struct ocfs2_alloc_context **data_ac, - struct ocfs2_alloc_context **meta_ac) -{ - int ret = 0, num_free_extents; - unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - - *meta_ac = NULL; - if (data_ac) - *data_ac = NULL; - - BUG_ON(clusters_to_add != 0 && data_ac == NULL); - - mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, " - "clusters_to_add = %u, extents_to_split = %u\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, (long long)i_size_read(inode), - le32_to_cpu(di->i_clusters), clusters_to_add, extents_to_split); - - num_free_extents = ocfs2_num_free_extents(osb, inode, di); - if (num_free_extents < 0) { - ret = num_free_extents; - mlog_errno(ret); - goto out; - } - - /* - * Sparse allocation file systems need to be more conservative - * with reserving room for expansion - the actual allocation - * happens while we've got a journal handle open so re-taking - * a cluster lock (because we ran out of room for another - * extent) will violate ordering rules. - * - * Most of the time we'll only be seeing this 1 cluster at a time - * anyway. - * - * Always lock for any unwritten extents - we might want to - * add blocks during a split. - */ - if (!num_free_extents || - (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) { - ret = ocfs2_reserve_new_metadata(osb, di, meta_ac); - if (ret < 0) { - if (ret != -ENOSPC) - mlog_errno(ret); - goto out; - } - } - - if (clusters_to_add == 0) - goto out; - - ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac); - if (ret < 0) { - if (ret != -ENOSPC) - mlog_errno(ret); - goto out; - } + int ret; + struct ocfs2_extent_tree et; -out: - if (ret) { - if (*meta_ac) { - ocfs2_free_alloc_context(*meta_ac); - *meta_ac = NULL; - } - - /* - * We cannot have an error and a non null *data_ac. - */ - } + ocfs2_init_dinode_extent_tree(&et, inode, fe_bh); + ret = ocfs2_add_clusters_in_btree(osb, inode, logical_offset, + clusters_to_add, mark_unwritten, + &et, handle, + data_ac, meta_ac, reason_ret); return ret; } @@ -698,6 +535,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start, struct ocfs2_alloc_context *meta_ac = NULL; enum ocfs2_alloc_restarted why; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_extent_tree et; mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); @@ -707,8 +545,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start, */ BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb)); - status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, - OCFS2_BH_CACHED, inode); + status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh); if (status < 0) { mlog_errno(status); goto leave; @@ -724,14 +561,21 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start, restart_all: BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); - status = ocfs2_lock_allocators(inode, fe, clusters_to_add, 0, &data_ac, - &meta_ac); + mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, " + "clusters_to_add = %u\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (long long)i_size_read(inode), le32_to_cpu(fe->i_clusters), + clusters_to_add); + ocfs2_init_dinode_extent_tree(&et, inode, bh); + status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0, + &data_ac, &meta_ac); if (status) { mlog_errno(status); goto leave; } - credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add); + credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list, + clusters_to_add); handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { status = PTR_ERR(handle); @@ -753,16 +597,16 @@ restarted_transaction: prev_clusters = OCFS2_I(inode)->ip_clusters; - status = ocfs2_do_extend_allocation(osb, - inode, - &logical_start, - clusters_to_add, - mark_unwritten, - bh, - handle, - data_ac, - meta_ac, - &why); + status = ocfs2_add_inode_data(osb, + inode, + &logical_start, + clusters_to_add, + mark_unwritten, + bh, + handle, + data_ac, + meta_ac, + &why); if ((status < 0) && (status != -EAGAIN)) { if (status != -ENOSPC) mlog_errno(status); @@ -789,7 +633,7 @@ restarted_transaction: mlog(0, "restarting transaction.\n"); /* TODO: This can be more intelligent. */ credits = ocfs2_calc_extend_credits(osb->sb, - fe, + &fe->id2.i_list, clusters_to_add); status = ocfs2_extend_trans(handle, credits); if (status < 0) { @@ -826,10 +670,8 @@ leave: restart_func = 0; goto restart_all; } - if (bh) { - brelse(bh); - bh = NULL; - } + brelse(bh); + bh = NULL; mlog_exit(status); return status; @@ -1096,9 +938,15 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) goto bail_unlock; } - if (i_size_read(inode) > attr->ia_size) + if (i_size_read(inode) > attr->ia_size) { + if (ocfs2_should_order_data(inode)) { + status = ocfs2_begin_ordered_truncate(inode, + attr->ia_size); + if (status) + goto bail_unlock; + } status = ocfs2_truncate_file(inode, bh, attr->ia_size); - else + } else status = ocfs2_extend_file(inode, bh, attr->ia_size); if (status < 0) { if (status != -ENOSPC) @@ -1140,8 +988,7 @@ bail_unlock_rw: if (size_change) ocfs2_rw_unlock(inode, 1); bail: - if (bh) - brelse(bh); + brelse(bh); mlog_exit(status); return status; @@ -1176,7 +1023,7 @@ bail: return err; } -int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd) +int ocfs2_permission(struct inode *inode, int mask) { int ret; @@ -1284,8 +1131,7 @@ static int ocfs2_write_remove_suid(struct inode *inode) struct buffer_head *bh = NULL; struct ocfs2_inode_info *oi = OCFS2_I(inode); - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), - oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode); + ret = ocfs2_read_block(inode, oi->ip_blkno, &bh); if (ret < 0) { mlog_errno(ret); goto out; @@ -1311,9 +1157,8 @@ static int ocfs2_allocate_unwritten_extents(struct inode *inode, struct buffer_head *di_bh = NULL; if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), - OCFS2_I(inode)->ip_blkno, &di_bh, - OCFS2_BH_CACHED, inode); + ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, + &di_bh); if (ret) { mlog_errno(ret); goto out; @@ -1394,8 +1239,11 @@ static int __ocfs2_remove_inode_range(struct inode *inode, handle_t *handle; struct ocfs2_alloc_context *meta_ac = NULL; struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_extent_tree et; + + ocfs2_init_dinode_extent_tree(&et, inode, di_bh); - ret = ocfs2_lock_allocators(inode, di, 0, 1, NULL, &meta_ac); + ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac); if (ret) { mlog_errno(ret); return ret; @@ -1425,7 +1273,7 @@ static int __ocfs2_remove_inode_range(struct inode *inode, goto out; } - ret = ocfs2_remove_extent(inode, di_bh, cpos, len, handle, meta_ac, + ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac, dealloc); if (ret) { mlog_errno(ret); @@ -1766,8 +1614,8 @@ out_inode_unlock: out_rw_unlock: ocfs2_rw_unlock(inode, 1); - mutex_unlock(&inode->i_mutex); out: + mutex_unlock(&inode->i_mutex); return ret; } @@ -2040,7 +1888,7 @@ out_dio: */ if (old_size != i_size_read(inode) || old_clusters != OCFS2_I(inode)->ip_clusters) { - ret = journal_force_commit(osb->journal->j_journal); + ret = jbd2_journal_force_commit(osb->journal->j_journal); if (ret < 0) written = ret; } @@ -2202,7 +2050,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos); if (ret == -EINVAL) - mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n"); + mlog(0, "generic_file_aio_read returned -EINVAL\n"); /* buffered aio wouldn't have proper lock coverage today */ BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); @@ -2227,7 +2075,12 @@ const struct inode_operations ocfs2_file_iops = { .setattr = ocfs2_setattr, .getattr = ocfs2_getattr, .permission = ocfs2_permission, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ocfs2_listxattr, + .removexattr = generic_removexattr, .fallocate = ocfs2_fallocate, + .fiemap = ocfs2_fiemap, }; const struct inode_operations ocfs2_special_file_iops = { @@ -2236,6 +2089,10 @@ const struct inode_operations ocfs2_special_file_iops = { .permission = ocfs2_permission, }; +/* + * Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with + * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks! + */ const struct file_operations ocfs2_fops = { .llseek = generic_file_llseek, .read = do_sync_read, @@ -2250,6 +2107,7 @@ const struct file_operations ocfs2_fops = { #ifdef CONFIG_COMPAT .compat_ioctl = ocfs2_compat_ioctl, #endif + .lock = ocfs2_lock, .flock = ocfs2_flock, .splice_read = ocfs2_file_splice_read, .splice_write = ocfs2_file_splice_write, @@ -2266,5 +2124,51 @@ const struct file_operations ocfs2_dops = { #ifdef CONFIG_COMPAT .compat_ioctl = ocfs2_compat_ioctl, #endif + .lock = ocfs2_lock, + .flock = ocfs2_flock, +}; + +/* + * POSIX-lockless variants of our file_operations. + * + * These will be used if the underlying cluster stack does not support + * posix file locking, if the user passes the "localflocks" mount + * option, or if we have a local-only fs. + * + * ocfs2_flock is in here because all stacks handle UNIX file locks, + * so we still want it in the case of no stack support for + * plocks. Internally, it will do the right thing when asked to ignore + * the cluster. + */ +const struct file_operations ocfs2_fops_no_plocks = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .mmap = ocfs2_mmap, + .fsync = ocfs2_sync_file, + .release = ocfs2_file_release, + .open = ocfs2_file_open, + .aio_read = ocfs2_file_aio_read, + .aio_write = ocfs2_file_aio_write, + .unlocked_ioctl = ocfs2_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ocfs2_compat_ioctl, +#endif + .flock = ocfs2_flock, + .splice_read = ocfs2_file_splice_read, + .splice_write = ocfs2_file_splice_write, +}; + +const struct file_operations ocfs2_dops_no_plocks = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = ocfs2_readdir, + .fsync = ocfs2_sync_file, + .release = ocfs2_dir_release, + .open = ocfs2_dir_open, + .unlocked_ioctl = ocfs2_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ocfs2_compat_ioctl, +#endif .flock = ocfs2_flock, }; diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index 048ddcaf5c80..e92382cbca5f 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h @@ -28,9 +28,12 @@ extern const struct file_operations ocfs2_fops; extern const struct file_operations ocfs2_dops; +extern const struct file_operations ocfs2_fops_no_plocks; +extern const struct file_operations ocfs2_dops_no_plocks; extern const struct inode_operations ocfs2_file_iops; extern const struct inode_operations ocfs2_special_file_iops; struct ocfs2_alloc_context; +enum ocfs2_alloc_restarted; struct ocfs2_file_private { struct file *fp_file; @@ -38,32 +41,22 @@ struct ocfs2_file_private { struct ocfs2_lock_res fp_flock; }; -enum ocfs2_alloc_restarted { - RESTART_NONE = 0, - RESTART_TRANS, - RESTART_META -}; -int ocfs2_do_extend_allocation(struct ocfs2_super *osb, - struct inode *inode, - u32 *logical_offset, - u32 clusters_to_add, - int mark_unwritten, - struct buffer_head *fe_bh, - handle_t *handle, - struct ocfs2_alloc_context *data_ac, - struct ocfs2_alloc_context *meta_ac, - enum ocfs2_alloc_restarted *reason_ret); +int ocfs2_add_inode_data(struct ocfs2_super *osb, + struct inode *inode, + u32 *logical_offset, + u32 clusters_to_add, + int mark_unwritten, + struct buffer_head *fe_bh, + handle_t *handle, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + enum ocfs2_alloc_restarted *reason_ret); int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to); -int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, - u32 clusters_to_add, u32 extents_to_split, - struct ocfs2_alloc_context **data_ac, - struct ocfs2_alloc_context **meta_ac); int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); -int ocfs2_permission(struct inode *inode, int mask, - struct nameidata *nd); +int ocfs2_permission(struct inode *inode, int mask); int ocfs2_should_update_atime(struct inode *inode, struct vfsmount *vfsmnt); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 7e9e4c79aec7..4903688f72a9 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -49,6 +49,7 @@ #include "symlink.h" #include "sysfile.h" #include "uptodate.h" +#include "xattr.h" #include "buffer_head_io.h" @@ -219,6 +220,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, struct super_block *sb; struct ocfs2_super *osb; int status = -EINVAL; + int use_plocks = 1; mlog_entry("(0x%p, size:%llu)\n", inode, (unsigned long long)le64_to_cpu(fe->i_size)); @@ -226,6 +228,10 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, sb = inode->i_sb; osb = OCFS2_SB(sb); + if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) || + ocfs2_mount_local(osb) || !ocfs2_stack_supports_plocks()) + use_plocks = 0; + /* this means that read_inode cannot create a superblock inode * today. change if needed. */ if (!OCFS2_IS_VALID_DINODE(fe) || @@ -295,13 +301,19 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, switch (inode->i_mode & S_IFMT) { case S_IFREG: - inode->i_fop = &ocfs2_fops; + if (use_plocks) + inode->i_fop = &ocfs2_fops; + else + inode->i_fop = &ocfs2_fops_no_plocks; inode->i_op = &ocfs2_file_iops; i_size_write(inode, le64_to_cpu(fe->i_size)); break; case S_IFDIR: inode->i_op = &ocfs2_dir_iops; - inode->i_fop = &ocfs2_dops; + if (use_plocks) + inode->i_fop = &ocfs2_dops; + else + inode->i_fop = &ocfs2_dops_no_plocks; i_size_write(inode, le64_to_cpu(fe->i_size)); break; case S_IFLNK: @@ -448,8 +460,11 @@ static int ocfs2_read_locked_inode(struct inode *inode, } } - status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0, - can_lock ? inode : NULL); + if (can_lock) + status = ocfs2_read_blocks(inode, args->fi_blkno, 1, &bh, + OCFS2_BH_IGNORE_CACHE); + else + status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh); if (status < 0) { mlog_errno(status); goto bail; @@ -522,6 +537,9 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, * data and fast symlinks. */ if (fe->i_clusters) { + if (ocfs2_should_order_data(inode)) + ocfs2_begin_ordered_truncate(inode, 0); + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); if (IS_ERR(handle)) { status = PTR_ERR(handle); @@ -730,6 +748,13 @@ static int ocfs2_wipe_inode(struct inode *inode, goto bail_unlock_dir; } + /*Free extended attribute resources associated with this inode.*/ + status = ocfs2_xattr_remove(inode, di_bh); + if (status < 0) { + mlog_errno(status); + goto bail_unlock_dir; + } + status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode, orphan_dir_bh); if (status < 0) @@ -1081,6 +1106,8 @@ void ocfs2_clear_inode(struct inode *inode) oi->ip_last_trans = 0; oi->ip_dir_start_lookup = 0; oi->ip_blkno = 0ULL; + jbd2_journal_release_jbd_inode(OCFS2_SB(inode->i_sb)->journal->j_journal, + &oi->ip_jinode); bail: mlog_exit_void(); @@ -1107,58 +1134,6 @@ void ocfs2_drop_inode(struct inode *inode) } /* - * TODO: this should probably be merged into ocfs2_get_block - * - * However, you now need to pay attention to the cont_prepare_write() - * stuff in ocfs2_get_block (that is, ocfs2_get_block pretty much - * expects never to extend). - */ -struct buffer_head *ocfs2_bread(struct inode *inode, - int block, int *err, int reada) -{ - struct buffer_head *bh = NULL; - int tmperr; - u64 p_blkno; - int readflags = OCFS2_BH_CACHED; - - if (reada) - readflags |= OCFS2_BH_READAHEAD; - - if (((u64)block << inode->i_sb->s_blocksize_bits) >= - i_size_read(inode)) { - BUG_ON(!reada); - return NULL; - } - - down_read(&OCFS2_I(inode)->ip_alloc_sem); - tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, - NULL); - up_read(&OCFS2_I(inode)->ip_alloc_sem); - if (tmperr < 0) { - mlog_errno(tmperr); - goto fail; - } - - tmperr = ocfs2_read_block(OCFS2_SB(inode->i_sb), p_blkno, &bh, - readflags, inode); - if (tmperr < 0) - goto fail; - - tmperr = 0; - - *err = 0; - return bh; - -fail: - if (bh) { - brelse(bh); - bh = NULL; - } - *err = -EIO; - return NULL; -} - -/* * This is called from our getattr. */ int ocfs2_inode_revalidate(struct dentry *dentry) diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 390a85596aa0..2f37af9bcc4a 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -40,6 +40,9 @@ struct ocfs2_inode_info /* protects allocation changes on this inode. */ struct rw_semaphore ip_alloc_sem; + /* protects extended attribute changes on this inode */ + struct rw_semaphore ip_xattr_sem; + /* These fields are protected by ip_lock */ spinlock_t ip_lock; u32 ip_open_count; @@ -68,6 +71,7 @@ struct ocfs2_inode_info struct ocfs2_extent_map ip_extent_map; struct inode vfs_inode; + struct jbd2_inode ip_jinode; }; /* @@ -113,8 +117,6 @@ extern struct kmem_cache *ocfs2_inode_cache; extern const struct address_space_operations ocfs2_aops; -struct buffer_head *ocfs2_bread(struct inode *inode, int block, - int *err, int reada); void ocfs2_clear_inode(struct inode *inode); void ocfs2_delete_inode(struct inode *inode); void ocfs2_drop_inode(struct inode *inode); diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 7b142f0ce995..9fcd36dcc9a0 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -102,8 +102,7 @@ bail_unlock: bail: mutex_unlock(&inode->i_mutex); - if (bh) - brelse(bh); + brelse(bh); mlog_exit(status); return status; diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 9698338adc39..81e40677eecb 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -57,7 +57,7 @@ static int __ocfs2_recovery_thread(void *arg); static int ocfs2_commit_cache(struct ocfs2_super *osb); static int ocfs2_wait_on_mount(struct ocfs2_super *osb); static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb, - int dirty); + int dirty, int replayed); static int ocfs2_trylock_journal(struct ocfs2_super *osb, int slot_num); static int ocfs2_recover_orphans(struct ocfs2_super *osb, @@ -215,9 +215,9 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb) goto finally; } - journal_lock_updates(journal->j_journal); - status = journal_flush(journal->j_journal); - journal_unlock_updates(journal->j_journal); + jbd2_journal_lock_updates(journal->j_journal); + status = jbd2_journal_flush(journal->j_journal); + jbd2_journal_unlock_updates(journal->j_journal); if (status < 0) { up_write(&journal->j_trans_barrier); mlog_errno(status); @@ -264,7 +264,7 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs) down_read(&osb->journal->j_trans_barrier); - handle = journal_start(journal, max_buffs); + handle = jbd2_journal_start(journal, max_buffs); if (IS_ERR(handle)) { up_read(&osb->journal->j_trans_barrier); @@ -290,7 +290,7 @@ int ocfs2_commit_trans(struct ocfs2_super *osb, BUG_ON(!handle); - ret = journal_stop(handle); + ret = jbd2_journal_stop(handle); if (ret < 0) mlog_errno(ret); @@ -304,7 +304,7 @@ int ocfs2_commit_trans(struct ocfs2_super *osb, * transaction. extend_trans will either extend the current handle by * nblocks, or commit it and start a new one with nblocks credits. * - * This might call journal_restart() which will commit dirty buffers + * This might call jbd2_journal_restart() which will commit dirty buffers * and then restart the transaction. Before calling * ocfs2_extend_trans(), any changed blocks should have been * dirtied. After calling it, all blocks which need to be changed must @@ -329,10 +329,10 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks) mlog(0, "Trying to extend transaction by %d blocks\n", nblocks); -#ifdef OCFS2_DEBUG_FS +#ifdef CONFIG_OCFS2_DEBUG_FS status = 1; #else - status = journal_extend(handle, nblocks); + status = jbd2_journal_extend(handle, nblocks); if (status < 0) { mlog_errno(status); goto bail; @@ -340,8 +340,10 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks) #endif if (status > 0) { - mlog(0, "journal_extend failed, trying journal_restart\n"); - status = journal_restart(handle, nblocks); + mlog(0, + "jbd2_journal_extend failed, trying " + "jbd2_journal_restart\n"); + status = jbd2_journal_restart(handle, nblocks); if (status < 0) { mlog_errno(status); goto bail; @@ -393,11 +395,11 @@ int ocfs2_journal_access(handle_t *handle, switch (type) { case OCFS2_JOURNAL_ACCESS_CREATE: case OCFS2_JOURNAL_ACCESS_WRITE: - status = journal_get_write_access(handle, bh); + status = jbd2_journal_get_write_access(handle, bh); break; case OCFS2_JOURNAL_ACCESS_UNDO: - status = journal_get_undo_access(handle, bh); + status = jbd2_journal_get_undo_access(handle, bh); break; default: @@ -422,7 +424,7 @@ int ocfs2_journal_dirty(handle_t *handle, mlog_entry("(bh->b_blocknr=%llu)\n", (unsigned long long)bh->b_blocknr); - status = journal_dirty_metadata(handle, bh); + status = jbd2_journal_dirty_metadata(handle, bh); if (status < 0) mlog(ML_ERROR, "Could not dirty metadata buffer. " "(bh->b_blocknr=%llu)\n", @@ -432,6 +434,7 @@ int ocfs2_journal_dirty(handle_t *handle, return status; } +#ifdef CONFIG_OCFS2_COMPAT_JBD int ocfs2_journal_dirty_data(handle_t *handle, struct buffer_head *bh) { @@ -443,8 +446,9 @@ int ocfs2_journal_dirty_data(handle_t *handle, return err; } +#endif -#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD_DEFAULT_MAX_COMMIT_AGE) +#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE) void ocfs2_set_journal_params(struct ocfs2_super *osb) { @@ -457,9 +461,9 @@ void ocfs2_set_journal_params(struct ocfs2_super *osb) spin_lock(&journal->j_state_lock); journal->j_commit_interval = commit_interval; if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER) - journal->j_flags |= JFS_BARRIER; + journal->j_flags |= JBD2_BARRIER; else - journal->j_flags &= ~JFS_BARRIER; + journal->j_flags &= ~JBD2_BARRIER; spin_unlock(&journal->j_state_lock); } @@ -524,14 +528,14 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty) mlog(0, "inode->ip_clusters = %u\n", OCFS2_I(inode)->ip_clusters); /* call the kernels journal init function now */ - j_journal = journal_init_inode(inode); + j_journal = jbd2_journal_init_inode(inode); if (j_journal == NULL) { mlog(ML_ERROR, "Linux journal layer error\n"); status = -EINVAL; goto done; } - mlog(0, "Returned from journal_init_inode\n"); + mlog(0, "Returned from jbd2_journal_init_inode\n"); mlog(0, "j_journal->j_maxlen = %u\n", j_journal->j_maxlen); *dirty = (le32_to_cpu(di->id1.journal1.ij_flags) & @@ -550,8 +554,7 @@ done: if (status < 0) { if (inode_lock) ocfs2_inode_unlock(inode, 1); - if (bh != NULL) - brelse(bh); + brelse(bh); if (inode) { OCFS2_I(inode)->ip_open_count--; iput(inode); @@ -562,8 +565,18 @@ done: return status; } +static void ocfs2_bump_recovery_generation(struct ocfs2_dinode *di) +{ + le32_add_cpu(&(di->id1.journal1.ij_recovery_generation), 1); +} + +static u32 ocfs2_get_recovery_generation(struct ocfs2_dinode *di) +{ + return le32_to_cpu(di->id1.journal1.ij_recovery_generation); +} + static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb, - int dirty) + int dirty, int replayed) { int status; unsigned int flags; @@ -593,6 +606,9 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb, flags &= ~OCFS2_JOURNAL_DIRTY_FL; fe->id1.journal1.ij_flags = cpu_to_le32(flags); + if (replayed) + ocfs2_bump_recovery_generation(fe); + status = ocfs2_write_block(osb, bh, journal->j_inode); if (status < 0) mlog_errno(status); @@ -626,7 +642,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb) if (journal->j_state != OCFS2_JOURNAL_LOADED) goto done; - /* need to inc inode use count as journal_destroy will iput. */ + /* need to inc inode use count - jbd2_journal_destroy will iput. */ if (!igrab(inode)) BUG(); @@ -655,9 +671,9 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb) BUG_ON(atomic_read(&(osb->journal->j_num_trans)) != 0); if (ocfs2_mount_local(osb)) { - journal_lock_updates(journal->j_journal); - status = journal_flush(journal->j_journal); - journal_unlock_updates(journal->j_journal); + jbd2_journal_lock_updates(journal->j_journal); + status = jbd2_journal_flush(journal->j_journal); + jbd2_journal_unlock_updates(journal->j_journal); if (status < 0) mlog_errno(status); } @@ -667,13 +683,13 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb) * Do not toggle if flush was unsuccessful otherwise * will leave dirty metadata in a "clean" journal */ - status = ocfs2_journal_toggle_dirty(osb, 0); + status = ocfs2_journal_toggle_dirty(osb, 0, 0); if (status < 0) mlog_errno(status); } /* Shutdown the kernel journal system */ - journal_destroy(journal->j_journal); + jbd2_journal_destroy(journal->j_journal); OCFS2_I(inode)->ip_open_count--; @@ -698,19 +714,19 @@ static void ocfs2_clear_journal_error(struct super_block *sb, { int olderr; - olderr = journal_errno(journal); + olderr = jbd2_journal_errno(journal); if (olderr) { mlog(ML_ERROR, "File system error %d recorded in " "journal %u.\n", olderr, slot); mlog(ML_ERROR, "File system on device %s needs checking.\n", sb->s_id); - journal_ack_err(journal); - journal_clear_err(journal); + jbd2_journal_ack_err(journal); + jbd2_journal_clear_err(journal); } } -int ocfs2_journal_load(struct ocfs2_journal *journal, int local) +int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed) { int status = 0; struct ocfs2_super *osb; @@ -721,7 +737,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local) osb = journal->j_osb; - status = journal_load(journal->j_journal); + status = jbd2_journal_load(journal->j_journal); if (status < 0) { mlog(ML_ERROR, "Failed to load journal!\n"); goto done; @@ -729,7 +745,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local) ocfs2_clear_journal_error(osb->sb, journal->j_journal, osb->slot_num); - status = ocfs2_journal_toggle_dirty(osb, 1); + status = ocfs2_journal_toggle_dirty(osb, 1, replayed); if (status < 0) { mlog_errno(status); goto done; @@ -765,13 +781,13 @@ int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full) BUG_ON(!journal); - status = journal_wipe(journal->j_journal, full); + status = jbd2_journal_wipe(journal->j_journal, full); if (status < 0) { mlog_errno(status); goto bail; } - status = ocfs2_journal_toggle_dirty(journal->j_osb, 0); + status = ocfs2_journal_toggle_dirty(journal->j_osb, 0, 0); if (status < 0) mlog_errno(status); @@ -834,9 +850,8 @@ static int ocfs2_force_read_journal(struct inode *inode) /* We are reading journal data which should not * be put in the uptodate cache */ - status = ocfs2_read_blocks(OCFS2_SB(inode->i_sb), - p_blkno, p_blocks, bhs, 0, - NULL); + status = ocfs2_read_blocks_sync(OCFS2_SB(inode->i_sb), + p_blkno, p_blocks, bhs); if (status < 0) { mlog_errno(status); goto bail; @@ -852,8 +867,7 @@ static int ocfs2_force_read_journal(struct inode *inode) bail: for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++) - if (bhs[i]) - brelse(bhs[i]); + brelse(bhs[i]); mlog_exit(status); return status; } @@ -1034,6 +1048,12 @@ restart: spin_unlock(&osb->osb_lock); mlog(0, "All nodes recovered\n"); + /* Refresh all journal recovery generations from disk */ + status = ocfs2_check_journals_nolocks(osb); + status = (status == -EROFS) ? 0 : status; + if (status < 0) + mlog_errno(status); + ocfs2_super_unlock(osb, 1); /* We always run recovery on our own orphan dir - the dead @@ -1096,6 +1116,43 @@ out: mlog_exit_void(); } +static int ocfs2_read_journal_inode(struct ocfs2_super *osb, + int slot_num, + struct buffer_head **bh, + struct inode **ret_inode) +{ + int status = -EACCES; + struct inode *inode = NULL; + + BUG_ON(slot_num >= osb->max_slots); + + inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE, + slot_num); + if (!inode || is_bad_inode(inode)) { + mlog_errno(status); + goto bail; + } + SET_INODE_JOURNAL(inode); + + status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, bh, + OCFS2_BH_IGNORE_CACHE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = 0; + +bail: + if (inode) { + if (status || !ret_inode) + iput(inode); + else + *ret_inode = inode; + } + return status; +} + /* Does the actual journal replay and marks the journal inode as * clean. Will only replay if the journal inode is marked dirty. */ static int ocfs2_replay_journal(struct ocfs2_super *osb, @@ -1109,22 +1166,36 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, struct ocfs2_dinode *fe; journal_t *journal = NULL; struct buffer_head *bh = NULL; + u32 slot_reco_gen; - inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE, - slot_num); - if (inode == NULL) { - status = -EACCES; + status = ocfs2_read_journal_inode(osb, slot_num, &bh, &inode); + if (status) { mlog_errno(status); goto done; } - if (is_bad_inode(inode)) { - status = -EACCES; - iput(inode); - inode = NULL; - mlog_errno(status); + + fe = (struct ocfs2_dinode *)bh->b_data; + slot_reco_gen = ocfs2_get_recovery_generation(fe); + brelse(bh); + bh = NULL; + + /* + * As the fs recovery is asynchronous, there is a small chance that + * another node mounted (and recovered) the slot before the recovery + * thread could get the lock. To handle that, we dirty read the journal + * inode for that slot to get the recovery generation. If it is + * different than what we expected, the slot has been recovered. + * If not, it needs recovery. + */ + if (osb->slot_recovery_generations[slot_num] != slot_reco_gen) { + mlog(0, "Slot %u already recovered (old/new=%u/%u)\n", slot_num, + osb->slot_recovery_generations[slot_num], slot_reco_gen); + osb->slot_recovery_generations[slot_num] = slot_reco_gen; + status = -EBUSY; goto done; } - SET_INODE_JOURNAL(inode); + + /* Continue with recovery as the journal has not yet been recovered */ status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY); if (status < 0) { @@ -1138,9 +1209,12 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, fe = (struct ocfs2_dinode *) bh->b_data; flags = le32_to_cpu(fe->id1.journal1.ij_flags); + slot_reco_gen = ocfs2_get_recovery_generation(fe); if (!(flags & OCFS2_JOURNAL_DIRTY_FL)) { mlog(0, "No recovery required for node %d\n", node_num); + /* Refresh recovery generation for the slot */ + osb->slot_recovery_generations[slot_num] = slot_reco_gen; goto done; } @@ -1157,19 +1231,19 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, } mlog(0, "calling journal_init_inode\n"); - journal = journal_init_inode(inode); + journal = jbd2_journal_init_inode(inode); if (journal == NULL) { mlog(ML_ERROR, "Linux journal layer error\n"); status = -EIO; goto done; } - status = journal_load(journal); + status = jbd2_journal_load(journal); if (status < 0) { mlog_errno(status); if (!igrab(inode)) BUG(); - journal_destroy(journal); + jbd2_journal_destroy(journal); goto done; } @@ -1177,9 +1251,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, /* wipe the journal */ mlog(0, "flushing the journal.\n"); - journal_lock_updates(journal); - status = journal_flush(journal); - journal_unlock_updates(journal); + jbd2_journal_lock_updates(journal); + status = jbd2_journal_flush(journal); + jbd2_journal_unlock_updates(journal); if (status < 0) mlog_errno(status); @@ -1188,6 +1262,11 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, flags &= ~OCFS2_JOURNAL_DIRTY_FL; fe->id1.journal1.ij_flags = cpu_to_le32(flags); + /* Increment recovery generation to indicate successful recovery */ + ocfs2_bump_recovery_generation(fe); + osb->slot_recovery_generations[slot_num] = + ocfs2_get_recovery_generation(fe); + status = ocfs2_write_block(osb, bh, inode); if (status < 0) mlog_errno(status); @@ -1195,7 +1274,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, if (!igrab(inode)) BUG(); - journal_destroy(journal); + jbd2_journal_destroy(journal); done: /* drop the lock on this nodes journal */ @@ -1205,8 +1284,7 @@ done: if (inode) iput(inode); - if (bh) - brelse(bh); + brelse(bh); mlog_exit(status); return status; @@ -1252,6 +1330,13 @@ static int ocfs2_recover_node(struct ocfs2_super *osb, status = ocfs2_replay_journal(osb, node_num, slot_num); if (status < 0) { + if (status == -EBUSY) { + mlog(0, "Skipping recovery for slot %u (node %u) " + "as another node has recovered it\n", slot_num, + node_num); + status = 0; + goto done; + } mlog_errno(status); goto done; } @@ -1334,21 +1419,46 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb) { unsigned int node_num; int status, i; + u32 gen; + struct buffer_head *bh = NULL; + struct ocfs2_dinode *di; /* This is called with the super block cluster lock, so we * know that the slot map can't change underneath us. */ - spin_lock(&osb->osb_lock); for (i = 0; i < osb->max_slots; i++) { - if (i == osb->slot_num) + /* Read journal inode to get the recovery generation */ + status = ocfs2_read_journal_inode(osb, i, &bh, NULL); + if (status) { + mlog_errno(status); + goto bail; + } + di = (struct ocfs2_dinode *)bh->b_data; + gen = ocfs2_get_recovery_generation(di); + brelse(bh); + bh = NULL; + + spin_lock(&osb->osb_lock); + osb->slot_recovery_generations[i] = gen; + + mlog(0, "Slot %u recovery generation is %u\n", i, + osb->slot_recovery_generations[i]); + + if (i == osb->slot_num) { + spin_unlock(&osb->osb_lock); continue; + } status = ocfs2_slot_to_node_num_locked(osb, i, &node_num); - if (status == -ENOENT) + if (status == -ENOENT) { + spin_unlock(&osb->osb_lock); continue; + } - if (__ocfs2_recovery_map_test(osb, node_num)) + if (__ocfs2_recovery_map_test(osb, node_num)) { + spin_unlock(&osb->osb_lock); continue; + } spin_unlock(&osb->osb_lock); /* Ok, we have a slot occupied by another node which @@ -1364,10 +1474,7 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb) mlog_errno(status); goto bail; } - - spin_lock(&osb->osb_lock); } - spin_unlock(&osb->osb_lock); status = 0; bail: @@ -1603,49 +1710,41 @@ static int ocfs2_commit_thread(void *arg) return 0; } -/* Look for a dirty journal without taking any cluster locks. Used for - * hard readonly access to determine whether the file system journals - * require recovery. */ +/* Reads all the journal inodes without taking any cluster locks. Used + * for hard readonly access to determine whether any journal requires + * recovery. Also used to refresh the recovery generation numbers after + * a journal has been recovered by another node. + */ int ocfs2_check_journals_nolocks(struct ocfs2_super *osb) { int ret = 0; unsigned int slot; - struct buffer_head *di_bh; + struct buffer_head *di_bh = NULL; struct ocfs2_dinode *di; - struct inode *journal = NULL; + int journal_dirty = 0; for(slot = 0; slot < osb->max_slots; slot++) { - journal = ocfs2_get_system_file_inode(osb, - JOURNAL_SYSTEM_INODE, - slot); - if (!journal || is_bad_inode(journal)) { - ret = -EACCES; - mlog_errno(ret); - goto out; - } - - di_bh = NULL; - ret = ocfs2_read_block(osb, OCFS2_I(journal)->ip_blkno, &di_bh, - 0, journal); - if (ret < 0) { + ret = ocfs2_read_journal_inode(osb, slot, &di_bh, NULL); + if (ret) { mlog_errno(ret); goto out; } di = (struct ocfs2_dinode *) di_bh->b_data; + osb->slot_recovery_generations[slot] = + ocfs2_get_recovery_generation(di); + if (le32_to_cpu(di->id1.journal1.ij_flags) & OCFS2_JOURNAL_DIRTY_FL) - ret = -EROFS; + journal_dirty = 1; brelse(di_bh); - if (ret) - break; + di_bh = NULL; } out: - if (journal) - iput(journal); - + if (journal_dirty) + ret = -EROFS; return ret; } diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index db82be2532ed..d4d14e9a3cea 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -27,7 +27,12 @@ #define OCFS2_JOURNAL_H #include <linux/fs.h> -#include <linux/jbd.h> +#ifndef CONFIG_OCFS2_COMPAT_JBD +# include <linux/jbd2.h> +#else +# include <linux/jbd.h> +# include "ocfs2_jbd_compat.h" +#endif enum ocfs2_journal_state { OCFS2_JOURNAL_FREE = 0, @@ -161,7 +166,8 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, void ocfs2_journal_shutdown(struct ocfs2_super *osb); int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full); -int ocfs2_journal_load(struct ocfs2_journal *journal, int local); +int ocfs2_journal_load(struct ocfs2_journal *journal, int local, + int replayed); int ocfs2_check_journals_nolocks(struct ocfs2_super *osb); void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num); @@ -214,8 +220,8 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode) * buffer. Will have to call ocfs2_journal_dirty once * we've actually dirtied it. Type is one of . or . * ocfs2_journal_dirty - Mark a journalled buffer as having dirty data. - * ocfs2_journal_dirty_data - Indicate that a data buffer should go out before - * the current handle commits. + * ocfs2_jbd2_file_inode - Mark an inode so that its data goes out before + * the current handle commits. */ /* You must always start_trans with a number of buffs > 0, but it's @@ -267,8 +273,10 @@ int ocfs2_journal_access(handle_t *handle, */ int ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh); +#ifdef CONFIG_OCFS2_COMPAT_JBD int ocfs2_journal_dirty_data(handle_t *handle, struct buffer_head *bh); +#endif /* * Credit Macros: @@ -282,6 +290,9 @@ int ocfs2_journal_dirty_data(handle_t *handle, /* simple file updates like chmod, etc. */ #define OCFS2_INODE_UPDATE_CREDITS 1 +/* extended attribute block update */ +#define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1 + /* group extend. inode update and last group update. */ #define OCFS2_GROUP_EXTEND_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) @@ -339,11 +350,23 @@ int ocfs2_journal_dirty_data(handle_t *handle, #define OCFS2_RENAME_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3 \ + OCFS2_UNLINK_CREDITS) +/* global bitmap dinode, group desc., relinked group, + * suballocator dinode, group desc., relinked group, + * dinode, xattr block */ +#define OCFS2_XATTR_BLOCK_CREATE_CREDITS (OCFS2_SUBALLOC_ALLOC * 2 + \ + + OCFS2_INODE_UPDATE_CREDITS \ + + OCFS2_XATTR_BLOCK_UPDATE_CREDITS) + +/* + * Please note that the caller must make sure that root_el is the root + * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise + * the result may be wrong. + */ static inline int ocfs2_calc_extend_credits(struct super_block *sb, - struct ocfs2_dinode *fe, + struct ocfs2_extent_list *root_el, u32 bits_wanted) { - int bitmap_blocks, sysfile_bitmap_blocks, dinode_blocks; + int bitmap_blocks, sysfile_bitmap_blocks, extent_blocks; /* bitmap dinode, group desc. + relinked group. */ bitmap_blocks = OCFS2_SUBALLOC_ALLOC; @@ -354,16 +377,16 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb, * however many metadata chunks needed * a remaining suballoc * alloc. */ sysfile_bitmap_blocks = 1 + - (OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(fe); + (OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(root_el); /* this does not include *new* metadata blocks, which are - * accounted for in sysfile_bitmap_blocks. fe + + * accounted for in sysfile_bitmap_blocks. root_el + * prev. last_eb_blk + blocks along edge of tree. * calc_symlink_credits passes because we just need 1 * credit for the dinode there. */ - dinode_blocks = 1 + 1 + le16_to_cpu(fe->id2.i_list.l_tree_depth); + extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth); - return bitmap_blocks + sysfile_bitmap_blocks + dinode_blocks; + return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks; } static inline int ocfs2_calc_symlink_credits(struct super_block *sb) @@ -414,4 +437,16 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb, return credits; } +static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode) +{ + return jbd2_journal_file_inode(handle, &OCFS2_I(inode)->ip_jinode); +} + +static inline int ocfs2_begin_ordered_truncate(struct inode *inode, + loff_t new_size) +{ + return jbd2_journal_begin_ordered_truncate(&OCFS2_I(inode)->ip_jinode, + new_size); +} + #endif /* OCFS2_JOURNAL_H */ diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index be774bdc8b36..687b28713c32 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -28,6 +28,7 @@ #include <linux/slab.h> #include <linux/highmem.h> #include <linux/bitops.h> +#include <linux/debugfs.h> #define MLOG_MASK_PREFIX ML_DISK_ALLOC #include <cluster/masklog.h> @@ -47,8 +48,6 @@ #define OCFS2_LOCAL_ALLOC(dinode) (&((dinode)->id2.i_lab)) -static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb); - static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc); static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, @@ -75,24 +74,129 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb, static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, struct inode *local_alloc_inode); -static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb) +#ifdef CONFIG_OCFS2_FS_STATS + +static int ocfs2_la_debug_open(struct inode *inode, struct file *file) +{ + file->private_data = inode->i_private; + return 0; +} + +#define LA_DEBUG_BUF_SZ PAGE_CACHE_SIZE +#define LA_DEBUG_VER 1 +static ssize_t ocfs2_la_debug_read(struct file *file, char __user *userbuf, + size_t count, loff_t *ppos) +{ + static DEFINE_MUTEX(la_debug_mutex); + struct ocfs2_super *osb = file->private_data; + int written, ret; + char *buf = osb->local_alloc_debug_buf; + + mutex_lock(&la_debug_mutex); + memset(buf, 0, LA_DEBUG_BUF_SZ); + + written = snprintf(buf, LA_DEBUG_BUF_SZ, + "0x%x\t0x%llx\t%u\t%u\t0x%x\n", + LA_DEBUG_VER, + (unsigned long long)osb->la_last_gd, + osb->local_alloc_default_bits, + osb->local_alloc_bits, osb->local_alloc_state); + + ret = simple_read_from_buffer(userbuf, count, ppos, buf, written); + + mutex_unlock(&la_debug_mutex); + return ret; +} + +static const struct file_operations ocfs2_la_debug_fops = { + .open = ocfs2_la_debug_open, + .read = ocfs2_la_debug_read, +}; + +static void ocfs2_init_la_debug(struct ocfs2_super *osb) +{ + osb->local_alloc_debug_buf = kmalloc(LA_DEBUG_BUF_SZ, GFP_NOFS); + if (!osb->local_alloc_debug_buf) + return; + + osb->local_alloc_debug = debugfs_create_file("local_alloc_stats", + S_IFREG|S_IRUSR, + osb->osb_debug_root, + osb, + &ocfs2_la_debug_fops); + if (!osb->local_alloc_debug) { + kfree(osb->local_alloc_debug_buf); + osb->local_alloc_debug_buf = NULL; + } +} + +static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb) +{ + if (osb->local_alloc_debug) + debugfs_remove(osb->local_alloc_debug); + + if (osb->local_alloc_debug_buf) + kfree(osb->local_alloc_debug_buf); + + osb->local_alloc_debug_buf = NULL; + osb->local_alloc_debug = NULL; +} +#else /* CONFIG_OCFS2_FS_STATS */ +static void ocfs2_init_la_debug(struct ocfs2_super *osb) +{ + return; +} +static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb) +{ + return; +} +#endif + +static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb) { - BUG_ON(osb->s_clustersize_bits > 20); + return (osb->local_alloc_state == OCFS2_LA_THROTTLED || + osb->local_alloc_state == OCFS2_LA_ENABLED); +} - /* Size local alloc windows by the megabyte */ - return osb->local_alloc_size << (20 - osb->s_clustersize_bits); +void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb, + unsigned int num_clusters) +{ + spin_lock(&osb->osb_lock); + if (osb->local_alloc_state == OCFS2_LA_DISABLED || + osb->local_alloc_state == OCFS2_LA_THROTTLED) + if (num_clusters >= osb->local_alloc_default_bits) { + cancel_delayed_work(&osb->la_enable_wq); + osb->local_alloc_state = OCFS2_LA_ENABLED; + } + spin_unlock(&osb->osb_lock); +} + +void ocfs2_la_enable_worker(struct work_struct *work) +{ + struct ocfs2_super *osb = + container_of(work, struct ocfs2_super, + la_enable_wq.work); + spin_lock(&osb->osb_lock); + osb->local_alloc_state = OCFS2_LA_ENABLED; + spin_unlock(&osb->osb_lock); } /* * Tell us whether a given allocation should use the local alloc * file. Otherwise, it has to go to the main bitmap. + * + * This function does semi-dirty reads of local alloc size and state! + * This is ok however, as the values are re-checked once under mutex. */ int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits) { - int la_bits = ocfs2_local_alloc_window_bits(osb); int ret = 0; + int la_bits; + + spin_lock(&osb->osb_lock); + la_bits = osb->local_alloc_bits; - if (osb->local_alloc_state != OCFS2_LA_ENABLED) + if (!ocfs2_la_state_enabled(osb)) goto bail; /* la_bits should be at least twice the size (in clusters) of @@ -106,6 +210,7 @@ int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits) bail: mlog(0, "state=%d, bits=%llu, la_bits=%d, ret=%d\n", osb->local_alloc_state, (unsigned long long)bits, la_bits, ret); + spin_unlock(&osb->osb_lock); return ret; } @@ -120,14 +225,18 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb) mlog_entry_void(); - if (osb->local_alloc_size == 0) + ocfs2_init_la_debug(osb); + + if (osb->local_alloc_bits == 0) goto bail; - if (ocfs2_local_alloc_window_bits(osb) >= osb->bitmap_cpg) { + if (osb->local_alloc_bits >= osb->bitmap_cpg) { mlog(ML_NOTICE, "Requested local alloc window %d is larger " "than max possible %u. Using defaults.\n", - ocfs2_local_alloc_window_bits(osb), (osb->bitmap_cpg - 1)); - osb->local_alloc_size = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE; + osb->local_alloc_bits, (osb->bitmap_cpg - 1)); + osb->local_alloc_bits = + ocfs2_megabytes_to_clusters(osb->sb, + OCFS2_DEFAULT_LOCAL_ALLOC_SIZE); } /* read the alloc off disk */ @@ -139,8 +248,8 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb) goto bail; } - status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, - &alloc_bh, 0, inode); + status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, + &alloc_bh, OCFS2_BH_IGNORE_CACHE); if (status < 0) { mlog_errno(status); goto bail; @@ -185,13 +294,14 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb) bail: if (status < 0) - if (alloc_bh) - brelse(alloc_bh); + brelse(alloc_bh); if (inode) iput(inode); - mlog(0, "Local alloc window bits = %d\n", - ocfs2_local_alloc_window_bits(osb)); + if (status < 0) + ocfs2_shutdown_la_debug(osb); + + mlog(0, "Local alloc window bits = %d\n", osb->local_alloc_bits); mlog_exit(status); return status; @@ -217,6 +327,11 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb) mlog_entry_void(); + cancel_delayed_work(&osb->la_enable_wq); + flush_workqueue(ocfs2_wq); + + ocfs2_shutdown_la_debug(osb); + if (osb->local_alloc_state == OCFS2_LA_UNUSED) goto out; @@ -295,8 +410,7 @@ out_commit: ocfs2_commit_trans(osb, handle); out_unlock: - if (main_bm_bh) - brelse(main_bm_bh); + brelse(main_bm_bh); ocfs2_inode_unlock(main_bm_inode, 1); @@ -345,8 +459,8 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb, mutex_lock(&inode->i_mutex); - status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, - &alloc_bh, 0, inode); + status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, + &alloc_bh, OCFS2_BH_IGNORE_CACHE); if (status < 0) { mlog_errno(status); goto bail; @@ -372,8 +486,7 @@ bail: *alloc_copy = NULL; } - if (alloc_bh) - brelse(alloc_bh); + brelse(alloc_bh); if (inode) { mutex_unlock(&inode->i_mutex); @@ -441,8 +554,7 @@ out_unlock: out_mutex: mutex_unlock(&main_bm_inode->i_mutex); - if (main_bm_bh) - brelse(main_bm_bh); + brelse(main_bm_bh); iput(main_bm_inode); @@ -453,8 +565,48 @@ out: return status; } +/* Check to see if the local alloc window is within ac->ac_max_block */ +static int ocfs2_local_alloc_in_range(struct inode *inode, + struct ocfs2_alloc_context *ac, + u32 bits_wanted) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_dinode *alloc; + struct ocfs2_local_alloc *la; + int start; + u64 block_off; + + if (!ac->ac_max_block) + return 1; + + alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; + la = OCFS2_LOCAL_ALLOC(alloc); + + start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted); + if (start == -1) { + mlog_errno(-ENOSPC); + return 0; + } + + /* + * Converting (bm_off + start + bits_wanted) to blocks gives us + * the blkno just past our actual allocation. This is perfect + * to compare with ac_max_block. + */ + block_off = ocfs2_clusters_to_blocks(inode->i_sb, + le32_to_cpu(la->la_bm_off) + + start + bits_wanted); + mlog(0, "Checking %llu against %llu\n", + (unsigned long long)block_off, + (unsigned long long)ac->ac_max_block); + if (block_off > ac->ac_max_block) + return 0; + + return 1; +} + /* - * make sure we've got at least bitswanted contiguous bits in the + * make sure we've got at least bits_wanted contiguous bits in the * local alloc. You lose them when you drop i_mutex. * * We will add ourselves to the transaction passed in, but may start @@ -485,20 +637,22 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, mutex_lock(&local_alloc_inode->i_mutex); - if (osb->local_alloc_state != OCFS2_LA_ENABLED) { - status = -ENOSPC; - goto bail; - } - - if (bits_wanted > ocfs2_local_alloc_window_bits(osb)) { - mlog(0, "Asking for more than my max window size!\n"); + /* + * We must double check state and allocator bits because + * another process may have changed them while holding i_mutex. + */ + spin_lock(&osb->osb_lock); + if (!ocfs2_la_state_enabled(osb) || + (bits_wanted > osb->local_alloc_bits)) { + spin_unlock(&osb->osb_lock); status = -ENOSPC; goto bail; } + spin_unlock(&osb->osb_lock); alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; -#ifdef OCFS2_DEBUG_FS +#ifdef CONFIG_OCFS2_DEBUG_FS if (le32_to_cpu(alloc->id1.bitmap1.i_used) != ocfs2_local_alloc_count_bits(alloc)) { ocfs2_error(osb->sb, "local alloc inode %llu says it has " @@ -522,6 +676,36 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, mlog_errno(status); goto bail; } + + /* + * Under certain conditions, the window slide code + * might have reduced the number of bits available or + * disabled the the local alloc entirely. Re-check + * here and return -ENOSPC if necessary. + */ + status = -ENOSPC; + if (!ocfs2_la_state_enabled(osb)) + goto bail; + + free_bits = le32_to_cpu(alloc->id1.bitmap1.i_total) - + le32_to_cpu(alloc->id1.bitmap1.i_used); + if (bits_wanted > free_bits) + goto bail; + } + + if (ac->ac_max_block) + mlog(0, "Calling in_range for max block %llu\n", + (unsigned long long)ac->ac_max_block); + + if (!ocfs2_local_alloc_in_range(local_alloc_inode, ac, + bits_wanted)) { + /* + * The window is outside ac->ac_max_block. + * This errno tells the caller to keep localalloc enabled + * but to get the allocation from the main bitmap. + */ + status = -EFBIG; + goto bail; } ac->ac_inode = local_alloc_inode; @@ -789,6 +973,85 @@ bail: return status; } +enum ocfs2_la_event { + OCFS2_LA_EVENT_SLIDE, /* Normal window slide. */ + OCFS2_LA_EVENT_FRAGMENTED, /* The global bitmap has + * enough bits theoretically + * free, but a contiguous + * allocation could not be + * found. */ + OCFS2_LA_EVENT_ENOSPC, /* Global bitmap doesn't have + * enough bits free to satisfy + * our request. */ +}; +#define OCFS2_LA_ENABLE_INTERVAL (30 * HZ) +/* + * Given an event, calculate the size of our next local alloc window. + * + * This should always be called under i_mutex of the local alloc inode + * so that local alloc disabling doesn't race with processes trying to + * use the allocator. + * + * Returns the state which the local alloc was left in. This value can + * be ignored by some paths. + */ +static int ocfs2_recalc_la_window(struct ocfs2_super *osb, + enum ocfs2_la_event event) +{ + unsigned int bits; + int state; + + spin_lock(&osb->osb_lock); + if (osb->local_alloc_state == OCFS2_LA_DISABLED) { + WARN_ON_ONCE(osb->local_alloc_state == OCFS2_LA_DISABLED); + goto out_unlock; + } + + /* + * ENOSPC and fragmentation are treated similarly for now. + */ + if (event == OCFS2_LA_EVENT_ENOSPC || + event == OCFS2_LA_EVENT_FRAGMENTED) { + /* + * We ran out of contiguous space in the primary + * bitmap. Drastically reduce the number of bits used + * by local alloc until we have to disable it. + */ + bits = osb->local_alloc_bits >> 1; + if (bits > ocfs2_megabytes_to_clusters(osb->sb, 1)) { + /* + * By setting state to THROTTLED, we'll keep + * the number of local alloc bits used down + * until an event occurs which would give us + * reason to assume the bitmap situation might + * have changed. + */ + osb->local_alloc_state = OCFS2_LA_THROTTLED; + osb->local_alloc_bits = bits; + } else { + osb->local_alloc_state = OCFS2_LA_DISABLED; + } + queue_delayed_work(ocfs2_wq, &osb->la_enable_wq, + OCFS2_LA_ENABLE_INTERVAL); + goto out_unlock; + } + + /* + * Don't increase the size of the local alloc window until we + * know we might be able to fulfill the request. Otherwise, we + * risk bouncing around the global bitmap during periods of + * low space. + */ + if (osb->local_alloc_state != OCFS2_LA_THROTTLED) + osb->local_alloc_bits = osb->local_alloc_default_bits; + +out_unlock: + state = osb->local_alloc_state; + spin_unlock(&osb->osb_lock); + + return state; +} + static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb, struct ocfs2_alloc_context **ac, struct inode **bitmap_inode, @@ -803,12 +1066,21 @@ static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb, goto bail; } - (*ac)->ac_bits_wanted = ocfs2_local_alloc_window_bits(osb); +retry_enospc: + (*ac)->ac_bits_wanted = osb->local_alloc_bits; status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac); + if (status == -ENOSPC) { + if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) == + OCFS2_LA_DISABLED) + goto bail; + + ocfs2_free_ac_resource(*ac); + memset(*ac, 0, sizeof(struct ocfs2_alloc_context)); + goto retry_enospc; + } if (status < 0) { - if (status != -ENOSPC) - mlog_errno(status); + mlog_errno(status); goto bail; } @@ -849,7 +1121,7 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb, "one\n"); mlog(0, "Allocating %u clusters for a new window.\n", - ocfs2_local_alloc_window_bits(osb)); + osb->local_alloc_bits); /* Instruct the allocation code to try the most recently used * cluster group. We'll re-record the group used this pass @@ -859,9 +1131,36 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb, /* we used the generic suballoc reserve function, but we set * everything up nicely, so there's no reason why we can't use * the more specific cluster api to claim bits. */ - status = ocfs2_claim_clusters(osb, handle, ac, - ocfs2_local_alloc_window_bits(osb), + status = ocfs2_claim_clusters(osb, handle, ac, osb->local_alloc_bits, &cluster_off, &cluster_count); + if (status == -ENOSPC) { +retry_enospc: + /* + * Note: We could also try syncing the journal here to + * allow use of any free bits which the current + * transaction can't give us access to. --Mark + */ + if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_FRAGMENTED) == + OCFS2_LA_DISABLED) + goto bail; + + status = ocfs2_claim_clusters(osb, handle, ac, + osb->local_alloc_bits, + &cluster_off, + &cluster_count); + if (status == -ENOSPC) + goto retry_enospc; + /* + * We only shrunk the *minimum* number of in our + * request - it's entirely possible that the allocator + * might give us more than we asked for. + */ + if (status == 0) { + spin_lock(&osb->osb_lock); + osb->local_alloc_bits = cluster_count; + spin_unlock(&osb->osb_lock); + } + } if (status < 0) { if (status != -ENOSPC) mlog_errno(status); @@ -905,6 +1204,8 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, mlog_entry_void(); + ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_SLIDE); + /* This will lock the main bitmap for us. */ status = ocfs2_local_alloc_reserve_for_window(osb, &ac, @@ -976,8 +1277,7 @@ bail: if (handle) ocfs2_commit_trans(osb, handle); - if (main_bm_bh) - brelse(main_bm_bh); + brelse(main_bm_bh); if (main_bm_inode) iput(main_bm_inode); diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h index 3f76631e110c..ac5ea9f86653 100644 --- a/fs/ocfs2/localalloc.h +++ b/fs/ocfs2/localalloc.h @@ -52,4 +52,8 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb, u32 *bit_off, u32 *num_bits); +void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb, + unsigned int num_clusters); +void ocfs2_la_enable_worker(struct work_struct *work); + #endif /* OCFS2_LOCALALLOC_H */ diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c index 203f87143877..544ac6245175 100644 --- a/fs/ocfs2/locks.c +++ b/fs/ocfs2/locks.c @@ -24,6 +24,7 @@ */ #include <linux/fs.h> +#include <linux/fcntl.h> #define MLOG_MASK_PREFIX ML_INODE #include <cluster/masklog.h> @@ -32,6 +33,7 @@ #include "dlmglue.h" #include "file.h" +#include "inode.h" #include "locks.h" static int ocfs2_do_flock(struct file *file, struct inode *inode, @@ -123,3 +125,16 @@ int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl) else return ocfs2_do_flock(file, inode, cmd, fl); } + +int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl) +{ + struct inode *inode = file->f_mapping->host; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (!(fl->fl_flags & FL_POSIX)) + return -ENOLCK; + if (__mandatory_lock(inode)) + return -ENOLCK; + + return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl); +} diff --git a/fs/ocfs2/locks.h b/fs/ocfs2/locks.h index 9743ef2324ec..496d488b271f 100644 --- a/fs/ocfs2/locks.h +++ b/fs/ocfs2/locks.h @@ -27,5 +27,6 @@ #define OCFS2_LOCKS_H int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl); +int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl); #endif /* OCFS2_LOCKS_H */ diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index d5d808fe0140..485a6aa0ad39 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -60,6 +60,7 @@ #include "symlink.h" #include "sysfile.h" #include "uptodate.h" +#include "xattr.h" #include "buffer_head_io.h" @@ -327,14 +328,9 @@ leave: if (status == -ENOSPC) mlog(0, "Disk is full\n"); - if (new_fe_bh) - brelse(new_fe_bh); - - if (de_bh) - brelse(de_bh); - - if (parent_fe_bh) - brelse(parent_fe_bh); + brelse(new_fe_bh); + brelse(de_bh); + brelse(parent_fe_bh); if ((status < 0) && inode) iput(inode); @@ -647,12 +643,9 @@ out_unlock_inode: out: ocfs2_inode_unlock(dir, 1); - if (de_bh) - brelse(de_bh); - if (fe_bh) - brelse(fe_bh); - if (parent_fe_bh) - brelse(parent_fe_bh); + brelse(de_bh); + brelse(fe_bh); + brelse(parent_fe_bh); mlog_exit(err); @@ -851,17 +844,10 @@ leave: iput(orphan_dir); } - if (fe_bh) - brelse(fe_bh); - - if (dirent_bh) - brelse(dirent_bh); - - if (parent_node_bh) - brelse(parent_node_bh); - - if (orphan_entry_bh) - brelse(orphan_entry_bh); + brelse(fe_bh); + brelse(dirent_bh); + brelse(parent_node_bh); + brelse(orphan_entry_bh); mlog_exit(status); @@ -1372,24 +1358,15 @@ bail: if (new_inode) iput(new_inode); - if (newfe_bh) - brelse(newfe_bh); - if (old_inode_bh) - brelse(old_inode_bh); - if (old_dir_bh) - brelse(old_dir_bh); - if (new_dir_bh) - brelse(new_dir_bh); - if (new_de_bh) - brelse(new_de_bh); - if (old_de_bh) - brelse(old_de_bh); - if (old_inode_de_bh) - brelse(old_inode_de_bh); - if (orphan_entry_bh) - brelse(orphan_entry_bh); - if (insert_entry_bh) - brelse(insert_entry_bh); + brelse(newfe_bh); + brelse(old_inode_bh); + brelse(old_dir_bh); + brelse(new_dir_bh); + brelse(new_de_bh); + brelse(old_de_bh); + brelse(old_inode_de_bh); + brelse(orphan_entry_bh); + brelse(insert_entry_bh); mlog_exit(status); @@ -1492,8 +1469,7 @@ bail: if (bhs) { for(i = 0; i < blocks; i++) - if (bhs[i]) - brelse(bhs[i]); + brelse(bhs[i]); kfree(bhs); } @@ -1598,10 +1574,10 @@ static int ocfs2_symlink(struct inode *dir, u32 offset = 0; inode->i_op = &ocfs2_symlink_inode_operations; - status = ocfs2_do_extend_allocation(osb, inode, &offset, 1, 0, - new_fe_bh, - handle, data_ac, NULL, - NULL); + status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0, + new_fe_bh, + handle, data_ac, NULL, + NULL); if (status < 0) { if (status != -ENOSPC && status != -EINTR) { mlog(ML_ERROR, @@ -1659,12 +1635,9 @@ bail: ocfs2_inode_unlock(dir, 1); - if (new_fe_bh) - brelse(new_fe_bh); - if (parent_fe_bh) - brelse(parent_fe_bh); - if (de_bh) - brelse(de_bh); + brelse(new_fe_bh); + brelse(parent_fe_bh); + brelse(de_bh); if (inode_ac) ocfs2_free_alloc_context(inode_ac); if (data_ac) @@ -1759,8 +1732,7 @@ leave: iput(orphan_dir_inode); } - if (orphan_dir_bh) - brelse(orphan_dir_bh); + brelse(orphan_dir_bh); mlog_exit(status); return status; @@ -1780,10 +1752,9 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino); - status = ocfs2_read_block(osb, + status = ocfs2_read_block(orphan_dir_inode, OCFS2_I(orphan_dir_inode)->ip_blkno, - &orphan_dir_bh, OCFS2_BH_CACHED, - orphan_dir_inode); + &orphan_dir_bh); if (status < 0) { mlog_errno(status); goto leave; @@ -1829,8 +1800,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num); leave: - if (orphan_dir_bh) - brelse(orphan_dir_bh); + brelse(orphan_dir_bh); mlog_exit(status); return status; @@ -1898,8 +1868,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, } leave: - if (target_de_bh) - brelse(target_de_bh); + brelse(target_de_bh); mlog_exit(status); return status; @@ -1918,4 +1887,8 @@ const struct inode_operations ocfs2_dir_iops = { .setattr = ocfs2_setattr, .getattr = ocfs2_getattr, .permission = ocfs2_permission, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ocfs2_listxattr, + .removexattr = generic_removexattr, }; diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 31692379c170..a21a465490c4 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -34,7 +34,12 @@ #include <linux/workqueue.h> #include <linux/kref.h> #include <linux/mutex.h> -#include <linux/jbd.h> +#ifndef CONFIG_OCFS2_COMPAT_JBD +# include <linux/jbd2.h> +#else +# include <linux/jbd.h> +# include "ocfs2_jbd_compat.h" +#endif /* For union ocfs2_dlm_lksb */ #include "stackglue.h" @@ -132,6 +137,18 @@ struct ocfs2_lock_res { wait_queue_head_t l_event; struct list_head l_debug_list; + +#ifdef CONFIG_OCFS2_FS_STATS + unsigned long long l_lock_num_prmode; /* PR acquires */ + unsigned long long l_lock_num_exmode; /* EX acquires */ + unsigned int l_lock_num_prmode_failed; /* Failed PR gets */ + unsigned int l_lock_num_exmode_failed; /* Failed EX gets */ + unsigned long long l_lock_total_prmode; /* Tot wait for PR */ + unsigned long long l_lock_total_exmode; /* Tot wait for EX */ + unsigned int l_lock_max_prmode; /* Max wait for PR */ + unsigned int l_lock_max_exmode; /* Max wait for EX */ + unsigned int l_lock_refresh; /* Disk refreshes */ +#endif }; struct ocfs2_dlm_debug { @@ -159,9 +176,13 @@ struct ocfs2_alloc_stats enum ocfs2_local_alloc_state { - OCFS2_LA_UNUSED = 0, - OCFS2_LA_ENABLED, - OCFS2_LA_DISABLED + OCFS2_LA_UNUSED = 0, /* Local alloc will never be used for + * this mountpoint. */ + OCFS2_LA_ENABLED, /* Local alloc is in use. */ + OCFS2_LA_THROTTLED, /* Local alloc is in use, but number + * of bits has been reduced. */ + OCFS2_LA_DISABLED /* Local alloc has temporarily been + * disabled. */ }; enum ocfs2_mount_options @@ -172,6 +193,8 @@ enum ocfs2_mount_options OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */ OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */ OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */ + OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */ + OCFS2_MOUNT_INODE64 = 1 << 7, /* Allow inode numbers > 2^32 */ }; #define OCFS2_OSB_SOFT_RO 0x0001 @@ -192,6 +215,8 @@ struct ocfs2_super struct ocfs2_slot_info *slot_info; + u32 *slot_recovery_generations; + spinlock_t node_map_lock; u64 root_blkno; @@ -200,6 +225,7 @@ struct ocfs2_super u32 bitmap_cpg; u8 *uuid; char *uuid_str; + u32 uuid_hash; u8 *vol_label; u64 first_cluster_group_blkno; u32 fs_generation; @@ -227,6 +253,7 @@ struct ocfs2_super int s_sectsize_bits; int s_clustersize; int s_clustersize_bits; + unsigned int s_xattr_inline_size; atomic_t vol_state; struct mutex recovery_lock; @@ -238,11 +265,27 @@ struct ocfs2_super struct ocfs2_journal *journal; unsigned long osb_commit_interval; - int local_alloc_size; - enum ocfs2_local_alloc_state local_alloc_state; + struct delayed_work la_enable_wq; + + /* + * Must hold local alloc i_mutex and osb->osb_lock to change + * local_alloc_bits. Reads can be done under either lock. + */ + unsigned int local_alloc_bits; + unsigned int local_alloc_default_bits; + + enum ocfs2_local_alloc_state local_alloc_state; /* protected + * by osb_lock */ + struct buffer_head *local_alloc_bh; + u64 la_last_gd; +#ifdef CONFIG_OCFS2_FS_STATS + struct dentry *local_alloc_debug; + char *local_alloc_debug_buf; +#endif + /* Next two fields are for local node slot recovery during * mount. */ int dirty; @@ -326,6 +369,13 @@ static inline int ocfs2_supports_inline_data(struct ocfs2_super *osb) return 0; } +static inline int ocfs2_supports_xattr(struct ocfs2_super *osb) +{ + if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR) + return 1; + return 0; +} + /* set / clear functions because cluster events can make these happen * in parallel so we want the transitions to be atomic. this also * means that any future flags osb_flags must be protected by spinlock @@ -540,6 +590,14 @@ static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb) return pages_per_cluster; } +static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb, + unsigned int megs) +{ + BUILD_BUG_ON(OCFS2_MAX_CLUSTERSIZE > 1048576); + + return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits); +} + static inline void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb) { spin_lock(&osb->osb_lock); diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 52c426665154..f24ce3d3f956 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -64,6 +64,7 @@ #define OCFS2_INODE_SIGNATURE "INODE01" #define OCFS2_EXTENT_BLOCK_SIGNATURE "EXBLK01" #define OCFS2_GROUP_DESC_SIGNATURE "GROUP01" +#define OCFS2_XATTR_BLOCK_SIGNATURE "XATTR01" /* Compatibility flags */ #define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \ @@ -90,7 +91,8 @@ | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC \ | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \ | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \ - | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK) + | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \ + | OCFS2_FEATURE_INCOMPAT_XATTR) #define OCFS2_FEATURE_RO_COMPAT_SUPP OCFS2_FEATURE_RO_COMPAT_UNWRITTEN /* @@ -127,10 +129,6 @@ /* Support for data packed into inode blocks */ #define OCFS2_FEATURE_INCOMPAT_INLINE_DATA 0x0040 -/* Support for the extended slot map */ -#define OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP 0x100 - - /* * Support for alternate, userspace cluster stacks. If set, the superblock * field s_cluster_info contains a tag for the alternate stack in use as @@ -142,6 +140,12 @@ */ #define OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK 0x0080 +/* Support for the extended slot map */ +#define OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP 0x100 + +/* Support for extended attributes */ +#define OCFS2_FEATURE_INCOMPAT_XATTR 0x0200 + /* * backup superblock flag is used to indicate that this volume * has backup superblocks. @@ -299,6 +303,12 @@ struct ocfs2_new_group_input { */ #define OCFS2_DEFAULT_LOCAL_ALLOC_SIZE 8 +/* + * Inline extended attribute size (in bytes) + * The value chosen should be aligned to 16 byte boundaries. + */ +#define OCFS2_MIN_XATTR_INLINE_SIZE 256 + struct ocfs2_system_inode_info { char *si_name; int si_iflags; @@ -563,7 +573,7 @@ struct ocfs2_super_block { /*40*/ __le16 s_max_slots; /* Max number of simultaneous mounts before tunefs required */ __le16 s_tunefs_flag; - __le32 s_reserved1; + __le32 s_uuid_hash; /* hash value of uuid */ __le64 s_first_cluster_group; /* Block offset of 1st cluster * group header */ /*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */ @@ -571,7 +581,11 @@ struct ocfs2_super_block { /*A0*/ struct ocfs2_cluster_info s_cluster_info; /* Selected userspace stack. Only valid with INCOMPAT flag. */ -/*B8*/ __le64 s_reserved2[17]; /* Fill out superblock */ +/*B8*/ __le16 s_xattr_inline_size; /* extended attribute inline size + for this fs*/ + __le16 s_reserved0; + __le32 s_reserved1; +/*C0*/ __le64 s_reserved2[16]; /* Fill out superblock */ /*140*/ /* @@ -621,7 +635,8 @@ struct ocfs2_dinode { belongs to */ __le16 i_suballoc_bit; /* Bit offset in suballocator block group */ -/*10*/ __le32 i_reserved0; +/*10*/ __le16 i_reserved0; + __le16 i_xattr_inline_size; __le32 i_clusters; /* Cluster count */ __le32 i_uid; /* Owner UID */ __le32 i_gid; /* Owning GID */ @@ -640,11 +655,12 @@ struct ocfs2_dinode { __le32 i_atime_nsec; __le32 i_ctime_nsec; __le32 i_mtime_nsec; - __le32 i_attr; +/*70*/ __le32 i_attr; __le16 i_orphaned_slot; /* Only valid when OCFS2_ORPHANED_FL was set in i_flags */ __le16 i_dyn_features; -/*70*/ __le64 i_reserved2[8]; + __le64 i_xattr_loc; +/*80*/ __le64 i_reserved2[7]; /*B8*/ union { __le64 i_pad1; /* Generic way to refer to this 64bit union */ @@ -660,7 +676,10 @@ struct ocfs2_dinode { struct { /* Info for journal system inodes */ __le32 ij_flags; /* Mounted, version, etc. */ - __le32 ij_pad; + __le32 ij_recovery_generation; /* Incremented when the + journal is recovered + after an unclean + shutdown */ } journal1; } id1; /* Inode type dependant 1 */ /*C0*/ union { @@ -712,6 +731,136 @@ struct ocfs2_group_desc /*40*/ __u8 bg_bitmap[0]; }; +/* + * On disk extended attribute structure for OCFS2. + */ + +/* + * ocfs2_xattr_entry indicates one extend attribute. + * + * Note that it can be stored in inode, one block or one xattr bucket. + */ +struct ocfs2_xattr_entry { + __le32 xe_name_hash; /* hash value of xattr prefix+suffix. */ + __le16 xe_name_offset; /* byte offset from the 1st etnry in the local + local xattr storage(inode, xattr block or + xattr bucket). */ + __u8 xe_name_len; /* xattr name len, does't include prefix. */ + __u8 xe_type; /* the low 7 bits indicates the name prefix's + * type and the highest 1 bits indicate whether + * the EA is stored in the local storage. */ + __le64 xe_value_size; /* real xattr value length. */ +}; + +/* + * On disk structure for xattr header. + * + * One ocfs2_xattr_header describes how many ocfs2_xattr_entry records in + * the local xattr storage. + */ +struct ocfs2_xattr_header { + __le16 xh_count; /* contains the count of how + many records are in the + local xattr storage. */ + __le16 xh_free_start; /* current offset for storing + xattr. */ + __le16 xh_name_value_len; /* total length of name/value + length in this bucket. */ + __le16 xh_num_buckets; /* bucket nums in one extent + record, only valid in the + first bucket. */ + __le64 xh_csum; + struct ocfs2_xattr_entry xh_entries[0]; /* xattr entry list. */ +}; + +/* + * On disk structure for xattr value root. + * + * It is used when one extended attribute's size is larger, and we will save it + * in an outside cluster. It will stored in a b-tree like file content. + */ +struct ocfs2_xattr_value_root { +/*00*/ __le32 xr_clusters; /* clusters covered by xattr value. */ + __le32 xr_reserved0; + __le64 xr_last_eb_blk; /* Pointer to last extent block */ +/*10*/ struct ocfs2_extent_list xr_list; /* Extent record list */ +}; + +/* + * On disk structure for xattr tree root. + * + * It is used when there are too many extended attributes for one file. These + * attributes will be organized and stored in an indexed-btree. + */ +struct ocfs2_xattr_tree_root { +/*00*/ __le32 xt_clusters; /* clusters covered by xattr. */ + __le32 xt_reserved0; + __le64 xt_last_eb_blk; /* Pointer to last extent block */ +/*10*/ struct ocfs2_extent_list xt_list; /* Extent record list */ +}; + +#define OCFS2_XATTR_INDEXED 0x1 +#define OCFS2_HASH_SHIFT 5 +#define OCFS2_XATTR_ROUND 3 +#define OCFS2_XATTR_SIZE(size) (((size) + OCFS2_XATTR_ROUND) & \ + ~(OCFS2_XATTR_ROUND)) + +#define OCFS2_XATTR_BUCKET_SIZE 4096 +#define OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET (OCFS2_XATTR_BUCKET_SIZE \ + / OCFS2_MIN_BLOCKSIZE) + +/* + * On disk structure for xattr block. + */ +struct ocfs2_xattr_block { +/*00*/ __u8 xb_signature[8]; /* Signature for verification */ + __le16 xb_suballoc_slot; /* Slot suballocator this + block belongs to. */ + __le16 xb_suballoc_bit; /* Bit offset in suballocator + block group */ + __le32 xb_fs_generation; /* Must match super block */ +/*10*/ __le64 xb_blkno; /* Offset on disk, in blocks */ + __le64 xb_csum; +/*20*/ __le16 xb_flags; /* Indicates whether this block contains + real xattr or a xattr tree. */ + __le16 xb_reserved0; + __le32 xb_reserved1; + __le64 xb_reserved2; +/*30*/ union { + struct ocfs2_xattr_header xb_header; /* xattr header if this + block contains xattr */ + struct ocfs2_xattr_tree_root xb_root;/* xattr tree root if this + block cotains xattr + tree. */ + } xb_attrs; +}; + +#define OCFS2_XATTR_ENTRY_LOCAL 0x80 +#define OCFS2_XATTR_TYPE_MASK 0x7F +static inline void ocfs2_xattr_set_local(struct ocfs2_xattr_entry *xe, + int local) +{ + if (local) + xe->xe_type |= OCFS2_XATTR_ENTRY_LOCAL; + else + xe->xe_type &= ~OCFS2_XATTR_ENTRY_LOCAL; +} + +static inline int ocfs2_xattr_is_local(struct ocfs2_xattr_entry *xe) +{ + return xe->xe_type & OCFS2_XATTR_ENTRY_LOCAL; +} + +static inline void ocfs2_xattr_set_type(struct ocfs2_xattr_entry *xe, int type) +{ + xe->xe_type |= type & OCFS2_XATTR_TYPE_MASK; +} + +static inline int ocfs2_xattr_get_type(struct ocfs2_xattr_entry *xe) +{ + return xe->xe_type & OCFS2_XATTR_TYPE_MASK; +} + #ifdef __KERNEL__ static inline int ocfs2_fast_symlink_chars(struct super_block *sb) { @@ -725,6 +874,20 @@ static inline int ocfs2_max_inline_data(struct super_block *sb) offsetof(struct ocfs2_dinode, id2.i_data.id_data); } +static inline int ocfs2_max_inline_data_with_xattr(struct super_block *sb, + struct ocfs2_dinode *di) +{ + unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size); + + if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL) + return sb->s_blocksize - + offsetof(struct ocfs2_dinode, id2.i_data.id_data) - + xattrsize; + else + return sb->s_blocksize - + offsetof(struct ocfs2_dinode, id2.i_data.id_data); +} + static inline int ocfs2_extent_recs_per_inode(struct super_block *sb) { int size; @@ -735,6 +898,24 @@ static inline int ocfs2_extent_recs_per_inode(struct super_block *sb) return size / sizeof(struct ocfs2_extent_rec); } +static inline int ocfs2_extent_recs_per_inode_with_xattr( + struct super_block *sb, + struct ocfs2_dinode *di) +{ + int size; + unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size); + + if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL) + size = sb->s_blocksize - + offsetof(struct ocfs2_dinode, id2.i_list.l_recs) - + xattrsize; + else + size = sb->s_blocksize - + offsetof(struct ocfs2_dinode, id2.i_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} + static inline int ocfs2_chain_recs_per_inode(struct super_block *sb) { int size; @@ -798,6 +979,17 @@ static inline u64 ocfs2_backup_super_blkno(struct super_block *sb, int index) return 0; } + +static inline u16 ocfs2_xattr_recs_per_xb(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_xattr_block, + xb_attrs.xb_root.xt_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} #else static inline int ocfs2_fast_symlink_chars(int blocksize) { @@ -881,6 +1073,17 @@ static inline uint64_t ocfs2_backup_super_blkno(int blocksize, int index) return 0; } + +static inline int ocfs2_xattr_recs_per_xb(int blocksize) +{ + int size; + + size = blocksize - + offsetof(struct ocfs2_xattr_block, + xb_attrs.xb_root.xt_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} #endif /* __KERNEL__ */ @@ -901,7 +1104,7 @@ static inline int ocfs2_sprintf_system_inode_name(char *buf, int len, * list has a copy per slot. */ if (type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE) - chars = snprintf(buf, len, + chars = snprintf(buf, len, "%s", ocfs2_system_inodes[type].si_name); else chars = snprintf(buf, len, diff --git a/fs/ocfs2/ocfs2_jbd_compat.h b/fs/ocfs2/ocfs2_jbd_compat.h new file mode 100644 index 000000000000..b91c78f8f558 --- /dev/null +++ b/fs/ocfs2/ocfs2_jbd_compat.h @@ -0,0 +1,82 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * ocfs2_jbd_compat.h + * + * Compatibility defines for JBD. + * + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef OCFS2_JBD_COMPAT_H +#define OCFS2_JBD_COMPAT_H + +#ifndef CONFIG_OCFS2_COMPAT_JBD +# error Should not have been included +#endif + +struct jbd2_inode { + unsigned int dummy; +}; + +#define JBD2_BARRIER JFS_BARRIER +#define JBD2_DEFAULT_MAX_COMMIT_AGE JBD_DEFAULT_MAX_COMMIT_AGE + +#define jbd2_journal_ack_err journal_ack_err +#define jbd2_journal_clear_err journal_clear_err +#define jbd2_journal_destroy journal_destroy +#define jbd2_journal_dirty_metadata journal_dirty_metadata +#define jbd2_journal_errno journal_errno +#define jbd2_journal_extend journal_extend +#define jbd2_journal_flush journal_flush +#define jbd2_journal_force_commit journal_force_commit +#define jbd2_journal_get_write_access journal_get_write_access +#define jbd2_journal_get_undo_access journal_get_undo_access +#define jbd2_journal_init_inode journal_init_inode +#define jbd2_journal_invalidatepage journal_invalidatepage +#define jbd2_journal_load journal_load +#define jbd2_journal_lock_updates journal_lock_updates +#define jbd2_journal_restart journal_restart +#define jbd2_journal_start journal_start +#define jbd2_journal_start_commit journal_start_commit +#define jbd2_journal_stop journal_stop +#define jbd2_journal_try_to_free_buffers journal_try_to_free_buffers +#define jbd2_journal_unlock_updates journal_unlock_updates +#define jbd2_journal_wipe journal_wipe +#define jbd2_log_wait_commit log_wait_commit + +static inline int jbd2_journal_file_inode(handle_t *handle, + struct jbd2_inode *inode) +{ + return 0; +} + +static inline int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode, + loff_t new_size) +{ + return 0; +} + +static inline void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, + struct inode *inode) +{ + return; +} + +static inline void jbd2_journal_release_jbd_inode(journal_t *journal, + struct jbd2_inode *jinode) +{ + return; +} + + +#endif /* OCFS2_JBD_COMPAT_H */ diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c index 8166968e9015..ffd48db229a7 100644 --- a/fs/ocfs2/resize.c +++ b/fs/ocfs2/resize.c @@ -200,7 +200,7 @@ static int update_backups(struct inode * inode, u32 clusters, char *data) if (cluster > clusters) break; - ret = ocfs2_read_block(osb, blkno, &backup, 0, NULL); + ret = ocfs2_read_blocks_sync(osb, blkno, 1, &backup); if (ret < 0) { mlog_errno(ret); break; @@ -236,8 +236,8 @@ static void ocfs2_update_super_and_backups(struct inode *inode, * update the superblock last. * It doesn't matter if the write failed. */ - ret = ocfs2_read_block(osb, OCFS2_SUPER_BLOCK_BLKNO, - &super_bh, 0, NULL); + ret = ocfs2_read_blocks_sync(osb, OCFS2_SUPER_BLOCK_BLKNO, 1, + &super_bh); if (ret < 0) { mlog_errno(ret); goto out; @@ -332,8 +332,7 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters) lgd_blkno = ocfs2_which_cluster_group(main_bm_inode, first_new_cluster - 1); - ret = ocfs2_read_block(osb, lgd_blkno, &group_bh, OCFS2_BH_CACHED, - main_bm_inode); + ret = ocfs2_read_block(main_bm_inode, lgd_blkno, &group_bh); if (ret < 0) { mlog_errno(ret); goto out_unlock; @@ -540,7 +539,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) goto out_unlock; } - ret = ocfs2_read_block(osb, input->group, &group_bh, 0, NULL); + ret = ocfs2_read_blocks_sync(osb, input->group, 1, &group_bh); if (ret < 0) { mlog(ML_ERROR, "Can't read the group descriptor # %llu " "from the device.", (unsigned long long)input->group); diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index bb5ff8939bf1..bdda2d8f8508 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -150,8 +150,8 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb) * be !NULL. Thus, ocfs2_read_blocks() will ignore blocknr. If * this is not true, the read of -1 (UINT64_MAX) will fail. */ - ret = ocfs2_read_blocks(osb, -1, si->si_blocks, si->si_bh, 0, - si->si_inode); + ret = ocfs2_read_blocks(si->si_inode, -1, si->si_blocks, si->si_bh, + OCFS2_BH_IGNORE_CACHE); if (ret == 0) { spin_lock(&osb->osb_lock); ocfs2_update_slot_info(si); @@ -404,7 +404,8 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb, (unsigned long long)blkno); bh = NULL; /* Acquire a fresh bh */ - status = ocfs2_read_block(osb, blkno, &bh, 0, si->si_inode); + status = ocfs2_read_blocks(si->si_inode, blkno, 1, &bh, + OCFS2_BH_IGNORE_CACHE); if (status < 0) { mlog_errno(status); goto bail; diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index c021280dd462..faec2d879357 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -21,12 +21,14 @@ #include <linux/fs.h> #include <linux/miscdevice.h> #include <linux/mutex.h> +#include <linux/smp_lock.h> #include <linux/reboot.h> #include <asm/uaccess.h> #include "ocfs2.h" /* For struct ocfs2_lock_res */ #include "stackglue.h" +#include <linux/dlm_plock.h> /* * The control protocol starts with a handshake. Until the handshake @@ -549,26 +551,17 @@ static ssize_t ocfs2_control_read(struct file *file, size_t count, loff_t *ppos) { - char *proto_string = OCFS2_CONTROL_PROTO; - size_t to_write = 0; - - if (*ppos >= OCFS2_CONTROL_PROTO_LEN) - return 0; - - to_write = OCFS2_CONTROL_PROTO_LEN - *ppos; - if (to_write > count) - to_write = count; - if (copy_to_user(buf, proto_string + *ppos, to_write)) - return -EFAULT; + ssize_t ret; - *ppos += to_write; + ret = simple_read_from_buffer(buf, count, ppos, + OCFS2_CONTROL_PROTO, OCFS2_CONTROL_PROTO_LEN); /* Have we read the whole protocol list? */ - if (*ppos >= OCFS2_CONTROL_PROTO_LEN) + if (ret > 0 && *ppos >= OCFS2_CONTROL_PROTO_LEN) ocfs2_control_set_handshake_state(file, OCFS2_CONTROL_HANDSHAKE_READ); - return to_write; + return ret; } static int ocfs2_control_release(struct inode *inode, struct file *file) @@ -619,10 +612,12 @@ static int ocfs2_control_open(struct inode *inode, struct file *file) return -ENOMEM; p->op_this_node = -1; + lock_kernel(); mutex_lock(&ocfs2_control_lock); file->private_data = p; list_add(&p->op_list, &ocfs2_control_private_list); mutex_unlock(&ocfs2_control_lock); + unlock_kernel(); return 0; } @@ -752,6 +747,37 @@ static void user_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb) { } +static int user_plock(struct ocfs2_cluster_connection *conn, + u64 ino, + struct file *file, + int cmd, + struct file_lock *fl) +{ + /* + * This more or less just demuxes the plock request into any + * one of three dlm calls. + * + * Internally, fs/dlm will pass these to a misc device, which + * a userspace daemon will read and write to. + * + * For now, cancel requests (which happen internally only), + * are turned into unlocks. Most of this function taken from + * gfs2_lock. + */ + + if (cmd == F_CANCELLK) { + cmd = F_SETLK; + fl->fl_type = F_UNLCK; + } + + if (IS_GETLK(cmd)) + return dlm_posix_get(conn->cc_lockspace, ino, file, fl); + else if (fl->fl_type == F_UNLCK) + return dlm_posix_unlock(conn->cc_lockspace, ino, file, fl); + else + return dlm_posix_lock(conn->cc_lockspace, ino, file, cmd, fl); +} + /* * Compare a requested locking protocol version against the current one. * @@ -845,6 +871,7 @@ static struct ocfs2_stack_operations ocfs2_user_plugin_ops = { .dlm_unlock = user_dlm_unlock, .lock_status = user_dlm_lock_status, .lock_lvb = user_dlm_lvb, + .plock = user_plock, .dump_lksb = user_dlm_dump_lksb, }; diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 10e149ae5e3a..68b668b0e60a 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -97,13 +97,14 @@ static int ocfs2_stack_driver_request(const char *stack_name, goto out; } - /* Ok, the stack is pinned */ - p->sp_count++; active_stack = p; - rc = 0; out: + /* If we found it, pin it */ + if (!rc) + active_stack->sp_count++; + spin_unlock(&ocfs2_stack_lock); return rc; } @@ -287,6 +288,26 @@ void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb) } EXPORT_SYMBOL_GPL(ocfs2_dlm_dump_lksb); +int ocfs2_stack_supports_plocks(void) +{ + return active_stack && active_stack->sp_ops->plock; +} +EXPORT_SYMBOL_GPL(ocfs2_stack_supports_plocks); + +/* + * ocfs2_plock() can only be safely called if + * ocfs2_stack_supports_plocks() returned true + */ +int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino, + struct file *file, int cmd, struct file_lock *fl) +{ + WARN_ON_ONCE(active_stack->sp_ops->plock == NULL); + if (active_stack->sp_ops->plock) + return active_stack->sp_ops->plock(conn, ino, file, cmd, fl); + return -EOPNOTSUPP; +} +EXPORT_SYMBOL_GPL(ocfs2_plock); + int ocfs2_cluster_connect(const char *stack_name, const char *group, int grouplen, diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h index db56281dd1be..c571af375ef8 100644 --- a/fs/ocfs2/stackglue.h +++ b/fs/ocfs2/stackglue.h @@ -28,6 +28,10 @@ #include "dlm/dlmapi.h" #include <linux/dlm.h> +/* Needed for plock-related prototypes */ +struct file; +struct file_lock; + /* * dlmconstants.h does not have a LOCAL flag. We hope to remove it * some day, but right now we need it. Let's fake it. This value is larger @@ -187,6 +191,17 @@ struct ocfs2_stack_operations { void *(*lock_lvb)(union ocfs2_dlm_lksb *lksb); /* + * Cluster-aware posix locks + * + * This is NULL for stacks which do not support posix locks. + */ + int (*plock)(struct ocfs2_cluster_connection *conn, + u64 ino, + struct file *file, + int cmd, + struct file_lock *fl); + + /* * This is an optoinal debugging hook. If provided, the * stack can dump debugging information about this lock. */ @@ -240,6 +255,10 @@ int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb); void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb); void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb); +int ocfs2_stack_supports_plocks(void); +int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino, + struct file *file, int cmd, struct file_lock *fl); + void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto); diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index d2d278fb9819..c5ff18b46b57 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -62,15 +62,18 @@ static int ocfs2_block_group_fill(handle_t *handle, struct ocfs2_chain_list *cl); static int ocfs2_block_group_alloc(struct ocfs2_super *osb, struct inode *alloc_inode, - struct buffer_head *bh); + struct buffer_head *bh, + u64 max_block); static int ocfs2_cluster_group_search(struct inode *inode, struct buffer_head *group_bh, u32 bits_wanted, u32 min_bits, + u64 max_block, u16 *bit_off, u16 *bits_found); static int ocfs2_block_group_search(struct inode *inode, struct buffer_head *group_bh, u32 bits_wanted, u32 min_bits, + u64 max_block, u16 *bit_off, u16 *bits_found); static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, struct ocfs2_alloc_context *ac, @@ -110,8 +113,11 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode, u64 data_blkno, u64 *bg_blkno, u16 *bg_bit_off); +static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, + u32 bits_wanted, u64 max_block, + struct ocfs2_alloc_context **ac); -static void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) +void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) { struct inode *inode = ac->ac_inode; @@ -124,10 +130,8 @@ static void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) iput(inode); ac->ac_inode = NULL; } - if (ac->ac_bh) { - brelse(ac->ac_bh); - ac->ac_bh = NULL; - } + brelse(ac->ac_bh); + ac->ac_bh = NULL; } void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) @@ -276,7 +280,8 @@ static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl) */ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, struct inode *alloc_inode, - struct buffer_head *bh) + struct buffer_head *bh, + u64 max_block) { int status, credits; struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data; @@ -294,9 +299,9 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, mlog_entry_void(); cl = &fe->id2.i_chain; - status = ocfs2_reserve_clusters(osb, - le16_to_cpu(cl->cl_cpg), - &ac); + status = ocfs2_reserve_clusters_with_limit(osb, + le16_to_cpu(cl->cl_cpg), + max_block, &ac); if (status < 0) { if (status != -ENOSPC) mlog_errno(status); @@ -394,8 +399,7 @@ bail: if (ac) ocfs2_free_alloc_context(ac); - if (bg_bh) - brelse(bg_bh); + brelse(bg_bh); mlog_exit(status); return status; @@ -469,7 +473,8 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, goto bail; } - status = ocfs2_block_group_alloc(osb, alloc_inode, bh); + status = ocfs2_block_group_alloc(osb, alloc_inode, bh, + ac->ac_max_block); if (status < 0) { if (status != -ENOSPC) mlog_errno(status); @@ -486,16 +491,15 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, get_bh(bh); ac->ac_bh = bh; bail: - if (bh) - brelse(bh); + brelse(bh); mlog_exit(status); return status; } -int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, - struct ocfs2_dinode *fe, - struct ocfs2_alloc_context **ac) +int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb, + int blocks, + struct ocfs2_alloc_context **ac) { int status; u32 slot; @@ -507,7 +511,7 @@ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, goto bail; } - (*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe); + (*ac)->ac_bits_wanted = blocks; (*ac)->ac_which = OCFS2_AC_USE_META; slot = osb->slot_num; (*ac)->ac_group_search = ocfs2_block_group_search; @@ -532,6 +536,15 @@ bail: return status; } +int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, + struct ocfs2_extent_list *root_el, + struct ocfs2_alloc_context **ac) +{ + return ocfs2_reserve_new_metadata_blocks(osb, + ocfs2_extend_meta_needed(root_el), + ac); +} + static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb, struct ocfs2_alloc_context *ac) { @@ -582,6 +595,14 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb, (*ac)->ac_group_search = ocfs2_block_group_search; /* + * stat(2) can't handle i_ino > 32bits, so we tell the + * lower levels not to allocate us a block group past that + * limit. The 'inode64' mount option avoids this behavior. + */ + if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64)) + (*ac)->ac_max_block = (u32)~0U; + + /* * slot is set when we successfully steal inode from other nodes. * It is reset in 3 places: * 1. when we flush the truncate log @@ -661,9 +682,9 @@ bail: /* Callers don't need to care which bitmap (local alloc or main) to * use so we figure it out for them, but unfortunately this clutters * things a bit. */ -int ocfs2_reserve_clusters(struct ocfs2_super *osb, - u32 bits_wanted, - struct ocfs2_alloc_context **ac) +static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, + u32 bits_wanted, u64 max_block, + struct ocfs2_alloc_context **ac) { int status; @@ -677,24 +698,20 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb, } (*ac)->ac_bits_wanted = bits_wanted; + (*ac)->ac_max_block = max_block; status = -ENOSPC; if (ocfs2_alloc_should_use_local(osb, bits_wanted)) { status = ocfs2_reserve_local_alloc_bits(osb, bits_wanted, *ac); - if ((status < 0) && (status != -ENOSPC)) { + if (status == -EFBIG) { + /* The local alloc window is outside ac_max_block. + * use the main bitmap. */ + status = -ENOSPC; + } else if ((status < 0) && (status != -ENOSPC)) { mlog_errno(status); goto bail; - } else if (status == -ENOSPC) { - /* reserve_local_bits will return enospc with - * the local alloc inode still locked, so we - * can change this safely here. */ - mlog(0, "Disabling local alloc\n"); - /* We set to OCFS2_LA_DISABLED so that umount - * can clean up what's left of the local - * allocation */ - osb->local_alloc_state = OCFS2_LA_DISABLED; } } @@ -718,6 +735,13 @@ bail: return status; } +int ocfs2_reserve_clusters(struct ocfs2_super *osb, + u32 bits_wanted, + struct ocfs2_alloc_context **ac) +{ + return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0, ac); +} + /* * More or less lifted from ext3. I'll leave their description below: * @@ -1000,11 +1024,14 @@ static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg static int ocfs2_cluster_group_search(struct inode *inode, struct buffer_head *group_bh, u32 bits_wanted, u32 min_bits, + u64 max_block, u16 *bit_off, u16 *bits_found) { int search = -ENOSPC; int ret; + u64 blkoff; struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); u16 tmp_off, tmp_found; unsigned int max_bits, gd_cluster_off; @@ -1037,6 +1064,17 @@ static int ocfs2_cluster_group_search(struct inode *inode, if (ret) return ret; + if (max_block) { + blkoff = ocfs2_clusters_to_blocks(inode->i_sb, + gd_cluster_off + + tmp_off + tmp_found); + mlog(0, "Checking %llu against %llu\n", + (unsigned long long)blkoff, + (unsigned long long)max_block); + if (blkoff > max_block) + return -ENOSPC; + } + /* ocfs2_block_group_find_clear_bits() might * return success, but we still want to return * -ENOSPC unless it found the minimum number @@ -1045,6 +1083,12 @@ static int ocfs2_cluster_group_search(struct inode *inode, *bit_off = tmp_off; *bits_found = tmp_found; search = 0; /* success */ + } else if (tmp_found) { + /* + * Don't show bits which we'll be returning + * for allocation to the local alloc bitmap. + */ + ocfs2_local_alloc_seen_free_bits(osb, tmp_found); } } @@ -1054,19 +1098,31 @@ static int ocfs2_cluster_group_search(struct inode *inode, static int ocfs2_block_group_search(struct inode *inode, struct buffer_head *group_bh, u32 bits_wanted, u32 min_bits, + u64 max_block, u16 *bit_off, u16 *bits_found) { int ret = -ENOSPC; + u64 blkoff; struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data; BUG_ON(min_bits != 1); BUG_ON(ocfs2_is_cluster_bitmap(inode)); - if (bg->bg_free_bits_count) + if (bg->bg_free_bits_count) { ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), group_bh, bits_wanted, le16_to_cpu(bg->bg_bits), bit_off, bits_found); + if (!ret && max_block) { + blkoff = le64_to_cpu(bg->bg_blkno) + *bit_off + + *bits_found; + mlog(0, "Checking %llu against %llu\n", + (unsigned long long)blkoff, + (unsigned long long)max_block); + if (blkoff > max_block) + ret = -ENOSPC; + } + } return ret; } @@ -1116,8 +1172,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, struct ocfs2_group_desc *gd; struct inode *alloc_inode = ac->ac_inode; - ret = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb), gd_blkno, - &group_bh, OCFS2_BH_CACHED, alloc_inode); + ret = ocfs2_read_block(alloc_inode, gd_blkno, &group_bh); if (ret < 0) { mlog_errno(ret); return ret; @@ -1131,7 +1186,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, } ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits, - bit_off, &found); + ac->ac_max_block, bit_off, &found); if (ret < 0) { if (ret != -ENOSPC) mlog_errno(ret); @@ -1186,9 +1241,9 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, bits_wanted, chain, (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno); - status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb), + status = ocfs2_read_block(alloc_inode, le64_to_cpu(cl->cl_recs[chain].c_blkno), - &group_bh, OCFS2_BH_CACHED, alloc_inode); + &group_bh); if (status < 0) { mlog_errno(status); goto bail; @@ -1204,21 +1259,20 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, /* for now, the chain search is a bit simplistic. We just use * the 1st group with any empty bits. */ while ((status = ac->ac_group_search(alloc_inode, group_bh, - bits_wanted, min_bits, bit_off, + bits_wanted, min_bits, + ac->ac_max_block, bit_off, &tmp_bits)) == -ENOSPC) { if (!bg->bg_next_group) break; - if (prev_group_bh) { - brelse(prev_group_bh); - prev_group_bh = NULL; - } + brelse(prev_group_bh); + prev_group_bh = NULL; + next_group = le64_to_cpu(bg->bg_next_group); prev_group_bh = group_bh; group_bh = NULL; - status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb), - next_group, &group_bh, - OCFS2_BH_CACHED, alloc_inode); + status = ocfs2_read_block(alloc_inode, + next_group, &group_bh); if (status < 0) { mlog_errno(status); goto bail; @@ -1307,10 +1361,8 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, *bg_blkno = le64_to_cpu(bg->bg_blkno); *bits_left = le16_to_cpu(bg->bg_free_bits_count); bail: - if (group_bh) - brelse(group_bh); - if (prev_group_bh) - brelse(prev_group_bh); + brelse(group_bh); + brelse(prev_group_bh); mlog_exit(status); return status; @@ -1723,7 +1775,6 @@ int ocfs2_free_suballoc_bits(handle_t *handle, { int status = 0; u32 tmp_used; - struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data; struct ocfs2_chain_list *cl = &fe->id2.i_chain; struct buffer_head *group_bh = NULL; @@ -1742,8 +1793,7 @@ int ocfs2_free_suballoc_bits(handle_t *handle, (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count, (unsigned long long)bg_blkno, start_bit); - status = ocfs2_read_block(osb, bg_blkno, &group_bh, OCFS2_BH_CACHED, - alloc_inode); + status = ocfs2_read_block(alloc_inode, bg_blkno, &group_bh); if (status < 0) { mlog_errno(status); goto bail; @@ -1784,8 +1834,7 @@ int ocfs2_free_suballoc_bits(handle_t *handle, } bail: - if (group_bh) - brelse(group_bh); + brelse(group_bh); mlog_exit(status); return status; @@ -1838,9 +1887,15 @@ int ocfs2_free_clusters(handle_t *handle, status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh, bg_start_bit, bg_blkno, num_clusters); - if (status < 0) + if (status < 0) { mlog_errno(status); + goto out; + } + ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb), + num_clusters); + +out: mlog_exit(status); return status; } @@ -1891,3 +1946,84 @@ static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe) (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno); } } + +/* + * For a given allocation, determine which allocators will need to be + * accessed, and lock them, reserving the appropriate number of bits. + * + * Sparse file systems call this from ocfs2_write_begin_nolock() + * and ocfs2_allocate_unwritten_extents(). + * + * File systems which don't support holes call this from + * ocfs2_extend_allocation(). + */ +int ocfs2_lock_allocators(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 clusters_to_add, u32 extents_to_split, + struct ocfs2_alloc_context **data_ac, + struct ocfs2_alloc_context **meta_ac) +{ + int ret = 0, num_free_extents; + unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + *meta_ac = NULL; + if (data_ac) + *data_ac = NULL; + + BUG_ON(clusters_to_add != 0 && data_ac == NULL); + + num_free_extents = ocfs2_num_free_extents(osb, inode, et); + if (num_free_extents < 0) { + ret = num_free_extents; + mlog_errno(ret); + goto out; + } + + /* + * Sparse allocation file systems need to be more conservative + * with reserving room for expansion - the actual allocation + * happens while we've got a journal handle open so re-taking + * a cluster lock (because we ran out of room for another + * extent) will violate ordering rules. + * + * Most of the time we'll only be seeing this 1 cluster at a time + * anyway. + * + * Always lock for any unwritten extents - we might want to + * add blocks during a split. + */ + if (!num_free_extents || + (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) { + ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + } + + if (clusters_to_add == 0) + goto out; + + ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + +out: + if (ret) { + if (*meta_ac) { + ocfs2_free_alloc_context(*meta_ac); + *meta_ac = NULL; + } + + /* + * We cannot have an error and a non null *data_ac. + */ + } + + return ret; +} diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index 544c600662bd..4df159d8f450 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -28,10 +28,11 @@ typedef int (group_search_t)(struct inode *, struct buffer_head *, - u32, - u32, - u16 *, - u16 *); + u32, /* bits_wanted */ + u32, /* min_bits */ + u64, /* max_block */ + u16 *, /* *bit_off */ + u16 *); /* *bits_found */ struct ocfs2_alloc_context { struct inode *ac_inode; /* which bitmap are we allocating from? */ @@ -51,6 +52,8 @@ struct ocfs2_alloc_context { group_search_t *ac_group_search; u64 ac_last_group; + u64 ac_max_block; /* Highest block number to allocate. 0 is + is the same as ~0 - unlimited */ }; void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac); @@ -59,9 +62,17 @@ static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac) return ac->ac_bits_wanted - ac->ac_bits_given; } +/* + * Please note that the caller must make sure that root_el is the root + * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise + * the result may be wrong. + */ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, - struct ocfs2_dinode *fe, + struct ocfs2_extent_list *root_el, struct ocfs2_alloc_context **ac); +int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb, + int blocks, + struct ocfs2_alloc_context **ac); int ocfs2_reserve_new_inode(struct ocfs2_super *osb, struct ocfs2_alloc_context **ac); int ocfs2_reserve_clusters(struct ocfs2_super *osb, @@ -147,6 +158,7 @@ static inline int ocfs2_is_cluster_bitmap(struct inode *inode) * apis above. */ int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb, struct ocfs2_alloc_context *ac); +void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac); /* given a cluster offset, calculate which block group it belongs to * and return that block offset. */ @@ -156,4 +168,8 @@ u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster); int ocfs2_check_group_descriptor(struct super_block *sb, struct ocfs2_dinode *di, struct ocfs2_group_desc *gd); +int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et, + u32 clusters_to_add, u32 extents_to_split, + struct ocfs2_alloc_context **data_ac, + struct ocfs2_alloc_context **meta_ac); #endif /* _CHAINALLOC_H_ */ diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index df63ba20ae90..304b63ac78cf 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -64,6 +64,7 @@ #include "sysfile.h" #include "uptodate.h" #include "ver.h" +#include "xattr.h" #include "buffer_head_io.h" @@ -154,10 +155,13 @@ enum { Opt_localalloc, Opt_localflocks, Opt_stack, + Opt_user_xattr, + Opt_nouser_xattr, + Opt_inode64, Opt_err, }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_barrier, "barrier=%u"}, {Opt_err_panic, "errors=panic"}, {Opt_err_ro, "errors=remount-ro"}, @@ -173,6 +177,9 @@ static match_table_t tokens = { {Opt_localalloc, "localalloc=%d"}, {Opt_localflocks, "localflocks"}, {Opt_stack, "cluster_stack=%s"}, + {Opt_user_xattr, "user_xattr"}, + {Opt_nouser_xattr, "nouser_xattr"}, + {Opt_inode64, "inode64"}, {Opt_err, NULL} }; @@ -205,10 +212,11 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait) ocfs2_schedule_truncate_log_flush(osb, 0); } - if (journal_start_commit(OCFS2_SB(sb)->journal->j_journal, &target)) { + if (jbd2_journal_start_commit(OCFS2_SB(sb)->journal->j_journal, + &target)) { if (wait) - log_wait_commit(OCFS2_SB(sb)->journal->j_journal, - target); + jbd2_log_wait_commit(OCFS2_SB(sb)->journal->j_journal, + target); } return 0; } @@ -325,6 +333,7 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb) if (!oi) return NULL; + jbd2_journal_init_jbd_inode(&oi->ip_jinode, &oi->vfs_inode); return &oi->vfs_inode; } @@ -406,6 +415,15 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) goto out; } + /* Probably don't want this on remount; it might + * mess with other nodes */ + if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64) && + (parsed_options.mount_opt & OCFS2_MOUNT_INODE64)) { + ret = -EINVAL; + mlog(ML_ERROR, "Cannot enable inode64 on remount\n"); + goto out; + } + /* We're going to/from readonly mode. */ if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { /* Lock here so the check of HARD_RO and the potential @@ -637,7 +655,8 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) osb->s_atime_quantum = parsed_options.atime_quantum; osb->preferred_slot = parsed_options.slot; osb->osb_commit_interval = parsed_options.commit_interval; - osb->local_alloc_size = parsed_options.localalloc_opt; + osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt); + osb->local_alloc_bits = osb->local_alloc_default_bits; status = ocfs2_verify_userspace_stack(osb, &parsed_options); if (status) @@ -743,8 +762,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) return status; read_super_error: - if (bh != NULL) - brelse(bh); + brelse(bh); if (inode) iput(inode); @@ -847,6 +865,12 @@ static int ocfs2_parse_options(struct super_block *sb, case Opt_data_writeback: mopt->mount_opt |= OCFS2_MOUNT_DATA_WRITEBACK; break; + case Opt_user_xattr: + mopt->mount_opt &= ~OCFS2_MOUNT_NOUSERXATTR; + break; + case Opt_nouser_xattr: + mopt->mount_opt |= OCFS2_MOUNT_NOUSERXATTR; + break; case Opt_atime_quantum: if (match_int(&args[0], &option)) { status = 0; @@ -873,7 +897,7 @@ static int ocfs2_parse_options(struct super_block *sb, if (option < 0) return 0; if (option == 0) - option = JBD_DEFAULT_MAX_COMMIT_AGE; + option = JBD2_DEFAULT_MAX_COMMIT_AGE; mopt->commit_interval = HZ * option; break; case Opt_localalloc: @@ -918,6 +942,9 @@ static int ocfs2_parse_options(struct super_block *sb, OCFS2_STACK_LABEL_LEN); mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0'; break; + case Opt_inode64: + mopt->mount_opt |= OCFS2_MOUNT_INODE64; + break; default: mlog(ML_ERROR, "Unrecognized mount option \"%s\" " @@ -938,6 +965,7 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt) { struct ocfs2_super *osb = OCFS2_SB(mnt->mnt_sb); unsigned long opts = osb->s_mount_opt; + unsigned int local_alloc_megs; if (opts & OCFS2_MOUNT_HB_LOCAL) seq_printf(s, ",_netdev,heartbeat=local"); @@ -970,8 +998,9 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt) seq_printf(s, ",commit=%u", (unsigned) (osb->osb_commit_interval / HZ)); - if (osb->local_alloc_size != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE) - seq_printf(s, ",localalloc=%d", osb->local_alloc_size); + local_alloc_megs = osb->local_alloc_bits >> (20 - osb->s_clustersize_bits); + if (local_alloc_megs != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE) + seq_printf(s, ",localalloc=%d", local_alloc_megs); if (opts & OCFS2_MOUNT_LOCALFLOCKS) seq_printf(s, ",localflocks,"); @@ -980,6 +1009,14 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt) seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN, osb->osb_cluster_stack); + if (opts & OCFS2_MOUNT_NOUSERXATTR) + seq_printf(s, ",nouser_xattr"); + else + seq_printf(s, ",user_xattr"); + + if (opts & OCFS2_MOUNT_INODE64) + seq_printf(s, ",inode64"); + return 0; } @@ -1118,7 +1155,7 @@ bail: return status; } -static void ocfs2_inode_init_once(struct kmem_cache *cachep, void *data) +static void ocfs2_inode_init_once(void *data) { struct ocfs2_inode_info *oi = data; @@ -1132,6 +1169,7 @@ static void ocfs2_inode_init_once(struct kmem_cache *cachep, void *data) oi->ip_dir_start_lookup = 0; init_rwsem(&oi->ip_alloc_sem); + init_rwsem(&oi->ip_xattr_sem); mutex_init(&oi->ip_io_mutex); oi->ip_blkno = 0ULL; @@ -1375,6 +1413,7 @@ static int ocfs2_initialize_super(struct super_block *sb, sb->s_fs_info = osb; sb->s_op = &ocfs2_sops; sb->s_export_op = &ocfs2_export_ops; + sb->s_xattr = ocfs2_xattr_handlers; sb->s_time_gran = 1; sb->s_flags |= MS_NOATIME; /* this is needed to support O_LARGEFILE */ @@ -1421,8 +1460,12 @@ static int ocfs2_initialize_super(struct super_block *sb, osb->slot_num = OCFS2_INVALID_SLOT; + osb->s_xattr_inline_size = le16_to_cpu( + di->id2.i_super.s_xattr_inline_size); + osb->local_alloc_state = OCFS2_LA_UNUSED; osb->local_alloc_bh = NULL; + INIT_DELAYED_WORK(&osb->la_enable_wq, ocfs2_la_enable_worker); init_waitqueue_head(&osb->osb_mount_event); @@ -1442,6 +1485,15 @@ static int ocfs2_initialize_super(struct super_block *sb, } mlog(0, "max_slots for this device: %u\n", osb->max_slots); + osb->slot_recovery_generations = + kcalloc(osb->max_slots, sizeof(*osb->slot_recovery_generations), + GFP_KERNEL); + if (!osb->slot_recovery_generations) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + init_waitqueue_head(&osb->osb_wipe_event); osb->osb_orphan_wipes = kcalloc(osb->max_slots, sizeof(*osb->osb_orphan_wipes), @@ -1559,6 +1611,7 @@ static int ocfs2_initialize_super(struct super_block *sb, osb->first_cluster_group_blkno = le64_to_cpu(di->id2.i_super.s_first_cluster_group); osb->fs_generation = le32_to_cpu(di->i_fs_generation); + osb->uuid_hash = le32_to_cpu(di->id2.i_super.s_uuid_hash); mlog(0, "vol_label: %s\n", osb->vol_label); mlog(0, "uuid: %s\n", osb->uuid_str); mlog(0, "root_blkno=%llu, system_dir_blkno=%llu\n", @@ -1703,7 +1756,11 @@ static int ocfs2_check_volume(struct ocfs2_super *osb) local = ocfs2_mount_local(osb); /* will play back anything left in the journal. */ - ocfs2_journal_load(osb->journal, local); + status = ocfs2_journal_load(osb->journal, local, dirty); + if (status < 0) { + mlog(ML_ERROR, "ocfs2 journal load failed! %d\n", status); + goto finally; + } if (dirty) { /* recover my local alloc if we didn't unmount cleanly. */ @@ -1764,6 +1821,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb) ocfs2_free_slot_info(osb); kfree(osb->osb_orphan_wipes); + kfree(osb->slot_recovery_generations); /* FIXME * This belongs in journal shutdown, but because we have to * allocate osb->journal at the start of ocfs2_initalize_osb(), diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c index ba9dbb51d25b..cbd03dfdc7b9 100644 --- a/fs/ocfs2/symlink.c +++ b/fs/ocfs2/symlink.c @@ -50,6 +50,7 @@ #include "inode.h" #include "journal.h" #include "symlink.h" +#include "xattr.h" #include "buffer_head_io.h" @@ -83,11 +84,7 @@ static char *ocfs2_fast_symlink_getlink(struct inode *inode, mlog_entry_void(); - status = ocfs2_read_block(OCFS2_SB(inode->i_sb), - OCFS2_I(inode)->ip_blkno, - bh, - OCFS2_BH_CACHED, - inode); + status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, bh); if (status < 0) { mlog_errno(status); link = ERR_PTR(status); @@ -157,8 +154,7 @@ bail: kunmap(page); page_cache_release(page); } - if (bh) - brelse(bh); + brelse(bh); return ERR_PTR(status); } @@ -168,10 +164,18 @@ const struct inode_operations ocfs2_symlink_inode_operations = { .follow_link = ocfs2_follow_link, .getattr = ocfs2_getattr, .setattr = ocfs2_setattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ocfs2_listxattr, + .removexattr = generic_removexattr, }; const struct inode_operations ocfs2_fast_symlink_inode_operations = { .readlink = ocfs2_readlink, .follow_link = ocfs2_follow_link, .getattr = ocfs2_getattr, .setattr = ocfs2_setattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ocfs2_listxattr, + .removexattr = generic_removexattr, }; diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c index 4da8851f2b23..187b99ff0368 100644 --- a/fs/ocfs2/uptodate.c +++ b/fs/ocfs2/uptodate.c @@ -53,7 +53,11 @@ #include <linux/highmem.h> #include <linux/buffer_head.h> #include <linux/rbtree.h> -#include <linux/jbd.h> +#ifndef CONFIG_OCFS2_COMPAT_JBD +# include <linux/jbd2.h> +#else +# include <linux/jbd.h> +#endif #define MLOG_MASK_PREFIX ML_UPTODATE @@ -511,14 +515,10 @@ static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci, ci->ci_num_cached--; } -/* Called when we remove a chunk of metadata from an inode. We don't - * bother reverting things to an inlined array in the case of a remove - * which moves us back under the limit. */ -void ocfs2_remove_from_cache(struct inode *inode, - struct buffer_head *bh) +static void ocfs2_remove_block_from_cache(struct inode *inode, + sector_t block) { int index; - sector_t block = bh->b_blocknr; struct ocfs2_meta_cache_item *item = NULL; struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; @@ -544,6 +544,30 @@ void ocfs2_remove_from_cache(struct inode *inode, kmem_cache_free(ocfs2_uptodate_cachep, item); } +/* + * Called when we remove a chunk of metadata from an inode. We don't + * bother reverting things to an inlined array in the case of a remove + * which moves us back under the limit. + */ +void ocfs2_remove_from_cache(struct inode *inode, + struct buffer_head *bh) +{ + sector_t block = bh->b_blocknr; + + ocfs2_remove_block_from_cache(inode, block); +} + +/* Called when we remove xattr clusters from an inode. */ +void ocfs2_remove_xattr_clusters_from_cache(struct inode *inode, + sector_t block, + u32 c_len) +{ + unsigned int i, b_len = ocfs2_clusters_to_blocks(inode->i_sb, 1) * c_len; + + for (i = 0; i < b_len; i++, block++) + ocfs2_remove_block_from_cache(inode, block); +} + int __init init_ocfs2_uptodate_cache(void) { ocfs2_uptodate_cachep = kmem_cache_create("ocfs2_uptodate", diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h index 2e73206059a8..531b4b3a0c47 100644 --- a/fs/ocfs2/uptodate.h +++ b/fs/ocfs2/uptodate.h @@ -40,6 +40,9 @@ void ocfs2_set_new_buffer_uptodate(struct inode *inode, struct buffer_head *bh); void ocfs2_remove_from_cache(struct inode *inode, struct buffer_head *bh); +void ocfs2_remove_xattr_clusters_from_cache(struct inode *inode, + sector_t block, + u32 c_len); int ocfs2_buffer_read_ahead(struct inode *inode, struct buffer_head *bh); diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c new file mode 100644 index 000000000000..c25780a70dfd --- /dev/null +++ b/fs/ocfs2/xattr.c @@ -0,0 +1,4834 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * xattr.c + * + * Copyright (C) 2008 Oracle. All rights reserved. + * + * CREDITS: + * Lots of code in this file is taken from ext3. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/capability.h> +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> +#include <linux/uio.h> +#include <linux/sched.h> +#include <linux/splice.h> +#include <linux/mount.h> +#include <linux/writeback.h> +#include <linux/falloc.h> +#include <linux/sort.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/string.h> + +#define MLOG_MASK_PREFIX ML_XATTR +#include <cluster/masklog.h> + +#include "ocfs2.h" +#include "alloc.h" +#include "dlmglue.h" +#include "file.h" +#include "symlink.h" +#include "sysfile.h" +#include "inode.h" +#include "journal.h" +#include "ocfs2_fs.h" +#include "suballoc.h" +#include "uptodate.h" +#include "buffer_head_io.h" +#include "super.h" +#include "xattr.h" + + +struct ocfs2_xattr_def_value_root { + struct ocfs2_xattr_value_root xv; + struct ocfs2_extent_rec er; +}; + +struct ocfs2_xattr_bucket { + struct buffer_head *bhs[OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET]; + struct ocfs2_xattr_header *xh; +}; + +#define OCFS2_XATTR_ROOT_SIZE (sizeof(struct ocfs2_xattr_def_value_root)) +#define OCFS2_XATTR_INLINE_SIZE 80 + +static struct ocfs2_xattr_def_value_root def_xv = { + .xv.xr_list.l_count = cpu_to_le16(1), +}; + +struct xattr_handler *ocfs2_xattr_handlers[] = { + &ocfs2_xattr_user_handler, + &ocfs2_xattr_trusted_handler, + NULL +}; + +static struct xattr_handler *ocfs2_xattr_handler_map[] = { + [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler, + [OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler, +}; + +struct ocfs2_xattr_info { + int name_index; + const char *name; + const void *value; + size_t value_len; +}; + +struct ocfs2_xattr_search { + struct buffer_head *inode_bh; + /* + * xattr_bh point to the block buffer head which has extended attribute + * when extended attribute in inode, xattr_bh is equal to inode_bh. + */ + struct buffer_head *xattr_bh; + struct ocfs2_xattr_header *header; + struct ocfs2_xattr_bucket bucket; + void *base; + void *end; + struct ocfs2_xattr_entry *here; + int not_found; +}; + +static int ocfs2_xattr_bucket_get_name_value(struct inode *inode, + struct ocfs2_xattr_header *xh, + int index, + int *block_off, + int *new_offset); + +static int ocfs2_xattr_index_block_find(struct inode *inode, + struct buffer_head *root_bh, + int name_index, + const char *name, + struct ocfs2_xattr_search *xs); + +static int ocfs2_xattr_tree_list_index_block(struct inode *inode, + struct ocfs2_xattr_tree_root *xt, + char *buffer, + size_t buffer_size); + +static int ocfs2_xattr_create_index_block(struct inode *inode, + struct ocfs2_xattr_search *xs); + +static int ocfs2_xattr_set_entry_index_block(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs); + +static int ocfs2_delete_xattr_index_block(struct inode *inode, + struct buffer_head *xb_bh); + +static inline const char *ocfs2_xattr_prefix(int name_index) +{ + struct xattr_handler *handler = NULL; + + if (name_index > 0 && name_index < OCFS2_XATTR_MAX) + handler = ocfs2_xattr_handler_map[name_index]; + + return handler ? handler->prefix : NULL; +} + +static u32 ocfs2_xattr_name_hash(struct inode *inode, + const char *name, + int name_len) +{ + /* Get hash value of uuid from super block */ + u32 hash = OCFS2_SB(inode->i_sb)->uuid_hash; + int i; + + /* hash extended attribute name */ + for (i = 0; i < name_len; i++) { + hash = (hash << OCFS2_HASH_SHIFT) ^ + (hash >> (8*sizeof(hash) - OCFS2_HASH_SHIFT)) ^ + *name++; + } + + return hash; +} + +/* + * ocfs2_xattr_hash_entry() + * + * Compute the hash of an extended attribute. + */ +static void ocfs2_xattr_hash_entry(struct inode *inode, + struct ocfs2_xattr_header *header, + struct ocfs2_xattr_entry *entry) +{ + u32 hash = 0; + char *name = (char *)header + le16_to_cpu(entry->xe_name_offset); + + hash = ocfs2_xattr_name_hash(inode, name, entry->xe_name_len); + entry->xe_name_hash = cpu_to_le32(hash); + + return; +} + +static int ocfs2_xattr_extend_allocation(struct inode *inode, + u32 clusters_to_add, + struct buffer_head *xattr_bh, + struct ocfs2_xattr_value_root *xv) +{ + int status = 0; + int restart_func = 0; + int credits = 0; + handle_t *handle = NULL; + struct ocfs2_alloc_context *data_ac = NULL; + struct ocfs2_alloc_context *meta_ac = NULL; + enum ocfs2_alloc_restarted why; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + u32 prev_clusters, logical_start = le32_to_cpu(xv->xr_clusters); + struct ocfs2_extent_tree et; + + mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add); + + ocfs2_init_xattr_value_extent_tree(&et, inode, xattr_bh, xv); + +restart_all: + + status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0, + &data_ac, &meta_ac); + if (status) { + mlog_errno(status); + goto leave; + } + + credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el, + clusters_to_add); + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto leave; + } + +restarted_transaction: + status = ocfs2_journal_access(handle, inode, xattr_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + prev_clusters = le32_to_cpu(xv->xr_clusters); + status = ocfs2_add_clusters_in_btree(osb, + inode, + &logical_start, + clusters_to_add, + 0, + &et, + handle, + data_ac, + meta_ac, + &why); + if ((status < 0) && (status != -EAGAIN)) { + if (status != -ENOSPC) + mlog_errno(status); + goto leave; + } + + status = ocfs2_journal_dirty(handle, xattr_bh); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + clusters_to_add -= le32_to_cpu(xv->xr_clusters) - prev_clusters; + + if (why != RESTART_NONE && clusters_to_add) { + if (why == RESTART_META) { + mlog(0, "restarting function.\n"); + restart_func = 1; + } else { + BUG_ON(why != RESTART_TRANS); + + mlog(0, "restarting transaction.\n"); + /* TODO: This can be more intelligent. */ + credits = ocfs2_calc_extend_credits(osb->sb, + et.et_root_el, + clusters_to_add); + status = ocfs2_extend_trans(handle, credits); + if (status < 0) { + /* handle still has to be committed at + * this point. */ + status = -ENOMEM; + mlog_errno(status); + goto leave; + } + goto restarted_transaction; + } + } + +leave: + if (handle) { + ocfs2_commit_trans(osb, handle); + handle = NULL; + } + if (data_ac) { + ocfs2_free_alloc_context(data_ac); + data_ac = NULL; + } + if (meta_ac) { + ocfs2_free_alloc_context(meta_ac); + meta_ac = NULL; + } + if ((!status) && restart_func) { + restart_func = 0; + goto restart_all; + } + + return status; +} + +static int __ocfs2_remove_xattr_range(struct inode *inode, + struct buffer_head *root_bh, + struct ocfs2_xattr_value_root *xv, + u32 cpos, u32 phys_cpos, u32 len, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret; + u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct inode *tl_inode = osb->osb_tl_inode; + handle_t *handle; + struct ocfs2_alloc_context *meta_ac = NULL; + struct ocfs2_extent_tree et; + + ocfs2_init_xattr_value_extent_tree(&et, inode, root_bh, xv); + + ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac); + if (ret) { + mlog_errno(ret); + return ret; + } + + mutex_lock(&tl_inode->i_mutex); + + if (ocfs2_truncate_log_needs_flush(osb)) { + ret = __ocfs2_flush_truncate_log(osb); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access(handle, inode, root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac, + dealloc); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + le32_add_cpu(&xv->xr_clusters, -len); + + ret = ocfs2_journal_dirty(handle, root_bh); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len); + if (ret) + mlog_errno(ret); + +out_commit: + ocfs2_commit_trans(osb, handle); +out: + mutex_unlock(&tl_inode->i_mutex); + + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + + return ret; +} + +static int ocfs2_xattr_shrink_size(struct inode *inode, + u32 old_clusters, + u32 new_clusters, + struct buffer_head *root_bh, + struct ocfs2_xattr_value_root *xv) +{ + int ret = 0; + u32 trunc_len, cpos, phys_cpos, alloc_size; + u64 block; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_cached_dealloc_ctxt dealloc; + + ocfs2_init_dealloc_ctxt(&dealloc); + + if (old_clusters <= new_clusters) + return 0; + + cpos = new_clusters; + trunc_len = old_clusters - new_clusters; + while (trunc_len) { + ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos, + &alloc_size, &xv->xr_list); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (alloc_size > trunc_len) + alloc_size = trunc_len; + + ret = __ocfs2_remove_xattr_range(inode, root_bh, xv, cpos, + phys_cpos, alloc_size, + &dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + + block = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); + ocfs2_remove_xattr_clusters_from_cache(inode, block, + alloc_size); + cpos += alloc_size; + trunc_len -= alloc_size; + } + +out: + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &dealloc); + + return ret; +} + +static int ocfs2_xattr_value_truncate(struct inode *inode, + struct buffer_head *root_bh, + struct ocfs2_xattr_value_root *xv, + int len) +{ + int ret; + u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb, len); + u32 old_clusters = le32_to_cpu(xv->xr_clusters); + + if (new_clusters == old_clusters) + return 0; + + if (new_clusters > old_clusters) + ret = ocfs2_xattr_extend_allocation(inode, + new_clusters - old_clusters, + root_bh, xv); + else + ret = ocfs2_xattr_shrink_size(inode, + old_clusters, new_clusters, + root_bh, xv); + + return ret; +} + +static int ocfs2_xattr_list_entry(char *buffer, size_t size, + size_t *result, const char *prefix, + const char *name, int name_len) +{ + char *p = buffer + *result; + int prefix_len = strlen(prefix); + int total_len = prefix_len + name_len + 1; + + *result += total_len; + + /* we are just looking for how big our buffer needs to be */ + if (!size) + return 0; + + if (*result > size) + return -ERANGE; + + memcpy(p, prefix, prefix_len); + memcpy(p + prefix_len, name, name_len); + p[prefix_len + name_len] = '\0'; + + return 0; +} + +static int ocfs2_xattr_list_entries(struct inode *inode, + struct ocfs2_xattr_header *header, + char *buffer, size_t buffer_size) +{ + size_t result = 0; + int i, type, ret; + const char *prefix, *name; + + for (i = 0 ; i < le16_to_cpu(header->xh_count); i++) { + struct ocfs2_xattr_entry *entry = &header->xh_entries[i]; + type = ocfs2_xattr_get_type(entry); + prefix = ocfs2_xattr_prefix(type); + + if (prefix) { + name = (const char *)header + + le16_to_cpu(entry->xe_name_offset); + + ret = ocfs2_xattr_list_entry(buffer, buffer_size, + &result, prefix, name, + entry->xe_name_len); + if (ret) + return ret; + } + } + + return result; +} + +static int ocfs2_xattr_ibody_list(struct inode *inode, + struct ocfs2_dinode *di, + char *buffer, + size_t buffer_size) +{ + struct ocfs2_xattr_header *header = NULL; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + int ret = 0; + + if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) + return ret; + + header = (struct ocfs2_xattr_header *) + ((void *)di + inode->i_sb->s_blocksize - + le16_to_cpu(di->i_xattr_inline_size)); + + ret = ocfs2_xattr_list_entries(inode, header, buffer, buffer_size); + + return ret; +} + +static int ocfs2_xattr_block_list(struct inode *inode, + struct ocfs2_dinode *di, + char *buffer, + size_t buffer_size) +{ + struct buffer_head *blk_bh = NULL; + struct ocfs2_xattr_block *xb; + int ret = 0; + + if (!di->i_xattr_loc) + return ret; + + ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + /*Verify the signature of xattr block*/ + if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE, + strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) { + ret = -EFAULT; + goto cleanup; + } + + xb = (struct ocfs2_xattr_block *)blk_bh->b_data; + + if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { + struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header; + ret = ocfs2_xattr_list_entries(inode, header, + buffer, buffer_size); + } else { + struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root; + ret = ocfs2_xattr_tree_list_index_block(inode, xt, + buffer, buffer_size); + } +cleanup: + brelse(blk_bh); + + return ret; +} + +ssize_t ocfs2_listxattr(struct dentry *dentry, + char *buffer, + size_t size) +{ + int ret = 0, i_ret = 0, b_ret = 0; + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di = NULL; + struct ocfs2_inode_info *oi = OCFS2_I(dentry->d_inode); + + if (!ocfs2_supports_xattr(OCFS2_SB(dentry->d_sb))) + return -EOPNOTSUPP; + + if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) + return ret; + + ret = ocfs2_inode_lock(dentry->d_inode, &di_bh, 0); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + di = (struct ocfs2_dinode *)di_bh->b_data; + + down_read(&oi->ip_xattr_sem); + i_ret = ocfs2_xattr_ibody_list(dentry->d_inode, di, buffer, size); + if (i_ret < 0) + b_ret = 0; + else { + if (buffer) { + buffer += i_ret; + size -= i_ret; + } + b_ret = ocfs2_xattr_block_list(dentry->d_inode, di, + buffer, size); + if (b_ret < 0) + i_ret = 0; + } + up_read(&oi->ip_xattr_sem); + ocfs2_inode_unlock(dentry->d_inode, 0); + + brelse(di_bh); + + return i_ret + b_ret; +} + +static int ocfs2_xattr_find_entry(int name_index, + const char *name, + struct ocfs2_xattr_search *xs) +{ + struct ocfs2_xattr_entry *entry; + size_t name_len; + int i, cmp = 1; + + if (name == NULL) + return -EINVAL; + + name_len = strlen(name); + entry = xs->here; + for (i = 0; i < le16_to_cpu(xs->header->xh_count); i++) { + cmp = name_index - ocfs2_xattr_get_type(entry); + if (!cmp) + cmp = name_len - entry->xe_name_len; + if (!cmp) + cmp = memcmp(name, (xs->base + + le16_to_cpu(entry->xe_name_offset)), + name_len); + if (cmp == 0) + break; + entry += 1; + } + xs->here = entry; + + return cmp ? -ENODATA : 0; +} + +static int ocfs2_xattr_get_value_outside(struct inode *inode, + struct ocfs2_xattr_value_root *xv, + void *buffer, + size_t len) +{ + u32 cpos, p_cluster, num_clusters, bpc, clusters; + u64 blkno; + int i, ret = 0; + size_t cplen, blocksize; + struct buffer_head *bh = NULL; + struct ocfs2_extent_list *el; + + el = &xv->xr_list; + clusters = le32_to_cpu(xv->xr_clusters); + bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); + blocksize = inode->i_sb->s_blocksize; + + cpos = 0; + while (cpos < clusters) { + ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, + &num_clusters, el); + if (ret) { + mlog_errno(ret); + goto out; + } + + blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); + /* Copy ocfs2_xattr_value */ + for (i = 0; i < num_clusters * bpc; i++, blkno++) { + ret = ocfs2_read_block(inode, blkno, &bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + cplen = len >= blocksize ? blocksize : len; + memcpy(buffer, bh->b_data, cplen); + len -= cplen; + buffer += cplen; + + brelse(bh); + bh = NULL; + if (len == 0) + break; + } + cpos += num_clusters; + } +out: + return ret; +} + +static int ocfs2_xattr_ibody_get(struct inode *inode, + int name_index, + const char *name, + void *buffer, + size_t buffer_size, + struct ocfs2_xattr_search *xs) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; + struct ocfs2_xattr_value_root *xv; + size_t size; + int ret = 0; + + if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) + return -ENODATA; + + xs->end = (void *)di + inode->i_sb->s_blocksize; + xs->header = (struct ocfs2_xattr_header *) + (xs->end - le16_to_cpu(di->i_xattr_inline_size)); + xs->base = (void *)xs->header; + xs->here = xs->header->xh_entries; + + ret = ocfs2_xattr_find_entry(name_index, name, xs); + if (ret) + return ret; + size = le64_to_cpu(xs->here->xe_value_size); + if (buffer) { + if (size > buffer_size) + return -ERANGE; + if (ocfs2_xattr_is_local(xs->here)) { + memcpy(buffer, (void *)xs->base + + le16_to_cpu(xs->here->xe_name_offset) + + OCFS2_XATTR_SIZE(xs->here->xe_name_len), size); + } else { + xv = (struct ocfs2_xattr_value_root *) + (xs->base + le16_to_cpu( + xs->here->xe_name_offset) + + OCFS2_XATTR_SIZE(xs->here->xe_name_len)); + ret = ocfs2_xattr_get_value_outside(inode, xv, + buffer, size); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + } + } + + return size; +} + +static int ocfs2_xattr_block_get(struct inode *inode, + int name_index, + const char *name, + void *buffer, + size_t buffer_size, + struct ocfs2_xattr_search *xs) +{ + struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; + struct buffer_head *blk_bh = NULL; + struct ocfs2_xattr_block *xb; + struct ocfs2_xattr_value_root *xv; + size_t size; + int ret = -ENODATA, name_offset, name_len, block_off, i; + + if (!di->i_xattr_loc) + return ret; + + memset(&xs->bucket, 0, sizeof(xs->bucket)); + + ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + /*Verify the signature of xattr block*/ + if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE, + strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) { + ret = -EFAULT; + goto cleanup; + } + + xs->xattr_bh = blk_bh; + xb = (struct ocfs2_xattr_block *)blk_bh->b_data; + + if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { + xs->header = &xb->xb_attrs.xb_header; + xs->base = (void *)xs->header; + xs->end = (void *)(blk_bh->b_data) + blk_bh->b_size; + xs->here = xs->header->xh_entries; + + ret = ocfs2_xattr_find_entry(name_index, name, xs); + } else + ret = ocfs2_xattr_index_block_find(inode, blk_bh, + name_index, + name, xs); + + if (ret) + goto cleanup; + size = le64_to_cpu(xs->here->xe_value_size); + if (buffer) { + ret = -ERANGE; + if (size > buffer_size) + goto cleanup; + + name_offset = le16_to_cpu(xs->here->xe_name_offset); + name_len = OCFS2_XATTR_SIZE(xs->here->xe_name_len); + i = xs->here - xs->header->xh_entries; + + if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) { + ret = ocfs2_xattr_bucket_get_name_value(inode, + xs->bucket.xh, + i, + &block_off, + &name_offset); + xs->base = xs->bucket.bhs[block_off]->b_data; + } + if (ocfs2_xattr_is_local(xs->here)) { + memcpy(buffer, (void *)xs->base + + name_offset + name_len, size); + } else { + xv = (struct ocfs2_xattr_value_root *) + (xs->base + name_offset + name_len); + ret = ocfs2_xattr_get_value_outside(inode, xv, + buffer, size); + if (ret < 0) { + mlog_errno(ret); + goto cleanup; + } + } + } + ret = size; +cleanup: + for (i = 0; i < OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET; i++) + brelse(xs->bucket.bhs[i]); + memset(&xs->bucket, 0, sizeof(xs->bucket)); + + brelse(blk_bh); + return ret; +} + +/* ocfs2_xattr_get() + * + * Copy an extended attribute into the buffer provided. + * Buffer is NULL to compute the size of buffer required. + */ +int ocfs2_xattr_get(struct inode *inode, + int name_index, + const char *name, + void *buffer, + size_t buffer_size) +{ + int ret; + struct ocfs2_dinode *di = NULL; + struct buffer_head *di_bh = NULL; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_xattr_search xis = { + .not_found = -ENODATA, + }; + struct ocfs2_xattr_search xbs = { + .not_found = -ENODATA, + }; + + if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb))) + return -EOPNOTSUPP; + + if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) + ret = -ENODATA; + + ret = ocfs2_inode_lock(inode, &di_bh, 0); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + xis.inode_bh = xbs.inode_bh = di_bh; + di = (struct ocfs2_dinode *)di_bh->b_data; + + down_read(&oi->ip_xattr_sem); + ret = ocfs2_xattr_ibody_get(inode, name_index, name, buffer, + buffer_size, &xis); + if (ret == -ENODATA) + ret = ocfs2_xattr_block_get(inode, name_index, name, buffer, + buffer_size, &xbs); + up_read(&oi->ip_xattr_sem); + ocfs2_inode_unlock(inode, 0); + + brelse(di_bh); + + return ret; +} + +static int __ocfs2_xattr_set_value_outside(struct inode *inode, + struct ocfs2_xattr_value_root *xv, + const void *value, + int value_len) +{ + int ret = 0, i, cp_len, credits; + u16 blocksize = inode->i_sb->s_blocksize; + u32 p_cluster, num_clusters; + u32 cpos = 0, bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); + u32 clusters = ocfs2_clusters_for_bytes(inode->i_sb, value_len); + u64 blkno; + struct buffer_head *bh = NULL; + handle_t *handle; + + BUG_ON(clusters > le32_to_cpu(xv->xr_clusters)); + + credits = clusters * bpc; + handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb), credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + while (cpos < clusters) { + ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, + &num_clusters, &xv->xr_list); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); + + for (i = 0; i < num_clusters * bpc; i++, blkno++) { + ret = ocfs2_read_block(inode, blkno, &bh); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_journal_access(handle, + inode, + bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; + } + + cp_len = value_len > blocksize ? blocksize : value_len; + memcpy(bh->b_data, value, cp_len); + value_len -= cp_len; + value += cp_len; + if (cp_len < blocksize) + memset(bh->b_data + cp_len, 0, + blocksize - cp_len); + + ret = ocfs2_journal_dirty(handle, bh); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; + } + brelse(bh); + bh = NULL; + + /* + * XXX: do we need to empty all the following + * blocks in this cluster? + */ + if (!value_len) + break; + } + cpos += num_clusters; + } +out_commit: + ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); +out: + brelse(bh); + + return ret; +} + +static int ocfs2_xattr_cleanup(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs, + size_t offs) +{ + handle_t *handle = NULL; + int ret = 0; + size_t name_len = strlen(xi->name); + void *val = xs->base + offs; + size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE; + + handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), + OCFS2_XATTR_BLOCK_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + ret = ocfs2_journal_access(handle, inode, xs->xattr_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + /* Decrease xattr count */ + le16_add_cpu(&xs->header->xh_count, -1); + /* Remove the xattr entry and tree root which has already be set*/ + memset((void *)xs->here, 0, sizeof(struct ocfs2_xattr_entry)); + memset(val, 0, size); + + ret = ocfs2_journal_dirty(handle, xs->xattr_bh); + if (ret < 0) + mlog_errno(ret); +out_commit: + ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); +out: + return ret; +} + +static int ocfs2_xattr_update_entry(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs, + size_t offs) +{ + handle_t *handle = NULL; + int ret = 0; + + handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), + OCFS2_XATTR_BLOCK_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + ret = ocfs2_journal_access(handle, inode, xs->xattr_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + xs->here->xe_name_offset = cpu_to_le16(offs); + xs->here->xe_value_size = cpu_to_le64(xi->value_len); + if (xi->value_len <= OCFS2_XATTR_INLINE_SIZE) + ocfs2_xattr_set_local(xs->here, 1); + else + ocfs2_xattr_set_local(xs->here, 0); + ocfs2_xattr_hash_entry(inode, xs->header, xs->here); + + ret = ocfs2_journal_dirty(handle, xs->xattr_bh); + if (ret < 0) + mlog_errno(ret); +out_commit: + ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); +out: + return ret; +} + +/* + * ocfs2_xattr_set_value_outside() + * + * Set large size value in B tree. + */ +static int ocfs2_xattr_set_value_outside(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs, + size_t offs) +{ + size_t name_len = strlen(xi->name); + void *val = xs->base + offs; + struct ocfs2_xattr_value_root *xv = NULL; + size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE; + int ret = 0; + + memset(val, 0, size); + memcpy(val, xi->name, name_len); + xv = (struct ocfs2_xattr_value_root *) + (val + OCFS2_XATTR_SIZE(name_len)); + xv->xr_clusters = 0; + xv->xr_last_eb_blk = 0; + xv->xr_list.l_tree_depth = 0; + xv->xr_list.l_count = cpu_to_le16(1); + xv->xr_list.l_next_free_rec = 0; + + ret = ocfs2_xattr_value_truncate(inode, xs->xattr_bh, xv, + xi->value_len); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + ret = __ocfs2_xattr_set_value_outside(inode, xv, xi->value, + xi->value_len); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + ret = ocfs2_xattr_update_entry(inode, xi, xs, offs); + if (ret < 0) + mlog_errno(ret); + + return ret; +} + +/* + * ocfs2_xattr_set_entry_local() + * + * Set, replace or remove extended attribute in local. + */ +static void ocfs2_xattr_set_entry_local(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs, + struct ocfs2_xattr_entry *last, + size_t min_offs) +{ + size_t name_len = strlen(xi->name); + int i; + + if (xi->value && xs->not_found) { + /* Insert the new xattr entry. */ + le16_add_cpu(&xs->header->xh_count, 1); + ocfs2_xattr_set_type(last, xi->name_index); + ocfs2_xattr_set_local(last, 1); + last->xe_name_len = name_len; + } else { + void *first_val; + void *val; + size_t offs, size; + + first_val = xs->base + min_offs; + offs = le16_to_cpu(xs->here->xe_name_offset); + val = xs->base + offs; + + if (le64_to_cpu(xs->here->xe_value_size) > + OCFS2_XATTR_INLINE_SIZE) + size = OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_ROOT_SIZE; + else + size = OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size)); + + if (xi->value && size == OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_SIZE(xi->value_len)) { + /* The old and the new value have the + same size. Just replace the value. */ + ocfs2_xattr_set_local(xs->here, 1); + xs->here->xe_value_size = cpu_to_le64(xi->value_len); + /* Clear value bytes. */ + memset(val + OCFS2_XATTR_SIZE(name_len), + 0, + OCFS2_XATTR_SIZE(xi->value_len)); + memcpy(val + OCFS2_XATTR_SIZE(name_len), + xi->value, + xi->value_len); + return; + } + /* Remove the old name+value. */ + memmove(first_val + size, first_val, val - first_val); + memset(first_val, 0, size); + xs->here->xe_name_hash = 0; + xs->here->xe_name_offset = 0; + ocfs2_xattr_set_local(xs->here, 1); + xs->here->xe_value_size = 0; + + min_offs += size; + + /* Adjust all value offsets. */ + last = xs->header->xh_entries; + for (i = 0 ; i < le16_to_cpu(xs->header->xh_count); i++) { + size_t o = le16_to_cpu(last->xe_name_offset); + + if (o < offs) + last->xe_name_offset = cpu_to_le16(o + size); + last += 1; + } + + if (!xi->value) { + /* Remove the old entry. */ + last -= 1; + memmove(xs->here, xs->here + 1, + (void *)last - (void *)xs->here); + memset(last, 0, sizeof(struct ocfs2_xattr_entry)); + le16_add_cpu(&xs->header->xh_count, -1); + } + } + if (xi->value) { + /* Insert the new name+value. */ + size_t size = OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_SIZE(xi->value_len); + void *val = xs->base + min_offs - size; + + xs->here->xe_name_offset = cpu_to_le16(min_offs - size); + memset(val, 0, size); + memcpy(val, xi->name, name_len); + memcpy(val + OCFS2_XATTR_SIZE(name_len), + xi->value, + xi->value_len); + xs->here->xe_value_size = cpu_to_le64(xi->value_len); + ocfs2_xattr_set_local(xs->here, 1); + ocfs2_xattr_hash_entry(inode, xs->header, xs->here); + } + + return; +} + +/* + * ocfs2_xattr_set_entry() + * + * Set extended attribute entry into inode or block. + * + * If extended attribute value size > OCFS2_XATTR_INLINE_SIZE, + * We first insert tree root(ocfs2_xattr_value_root) with set_entry_local(), + * then set value in B tree with set_value_outside(). + */ +static int ocfs2_xattr_set_entry(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs, + int flag) +{ + struct ocfs2_xattr_entry *last; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; + size_t min_offs = xs->end - xs->base, name_len = strlen(xi->name); + size_t size_l = 0; + handle_t *handle = NULL; + int free, i, ret; + struct ocfs2_xattr_info xi_l = { + .name_index = xi->name_index, + .name = xi->name, + .value = xi->value, + .value_len = xi->value_len, + }; + + /* Compute min_offs, last and free space. */ + last = xs->header->xh_entries; + + for (i = 0 ; i < le16_to_cpu(xs->header->xh_count); i++) { + size_t offs = le16_to_cpu(last->xe_name_offset); + if (offs < min_offs) + min_offs = offs; + last += 1; + } + + free = min_offs - ((void *)last - xs->base) - sizeof(__u32); + if (free < 0) + return -EFAULT; + + if (!xs->not_found) { + size_t size = 0; + if (ocfs2_xattr_is_local(xs->here)) + size = OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size)); + else + size = OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_ROOT_SIZE; + free += (size + sizeof(struct ocfs2_xattr_entry)); + } + /* Check free space in inode or block */ + if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE) { + if (free < sizeof(struct ocfs2_xattr_entry) + + OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_ROOT_SIZE) { + ret = -ENOSPC; + goto out; + } + size_l = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE; + xi_l.value = (void *)&def_xv; + xi_l.value_len = OCFS2_XATTR_ROOT_SIZE; + } else if (xi->value) { + if (free < sizeof(struct ocfs2_xattr_entry) + + OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_SIZE(xi->value_len)) { + ret = -ENOSPC; + goto out; + } + } + + if (!xs->not_found) { + /* For existing extended attribute */ + size_t size = OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size)); + size_t offs = le16_to_cpu(xs->here->xe_name_offset); + void *val = xs->base + offs; + + if (ocfs2_xattr_is_local(xs->here) && size == size_l) { + /* Replace existing local xattr with tree root */ + ret = ocfs2_xattr_set_value_outside(inode, xi, xs, + offs); + if (ret < 0) + mlog_errno(ret); + goto out; + } else if (!ocfs2_xattr_is_local(xs->here)) { + /* For existing xattr which has value outside */ + struct ocfs2_xattr_value_root *xv = NULL; + xv = (struct ocfs2_xattr_value_root *)(val + + OCFS2_XATTR_SIZE(name_len)); + + if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) { + /* + * If new value need set outside also, + * first truncate old value to new value, + * then set new value with set_value_outside(). + */ + ret = ocfs2_xattr_value_truncate(inode, + xs->xattr_bh, + xv, + xi->value_len); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = __ocfs2_xattr_set_value_outside(inode, + xv, + xi->value, + xi->value_len); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_xattr_update_entry(inode, + xi, + xs, + offs); + if (ret < 0) + mlog_errno(ret); + goto out; + } else { + /* + * If new value need set in local, + * just trucate old value to zero. + */ + ret = ocfs2_xattr_value_truncate(inode, + xs->xattr_bh, + xv, + 0); + if (ret < 0) + mlog_errno(ret); + } + } + } + + handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), + OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access(handle, inode, xs->inode_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + if (!(flag & OCFS2_INLINE_XATTR_FL)) { + /* set extended attribute in external block. */ + ret = ocfs2_extend_trans(handle, + OCFS2_INODE_UPDATE_CREDITS + + OCFS2_XATTR_BLOCK_UPDATE_CREDITS); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + ret = ocfs2_journal_access(handle, inode, xs->xattr_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + } + + /* + * Set value in local, include set tree root in local. + * This is the first step for value size >INLINE_SIZE. + */ + ocfs2_xattr_set_entry_local(inode, &xi_l, xs, last, min_offs); + + if (!(flag & OCFS2_INLINE_XATTR_FL)) { + ret = ocfs2_journal_dirty(handle, xs->xattr_bh); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; + } + } + + if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) && + (flag & OCFS2_INLINE_XATTR_FL)) { + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + unsigned int xattrsize = osb->s_xattr_inline_size; + + /* + * Adjust extent record count or inline data size + * to reserve space for extended attribute. + */ + if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + struct ocfs2_inline_data *idata = &di->id2.i_data; + le16_add_cpu(&idata->id_count, -xattrsize); + } else if (!(ocfs2_inode_is_fast_symlink(inode))) { + struct ocfs2_extent_list *el = &di->id2.i_list; + le16_add_cpu(&el->l_count, -(xattrsize / + sizeof(struct ocfs2_extent_rec))); + } + di->i_xattr_inline_size = cpu_to_le16(xattrsize); + } + /* Update xattr flag */ + spin_lock(&oi->ip_lock); + oi->ip_dyn_features |= flag; + di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); + spin_unlock(&oi->ip_lock); + /* Update inode ctime */ + inode->i_ctime = CURRENT_TIME; + di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); + di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + + ret = ocfs2_journal_dirty(handle, xs->inode_bh); + if (ret < 0) + mlog_errno(ret); + +out_commit: + ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); + + if (!ret && xi->value_len > OCFS2_XATTR_INLINE_SIZE) { + /* + * Set value outside in B tree. + * This is the second step for value size > INLINE_SIZE. + */ + size_t offs = le16_to_cpu(xs->here->xe_name_offset); + ret = ocfs2_xattr_set_value_outside(inode, xi, xs, offs); + if (ret < 0) { + int ret2; + + mlog_errno(ret); + /* + * If set value outside failed, we have to clean + * the junk tree root we have already set in local. + */ + ret2 = ocfs2_xattr_cleanup(inode, xi, xs, offs); + if (ret2 < 0) + mlog_errno(ret2); + } + } +out: + return ret; + +} + +static int ocfs2_remove_value_outside(struct inode*inode, + struct buffer_head *bh, + struct ocfs2_xattr_header *header) +{ + int ret = 0, i; + + for (i = 0; i < le16_to_cpu(header->xh_count); i++) { + struct ocfs2_xattr_entry *entry = &header->xh_entries[i]; + + if (!ocfs2_xattr_is_local(entry)) { + struct ocfs2_xattr_value_root *xv; + void *val; + + val = (void *)header + + le16_to_cpu(entry->xe_name_offset); + xv = (struct ocfs2_xattr_value_root *) + (val + OCFS2_XATTR_SIZE(entry->xe_name_len)); + ret = ocfs2_xattr_value_truncate(inode, bh, xv, 0); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + } + } + + return ret; +} + +static int ocfs2_xattr_ibody_remove(struct inode *inode, + struct buffer_head *di_bh) +{ + + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_xattr_header *header; + int ret; + + header = (struct ocfs2_xattr_header *) + ((void *)di + inode->i_sb->s_blocksize - + le16_to_cpu(di->i_xattr_inline_size)); + + ret = ocfs2_remove_value_outside(inode, di_bh, header); + + return ret; +} + +static int ocfs2_xattr_block_remove(struct inode *inode, + struct buffer_head *blk_bh) +{ + struct ocfs2_xattr_block *xb; + int ret = 0; + + xb = (struct ocfs2_xattr_block *)blk_bh->b_data; + if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { + struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header); + ret = ocfs2_remove_value_outside(inode, blk_bh, header); + } else + ret = ocfs2_delete_xattr_index_block(inode, blk_bh); + + return ret; +} + +static int ocfs2_xattr_free_block(struct inode *inode, + u64 block) +{ + struct inode *xb_alloc_inode; + struct buffer_head *xb_alloc_bh = NULL; + struct buffer_head *blk_bh = NULL; + struct ocfs2_xattr_block *xb; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + handle_t *handle; + int ret = 0; + u64 blk, bg_blkno; + u16 bit; + + ret = ocfs2_read_block(inode, block, &blk_bh); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + /*Verify the signature of xattr block*/ + if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE, + strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) { + ret = -EFAULT; + goto out; + } + + ret = ocfs2_xattr_block_remove(inode, blk_bh); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + xb = (struct ocfs2_xattr_block *)blk_bh->b_data; + blk = le64_to_cpu(xb->xb_blkno); + bit = le16_to_cpu(xb->xb_suballoc_bit); + bg_blkno = ocfs2_which_suballoc_group(blk, bit); + + xb_alloc_inode = ocfs2_get_system_file_inode(osb, + EXTENT_ALLOC_SYSTEM_INODE, + le16_to_cpu(xb->xb_suballoc_slot)); + if (!xb_alloc_inode) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + mutex_lock(&xb_alloc_inode->i_mutex); + + ret = ocfs2_inode_lock(xb_alloc_inode, &xb_alloc_bh, 1); + if (ret < 0) { + mlog_errno(ret); + goto out_mutex; + } + + handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out_unlock; + } + + ret = ocfs2_free_suballoc_bits(handle, xb_alloc_inode, xb_alloc_bh, + bit, bg_blkno, 1); + if (ret < 0) + mlog_errno(ret); + + ocfs2_commit_trans(osb, handle); +out_unlock: + ocfs2_inode_unlock(xb_alloc_inode, 1); + brelse(xb_alloc_bh); +out_mutex: + mutex_unlock(&xb_alloc_inode->i_mutex); + iput(xb_alloc_inode); +out: + brelse(blk_bh); + return ret; +} + +/* + * ocfs2_xattr_remove() + * + * Free extended attribute resources associated with this inode. + */ +int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + handle_t *handle; + int ret; + + if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb))) + return 0; + + if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) + return 0; + + if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) { + ret = ocfs2_xattr_ibody_remove(inode, di_bh); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + if (di->i_xattr_loc) { + ret = ocfs2_xattr_free_block(inode, + le64_to_cpu(di->i_xattr_loc)); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), + OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + ret = ocfs2_journal_access(handle, inode, di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + di->i_xattr_loc = 0; + + spin_lock(&oi->ip_lock); + oi->ip_dyn_features &= ~(OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL); + di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); + spin_unlock(&oi->ip_lock); + + ret = ocfs2_journal_dirty(handle, di_bh); + if (ret < 0) + mlog_errno(ret); +out_commit: + ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); +out: + return ret; +} + +static int ocfs2_xattr_has_space_inline(struct inode *inode, + struct ocfs2_dinode *di) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + unsigned int xattrsize = OCFS2_SB(inode->i_sb)->s_xattr_inline_size; + int free; + + if (xattrsize < OCFS2_MIN_XATTR_INLINE_SIZE) + return 0; + + if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + struct ocfs2_inline_data *idata = &di->id2.i_data; + free = le16_to_cpu(idata->id_count) - le64_to_cpu(di->i_size); + } else if (ocfs2_inode_is_fast_symlink(inode)) { + free = ocfs2_fast_symlink_chars(inode->i_sb) - + le64_to_cpu(di->i_size); + } else { + struct ocfs2_extent_list *el = &di->id2.i_list; + free = (le16_to_cpu(el->l_count) - + le16_to_cpu(el->l_next_free_rec)) * + sizeof(struct ocfs2_extent_rec); + } + if (free >= xattrsize) + return 1; + + return 0; +} + +/* + * ocfs2_xattr_ibody_find() + * + * Find extended attribute in inode block and + * fill search info into struct ocfs2_xattr_search. + */ +static int ocfs2_xattr_ibody_find(struct inode *inode, + int name_index, + const char *name, + struct ocfs2_xattr_search *xs) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; + int ret; + int has_space = 0; + + if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE) + return 0; + + if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) { + down_read(&oi->ip_alloc_sem); + has_space = ocfs2_xattr_has_space_inline(inode, di); + up_read(&oi->ip_alloc_sem); + if (!has_space) + return 0; + } + + xs->xattr_bh = xs->inode_bh; + xs->end = (void *)di + inode->i_sb->s_blocksize; + if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) + xs->header = (struct ocfs2_xattr_header *) + (xs->end - le16_to_cpu(di->i_xattr_inline_size)); + else + xs->header = (struct ocfs2_xattr_header *) + (xs->end - OCFS2_SB(inode->i_sb)->s_xattr_inline_size); + xs->base = (void *)xs->header; + xs->here = xs->header->xh_entries; + + /* Find the named attribute. */ + if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) { + ret = ocfs2_xattr_find_entry(name_index, name, xs); + if (ret && ret != -ENODATA) + return ret; + xs->not_found = ret; + } + + return 0; +} + +/* + * ocfs2_xattr_ibody_set() + * + * Set, replace or remove an extended attribute into inode block. + * + */ +static int ocfs2_xattr_ibody_set(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; + int ret; + + if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE) + return -ENOSPC; + + down_write(&oi->ip_alloc_sem); + if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) { + if (!ocfs2_xattr_has_space_inline(inode, di)) { + ret = -ENOSPC; + goto out; + } + } + + ret = ocfs2_xattr_set_entry(inode, xi, xs, + (OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL)); +out: + up_write(&oi->ip_alloc_sem); + + return ret; +} + +/* + * ocfs2_xattr_block_find() + * + * Find extended attribute in external block and + * fill search info into struct ocfs2_xattr_search. + */ +static int ocfs2_xattr_block_find(struct inode *inode, + int name_index, + const char *name, + struct ocfs2_xattr_search *xs) +{ + struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; + struct buffer_head *blk_bh = NULL; + struct ocfs2_xattr_block *xb; + int ret = 0; + + if (!di->i_xattr_loc) + return ret; + + ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + /*Verify the signature of xattr block*/ + if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE, + strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) { + ret = -EFAULT; + goto cleanup; + } + + xs->xattr_bh = blk_bh; + xb = (struct ocfs2_xattr_block *)blk_bh->b_data; + + if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { + xs->header = &xb->xb_attrs.xb_header; + xs->base = (void *)xs->header; + xs->end = (void *)(blk_bh->b_data) + blk_bh->b_size; + xs->here = xs->header->xh_entries; + + ret = ocfs2_xattr_find_entry(name_index, name, xs); + } else + ret = ocfs2_xattr_index_block_find(inode, blk_bh, + name_index, + name, xs); + + if (ret && ret != -ENODATA) { + xs->xattr_bh = NULL; + goto cleanup; + } + xs->not_found = ret; + return 0; +cleanup: + brelse(blk_bh); + + return ret; +} + +/* + * When all the xattrs are deleted from index btree, the ocfs2_xattr_tree + * will be erased and ocfs2_xattr_block will have its ocfs2_xattr_header + * re-initialized. + */ +static int ocfs2_restore_xattr_block(struct inode *inode, + struct ocfs2_xattr_search *xs) +{ + int ret; + handle_t *handle; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)xs->xattr_bh->b_data; + struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list; + u16 xb_flags = le16_to_cpu(xb->xb_flags); + + BUG_ON(!(xb_flags & OCFS2_XATTR_INDEXED) || + le16_to_cpu(el->l_next_free_rec) != 0); + + handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + goto out; + } + + ret = ocfs2_journal_access(handle, inode, xs->xattr_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; + } + + memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize - + offsetof(struct ocfs2_xattr_block, xb_attrs)); + + xb->xb_flags = cpu_to_le16(xb_flags & ~OCFS2_XATTR_INDEXED); + + ocfs2_journal_dirty(handle, xs->xattr_bh); + +out_commit: + ocfs2_commit_trans(osb, handle); +out: + return ret; +} + +/* + * ocfs2_xattr_block_set() + * + * Set, replace or remove an extended attribute into external block. + * + */ +static int ocfs2_xattr_block_set(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs) +{ + struct buffer_head *new_bh = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; + struct ocfs2_alloc_context *meta_ac = NULL; + handle_t *handle = NULL; + struct ocfs2_xattr_block *xblk = NULL; + u16 suballoc_bit_start; + u32 num_got; + u64 first_blkno; + int ret; + + if (!xs->xattr_bh) { + /* + * Alloc one external block for extended attribute + * outside of inode. + */ + ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + handle = ocfs2_start_trans(osb, + OCFS2_XATTR_BLOCK_CREATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + ret = ocfs2_journal_access(handle, inode, xs->inode_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, + &suballoc_bit_start, &num_got, + &first_blkno); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; + } + + new_bh = sb_getblk(inode->i_sb, first_blkno); + ocfs2_set_new_buffer_uptodate(inode, new_bh); + + ret = ocfs2_journal_access(handle, inode, new_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; + } + + /* Initialize ocfs2_xattr_block */ + xs->xattr_bh = new_bh; + xblk = (struct ocfs2_xattr_block *)new_bh->b_data; + memset(xblk, 0, inode->i_sb->s_blocksize); + strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE); + xblk->xb_suballoc_slot = cpu_to_le16(osb->slot_num); + xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start); + xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation); + xblk->xb_blkno = cpu_to_le64(first_blkno); + + xs->header = &xblk->xb_attrs.xb_header; + xs->base = (void *)xs->header; + xs->end = (void *)xblk + inode->i_sb->s_blocksize; + xs->here = xs->header->xh_entries; + + + ret = ocfs2_journal_dirty(handle, new_bh); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; + } + di->i_xattr_loc = cpu_to_le64(first_blkno); + ret = ocfs2_journal_dirty(handle, xs->inode_bh); + if (ret < 0) + mlog_errno(ret); +out_commit: + ocfs2_commit_trans(osb, handle); +out: + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + if (ret < 0) + return ret; + } else + xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data; + + if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) { + /* Set extended attribute into external block */ + ret = ocfs2_xattr_set_entry(inode, xi, xs, OCFS2_HAS_XATTR_FL); + if (!ret || ret != -ENOSPC) + goto end; + + ret = ocfs2_xattr_create_index_block(inode, xs); + if (ret) + goto end; + } + + ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs); + if (!ret && xblk->xb_attrs.xb_root.xt_list.l_next_free_rec == 0) + ret = ocfs2_restore_xattr_block(inode, xs); + +end: + + return ret; +} + +/* + * ocfs2_xattr_set() + * + * Set, replace or remove an extended attribute for this inode. + * value is NULL to remove an existing extended attribute, else either + * create or replace an extended attribute. + */ +int ocfs2_xattr_set(struct inode *inode, + int name_index, + const char *name, + const void *value, + size_t value_len, + int flags) +{ + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di; + int ret; + u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + + struct ocfs2_xattr_info xi = { + .name_index = name_index, + .name = name, + .value = value, + .value_len = value_len, + }; + + struct ocfs2_xattr_search xis = { + .not_found = -ENODATA, + }; + + struct ocfs2_xattr_search xbs = { + .not_found = -ENODATA, + }; + + if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb))) + return -EOPNOTSUPP; + + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + xis.inode_bh = xbs.inode_bh = di_bh; + di = (struct ocfs2_dinode *)di_bh->b_data; + + down_write(&OCFS2_I(inode)->ip_xattr_sem); + /* + * Scan inode and external block to find the same name + * extended attribute and collect search infomation. + */ + ret = ocfs2_xattr_ibody_find(inode, name_index, name, &xis); + if (ret) + goto cleanup; + if (xis.not_found) { + ret = ocfs2_xattr_block_find(inode, name_index, name, &xbs); + if (ret) + goto cleanup; + } + + if (xis.not_found && xbs.not_found) { + ret = -ENODATA; + if (flags & XATTR_REPLACE) + goto cleanup; + ret = 0; + if (!value) + goto cleanup; + } else { + ret = -EEXIST; + if (flags & XATTR_CREATE) + goto cleanup; + } + + if (!value) { + /* Remove existing extended attribute */ + if (!xis.not_found) + ret = ocfs2_xattr_ibody_set(inode, &xi, &xis); + else if (!xbs.not_found) + ret = ocfs2_xattr_block_set(inode, &xi, &xbs); + } else { + /* We always try to set extended attribute into inode first*/ + ret = ocfs2_xattr_ibody_set(inode, &xi, &xis); + if (!ret && !xbs.not_found) { + /* + * If succeed and that extended attribute existing in + * external block, then we will remove it. + */ + xi.value = NULL; + xi.value_len = 0; + ret = ocfs2_xattr_block_set(inode, &xi, &xbs); + } else if (ret == -ENOSPC) { + if (di->i_xattr_loc && !xbs.xattr_bh) { + ret = ocfs2_xattr_block_find(inode, name_index, + name, &xbs); + if (ret) + goto cleanup; + } + /* + * If no space in inode, we will set extended attribute + * into external block. + */ + ret = ocfs2_xattr_block_set(inode, &xi, &xbs); + if (ret) + goto cleanup; + if (!xis.not_found) { + /* + * If succeed and that extended attribute + * existing in inode, we will remove it. + */ + xi.value = NULL; + xi.value_len = 0; + ret = ocfs2_xattr_ibody_set(inode, &xi, &xis); + } + } + } +cleanup: + up_write(&OCFS2_I(inode)->ip_xattr_sem); + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + brelse(xbs.xattr_bh); + for (i = 0; i < blk_per_bucket; i++) + brelse(xbs.bucket.bhs[i]); + + return ret; +} + +/* + * Find the xattr extent rec which may contains name_hash. + * e_cpos will be the first name hash of the xattr rec. + * el must be the ocfs2_xattr_header.xb_attrs.xb_root.xt_list. + */ +static int ocfs2_xattr_get_rec(struct inode *inode, + u32 name_hash, + u64 *p_blkno, + u32 *e_cpos, + u32 *num_clusters, + struct ocfs2_extent_list *el) +{ + int ret = 0, i; + struct buffer_head *eb_bh = NULL; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_rec *rec = NULL; + u64 e_blkno = 0; + + if (el->l_tree_depth) { + ret = ocfs2_find_leaf(inode, el, name_hash, &eb_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + + if (el->l_tree_depth) { + ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in " + "xattr tree block %llu\n", inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); + ret = -EROFS; + goto out; + } + } + + for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) { + rec = &el->l_recs[i]; + + if (le32_to_cpu(rec->e_cpos) <= name_hash) { + e_blkno = le64_to_cpu(rec->e_blkno); + break; + } + } + + if (!e_blkno) { + ocfs2_error(inode->i_sb, "Inode %lu has bad extent " + "record (%u, %u, 0) in xattr", inode->i_ino, + le32_to_cpu(rec->e_cpos), + ocfs2_rec_clusters(el, rec)); + ret = -EROFS; + goto out; + } + + *p_blkno = le64_to_cpu(rec->e_blkno); + *num_clusters = le16_to_cpu(rec->e_leaf_clusters); + if (e_cpos) + *e_cpos = le32_to_cpu(rec->e_cpos); +out: + brelse(eb_bh); + return ret; +} + +typedef int (xattr_bucket_func)(struct inode *inode, + struct ocfs2_xattr_bucket *bucket, + void *para); + +static int ocfs2_find_xe_in_bucket(struct inode *inode, + struct buffer_head *header_bh, + int name_index, + const char *name, + u32 name_hash, + u16 *xe_index, + int *found) +{ + int i, ret = 0, cmp = 1, block_off, new_offset; + struct ocfs2_xattr_header *xh = + (struct ocfs2_xattr_header *)header_bh->b_data; + size_t name_len = strlen(name); + struct ocfs2_xattr_entry *xe = NULL; + struct buffer_head *name_bh = NULL; + char *xe_name; + + /* + * We don't use binary search in the bucket because there + * may be multiple entries with the same name hash. + */ + for (i = 0; i < le16_to_cpu(xh->xh_count); i++) { + xe = &xh->xh_entries[i]; + + if (name_hash > le32_to_cpu(xe->xe_name_hash)) + continue; + else if (name_hash < le32_to_cpu(xe->xe_name_hash)) + break; + + cmp = name_index - ocfs2_xattr_get_type(xe); + if (!cmp) + cmp = name_len - xe->xe_name_len; + if (cmp) + continue; + + ret = ocfs2_xattr_bucket_get_name_value(inode, + xh, + i, + &block_off, + &new_offset); + if (ret) { + mlog_errno(ret); + break; + } + + ret = ocfs2_read_block(inode, header_bh->b_blocknr + block_off, + &name_bh); + if (ret) { + mlog_errno(ret); + break; + } + xe_name = name_bh->b_data + new_offset; + + cmp = memcmp(name, xe_name, name_len); + brelse(name_bh); + name_bh = NULL; + + if (cmp == 0) { + *xe_index = i; + *found = 1; + ret = 0; + break; + } + } + + return ret; +} + +/* + * Find the specified xattr entry in a series of buckets. + * This series start from p_blkno and last for num_clusters. + * The ocfs2_xattr_header.xh_num_buckets of the first bucket contains + * the num of the valid buckets. + * + * Return the buffer_head this xattr should reside in. And if the xattr's + * hash is in the gap of 2 buckets, return the lower bucket. + */ +static int ocfs2_xattr_bucket_find(struct inode *inode, + int name_index, + const char *name, + u32 name_hash, + u64 p_blkno, + u32 first_hash, + u32 num_clusters, + struct ocfs2_xattr_search *xs) +{ + int ret, found = 0; + struct buffer_head *bh = NULL; + struct buffer_head *lower_bh = NULL; + struct ocfs2_xattr_header *xh = NULL; + struct ocfs2_xattr_entry *xe = NULL; + u16 index = 0; + u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + int low_bucket = 0, bucket, high_bucket; + u32 last_hash; + u64 blkno; + + ret = ocfs2_read_block(inode, p_blkno, &bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + xh = (struct ocfs2_xattr_header *)bh->b_data; + high_bucket = le16_to_cpu(xh->xh_num_buckets) - 1; + + while (low_bucket <= high_bucket) { + brelse(bh); + bh = NULL; + bucket = (low_bucket + high_bucket) / 2; + + blkno = p_blkno + bucket * blk_per_bucket; + + ret = ocfs2_read_block(inode, blkno, &bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + xh = (struct ocfs2_xattr_header *)bh->b_data; + xe = &xh->xh_entries[0]; + if (name_hash < le32_to_cpu(xe->xe_name_hash)) { + high_bucket = bucket - 1; + continue; + } + + /* + * Check whether the hash of the last entry in our + * bucket is larger than the search one. for an empty + * bucket, the last one is also the first one. + */ + if (xh->xh_count) + xe = &xh->xh_entries[le16_to_cpu(xh->xh_count) - 1]; + + last_hash = le32_to_cpu(xe->xe_name_hash); + + /* record lower_bh which may be the insert place. */ + brelse(lower_bh); + lower_bh = bh; + bh = NULL; + + if (name_hash > le32_to_cpu(xe->xe_name_hash)) { + low_bucket = bucket + 1; + continue; + } + + /* the searched xattr should reside in this bucket if exists. */ + ret = ocfs2_find_xe_in_bucket(inode, lower_bh, + name_index, name, name_hash, + &index, &found); + if (ret) { + mlog_errno(ret); + goto out; + } + break; + } + + /* + * Record the bucket we have found. + * When the xattr's hash value is in the gap of 2 buckets, we will + * always set it to the previous bucket. + */ + if (!lower_bh) { + /* + * We can't find any bucket whose first name_hash is less + * than the find name_hash. + */ + BUG_ON(bh->b_blocknr != p_blkno); + lower_bh = bh; + bh = NULL; + } + xs->bucket.bhs[0] = lower_bh; + xs->bucket.xh = (struct ocfs2_xattr_header *) + xs->bucket.bhs[0]->b_data; + lower_bh = NULL; + + xs->header = xs->bucket.xh; + xs->base = xs->bucket.bhs[0]->b_data; + xs->end = xs->base + inode->i_sb->s_blocksize; + + if (found) { + /* + * If we have found the xattr enty, read all the blocks in + * this bucket. + */ + ret = ocfs2_read_blocks(inode, xs->bucket.bhs[0]->b_blocknr + 1, + blk_per_bucket - 1, &xs->bucket.bhs[1], + OCFS2_BH_CACHED); + if (ret) { + mlog_errno(ret); + goto out; + } + + xs->here = &xs->header->xh_entries[index]; + mlog(0, "find xattr %s in bucket %llu, entry = %u\n", name, + (unsigned long long)xs->bucket.bhs[0]->b_blocknr, index); + } else + ret = -ENODATA; + +out: + brelse(bh); + brelse(lower_bh); + return ret; +} + +static int ocfs2_xattr_index_block_find(struct inode *inode, + struct buffer_head *root_bh, + int name_index, + const char *name, + struct ocfs2_xattr_search *xs) +{ + int ret; + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)root_bh->b_data; + struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root; + struct ocfs2_extent_list *el = &xb_root->xt_list; + u64 p_blkno = 0; + u32 first_hash, num_clusters = 0; + u32 name_hash = ocfs2_xattr_name_hash(inode, name, strlen(name)); + + if (le16_to_cpu(el->l_next_free_rec) == 0) + return -ENODATA; + + mlog(0, "find xattr %s, hash = %u, index = %d in xattr tree\n", + name, name_hash, name_index); + + ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &first_hash, + &num_clusters, el); + if (ret) { + mlog_errno(ret); + goto out; + } + + BUG_ON(p_blkno == 0 || num_clusters == 0 || first_hash > name_hash); + + mlog(0, "find xattr extent rec %u clusters from %llu, the first hash " + "in the rec is %u\n", num_clusters, p_blkno, first_hash); + + ret = ocfs2_xattr_bucket_find(inode, name_index, name, name_hash, + p_blkno, first_hash, num_clusters, xs); + +out: + return ret; +} + +static int ocfs2_iterate_xattr_buckets(struct inode *inode, + u64 blkno, + u32 clusters, + xattr_bucket_func *func, + void *para) +{ + int i, j, ret = 0; + int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)); + u32 num_buckets = clusters * bpc; + struct ocfs2_xattr_bucket bucket; + + memset(&bucket, 0, sizeof(bucket)); + + mlog(0, "iterating xattr buckets in %u clusters starting from %llu\n", + clusters, blkno); + + for (i = 0; i < num_buckets; i++, blkno += blk_per_bucket) { + ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket, + bucket.bhs, OCFS2_BH_CACHED); + if (ret) { + mlog_errno(ret); + goto out; + } + + bucket.xh = (struct ocfs2_xattr_header *)bucket.bhs[0]->b_data; + /* + * The real bucket num in this series of blocks is stored + * in the 1st bucket. + */ + if (i == 0) + num_buckets = le16_to_cpu(bucket.xh->xh_num_buckets); + + mlog(0, "iterating xattr bucket %llu, first hash %u\n", blkno, + le32_to_cpu(bucket.xh->xh_entries[0].xe_name_hash)); + if (func) { + ret = func(inode, &bucket, para); + if (ret) { + mlog_errno(ret); + break; + } + } + + for (j = 0; j < blk_per_bucket; j++) + brelse(bucket.bhs[j]); + memset(&bucket, 0, sizeof(bucket)); + } + +out: + for (j = 0; j < blk_per_bucket; j++) + brelse(bucket.bhs[j]); + + return ret; +} + +struct ocfs2_xattr_tree_list { + char *buffer; + size_t buffer_size; + size_t result; +}; + +static int ocfs2_xattr_bucket_get_name_value(struct inode *inode, + struct ocfs2_xattr_header *xh, + int index, + int *block_off, + int *new_offset) +{ + u16 name_offset; + + if (index < 0 || index >= le16_to_cpu(xh->xh_count)) + return -EINVAL; + + name_offset = le16_to_cpu(xh->xh_entries[index].xe_name_offset); + + *block_off = name_offset >> inode->i_sb->s_blocksize_bits; + *new_offset = name_offset % inode->i_sb->s_blocksize; + + return 0; +} + +static int ocfs2_list_xattr_bucket(struct inode *inode, + struct ocfs2_xattr_bucket *bucket, + void *para) +{ + int ret = 0, type; + struct ocfs2_xattr_tree_list *xl = (struct ocfs2_xattr_tree_list *)para; + int i, block_off, new_offset; + const char *prefix, *name; + + for (i = 0 ; i < le16_to_cpu(bucket->xh->xh_count); i++) { + struct ocfs2_xattr_entry *entry = &bucket->xh->xh_entries[i]; + type = ocfs2_xattr_get_type(entry); + prefix = ocfs2_xattr_prefix(type); + + if (prefix) { + ret = ocfs2_xattr_bucket_get_name_value(inode, + bucket->xh, + i, + &block_off, + &new_offset); + if (ret) + break; + + name = (const char *)bucket->bhs[block_off]->b_data + + new_offset; + ret = ocfs2_xattr_list_entry(xl->buffer, + xl->buffer_size, + &xl->result, + prefix, name, + entry->xe_name_len); + if (ret) + break; + } + } + + return ret; +} + +static int ocfs2_xattr_tree_list_index_block(struct inode *inode, + struct ocfs2_xattr_tree_root *xt, + char *buffer, + size_t buffer_size) +{ + struct ocfs2_extent_list *el = &xt->xt_list; + int ret = 0; + u32 name_hash = UINT_MAX, e_cpos = 0, num_clusters = 0; + u64 p_blkno = 0; + struct ocfs2_xattr_tree_list xl = { + .buffer = buffer, + .buffer_size = buffer_size, + .result = 0, + }; + + if (le16_to_cpu(el->l_next_free_rec) == 0) + return 0; + + while (name_hash > 0) { + ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, + &e_cpos, &num_clusters, el); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_iterate_xattr_buckets(inode, p_blkno, num_clusters, + ocfs2_list_xattr_bucket, + &xl); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (e_cpos == 0) + break; + + name_hash = e_cpos - 1; + } + + ret = xl.result; +out: + return ret; +} + +static int cmp_xe(const void *a, const void *b) +{ + const struct ocfs2_xattr_entry *l = a, *r = b; + u32 l_hash = le32_to_cpu(l->xe_name_hash); + u32 r_hash = le32_to_cpu(r->xe_name_hash); + + if (l_hash > r_hash) + return 1; + if (l_hash < r_hash) + return -1; + return 0; +} + +static void swap_xe(void *a, void *b, int size) +{ + struct ocfs2_xattr_entry *l = a, *r = b, tmp; + + tmp = *l; + memcpy(l, r, sizeof(struct ocfs2_xattr_entry)); + memcpy(r, &tmp, sizeof(struct ocfs2_xattr_entry)); +} + +/* + * When the ocfs2_xattr_block is filled up, new bucket will be created + * and all the xattr entries will be moved to the new bucket. + * Note: we need to sort the entries since they are not saved in order + * in the ocfs2_xattr_block. + */ +static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode, + struct buffer_head *xb_bh, + struct buffer_head *xh_bh, + struct buffer_head *data_bh) +{ + int i, blocksize = inode->i_sb->s_blocksize; + u16 offset, size, off_change; + struct ocfs2_xattr_entry *xe; + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)xb_bh->b_data; + struct ocfs2_xattr_header *xb_xh = &xb->xb_attrs.xb_header; + struct ocfs2_xattr_header *xh = + (struct ocfs2_xattr_header *)xh_bh->b_data; + u16 count = le16_to_cpu(xb_xh->xh_count); + char *target = xh_bh->b_data, *src = xb_bh->b_data; + + mlog(0, "cp xattr from block %llu to bucket %llu\n", + (unsigned long long)xb_bh->b_blocknr, + (unsigned long long)xh_bh->b_blocknr); + + memset(xh_bh->b_data, 0, blocksize); + if (data_bh) + memset(data_bh->b_data, 0, blocksize); + /* + * Since the xe_name_offset is based on ocfs2_xattr_header, + * there is a offset change corresponding to the change of + * ocfs2_xattr_header's position. + */ + off_change = offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header); + xe = &xb_xh->xh_entries[count - 1]; + offset = le16_to_cpu(xe->xe_name_offset) + off_change; + size = blocksize - offset; + + /* copy all the names and values. */ + if (data_bh) + target = data_bh->b_data; + memcpy(target + offset, src + offset, size); + + /* Init new header now. */ + xh->xh_count = xb_xh->xh_count; + xh->xh_num_buckets = cpu_to_le16(1); + xh->xh_name_value_len = cpu_to_le16(size); + xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE - size); + + /* copy all the entries. */ + target = xh_bh->b_data; + offset = offsetof(struct ocfs2_xattr_header, xh_entries); + size = count * sizeof(struct ocfs2_xattr_entry); + memcpy(target + offset, (char *)xb_xh + offset, size); + + /* Change the xe offset for all the xe because of the move. */ + off_change = OCFS2_XATTR_BUCKET_SIZE - blocksize + + offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header); + for (i = 0; i < count; i++) + le16_add_cpu(&xh->xh_entries[i].xe_name_offset, off_change); + + mlog(0, "copy entry: start = %u, size = %u, offset_change = %u\n", + offset, size, off_change); + + sort(target + offset, count, sizeof(struct ocfs2_xattr_entry), + cmp_xe, swap_xe); +} + +/* + * After we move xattr from block to index btree, we have to + * update ocfs2_xattr_search to the new xe and base. + * + * When the entry is in xattr block, xattr_bh indicates the storage place. + * While if the entry is in index b-tree, "bucket" indicates the + * real place of the xattr. + */ +static int ocfs2_xattr_update_xattr_search(struct inode *inode, + struct ocfs2_xattr_search *xs, + struct buffer_head *old_bh, + struct buffer_head *new_bh) +{ + int ret = 0; + char *buf = old_bh->b_data; + struct ocfs2_xattr_block *old_xb = (struct ocfs2_xattr_block *)buf; + struct ocfs2_xattr_header *old_xh = &old_xb->xb_attrs.xb_header; + int i, blocksize = inode->i_sb->s_blocksize; + u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + + xs->bucket.bhs[0] = new_bh; + get_bh(new_bh); + xs->bucket.xh = (struct ocfs2_xattr_header *)xs->bucket.bhs[0]->b_data; + xs->header = xs->bucket.xh; + + xs->base = new_bh->b_data; + xs->end = xs->base + inode->i_sb->s_blocksize; + + if (!xs->not_found) { + if (OCFS2_XATTR_BUCKET_SIZE != blocksize) { + ret = ocfs2_read_blocks(inode, + xs->bucket.bhs[0]->b_blocknr + 1, + blk_per_bucket - 1, &xs->bucket.bhs[1], + OCFS2_BH_CACHED); + if (ret) { + mlog_errno(ret); + return ret; + } + + i = xs->here - old_xh->xh_entries; + xs->here = &xs->header->xh_entries[i]; + } + } + + return ret; +} + +static int ocfs2_xattr_create_index_block(struct inode *inode, + struct ocfs2_xattr_search *xs) +{ + int ret, credits = OCFS2_SUBALLOC_ALLOC; + u32 bit_off, len; + u64 blkno; + handle_t *handle; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_alloc_context *data_ac; + struct buffer_head *xh_bh = NULL, *data_bh = NULL; + struct buffer_head *xb_bh = xs->xattr_bh; + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)xb_bh->b_data; + struct ocfs2_xattr_tree_root *xr; + u16 xb_flags = le16_to_cpu(xb->xb_flags); + u16 bpb = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + + mlog(0, "create xattr index block for %llu\n", + (unsigned long long)xb_bh->b_blocknr); + + BUG_ON(xb_flags & OCFS2_XATTR_INDEXED); + + ret = ocfs2_reserve_clusters(osb, 1, &data_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * XXX: + * We can use this lock for now, and maybe move to a dedicated mutex + * if performance becomes a problem later. + */ + down_write(&oi->ip_alloc_sem); + + /* + * 3 more credits, one for xattr block update, one for the 1st block + * of the new xattr bucket and one for the value/data. + */ + credits += 3; + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out_sem; + } + + ret = ocfs2_journal_access(handle, inode, xb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + /* + * The bucket may spread in many blocks, and + * we will only touch the 1st block and the last block + * in the whole bucket(one for entry and one for data). + */ + blkno = ocfs2_clusters_to_blocks(inode->i_sb, bit_off); + + mlog(0, "allocate 1 cluster from %llu to xattr block\n", blkno); + + xh_bh = sb_getblk(inode->i_sb, blkno); + if (!xh_bh) { + ret = -EIO; + mlog_errno(ret); + goto out_commit; + } + + ocfs2_set_new_buffer_uptodate(inode, xh_bh); + + ret = ocfs2_journal_access(handle, inode, xh_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + if (bpb > 1) { + data_bh = sb_getblk(inode->i_sb, blkno + bpb - 1); + if (!data_bh) { + ret = -EIO; + mlog_errno(ret); + goto out_commit; + } + + ocfs2_set_new_buffer_uptodate(inode, data_bh); + + ret = ocfs2_journal_access(handle, inode, data_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + } + + ocfs2_cp_xattr_block_to_bucket(inode, xb_bh, xh_bh, data_bh); + + ocfs2_journal_dirty(handle, xh_bh); + if (data_bh) + ocfs2_journal_dirty(handle, data_bh); + + ocfs2_xattr_update_xattr_search(inode, xs, xb_bh, xh_bh); + + /* Change from ocfs2_xattr_header to ocfs2_xattr_tree_root */ + memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize - + offsetof(struct ocfs2_xattr_block, xb_attrs)); + + xr = &xb->xb_attrs.xb_root; + xr->xt_clusters = cpu_to_le32(1); + xr->xt_last_eb_blk = 0; + xr->xt_list.l_tree_depth = 0; + xr->xt_list.l_count = cpu_to_le16(ocfs2_xattr_recs_per_xb(inode->i_sb)); + xr->xt_list.l_next_free_rec = cpu_to_le16(1); + + xr->xt_list.l_recs[0].e_cpos = 0; + xr->xt_list.l_recs[0].e_blkno = cpu_to_le64(blkno); + xr->xt_list.l_recs[0].e_leaf_clusters = cpu_to_le16(1); + + xb->xb_flags = cpu_to_le16(xb_flags | OCFS2_XATTR_INDEXED); + + ret = ocfs2_journal_dirty(handle, xb_bh); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + +out_commit: + ocfs2_commit_trans(osb, handle); + +out_sem: + up_write(&oi->ip_alloc_sem); + +out: + if (data_ac) + ocfs2_free_alloc_context(data_ac); + + brelse(xh_bh); + brelse(data_bh); + + return ret; +} + +static int cmp_xe_offset(const void *a, const void *b) +{ + const struct ocfs2_xattr_entry *l = a, *r = b; + u32 l_name_offset = le16_to_cpu(l->xe_name_offset); + u32 r_name_offset = le16_to_cpu(r->xe_name_offset); + + if (l_name_offset < r_name_offset) + return 1; + if (l_name_offset > r_name_offset) + return -1; + return 0; +} + +/* + * defrag a xattr bucket if we find that the bucket has some + * holes beteen name/value pairs. + * We will move all the name/value pairs to the end of the bucket + * so that we can spare some space for insertion. + */ +static int ocfs2_defrag_xattr_bucket(struct inode *inode, + struct ocfs2_xattr_bucket *bucket) +{ + int ret, i; + size_t end, offset, len, value_len; + struct ocfs2_xattr_header *xh; + char *entries, *buf, *bucket_buf = NULL; + u64 blkno = bucket->bhs[0]->b_blocknr; + u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + u16 xh_free_start; + size_t blocksize = inode->i_sb->s_blocksize; + handle_t *handle; + struct buffer_head **bhs; + struct ocfs2_xattr_entry *xe; + + bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket, + GFP_NOFS); + if (!bhs) + return -ENOMEM; + + ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket, bhs, + OCFS2_BH_CACHED); + if (ret) + goto out; + + /* + * In order to make the operation more efficient and generic, + * we copy all the blocks into a contiguous memory and do the + * defragment there, so if anything is error, we will not touch + * the real block. + */ + bucket_buf = kmalloc(OCFS2_XATTR_BUCKET_SIZE, GFP_NOFS); + if (!bucket_buf) { + ret = -EIO; + goto out; + } + + buf = bucket_buf; + for (i = 0; i < blk_per_bucket; i++, buf += blocksize) + memcpy(buf, bhs[i]->b_data, blocksize); + + handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), blk_per_bucket); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + mlog_errno(ret); + goto out; + } + + for (i = 0; i < blk_per_bucket; i++) { + ret = ocfs2_journal_access(handle, inode, bhs[i], + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto commit; + } + } + + xh = (struct ocfs2_xattr_header *)bucket_buf; + entries = (char *)xh->xh_entries; + xh_free_start = le16_to_cpu(xh->xh_free_start); + + mlog(0, "adjust xattr bucket in %llu, count = %u, " + "xh_free_start = %u, xh_name_value_len = %u.\n", + blkno, le16_to_cpu(xh->xh_count), xh_free_start, + le16_to_cpu(xh->xh_name_value_len)); + + /* + * sort all the entries by their offset. + * the largest will be the first, so that we can + * move them to the end one by one. + */ + sort(entries, le16_to_cpu(xh->xh_count), + sizeof(struct ocfs2_xattr_entry), + cmp_xe_offset, swap_xe); + + /* Move all name/values to the end of the bucket. */ + xe = xh->xh_entries; + end = OCFS2_XATTR_BUCKET_SIZE; + for (i = 0; i < le16_to_cpu(xh->xh_count); i++, xe++) { + offset = le16_to_cpu(xe->xe_name_offset); + if (ocfs2_xattr_is_local(xe)) + value_len = OCFS2_XATTR_SIZE( + le64_to_cpu(xe->xe_value_size)); + else + value_len = OCFS2_XATTR_ROOT_SIZE; + len = OCFS2_XATTR_SIZE(xe->xe_name_len) + value_len; + + /* + * We must make sure that the name/value pair + * exist in the same block. So adjust end to + * the previous block end if needed. + */ + if (((end - len) / blocksize != + (end - 1) / blocksize)) + end = end - end % blocksize; + + if (end > offset + len) { + memmove(bucket_buf + end - len, + bucket_buf + offset, len); + xe->xe_name_offset = cpu_to_le16(end - len); + } + + mlog_bug_on_msg(end < offset + len, "Defrag check failed for " + "bucket %llu\n", (unsigned long long)blkno); + + end -= len; + } + + mlog_bug_on_msg(xh_free_start > end, "Defrag check failed for " + "bucket %llu\n", (unsigned long long)blkno); + + if (xh_free_start == end) + goto commit; + + memset(bucket_buf + xh_free_start, 0, end - xh_free_start); + xh->xh_free_start = cpu_to_le16(end); + + /* sort the entries by their name_hash. */ + sort(entries, le16_to_cpu(xh->xh_count), + sizeof(struct ocfs2_xattr_entry), + cmp_xe, swap_xe); + + buf = bucket_buf; + for (i = 0; i < blk_per_bucket; i++, buf += blocksize) { + memcpy(bhs[i]->b_data, buf, blocksize); + ocfs2_journal_dirty(handle, bhs[i]); + } + +commit: + ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); +out: + + if (bhs) { + for (i = 0; i < blk_per_bucket; i++) + brelse(bhs[i]); + } + kfree(bhs); + + kfree(bucket_buf); + return ret; +} + +/* + * Move half nums of the xattr bucket in the previous cluster to this new + * cluster. We only touch the last cluster of the previous extend record. + * + * first_bh is the first buffer_head of a series of bucket in the same + * extent rec and header_bh is the header of one bucket in this cluster. + * They will be updated if we move the data header_bh contains to the new + * cluster. first_hash will be set as the 1st xe's name_hash of the new cluster. + */ +static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode, + handle_t *handle, + struct buffer_head **first_bh, + struct buffer_head **header_bh, + u64 new_blkno, + u64 prev_blkno, + u32 num_clusters, + u32 *first_hash) +{ + int i, ret, credits; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); + int num_buckets = ocfs2_xattr_buckets_per_cluster(osb); + int blocksize = inode->i_sb->s_blocksize; + struct buffer_head *old_bh, *new_bh, *prev_bh, *new_first_bh = NULL; + struct ocfs2_xattr_header *new_xh; + struct ocfs2_xattr_header *xh = + (struct ocfs2_xattr_header *)((*first_bh)->b_data); + + BUG_ON(le16_to_cpu(xh->xh_num_buckets) < num_buckets); + BUG_ON(OCFS2_XATTR_BUCKET_SIZE == osb->s_clustersize); + + prev_bh = *first_bh; + get_bh(prev_bh); + xh = (struct ocfs2_xattr_header *)prev_bh->b_data; + + prev_blkno += (num_clusters - 1) * bpc + bpc / 2; + + mlog(0, "move half of xattrs in cluster %llu to %llu\n", + prev_blkno, new_blkno); + + /* + * We need to update the 1st half of the new cluster and + * 1 more for the update of the 1st bucket of the previous + * extent record. + */ + credits = bpc / 2 + 1; + ret = ocfs2_extend_trans(handle, credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access(handle, inode, prev_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + for (i = 0; i < bpc / 2; i++, prev_blkno++, new_blkno++) { + old_bh = new_bh = NULL; + new_bh = sb_getblk(inode->i_sb, new_blkno); + if (!new_bh) { + ret = -EIO; + mlog_errno(ret); + goto out; + } + + ocfs2_set_new_buffer_uptodate(inode, new_bh); + + ret = ocfs2_journal_access(handle, inode, new_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret < 0) { + mlog_errno(ret); + brelse(new_bh); + goto out; + } + + ret = ocfs2_read_block(inode, prev_blkno, &old_bh); + if (ret < 0) { + mlog_errno(ret); + brelse(new_bh); + goto out; + } + + memcpy(new_bh->b_data, old_bh->b_data, blocksize); + + if (i == 0) { + new_xh = (struct ocfs2_xattr_header *)new_bh->b_data; + new_xh->xh_num_buckets = cpu_to_le16(num_buckets / 2); + + if (first_hash) + *first_hash = le32_to_cpu( + new_xh->xh_entries[0].xe_name_hash); + new_first_bh = new_bh; + get_bh(new_first_bh); + } + + ocfs2_journal_dirty(handle, new_bh); + + if (*header_bh == old_bh) { + brelse(*header_bh); + *header_bh = new_bh; + get_bh(*header_bh); + + brelse(*first_bh); + *first_bh = new_first_bh; + get_bh(*first_bh); + } + brelse(new_bh); + brelse(old_bh); + } + + le16_add_cpu(&xh->xh_num_buckets, -(num_buckets / 2)); + + ocfs2_journal_dirty(handle, prev_bh); +out: + brelse(prev_bh); + brelse(new_first_bh); + return ret; +} + +static int ocfs2_read_xattr_bucket(struct inode *inode, + u64 blkno, + struct buffer_head **bhs, + int new) +{ + int ret = 0; + u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + + if (!new) + return ocfs2_read_blocks(inode, blkno, + blk_per_bucket, bhs, + OCFS2_BH_CACHED); + + for (i = 0; i < blk_per_bucket; i++) { + bhs[i] = sb_getblk(inode->i_sb, blkno + i); + if (bhs[i] == NULL) { + ret = -EIO; + mlog_errno(ret); + break; + } + ocfs2_set_new_buffer_uptodate(inode, bhs[i]); + } + + return ret; +} + +/* + * Move half num of the xattrs in old bucket(blk) to new bucket(new_blk). + * first_hash will record the 1st hash of the new bucket. + */ +static int ocfs2_half_xattr_bucket(struct inode *inode, + handle_t *handle, + u64 blk, + u64 new_blk, + u32 *first_hash, + int new_bucket_head) +{ + int ret, i; + u16 count, start, len, name_value_len, xe_len, name_offset; + u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + struct buffer_head **s_bhs, **t_bhs = NULL; + struct ocfs2_xattr_header *xh; + struct ocfs2_xattr_entry *xe; + int blocksize = inode->i_sb->s_blocksize; + + mlog(0, "move half of xattrs from bucket %llu to %llu\n", + blk, new_blk); + + s_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS); + if (!s_bhs) + return -ENOMEM; + + ret = ocfs2_read_xattr_bucket(inode, blk, s_bhs, 0); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access(handle, inode, s_bhs[0], + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + t_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS); + if (!t_bhs) { + ret = -ENOMEM; + goto out; + } + + ret = ocfs2_read_xattr_bucket(inode, new_blk, t_bhs, new_bucket_head); + if (ret) { + mlog_errno(ret); + goto out; + } + + for (i = 0; i < blk_per_bucket; i++) { + ret = ocfs2_journal_access(handle, inode, t_bhs[i], + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + /* copy the whole bucket to the new first. */ + for (i = 0; i < blk_per_bucket; i++) + memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize); + + /* update the new bucket. */ + xh = (struct ocfs2_xattr_header *)t_bhs[0]->b_data; + count = le16_to_cpu(xh->xh_count); + start = count / 2; + + /* + * Calculate the total name/value len and xh_free_start for + * the old bucket first. + */ + name_offset = OCFS2_XATTR_BUCKET_SIZE; + name_value_len = 0; + for (i = 0; i < start; i++) { + xe = &xh->xh_entries[i]; + xe_len = OCFS2_XATTR_SIZE(xe->xe_name_len); + if (ocfs2_xattr_is_local(xe)) + xe_len += + OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size)); + else + xe_len += OCFS2_XATTR_ROOT_SIZE; + name_value_len += xe_len; + if (le16_to_cpu(xe->xe_name_offset) < name_offset) + name_offset = le16_to_cpu(xe->xe_name_offset); + } + + /* + * Now begin the modification to the new bucket. + * + * In the new bucket, We just move the xattr entry to the beginning + * and don't touch the name/value. So there will be some holes in the + * bucket, and they will be removed when ocfs2_defrag_xattr_bucket is + * called. + */ + xe = &xh->xh_entries[start]; + len = sizeof(struct ocfs2_xattr_entry) * (count - start); + mlog(0, "mv xattr entry len %d from %d to %d\n", len, + (int)((char *)xe - (char *)xh), + (int)((char *)xh->xh_entries - (char *)xh)); + memmove((char *)xh->xh_entries, (char *)xe, len); + xe = &xh->xh_entries[count - start]; + len = sizeof(struct ocfs2_xattr_entry) * start; + memset((char *)xe, 0, len); + + le16_add_cpu(&xh->xh_count, -start); + le16_add_cpu(&xh->xh_name_value_len, -name_value_len); + + /* Calculate xh_free_start for the new bucket. */ + xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE); + for (i = 0; i < le16_to_cpu(xh->xh_count); i++) { + xe = &xh->xh_entries[i]; + xe_len = OCFS2_XATTR_SIZE(xe->xe_name_len); + if (ocfs2_xattr_is_local(xe)) + xe_len += + OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size)); + else + xe_len += OCFS2_XATTR_ROOT_SIZE; + if (le16_to_cpu(xe->xe_name_offset) < + le16_to_cpu(xh->xh_free_start)) + xh->xh_free_start = xe->xe_name_offset; + } + + /* set xh->xh_num_buckets for the new xh. */ + if (new_bucket_head) + xh->xh_num_buckets = cpu_to_le16(1); + else + xh->xh_num_buckets = 0; + + for (i = 0; i < blk_per_bucket; i++) { + ocfs2_journal_dirty(handle, t_bhs[i]); + if (ret) + mlog_errno(ret); + } + + /* store the first_hash of the new bucket. */ + if (first_hash) + *first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash); + + /* + * Now only update the 1st block of the old bucket. + * Please note that the entry has been sorted already above. + */ + xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data; + memset(&xh->xh_entries[start], 0, + sizeof(struct ocfs2_xattr_entry) * (count - start)); + xh->xh_count = cpu_to_le16(start); + xh->xh_free_start = cpu_to_le16(name_offset); + xh->xh_name_value_len = cpu_to_le16(name_value_len); + + ocfs2_journal_dirty(handle, s_bhs[0]); + if (ret) + mlog_errno(ret); + +out: + if (s_bhs) { + for (i = 0; i < blk_per_bucket; i++) + brelse(s_bhs[i]); + } + kfree(s_bhs); + + if (t_bhs) { + for (i = 0; i < blk_per_bucket; i++) + brelse(t_bhs[i]); + } + kfree(t_bhs); + + return ret; +} + +/* + * Copy xattr from one bucket to another bucket. + * + * The caller must make sure that the journal transaction + * has enough space for journaling. + */ +static int ocfs2_cp_xattr_bucket(struct inode *inode, + handle_t *handle, + u64 s_blkno, + u64 t_blkno, + int t_is_new) +{ + int ret, i; + int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + int blocksize = inode->i_sb->s_blocksize; + struct buffer_head **s_bhs, **t_bhs = NULL; + + BUG_ON(s_blkno == t_blkno); + + mlog(0, "cp bucket %llu to %llu, target is %d\n", + s_blkno, t_blkno, t_is_new); + + s_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket, + GFP_NOFS); + if (!s_bhs) + return -ENOMEM; + + ret = ocfs2_read_xattr_bucket(inode, s_blkno, s_bhs, 0); + if (ret) + goto out; + + t_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket, + GFP_NOFS); + if (!t_bhs) { + ret = -ENOMEM; + goto out; + } + + ret = ocfs2_read_xattr_bucket(inode, t_blkno, t_bhs, t_is_new); + if (ret) + goto out; + + for (i = 0; i < blk_per_bucket; i++) { + ret = ocfs2_journal_access(handle, inode, t_bhs[i], + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) + goto out; + } + + for (i = 0; i < blk_per_bucket; i++) { + memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize); + ocfs2_journal_dirty(handle, t_bhs[i]); + } + +out: + if (s_bhs) { + for (i = 0; i < blk_per_bucket; i++) + brelse(s_bhs[i]); + } + kfree(s_bhs); + + if (t_bhs) { + for (i = 0; i < blk_per_bucket; i++) + brelse(t_bhs[i]); + } + kfree(t_bhs); + + return ret; +} + +/* + * Copy one xattr cluster from src_blk to to_blk. + * The to_blk will become the first bucket header of the cluster, so its + * xh_num_buckets will be initialized as the bucket num in the cluster. + */ +static int ocfs2_cp_xattr_cluster(struct inode *inode, + handle_t *handle, + struct buffer_head *first_bh, + u64 src_blk, + u64 to_blk, + u32 *first_hash) +{ + int i, ret, credits; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); + int num_buckets = ocfs2_xattr_buckets_per_cluster(osb); + struct buffer_head *bh = NULL; + struct ocfs2_xattr_header *xh; + u64 to_blk_start = to_blk; + + mlog(0, "cp xattrs from cluster %llu to %llu\n", src_blk, to_blk); + + /* + * We need to update the new cluster and 1 more for the update of + * the 1st bucket of the previous extent rec. + */ + credits = bpc + 1; + ret = ocfs2_extend_trans(handle, credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access(handle, inode, first_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + for (i = 0; i < num_buckets; i++) { + ret = ocfs2_cp_xattr_bucket(inode, handle, + src_blk, to_blk, 1); + if (ret) { + mlog_errno(ret); + goto out; + } + + src_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb); + to_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb); + } + + /* update the old bucket header. */ + xh = (struct ocfs2_xattr_header *)first_bh->b_data; + le16_add_cpu(&xh->xh_num_buckets, -num_buckets); + + ocfs2_journal_dirty(handle, first_bh); + + /* update the new bucket header. */ + ret = ocfs2_read_block(inode, to_blk_start, &bh); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access(handle, inode, bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + xh = (struct ocfs2_xattr_header *)bh->b_data; + xh->xh_num_buckets = cpu_to_le16(num_buckets); + + ocfs2_journal_dirty(handle, bh); + + if (first_hash) + *first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash); +out: + brelse(bh); + return ret; +} + +/* + * Move half of the xattrs in this cluster to the new cluster. + * This function should only be called when bucket size == cluster size. + * Otherwise ocfs2_mv_xattr_bucket_cross_cluster should be used instead. + */ +static int ocfs2_half_xattr_cluster(struct inode *inode, + handle_t *handle, + u64 prev_blk, + u64 new_blk, + u32 *first_hash) +{ + u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + int ret, credits = 2 * blk_per_bucket; + + BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize); + + ret = ocfs2_extend_trans(handle, credits); + if (ret) { + mlog_errno(ret); + return ret; + } + + /* Move half of the xattr in start_blk to the next bucket. */ + return ocfs2_half_xattr_bucket(inode, handle, prev_blk, + new_blk, first_hash, 1); +} + +/* + * Move some xattrs from the old cluster to the new one since they are not + * contiguous in ocfs2 xattr tree. + * + * new_blk starts a new separate cluster, and we will move some xattrs from + * prev_blk to it. v_start will be set as the first name hash value in this + * new cluster so that it can be used as e_cpos during tree insertion and + * don't collide with our original b-tree operations. first_bh and header_bh + * will also be updated since they will be used in ocfs2_extend_xattr_bucket + * to extend the insert bucket. + * + * The problem is how much xattr should we move to the new one and when should + * we update first_bh and header_bh? + * 1. If cluster size > bucket size, that means the previous cluster has more + * than 1 bucket, so just move half nums of bucket into the new cluster and + * update the first_bh and header_bh if the insert bucket has been moved + * to the new cluster. + * 2. If cluster_size == bucket_size: + * a) If the previous extent rec has more than one cluster and the insert + * place isn't in the last cluster, copy the entire last cluster to the + * new one. This time, we don't need to upate the first_bh and header_bh + * since they will not be moved into the new cluster. + * b) Otherwise, move the bottom half of the xattrs in the last cluster into + * the new one. And we set the extend flag to zero if the insert place is + * moved into the new allocated cluster since no extend is needed. + */ +static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode, + handle_t *handle, + struct buffer_head **first_bh, + struct buffer_head **header_bh, + u64 new_blk, + u64 prev_blk, + u32 prev_clusters, + u32 *v_start, + int *extend) +{ + int ret = 0; + int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); + + mlog(0, "adjust xattrs from cluster %llu len %u to %llu\n", + prev_blk, prev_clusters, new_blk); + + if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1) + ret = ocfs2_mv_xattr_bucket_cross_cluster(inode, + handle, + first_bh, + header_bh, + new_blk, + prev_blk, + prev_clusters, + v_start); + else { + u64 last_blk = prev_blk + bpc * (prev_clusters - 1); + + if (prev_clusters > 1 && (*header_bh)->b_blocknr != last_blk) + ret = ocfs2_cp_xattr_cluster(inode, handle, *first_bh, + last_blk, new_blk, + v_start); + else { + ret = ocfs2_half_xattr_cluster(inode, handle, + last_blk, new_blk, + v_start); + + if ((*header_bh)->b_blocknr == last_blk && extend) + *extend = 0; + } + } + + return ret; +} + +/* + * Add a new cluster for xattr storage. + * + * If the new cluster is contiguous with the previous one, it will be + * appended to the same extent record, and num_clusters will be updated. + * If not, we will insert a new extent for it and move some xattrs in + * the last cluster into the new allocated one. + * We also need to limit the maximum size of a btree leaf, otherwise we'll + * lose the benefits of hashing because we'll have to search large leaves. + * So now the maximum size is OCFS2_MAX_XATTR_TREE_LEAF_SIZE(or clustersize, + * if it's bigger). + * + * first_bh is the first block of the previous extent rec and header_bh + * indicates the bucket we will insert the new xattrs. They will be updated + * when the header_bh is moved into the new cluster. + */ +static int ocfs2_add_new_xattr_cluster(struct inode *inode, + struct buffer_head *root_bh, + struct buffer_head **first_bh, + struct buffer_head **header_bh, + u32 *num_clusters, + u32 prev_cpos, + u64 prev_blkno, + int *extend) +{ + int ret, credits; + u16 bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); + u32 prev_clusters = *num_clusters; + u32 clusters_to_add = 1, bit_off, num_bits, v_start = 0; + u64 block; + handle_t *handle = NULL; + struct ocfs2_alloc_context *data_ac = NULL; + struct ocfs2_alloc_context *meta_ac = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_extent_tree et; + + mlog(0, "Add new xattr cluster for %llu, previous xattr hash = %u, " + "previous xattr blkno = %llu\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + prev_cpos, prev_blkno); + + ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh); + + ret = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0, + &data_ac, &meta_ac); + if (ret) { + mlog_errno(ret); + goto leave; + } + + credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el, + clusters_to_add); + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + mlog_errno(ret); + goto leave; + } + + ret = ocfs2_journal_access(handle, inode, root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto leave; + } + + ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1, + clusters_to_add, &bit_off, &num_bits); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto leave; + } + + BUG_ON(num_bits > clusters_to_add); + + block = ocfs2_clusters_to_blocks(osb->sb, bit_off); + mlog(0, "Allocating %u clusters at block %u for xattr in inode %llu\n", + num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); + + if (prev_blkno + prev_clusters * bpc == block && + (prev_clusters + num_bits) << osb->s_clustersize_bits <= + OCFS2_MAX_XATTR_TREE_LEAF_SIZE) { + /* + * If this cluster is contiguous with the old one and + * adding this new cluster, we don't surpass the limit of + * OCFS2_MAX_XATTR_TREE_LEAF_SIZE, cool. We will let it be + * initialized and used like other buckets in the previous + * cluster. + * So add it as a contiguous one. The caller will handle + * its init process. + */ + v_start = prev_cpos + prev_clusters; + *num_clusters = prev_clusters + num_bits; + mlog(0, "Add contiguous %u clusters to previous extent rec.\n", + num_bits); + } else { + ret = ocfs2_adjust_xattr_cross_cluster(inode, + handle, + first_bh, + header_bh, + block, + prev_blkno, + prev_clusters, + &v_start, + extend); + if (ret) { + mlog_errno(ret); + goto leave; + } + } + + if (handle->h_buffer_credits < credits) { + /* + * The journal has been restarted before, and don't + * have enough space for the insertion, so extend it + * here. + */ + ret = ocfs2_extend_trans(handle, credits); + if (ret) { + mlog_errno(ret); + goto leave; + } + } + mlog(0, "Insert %u clusters at block %llu for xattr at %u\n", + num_bits, block, v_start); + ret = ocfs2_insert_extent(osb, handle, inode, &et, v_start, block, + num_bits, 0, meta_ac); + if (ret < 0) { + mlog_errno(ret); + goto leave; + } + + ret = ocfs2_journal_dirty(handle, root_bh); + if (ret < 0) { + mlog_errno(ret); + goto leave; + } + +leave: + if (handle) + ocfs2_commit_trans(osb, handle); + if (data_ac) + ocfs2_free_alloc_context(data_ac); + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + + return ret; +} + +/* + * Extend a new xattr bucket and move xattrs to the end one by one until + * We meet with start_bh. Only move half of the xattrs to the bucket after it. + */ +static int ocfs2_extend_xattr_bucket(struct inode *inode, + struct buffer_head *first_bh, + struct buffer_head *start_bh, + u32 num_clusters) +{ + int ret, credits; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + u64 start_blk = start_bh->b_blocknr, end_blk; + u32 num_buckets = num_clusters * ocfs2_xattr_buckets_per_cluster(osb); + handle_t *handle; + struct ocfs2_xattr_header *first_xh = + (struct ocfs2_xattr_header *)first_bh->b_data; + u16 bucket = le16_to_cpu(first_xh->xh_num_buckets); + + mlog(0, "extend xattr bucket in %llu, xattr extend rec starting " + "from %llu, len = %u\n", start_blk, + (unsigned long long)first_bh->b_blocknr, num_clusters); + + BUG_ON(bucket >= num_buckets); + + end_blk = first_bh->b_blocknr + (bucket - 1) * blk_per_bucket; + + /* + * We will touch all the buckets after the start_bh(include it). + * Add one more bucket and modify the first_bh. + */ + credits = end_blk - start_blk + 2 * blk_per_bucket + 1; + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access(handle, inode, first_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto commit; + } + + while (end_blk != start_blk) { + ret = ocfs2_cp_xattr_bucket(inode, handle, end_blk, + end_blk + blk_per_bucket, 0); + if (ret) + goto commit; + end_blk -= blk_per_bucket; + } + + /* Move half of the xattr in start_blk to the next bucket. */ + ret = ocfs2_half_xattr_bucket(inode, handle, start_blk, + start_blk + blk_per_bucket, NULL, 0); + + le16_add_cpu(&first_xh->xh_num_buckets, 1); + ocfs2_journal_dirty(handle, first_bh); + +commit: + ocfs2_commit_trans(osb, handle); +out: + return ret; +} + +/* + * Add new xattr bucket in an extent record and adjust the buckets accordingly. + * xb_bh is the ocfs2_xattr_block. + * We will move all the buckets starting from header_bh to the next place. As + * for this one, half num of its xattrs will be moved to the next one. + * + * We will allocate a new cluster if current cluster is full and adjust + * header_bh and first_bh if the insert place is moved to the new cluster. + */ +static int ocfs2_add_new_xattr_bucket(struct inode *inode, + struct buffer_head *xb_bh, + struct buffer_head *header_bh) +{ + struct ocfs2_xattr_header *first_xh = NULL; + struct buffer_head *first_bh = NULL; + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)xb_bh->b_data; + struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root; + struct ocfs2_extent_list *el = &xb_root->xt_list; + struct ocfs2_xattr_header *xh = + (struct ocfs2_xattr_header *)header_bh->b_data; + u32 name_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash); + struct super_block *sb = inode->i_sb; + struct ocfs2_super *osb = OCFS2_SB(sb); + int ret, num_buckets, extend = 1; + u64 p_blkno; + u32 e_cpos, num_clusters; + + mlog(0, "Add new xattr bucket starting form %llu\n", + (unsigned long long)header_bh->b_blocknr); + + /* + * Add refrence for header_bh here because it may be + * changed in ocfs2_add_new_xattr_cluster and we need + * to free it in the end. + */ + get_bh(header_bh); + + ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &e_cpos, + &num_clusters, el); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_read_block(inode, p_blkno, &first_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + num_buckets = ocfs2_xattr_buckets_per_cluster(osb) * num_clusters; + first_xh = (struct ocfs2_xattr_header *)first_bh->b_data; + + if (num_buckets == le16_to_cpu(first_xh->xh_num_buckets)) { + ret = ocfs2_add_new_xattr_cluster(inode, + xb_bh, + &first_bh, + &header_bh, + &num_clusters, + e_cpos, + p_blkno, + &extend); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + if (extend) + ret = ocfs2_extend_xattr_bucket(inode, + first_bh, + header_bh, + num_clusters); + if (ret) + mlog_errno(ret); +out: + brelse(first_bh); + brelse(header_bh); + return ret; +} + +static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode, + struct ocfs2_xattr_bucket *bucket, + int offs) +{ + int block_off = offs >> inode->i_sb->s_blocksize_bits; + + offs = offs % inode->i_sb->s_blocksize; + return bucket->bhs[block_off]->b_data + offs; +} + +/* + * Handle the normal xattr set, including replace, delete and new. + * + * Note: "local" indicates the real data's locality. So we can't + * just its bucket locality by its length. + */ +static void ocfs2_xattr_set_entry_normal(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs, + u32 name_hash, + int local) +{ + struct ocfs2_xattr_entry *last, *xe; + int name_len = strlen(xi->name); + struct ocfs2_xattr_header *xh = xs->header; + u16 count = le16_to_cpu(xh->xh_count), start; + size_t blocksize = inode->i_sb->s_blocksize; + char *val; + size_t offs, size, new_size; + + last = &xh->xh_entries[count]; + if (!xs->not_found) { + xe = xs->here; + offs = le16_to_cpu(xe->xe_name_offset); + if (ocfs2_xattr_is_local(xe)) + size = OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size)); + else + size = OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE); + + /* + * If the new value will be stored outside, xi->value has been + * initalized as an empty ocfs2_xattr_value_root, and the same + * goes with xi->value_len, so we can set new_size safely here. + * See ocfs2_xattr_set_in_bucket. + */ + new_size = OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_SIZE(xi->value_len); + + le16_add_cpu(&xh->xh_name_value_len, -size); + if (xi->value) { + if (new_size > size) + goto set_new_name_value; + + /* Now replace the old value with new one. */ + if (local) + xe->xe_value_size = cpu_to_le64(xi->value_len); + else + xe->xe_value_size = 0; + + val = ocfs2_xattr_bucket_get_val(inode, + &xs->bucket, offs); + memset(val + OCFS2_XATTR_SIZE(name_len), 0, + size - OCFS2_XATTR_SIZE(name_len)); + if (OCFS2_XATTR_SIZE(xi->value_len) > 0) + memcpy(val + OCFS2_XATTR_SIZE(name_len), + xi->value, xi->value_len); + + le16_add_cpu(&xh->xh_name_value_len, new_size); + ocfs2_xattr_set_local(xe, local); + return; + } else { + /* + * Remove the old entry if there is more than one. + * We don't remove the last entry so that we can + * use it to indicate the hash value of the empty + * bucket. + */ + last -= 1; + le16_add_cpu(&xh->xh_count, -1); + if (xh->xh_count) { + memmove(xe, xe + 1, + (void *)last - (void *)xe); + memset(last, 0, + sizeof(struct ocfs2_xattr_entry)); + } else + xh->xh_free_start = + cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE); + + return; + } + } else { + /* find a new entry for insert. */ + int low = 0, high = count - 1, tmp; + struct ocfs2_xattr_entry *tmp_xe; + + while (low <= high && count) { + tmp = (low + high) / 2; + tmp_xe = &xh->xh_entries[tmp]; + + if (name_hash > le32_to_cpu(tmp_xe->xe_name_hash)) + low = tmp + 1; + else if (name_hash < + le32_to_cpu(tmp_xe->xe_name_hash)) + high = tmp - 1; + else { + low = tmp; + break; + } + } + + xe = &xh->xh_entries[low]; + if (low != count) + memmove(xe + 1, xe, (void *)last - (void *)xe); + + le16_add_cpu(&xh->xh_count, 1); + memset(xe, 0, sizeof(struct ocfs2_xattr_entry)); + xe->xe_name_hash = cpu_to_le32(name_hash); + xe->xe_name_len = name_len; + ocfs2_xattr_set_type(xe, xi->name_index); + } + +set_new_name_value: + /* Insert the new name+value. */ + size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(xi->value_len); + + /* + * We must make sure that the name/value pair + * exists in the same block. + */ + offs = le16_to_cpu(xh->xh_free_start); + start = offs - size; + + if (start >> inode->i_sb->s_blocksize_bits != + (offs - 1) >> inode->i_sb->s_blocksize_bits) { + offs = offs - offs % blocksize; + xh->xh_free_start = cpu_to_le16(offs); + } + + val = ocfs2_xattr_bucket_get_val(inode, + &xs->bucket, offs - size); + xe->xe_name_offset = cpu_to_le16(offs - size); + + memset(val, 0, size); + memcpy(val, xi->name, name_len); + memcpy(val + OCFS2_XATTR_SIZE(name_len), xi->value, xi->value_len); + + xe->xe_value_size = cpu_to_le64(xi->value_len); + ocfs2_xattr_set_local(xe, local); + xs->here = xe; + le16_add_cpu(&xh->xh_free_start, -size); + le16_add_cpu(&xh->xh_name_value_len, size); + + return; +} + +static int ocfs2_xattr_bucket_handle_journal(struct inode *inode, + handle_t *handle, + struct ocfs2_xattr_search *xs, + struct buffer_head **bhs, + u16 bh_num) +{ + int ret = 0, off, block_off; + struct ocfs2_xattr_entry *xe = xs->here; + + /* + * First calculate all the blocks we should journal_access + * and journal_dirty. The first block should always be touched. + */ + ret = ocfs2_journal_dirty(handle, bhs[0]); + if (ret) + mlog_errno(ret); + + /* calc the data. */ + off = le16_to_cpu(xe->xe_name_offset); + block_off = off >> inode->i_sb->s_blocksize_bits; + ret = ocfs2_journal_dirty(handle, bhs[block_off]); + if (ret) + mlog_errno(ret); + + return ret; +} + +/* + * Set the xattr entry in the specified bucket. + * The bucket is indicated by xs->bucket and it should have the enough + * space for the xattr insertion. + */ +static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs, + u32 name_hash, + int local) +{ + int i, ret; + handle_t *handle = NULL; + u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + mlog(0, "Set xattr entry len = %lu index = %d in bucket %llu\n", + (unsigned long)xi->value_len, xi->name_index, + (unsigned long long)xs->bucket.bhs[0]->b_blocknr); + + if (!xs->bucket.bhs[1]) { + ret = ocfs2_read_blocks(inode, + xs->bucket.bhs[0]->b_blocknr + 1, + blk_per_bucket - 1, &xs->bucket.bhs[1], + OCFS2_BH_CACHED); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + handle = ocfs2_start_trans(osb, blk_per_bucket); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + mlog_errno(ret); + goto out; + } + + for (i = 0; i < blk_per_bucket; i++) { + ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[i], + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + ocfs2_xattr_set_entry_normal(inode, xi, xs, name_hash, local); + + /*Only dirty the blocks we have touched in set xattr. */ + ret = ocfs2_xattr_bucket_handle_journal(inode, handle, xs, + xs->bucket.bhs, blk_per_bucket); + if (ret) + mlog_errno(ret); +out: + ocfs2_commit_trans(osb, handle); + + return ret; +} + +static int ocfs2_xattr_value_update_size(struct inode *inode, + struct buffer_head *xe_bh, + struct ocfs2_xattr_entry *xe, + u64 new_size) +{ + int ret; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + handle_t *handle = NULL; + + handle = ocfs2_start_trans(osb, 1); + if (handle == NULL) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access(handle, inode, xe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; + } + + xe->xe_value_size = cpu_to_le64(new_size); + + ret = ocfs2_journal_dirty(handle, xe_bh); + if (ret < 0) + mlog_errno(ret); + +out_commit: + ocfs2_commit_trans(osb, handle); +out: + return ret; +} + +/* + * Truncate the specified xe_off entry in xattr bucket. + * bucket is indicated by header_bh and len is the new length. + * Both the ocfs2_xattr_value_root and the entry will be updated here. + * + * Copy the new updated xe and xe_value_root to new_xe and new_xv if needed. + */ +static int ocfs2_xattr_bucket_value_truncate(struct inode *inode, + struct buffer_head *header_bh, + int xe_off, + int len) +{ + int ret, offset; + u64 value_blk; + struct buffer_head *value_bh = NULL; + struct ocfs2_xattr_value_root *xv; + struct ocfs2_xattr_entry *xe; + struct ocfs2_xattr_header *xh = + (struct ocfs2_xattr_header *)header_bh->b_data; + size_t blocksize = inode->i_sb->s_blocksize; + + xe = &xh->xh_entries[xe_off]; + + BUG_ON(!xe || ocfs2_xattr_is_local(xe)); + + offset = le16_to_cpu(xe->xe_name_offset) + + OCFS2_XATTR_SIZE(xe->xe_name_len); + + value_blk = offset / blocksize; + + /* We don't allow ocfs2_xattr_value to be stored in different block. */ + BUG_ON(value_blk != (offset + OCFS2_XATTR_ROOT_SIZE - 1) / blocksize); + value_blk += header_bh->b_blocknr; + + ret = ocfs2_read_block(inode, value_blk, &value_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + xv = (struct ocfs2_xattr_value_root *) + (value_bh->b_data + offset % blocksize); + + mlog(0, "truncate %u in xattr bucket %llu to %d bytes.\n", + xe_off, (unsigned long long)header_bh->b_blocknr, len); + ret = ocfs2_xattr_value_truncate(inode, value_bh, xv, len); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_xattr_value_update_size(inode, header_bh, xe, len); + if (ret) { + mlog_errno(ret); + goto out; + } + +out: + brelse(value_bh); + return ret; +} + +static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode, + struct ocfs2_xattr_search *xs, + int len) +{ + int ret, offset; + struct ocfs2_xattr_entry *xe = xs->here; + struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)xs->base; + + BUG_ON(!xs->bucket.bhs[0] || !xe || ocfs2_xattr_is_local(xe)); + + offset = xe - xh->xh_entries; + ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket.bhs[0], + offset, len); + if (ret) + mlog_errno(ret); + + return ret; +} + +static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode, + struct ocfs2_xattr_search *xs, + char *val, + int value_len) +{ + int offset; + struct ocfs2_xattr_value_root *xv; + struct ocfs2_xattr_entry *xe = xs->here; + + BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe)); + + offset = le16_to_cpu(xe->xe_name_offset) + + OCFS2_XATTR_SIZE(xe->xe_name_len); + + xv = (struct ocfs2_xattr_value_root *)(xs->base + offset); + + return __ocfs2_xattr_set_value_outside(inode, xv, val, value_len); +} + +static int ocfs2_rm_xattr_cluster(struct inode *inode, + struct buffer_head *root_bh, + u64 blkno, + u32 cpos, + u32 len) +{ + int ret; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct inode *tl_inode = osb->osb_tl_inode; + handle_t *handle; + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)root_bh->b_data; + struct ocfs2_alloc_context *meta_ac = NULL; + struct ocfs2_cached_dealloc_ctxt dealloc; + struct ocfs2_extent_tree et; + + ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh); + + ocfs2_init_dealloc_ctxt(&dealloc); + + mlog(0, "rm xattr extent rec at %u len = %u, start from %llu\n", + cpos, len, (unsigned long long)blkno); + + ocfs2_remove_xattr_clusters_from_cache(inode, blkno, len); + + ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac); + if (ret) { + mlog_errno(ret); + return ret; + } + + mutex_lock(&tl_inode->i_mutex); + + if (ocfs2_truncate_log_needs_flush(osb)) { + ret = __ocfs2_flush_truncate_log(osb); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS); + if (handle == NULL) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access(handle, inode, root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac, + &dealloc); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, -len); + + ret = ocfs2_journal_dirty(handle, root_bh); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_truncate_log_append(osb, handle, blkno, len); + if (ret) + mlog_errno(ret); + +out_commit: + ocfs2_commit_trans(osb, handle); +out: + ocfs2_schedule_truncate_log_flush(osb, 1); + + mutex_unlock(&tl_inode->i_mutex); + + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + + ocfs2_run_deallocs(osb, &dealloc); + + return ret; +} + +static void ocfs2_xattr_bucket_remove_xs(struct inode *inode, + struct ocfs2_xattr_search *xs) +{ + handle_t *handle = NULL; + struct ocfs2_xattr_header *xh = xs->bucket.xh; + struct ocfs2_xattr_entry *last = &xh->xh_entries[ + le16_to_cpu(xh->xh_count) - 1]; + int ret = 0; + + handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + return; + } + + ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[0], + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + /* Remove the old entry. */ + memmove(xs->here, xs->here + 1, + (void *)last - (void *)xs->here); + memset(last, 0, sizeof(struct ocfs2_xattr_entry)); + le16_add_cpu(&xh->xh_count, -1); + + ret = ocfs2_journal_dirty(handle, xs->bucket.bhs[0]); + if (ret < 0) + mlog_errno(ret); +out_commit: + ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); +} + +/* + * Set the xattr name/value in the bucket specified in xs. + * + * As the new value in xi may be stored in the bucket or in an outside cluster, + * we divide the whole process into 3 steps: + * 1. insert name/value in the bucket(ocfs2_xattr_set_entry_in_bucket) + * 2. truncate of the outside cluster(ocfs2_xattr_bucket_value_truncate_xs) + * 3. Set the value to the outside cluster(ocfs2_xattr_bucket_set_value_outside) + * 4. If the clusters for the new outside value can't be allocated, we need + * to free the xattr we allocated in set. + */ +static int ocfs2_xattr_set_in_bucket(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs) +{ + int ret, local = 1; + size_t value_len; + char *val = (char *)xi->value; + struct ocfs2_xattr_entry *xe = xs->here; + u32 name_hash = ocfs2_xattr_name_hash(inode, xi->name, + strlen(xi->name)); + + if (!xs->not_found && !ocfs2_xattr_is_local(xe)) { + /* + * We need to truncate the xattr storage first. + * + * If both the old and new value are stored to + * outside block, we only need to truncate + * the storage and then set the value outside. + * + * If the new value should be stored within block, + * we should free all the outside block first and + * the modification to the xattr block will be done + * by following steps. + */ + if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) + value_len = xi->value_len; + else + value_len = 0; + + ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs, + value_len); + if (ret) + goto out; + + if (value_len) + goto set_value_outside; + } + + value_len = xi->value_len; + /* So we have to handle the inside block change now. */ + if (value_len > OCFS2_XATTR_INLINE_SIZE) { + /* + * If the new value will be stored outside of block, + * initalize a new empty value root and insert it first. + */ + local = 0; + xi->value = &def_xv; + xi->value_len = OCFS2_XATTR_ROOT_SIZE; + } + + ret = ocfs2_xattr_set_entry_in_bucket(inode, xi, xs, name_hash, local); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (value_len <= OCFS2_XATTR_INLINE_SIZE) + goto out; + + /* allocate the space now for the outside block storage. */ + ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs, + value_len); + if (ret) { + mlog_errno(ret); + + if (xs->not_found) { + /* + * We can't allocate enough clusters for outside + * storage and we have allocated xattr already, + * so need to remove it. + */ + ocfs2_xattr_bucket_remove_xs(inode, xs); + } + goto out; + } + +set_value_outside: + ret = ocfs2_xattr_bucket_set_value_outside(inode, xs, val, value_len); +out: + return ret; +} + +/* check whether the xattr bucket is filled up with the same hash value. */ +static int ocfs2_check_xattr_bucket_collision(struct inode *inode, + struct ocfs2_xattr_bucket *bucket) +{ + struct ocfs2_xattr_header *xh = bucket->xh; + + if (xh->xh_entries[le16_to_cpu(xh->xh_count) - 1].xe_name_hash == + xh->xh_entries[0].xe_name_hash) { + mlog(ML_ERROR, "Too much hash collision in xattr bucket %llu, " + "hash = %u\n", + (unsigned long long)bucket->bhs[0]->b_blocknr, + le32_to_cpu(xh->xh_entries[0].xe_name_hash)); + return -ENOSPC; + } + + return 0; +} + +static int ocfs2_xattr_set_entry_index_block(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs) +{ + struct ocfs2_xattr_header *xh; + struct ocfs2_xattr_entry *xe; + u16 count, header_size, xh_free_start; + int i, free, max_free, need, old; + size_t value_size = 0, name_len = strlen(xi->name); + size_t blocksize = inode->i_sb->s_blocksize; + int ret, allocation = 0; + u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + + mlog_entry("Set xattr %s in xattr index block\n", xi->name); + +try_again: + xh = xs->header; + count = le16_to_cpu(xh->xh_count); + xh_free_start = le16_to_cpu(xh->xh_free_start); + header_size = sizeof(struct ocfs2_xattr_header) + + count * sizeof(struct ocfs2_xattr_entry); + max_free = OCFS2_XATTR_BUCKET_SIZE - + le16_to_cpu(xh->xh_name_value_len) - header_size; + + mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size " + "of %u which exceed block size\n", + (unsigned long long)xs->bucket.bhs[0]->b_blocknr, + header_size); + + if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE) + value_size = OCFS2_XATTR_ROOT_SIZE; + else if (xi->value) + value_size = OCFS2_XATTR_SIZE(xi->value_len); + + if (xs->not_found) + need = sizeof(struct ocfs2_xattr_entry) + + OCFS2_XATTR_SIZE(name_len) + value_size; + else { + need = value_size + OCFS2_XATTR_SIZE(name_len); + + /* + * We only replace the old value if the new length is smaller + * than the old one. Otherwise we will allocate new space in the + * bucket to store it. + */ + xe = xs->here; + if (ocfs2_xattr_is_local(xe)) + old = OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size)); + else + old = OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE); + + if (old >= value_size) + need = 0; + } + + free = xh_free_start - header_size; + /* + * We need to make sure the new name/value pair + * can exist in the same block. + */ + if (xh_free_start % blocksize < need) + free -= xh_free_start % blocksize; + + mlog(0, "xs->not_found = %d, in xattr bucket %llu: free = %d, " + "need = %d, max_free = %d, xh_free_start = %u, xh_name_value_len =" + " %u\n", xs->not_found, + (unsigned long long)xs->bucket.bhs[0]->b_blocknr, + free, need, max_free, le16_to_cpu(xh->xh_free_start), + le16_to_cpu(xh->xh_name_value_len)); + + if (free < need || count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) { + if (need <= max_free && + count < ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) { + /* + * We can create the space by defragment. Since only the + * name/value will be moved, the xe shouldn't be changed + * in xs. + */ + ret = ocfs2_defrag_xattr_bucket(inode, &xs->bucket); + if (ret) { + mlog_errno(ret); + goto out; + } + + xh_free_start = le16_to_cpu(xh->xh_free_start); + free = xh_free_start - header_size; + if (xh_free_start % blocksize < need) + free -= xh_free_start % blocksize; + + if (free >= need) + goto xattr_set; + + mlog(0, "Can't get enough space for xattr insert by " + "defragment. Need %u bytes, but we have %d, so " + "allocate new bucket for it.\n", need, free); + } + + /* + * We have to add new buckets or clusters and one + * allocation should leave us enough space for insert. + */ + BUG_ON(allocation); + + /* + * We do not allow for overlapping ranges between buckets. And + * the maximum number of collisions we will allow for then is + * one bucket's worth, so check it here whether we need to + * add a new bucket for the insert. + */ + ret = ocfs2_check_xattr_bucket_collision(inode, &xs->bucket); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_add_new_xattr_bucket(inode, + xs->xattr_bh, + xs->bucket.bhs[0]); + if (ret) { + mlog_errno(ret); + goto out; + } + + for (i = 0; i < blk_per_bucket; i++) + brelse(xs->bucket.bhs[i]); + + memset(&xs->bucket, 0, sizeof(xs->bucket)); + + ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh, + xi->name_index, + xi->name, xs); + if (ret && ret != -ENODATA) + goto out; + xs->not_found = ret; + allocation = 1; + goto try_again; + } + +xattr_set: + ret = ocfs2_xattr_set_in_bucket(inode, xi, xs); +out: + mlog_exit(ret); + return ret; +} + +static int ocfs2_delete_xattr_in_bucket(struct inode *inode, + struct ocfs2_xattr_bucket *bucket, + void *para) +{ + int ret = 0; + struct ocfs2_xattr_header *xh = bucket->xh; + u16 i; + struct ocfs2_xattr_entry *xe; + + for (i = 0; i < le16_to_cpu(xh->xh_count); i++) { + xe = &xh->xh_entries[i]; + if (ocfs2_xattr_is_local(xe)) + continue; + + ret = ocfs2_xattr_bucket_value_truncate(inode, + bucket->bhs[0], + i, 0); + if (ret) { + mlog_errno(ret); + break; + } + } + + return ret; +} + +static int ocfs2_delete_xattr_index_block(struct inode *inode, + struct buffer_head *xb_bh) +{ + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)xb_bh->b_data; + struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list; + int ret = 0; + u32 name_hash = UINT_MAX, e_cpos, num_clusters; + u64 p_blkno; + + if (le16_to_cpu(el->l_next_free_rec) == 0) + return 0; + + while (name_hash > 0) { + ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, + &e_cpos, &num_clusters, el); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_iterate_xattr_buckets(inode, p_blkno, num_clusters, + ocfs2_delete_xattr_in_bucket, + NULL); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_rm_xattr_cluster(inode, xb_bh, + p_blkno, e_cpos, num_clusters); + if (ret) { + mlog_errno(ret); + break; + } + + if (e_cpos == 0) + break; + + name_hash = e_cpos - 1; + } + +out: + return ret; +} + +/* + * 'trusted' attributes support + */ + +#define XATTR_TRUSTED_PREFIX "trusted." + +static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list, + size_t list_size, const char *name, + size_t name_len) +{ + const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX) - 1; + const size_t total_len = prefix_len + name_len + 1; + + if (list && total_len <= list_size) { + memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len); + memcpy(list + prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + return total_len; +} + +static int ocfs2_xattr_trusted_get(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_TRUSTED, name, + buffer, size); +} + +static int ocfs2_xattr_trusted_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + + return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_TRUSTED, name, value, + size, flags); +} + +struct xattr_handler ocfs2_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .list = ocfs2_xattr_trusted_list, + .get = ocfs2_xattr_trusted_get, + .set = ocfs2_xattr_trusted_set, +}; + + +/* + * 'user' attributes support + */ + +#define XATTR_USER_PREFIX "user." + +static size_t ocfs2_xattr_user_list(struct inode *inode, char *list, + size_t list_size, const char *name, + size_t name_len) +{ + const size_t prefix_len = sizeof(XATTR_USER_PREFIX) - 1; + const size_t total_len = prefix_len + name_len + 1; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) + return 0; + + if (list && total_len <= list_size) { + memcpy(list, XATTR_USER_PREFIX, prefix_len); + memcpy(list + prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + return total_len; +} + +static int ocfs2_xattr_user_get(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (strcmp(name, "") == 0) + return -EINVAL; + if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) + return -EOPNOTSUPP; + return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_USER, name, + buffer, size); +} + +static int ocfs2_xattr_user_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (strcmp(name, "") == 0) + return -EINVAL; + if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) + return -EOPNOTSUPP; + + return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_USER, name, value, + size, flags); +} + +struct xattr_handler ocfs2_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .list = ocfs2_xattr_user_list, + .get = ocfs2_xattr_user_get, + .set = ocfs2_xattr_user_set, +}; diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h new file mode 100644 index 000000000000..c25c7c62a059 --- /dev/null +++ b/fs/ocfs2/xattr.h @@ -0,0 +1,68 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * xattr.h + * + * Function prototypes + * + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_XATTR_H +#define OCFS2_XATTR_H + +#include <linux/init.h> +#include <linux/xattr.h> + +enum ocfs2_xattr_type { + OCFS2_XATTR_INDEX_USER = 1, + OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS, + OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT, + OCFS2_XATTR_INDEX_TRUSTED, + OCFS2_XATTR_INDEX_SECURITY, + OCFS2_XATTR_MAX +}; + +extern struct xattr_handler ocfs2_xattr_user_handler; +extern struct xattr_handler ocfs2_xattr_trusted_handler; + +extern ssize_t ocfs2_listxattr(struct dentry *, char *, size_t); +extern int ocfs2_xattr_get(struct inode *, int, const char *, void *, size_t); +extern int ocfs2_xattr_set(struct inode *, int, const char *, const void *, + size_t, int); +extern int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh); +extern struct xattr_handler *ocfs2_xattr_handlers[]; + +static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb) +{ + return (1 << osb->s_clustersize_bits) / OCFS2_XATTR_BUCKET_SIZE; +} + +static inline u16 ocfs2_blocks_per_xattr_bucket(struct super_block *sb) +{ + return OCFS2_XATTR_BUCKET_SIZE / (1 << sb->s_blocksize_bits); +} + +static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb) +{ + u16 len = sb->s_blocksize - + offsetof(struct ocfs2_xattr_header, xh_entries); + + return len / sizeof(struct ocfs2_xattr_entry); +} +#endif /* OCFS2_XATTR_H */ diff --git a/fs/omfs/Makefile b/fs/omfs/Makefile new file mode 100644 index 000000000000..8b82b63f1129 --- /dev/null +++ b/fs/omfs/Makefile @@ -0,0 +1,4 @@ + +obj-$(CONFIG_OMFS_FS) += omfs.o + +omfs-y := bitmap.o dir.o file.o inode.o diff --git a/fs/omfs/bitmap.c b/fs/omfs/bitmap.c new file mode 100644 index 000000000000..e1c0ec0ae989 --- /dev/null +++ b/fs/omfs/bitmap.c @@ -0,0 +1,193 @@ +#include <linux/kernel.h> +#include <linux/fs.h> +#include <linux/buffer_head.h> +#include <asm/div64.h> +#include "omfs.h" + +unsigned long omfs_count_free(struct super_block *sb) +{ + unsigned int i; + unsigned long sum = 0; + struct omfs_sb_info *sbi = OMFS_SB(sb); + int nbits = sb->s_blocksize * 8; + + for (i = 0; i < sbi->s_imap_size; i++) + sum += nbits - bitmap_weight(sbi->s_imap[i], nbits); + + return sum; +} + +/* + * Counts the run of zero bits starting at bit up to max. + * It handles the case where a run might spill over a buffer. + * Called with bitmap lock. + */ +static int count_run(unsigned long **addr, int nbits, + int addrlen, int bit, int max) +{ + int count = 0; + int x; + + for (; addrlen > 0; addrlen--, addr++) { + x = find_next_bit(*addr, nbits, bit); + count += x - bit; + + if (x < nbits || count > max) + return min(count, max); + + bit = 0; + } + return min(count, max); +} + +/* + * Sets or clears the run of count bits starting with bit. + * Called with bitmap lock. + */ +static int set_run(struct super_block *sb, int map, + int nbits, int bit, int count, int set) +{ + int i; + int err; + struct buffer_head *bh; + struct omfs_sb_info *sbi = OMFS_SB(sb); + + err = -ENOMEM; + bh = sb_bread(sb, clus_to_blk(sbi, sbi->s_bitmap_ino) + map); + if (!bh) + goto out; + + for (i = 0; i < count; i++, bit++) { + if (bit >= nbits) { + bit = 0; + map++; + + mark_buffer_dirty(bh); + brelse(bh); + bh = sb_bread(sb, + clus_to_blk(sbi, sbi->s_bitmap_ino) + map); + if (!bh) + goto out; + } + if (set) { + set_bit(bit, sbi->s_imap[map]); + set_bit(bit, (unsigned long *)bh->b_data); + } else { + clear_bit(bit, sbi->s_imap[map]); + clear_bit(bit, (unsigned long *)bh->b_data); + } + } + mark_buffer_dirty(bh); + brelse(bh); + err = 0; +out: + return err; +} + +/* + * Tries to allocate exactly one block. Returns true if sucessful. + */ +int omfs_allocate_block(struct super_block *sb, u64 block) +{ + struct buffer_head *bh; + struct omfs_sb_info *sbi = OMFS_SB(sb); + int bits_per_entry = 8 * sb->s_blocksize; + unsigned int map, bit; + int ret = 0; + u64 tmp; + + tmp = block; + bit = do_div(tmp, bits_per_entry); + map = tmp; + + mutex_lock(&sbi->s_bitmap_lock); + if (map >= sbi->s_imap_size || test_and_set_bit(bit, sbi->s_imap[map])) + goto out; + + if (sbi->s_bitmap_ino > 0) { + bh = sb_bread(sb, clus_to_blk(sbi, sbi->s_bitmap_ino) + map); + if (!bh) + goto out; + + set_bit(bit, (unsigned long *)bh->b_data); + mark_buffer_dirty(bh); + brelse(bh); + } + ret = 1; +out: + mutex_unlock(&sbi->s_bitmap_lock); + return ret; +} + + +/* + * Tries to allocate a set of blocks. The request size depends on the + * type: for inodes, we must allocate sbi->s_mirrors blocks, and for file + * blocks, we try to allocate sbi->s_clustersize, but can always get away + * with just one block. + */ +int omfs_allocate_range(struct super_block *sb, + int min_request, + int max_request, + u64 *return_block, + int *return_size) +{ + struct omfs_sb_info *sbi = OMFS_SB(sb); + int bits_per_entry = 8 * sb->s_blocksize; + int ret = 0; + int i, run, bit; + + mutex_lock(&sbi->s_bitmap_lock); + for (i = 0; i < sbi->s_imap_size; i++) { + bit = 0; + while (bit < bits_per_entry) { + bit = find_next_zero_bit(sbi->s_imap[i], bits_per_entry, + bit); + + if (bit == bits_per_entry) + break; + + run = count_run(&sbi->s_imap[i], bits_per_entry, + sbi->s_imap_size-i, bit, max_request); + + if (run >= min_request) + goto found; + bit += run; + } + } + ret = -ENOSPC; + goto out; + +found: + *return_block = i * bits_per_entry + bit; + *return_size = run; + ret = set_run(sb, i, bits_per_entry, bit, run, 1); + +out: + mutex_unlock(&sbi->s_bitmap_lock); + return ret; +} + +/* + * Clears count bits starting at a given block. + */ +int omfs_clear_range(struct super_block *sb, u64 block, int count) +{ + struct omfs_sb_info *sbi = OMFS_SB(sb); + int bits_per_entry = 8 * sb->s_blocksize; + u64 tmp; + unsigned int map, bit; + int ret; + + tmp = block; + bit = do_div(tmp, bits_per_entry); + map = tmp; + + if (map >= sbi->s_imap_size) + return 0; + + mutex_lock(&sbi->s_bitmap_lock); + ret = set_run(sb, map, bits_per_entry, bit, count, 0); + mutex_unlock(&sbi->s_bitmap_lock); + return ret; +} diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c new file mode 100644 index 000000000000..c0757e998876 --- /dev/null +++ b/fs/omfs/dir.c @@ -0,0 +1,504 @@ +/* + * OMFS (as used by RIO Karma) directory operations. + * Copyright (C) 2005 Bob Copeland <me@bobcopeland.com> + * Released under GPL v2. + */ + +#include <linux/fs.h> +#include <linux/ctype.h> +#include <linux/buffer_head.h> +#include "omfs.h" + +static int omfs_hash(const char *name, int namelen, int mod) +{ + int i, hash = 0; + for (i = 0; i < namelen; i++) + hash ^= tolower(name[i]) << (i % 24); + return hash % mod; +} + +/* + * Finds the bucket for a given name and reads the containing block; + * *ofs is set to the offset of the first list entry. + */ +static struct buffer_head *omfs_get_bucket(struct inode *dir, + const char *name, int namelen, int *ofs) +{ + int nbuckets = (dir->i_size - OMFS_DIR_START)/8; + int block = clus_to_blk(OMFS_SB(dir->i_sb), dir->i_ino); + int bucket = omfs_hash(name, namelen, nbuckets); + + *ofs = OMFS_DIR_START + bucket * 8; + return sb_bread(dir->i_sb, block); +} + +static struct buffer_head *omfs_scan_list(struct inode *dir, u64 block, + const char *name, int namelen, + u64 *prev_block) +{ + struct buffer_head *bh; + struct omfs_inode *oi; + int err = -ENOENT; + *prev_block = ~0; + + while (block != ~0) { + bh = sb_bread(dir->i_sb, + clus_to_blk(OMFS_SB(dir->i_sb), block)); + if (!bh) { + err = -EIO; + goto err; + } + + oi = (struct omfs_inode *) bh->b_data; + if (omfs_is_bad(OMFS_SB(dir->i_sb), &oi->i_head, block)) { + brelse(bh); + goto err; + } + + if (strncmp(oi->i_name, name, namelen) == 0) + return bh; + + *prev_block = block; + block = be64_to_cpu(oi->i_sibling); + brelse(bh); + } +err: + return ERR_PTR(err); +} + +static struct buffer_head *omfs_find_entry(struct inode *dir, + const char *name, int namelen) +{ + struct buffer_head *bh; + int ofs; + u64 block, dummy; + + bh = omfs_get_bucket(dir, name, namelen, &ofs); + if (!bh) + return ERR_PTR(-EIO); + + block = be64_to_cpu(*((__be64 *) &bh->b_data[ofs])); + brelse(bh); + + return omfs_scan_list(dir, block, name, namelen, &dummy); +} + +int omfs_make_empty(struct inode *inode, struct super_block *sb) +{ + struct omfs_sb_info *sbi = OMFS_SB(sb); + int block = clus_to_blk(sbi, inode->i_ino); + struct buffer_head *bh; + struct omfs_inode *oi; + + bh = sb_bread(sb, block); + if (!bh) + return -ENOMEM; + + memset(bh->b_data, 0, sizeof(struct omfs_inode)); + + if (inode->i_mode & S_IFDIR) { + memset(&bh->b_data[OMFS_DIR_START], 0xff, + sbi->s_sys_blocksize - OMFS_DIR_START); + } else + omfs_make_empty_table(bh, OMFS_EXTENT_START); + + oi = (struct omfs_inode *) bh->b_data; + oi->i_head.h_self = cpu_to_be64(inode->i_ino); + oi->i_sibling = ~cpu_to_be64(0ULL); + + mark_buffer_dirty(bh); + brelse(bh); + return 0; +} + +static int omfs_add_link(struct dentry *dentry, struct inode *inode) +{ + struct inode *dir = dentry->d_parent->d_inode; + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + struct omfs_inode *oi; + struct buffer_head *bh; + u64 block; + __be64 *entry; + int ofs; + + /* just prepend to head of queue in proper bucket */ + bh = omfs_get_bucket(dir, name, namelen, &ofs); + if (!bh) + goto out; + + entry = (__be64 *) &bh->b_data[ofs]; + block = be64_to_cpu(*entry); + *entry = cpu_to_be64(inode->i_ino); + mark_buffer_dirty(bh); + brelse(bh); + + /* now set the sibling and parent pointers on the new inode */ + bh = sb_bread(dir->i_sb, clus_to_blk(OMFS_SB(dir->i_sb), inode->i_ino)); + if (!bh) + goto out; + + oi = (struct omfs_inode *) bh->b_data; + memcpy(oi->i_name, name, namelen); + memset(oi->i_name + namelen, 0, OMFS_NAMELEN - namelen); + oi->i_sibling = cpu_to_be64(block); + oi->i_parent = cpu_to_be64(dir->i_ino); + mark_buffer_dirty(bh); + brelse(bh); + + dir->i_ctime = CURRENT_TIME_SEC; + + /* mark affected inodes dirty to rebuild checksums */ + mark_inode_dirty(dir); + mark_inode_dirty(inode); + return 0; +out: + return -ENOMEM; +} + +static int omfs_delete_entry(struct dentry *dentry) +{ + struct inode *dir = dentry->d_parent->d_inode; + struct inode *dirty; + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + struct omfs_inode *oi; + struct buffer_head *bh, *bh2; + __be64 *entry, next; + u64 block, prev; + int ofs; + int err = -ENOMEM; + + /* delete the proper node in the bucket's linked list */ + bh = omfs_get_bucket(dir, name, namelen, &ofs); + if (!bh) + goto out; + + entry = (__be64 *) &bh->b_data[ofs]; + block = be64_to_cpu(*entry); + + bh2 = omfs_scan_list(dir, block, name, namelen, &prev); + if (IS_ERR(bh2)) { + err = PTR_ERR(bh2); + goto out_free_bh; + } + + oi = (struct omfs_inode *) bh2->b_data; + next = oi->i_sibling; + brelse(bh2); + + if (prev != ~0) { + /* found in middle of list, get list ptr */ + brelse(bh); + bh = sb_bread(dir->i_sb, + clus_to_blk(OMFS_SB(dir->i_sb), prev)); + if (!bh) + goto out; + + oi = (struct omfs_inode *) bh->b_data; + entry = &oi->i_sibling; + } + + *entry = next; + mark_buffer_dirty(bh); + + if (prev != ~0) { + dirty = omfs_iget(dir->i_sb, prev); + if (!IS_ERR(dirty)) { + mark_inode_dirty(dirty); + iput(dirty); + } + } + + err = 0; +out_free_bh: + brelse(bh); +out: + return err; +} + +static int omfs_dir_is_empty(struct inode *inode) +{ + int nbuckets = (inode->i_size - OMFS_DIR_START) / 8; + struct buffer_head *bh; + u64 *ptr; + int i; + + bh = sb_bread(inode->i_sb, clus_to_blk(OMFS_SB(inode->i_sb), + inode->i_ino)); + + if (!bh) + return 0; + + ptr = (u64 *) &bh->b_data[OMFS_DIR_START]; + + for (i = 0; i < nbuckets; i++, ptr++) + if (*ptr != ~0) + break; + + brelse(bh); + return *ptr != ~0; +} + +static int omfs_unlink(struct inode *dir, struct dentry *dentry) +{ + int ret; + struct inode *inode = dentry->d_inode; + + ret = omfs_delete_entry(dentry); + if (ret) + goto end_unlink; + + inode_dec_link_count(inode); + mark_inode_dirty(dir); + +end_unlink: + return ret; +} + +static int omfs_rmdir(struct inode *dir, struct dentry *dentry) +{ + int err = -ENOTEMPTY; + struct inode *inode = dentry->d_inode; + + if (omfs_dir_is_empty(inode)) { + err = omfs_unlink(dir, dentry); + if (!err) + inode_dec_link_count(inode); + } + return err; +} + +static int omfs_add_node(struct inode *dir, struct dentry *dentry, int mode) +{ + int err; + struct inode *inode = omfs_new_inode(dir, mode); + + if (IS_ERR(inode)) + return PTR_ERR(inode); + + err = omfs_make_empty(inode, dir->i_sb); + if (err) + goto out_free_inode; + + err = omfs_add_link(dentry, inode); + if (err) + goto out_free_inode; + + d_instantiate(dentry, inode); + return 0; + +out_free_inode: + iput(inode); + return err; +} + +static int omfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + return omfs_add_node(dir, dentry, mode | S_IFDIR); +} + +static int omfs_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + return omfs_add_node(dir, dentry, mode | S_IFREG); +} + +static struct dentry *omfs_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + struct buffer_head *bh; + struct inode *inode = NULL; + + if (dentry->d_name.len > OMFS_NAMELEN) + return ERR_PTR(-ENAMETOOLONG); + + bh = omfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len); + if (!IS_ERR(bh)) { + struct omfs_inode *oi = (struct omfs_inode *)bh->b_data; + ino_t ino = be64_to_cpu(oi->i_head.h_self); + brelse(bh); + inode = omfs_iget(dir->i_sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + } + d_add(dentry, inode); + return NULL; +} + +/* sanity check block's self pointer */ +int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header, + u64 fsblock) +{ + int is_bad; + u64 ino = be64_to_cpu(header->h_self); + is_bad = ((ino != fsblock) || (ino < sbi->s_root_ino) || + (ino > sbi->s_num_blocks)); + + if (is_bad) + printk(KERN_WARNING "omfs: bad hash chain detected\n"); + + return is_bad; +} + +static int omfs_fill_chain(struct file *filp, void *dirent, filldir_t filldir, + u64 fsblock, int hindex) +{ + struct inode *dir = filp->f_dentry->d_inode; + struct buffer_head *bh; + struct omfs_inode *oi; + u64 self; + int res = 0; + unsigned char d_type; + + /* follow chain in this bucket */ + while (fsblock != ~0) { + bh = sb_bread(dir->i_sb, clus_to_blk(OMFS_SB(dir->i_sb), + fsblock)); + if (!bh) + goto out; + + oi = (struct omfs_inode *) bh->b_data; + if (omfs_is_bad(OMFS_SB(dir->i_sb), &oi->i_head, fsblock)) { + brelse(bh); + goto out; + } + + self = fsblock; + fsblock = be64_to_cpu(oi->i_sibling); + + /* skip visited nodes */ + if (hindex) { + hindex--; + brelse(bh); + continue; + } + + d_type = (oi->i_type == OMFS_DIR) ? DT_DIR : DT_REG; + + res = filldir(dirent, oi->i_name, strnlen(oi->i_name, + OMFS_NAMELEN), filp->f_pos, self, d_type); + if (res == 0) + filp->f_pos++; + brelse(bh); + } +out: + return res; +} + +static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = old_dentry->d_inode; + struct buffer_head *bh; + int is_dir; + int err; + + is_dir = S_ISDIR(old_inode->i_mode); + + if (new_inode) { + /* overwriting existing file/dir */ + err = -ENOTEMPTY; + if (is_dir && !omfs_dir_is_empty(new_inode)) + goto out; + + err = -ENOENT; + bh = omfs_find_entry(new_dir, new_dentry->d_name.name, + new_dentry->d_name.len); + if (IS_ERR(bh)) + goto out; + brelse(bh); + + err = omfs_unlink(new_dir, new_dentry); + if (err) + goto out; + } + + /* since omfs locates files by name, we need to unlink _before_ + * adding the new link or we won't find the old one */ + inode_inc_link_count(old_inode); + err = omfs_unlink(old_dir, old_dentry); + if (err) { + inode_dec_link_count(old_inode); + goto out; + } + + err = omfs_add_link(new_dentry, old_inode); + if (err) + goto out; + + old_inode->i_ctime = CURRENT_TIME_SEC; +out: + return err; +} + +static int omfs_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + struct inode *dir = filp->f_dentry->d_inode; + struct buffer_head *bh; + loff_t offset, res; + unsigned int hchain, hindex; + int nbuckets; + u64 fsblock; + int ret = -EINVAL; + + if (filp->f_pos >> 32) + goto success; + + switch ((unsigned long) filp->f_pos) { + case 0: + if (filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR) < 0) + goto success; + filp->f_pos++; + /* fall through */ + case 1: + if (filldir(dirent, "..", 2, 1, + parent_ino(filp->f_dentry), DT_DIR) < 0) + goto success; + filp->f_pos = 1 << 20; + /* fall through */ + } + + nbuckets = (dir->i_size - OMFS_DIR_START) / 8; + + /* high 12 bits store bucket + 1 and low 20 bits store hash index */ + hchain = (filp->f_pos >> 20) - 1; + hindex = filp->f_pos & 0xfffff; + + bh = sb_bread(dir->i_sb, clus_to_blk(OMFS_SB(dir->i_sb), dir->i_ino)); + if (!bh) + goto out; + + offset = OMFS_DIR_START + hchain * 8; + + for (; hchain < nbuckets; hchain++, offset += 8) { + fsblock = be64_to_cpu(*((__be64 *) &bh->b_data[offset])); + + res = omfs_fill_chain(filp, dirent, filldir, fsblock, hindex); + hindex = 0; + if (res < 0) + break; + + filp->f_pos = (hchain+2) << 20; + } + brelse(bh); +success: + ret = 0; +out: + return ret; +} + +struct inode_operations omfs_dir_inops = { + .lookup = omfs_lookup, + .mkdir = omfs_mkdir, + .rename = omfs_rename, + .create = omfs_create, + .unlink = omfs_unlink, + .rmdir = omfs_rmdir, +}; + +struct file_operations omfs_dir_operations = { + .read = generic_read_dir, + .readdir = omfs_readdir, +}; diff --git a/fs/omfs/file.c b/fs/omfs/file.c new file mode 100644 index 000000000000..834b2331f6b3 --- /dev/null +++ b/fs/omfs/file.c @@ -0,0 +1,365 @@ +/* + * OMFS (as used by RIO Karma) file operations. + * Copyright (C) 2005 Bob Copeland <me@bobcopeland.com> + * Released under GPL v2. + */ + +#include <linux/version.h> +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/buffer_head.h> +#include <linux/mpage.h> +#include "omfs.h" + +static int omfs_sync_file(struct file *file, struct dentry *dentry, + int datasync) +{ + struct inode *inode = dentry->d_inode; + int err; + + err = sync_mapping_buffers(inode->i_mapping); + if (!(inode->i_state & I_DIRTY)) + return err; + if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) + return err; + err |= omfs_sync_inode(inode); + return err ? -EIO : 0; +} + +static u32 omfs_max_extents(struct omfs_sb_info *sbi, int offset) +{ + return (sbi->s_sys_blocksize - offset - + sizeof(struct omfs_extent)) / + sizeof(struct omfs_extent_entry) + 1; +} + +void omfs_make_empty_table(struct buffer_head *bh, int offset) +{ + struct omfs_extent *oe = (struct omfs_extent *) &bh->b_data[offset]; + + oe->e_next = ~cpu_to_be64(0ULL); + oe->e_extent_count = cpu_to_be32(1), + oe->e_fill = cpu_to_be32(0x22), + oe->e_entry.e_cluster = ~cpu_to_be64(0ULL); + oe->e_entry.e_blocks = ~cpu_to_be64(0ULL); +} + +int omfs_shrink_inode(struct inode *inode) +{ + struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb); + struct omfs_extent *oe; + struct omfs_extent_entry *entry; + struct buffer_head *bh; + u64 next, last; + u32 extent_count; + u32 max_extents; + int ret; + + /* traverse extent table, freeing each entry that is greater + * than inode->i_size; + */ + next = inode->i_ino; + + /* only support truncate -> 0 for now */ + ret = -EIO; + if (inode->i_size != 0) + goto out; + + bh = sb_bread(inode->i_sb, clus_to_blk(sbi, next)); + if (!bh) + goto out; + + oe = (struct omfs_extent *)(&bh->b_data[OMFS_EXTENT_START]); + max_extents = omfs_max_extents(sbi, OMFS_EXTENT_START); + + for (;;) { + + if (omfs_is_bad(sbi, (struct omfs_header *) bh->b_data, next)) + goto out_brelse; + + extent_count = be32_to_cpu(oe->e_extent_count); + + if (extent_count > max_extents) + goto out_brelse; + + last = next; + next = be64_to_cpu(oe->e_next); + entry = &oe->e_entry; + + /* ignore last entry as it is the terminator */ + for (; extent_count > 1; extent_count--) { + u64 start, count; + start = be64_to_cpu(entry->e_cluster); + count = be64_to_cpu(entry->e_blocks); + + omfs_clear_range(inode->i_sb, start, (int) count); + entry++; + } + omfs_make_empty_table(bh, (char *) oe - bh->b_data); + mark_buffer_dirty(bh); + brelse(bh); + + if (last != inode->i_ino) + omfs_clear_range(inode->i_sb, last, sbi->s_mirrors); + + if (next == ~0) + break; + + bh = sb_bread(inode->i_sb, clus_to_blk(sbi, next)); + if (!bh) + goto out; + oe = (struct omfs_extent *) (&bh->b_data[OMFS_EXTENT_CONT]); + max_extents = omfs_max_extents(sbi, OMFS_EXTENT_CONT); + } + ret = 0; +out: + return ret; +out_brelse: + brelse(bh); + return ret; +} + +static void omfs_truncate(struct inode *inode) +{ + omfs_shrink_inode(inode); + mark_inode_dirty(inode); +} + +/* + * Add new blocks to the current extent, or create new entries/continuations + * as necessary. + */ +static int omfs_grow_extent(struct inode *inode, struct omfs_extent *oe, + u64 *ret_block) +{ + struct omfs_extent_entry *terminator; + struct omfs_extent_entry *entry = &oe->e_entry; + struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb); + u32 extent_count = be32_to_cpu(oe->e_extent_count); + u64 new_block = 0; + u32 max_count; + int new_count; + int ret = 0; + + /* reached the end of the extent table with no blocks mapped. + * there are three possibilities for adding: grow last extent, + * add a new extent to the current extent table, and add a + * continuation inode. in last two cases need an allocator for + * sbi->s_cluster_size + */ + + /* TODO: handle holes */ + + /* should always have a terminator */ + if (extent_count < 1) + return -EIO; + + /* trivially grow current extent, if next block is not taken */ + terminator = entry + extent_count - 1; + if (extent_count > 1) { + entry = terminator-1; + new_block = be64_to_cpu(entry->e_cluster) + + be64_to_cpu(entry->e_blocks); + + if (omfs_allocate_block(inode->i_sb, new_block)) { + entry->e_blocks = + cpu_to_be64(be64_to_cpu(entry->e_blocks) + 1); + terminator->e_blocks = ~(cpu_to_be64( + be64_to_cpu(~terminator->e_blocks) + 1)); + goto out; + } + } + max_count = omfs_max_extents(sbi, OMFS_EXTENT_START); + + /* TODO: add a continuation block here */ + if (be32_to_cpu(oe->e_extent_count) > max_count-1) + return -EIO; + + /* try to allocate a new cluster */ + ret = omfs_allocate_range(inode->i_sb, 1, sbi->s_clustersize, + &new_block, &new_count); + if (ret) + goto out_fail; + + /* copy terminator down an entry */ + entry = terminator; + terminator++; + memcpy(terminator, entry, sizeof(struct omfs_extent_entry)); + + entry->e_cluster = cpu_to_be64(new_block); + entry->e_blocks = cpu_to_be64((u64) new_count); + + terminator->e_blocks = ~(cpu_to_be64( + be64_to_cpu(~terminator->e_blocks) + (u64) new_count)); + + /* write in new entry */ + oe->e_extent_count = cpu_to_be32(1 + be32_to_cpu(oe->e_extent_count)); + +out: + *ret_block = new_block; +out_fail: + return ret; +} + +/* + * Scans across the directory table for a given file block number. + * If block not found, return 0. + */ +static sector_t find_block(struct inode *inode, struct omfs_extent_entry *ent, + sector_t block, int count, int *left) +{ + /* count > 1 because of terminator */ + sector_t searched = 0; + for (; count > 1; count--) { + int numblocks = clus_to_blk(OMFS_SB(inode->i_sb), + be64_to_cpu(ent->e_blocks)); + + if (block >= searched && + block < searched + numblocks) { + /* + * found it at cluster + (block - searched) + * numblocks - (block - searched) is remainder + */ + *left = numblocks - (block - searched); + return clus_to_blk(OMFS_SB(inode->i_sb), + be64_to_cpu(ent->e_cluster)) + + block - searched; + } + searched += numblocks; + ent++; + } + return 0; +} + +static int omfs_get_block(struct inode *inode, sector_t block, + struct buffer_head *bh_result, int create) +{ + struct buffer_head *bh; + sector_t next, offset; + int ret; + u64 new_block; + u32 max_extents; + int extent_count; + struct omfs_extent *oe; + struct omfs_extent_entry *entry; + struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb); + int max_blocks = bh_result->b_size >> inode->i_blkbits; + int remain; + + ret = -EIO; + bh = sb_bread(inode->i_sb, clus_to_blk(sbi, inode->i_ino)); + if (!bh) + goto out; + + oe = (struct omfs_extent *)(&bh->b_data[OMFS_EXTENT_START]); + max_extents = omfs_max_extents(sbi, OMFS_EXTENT_START); + next = inode->i_ino; + + for (;;) { + + if (omfs_is_bad(sbi, (struct omfs_header *) bh->b_data, next)) + goto out_brelse; + + extent_count = be32_to_cpu(oe->e_extent_count); + next = be64_to_cpu(oe->e_next); + entry = &oe->e_entry; + + if (extent_count > max_extents) + goto out_brelse; + + offset = find_block(inode, entry, block, extent_count, &remain); + if (offset > 0) { + ret = 0; + map_bh(bh_result, inode->i_sb, offset); + if (remain > max_blocks) + remain = max_blocks; + bh_result->b_size = (remain << inode->i_blkbits); + goto out_brelse; + } + if (next == ~0) + break; + + brelse(bh); + bh = sb_bread(inode->i_sb, clus_to_blk(sbi, next)); + if (!bh) + goto out; + oe = (struct omfs_extent *) (&bh->b_data[OMFS_EXTENT_CONT]); + max_extents = omfs_max_extents(sbi, OMFS_EXTENT_CONT); + } + if (create) { + ret = omfs_grow_extent(inode, oe, &new_block); + if (ret == 0) { + mark_buffer_dirty(bh); + mark_inode_dirty(inode); + map_bh(bh_result, inode->i_sb, + clus_to_blk(sbi, new_block)); + } + } +out_brelse: + brelse(bh); +out: + return ret; +} + +static int omfs_readpage(struct file *file, struct page *page) +{ + return block_read_full_page(page, omfs_get_block); +} + +static int omfs_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, omfs_get_block); +} + +static int omfs_writepage(struct page *page, struct writeback_control *wbc) +{ + return block_write_full_page(page, omfs_get_block, wbc); +} + +static int +omfs_writepages(struct address_space *mapping, struct writeback_control *wbc) +{ + return mpage_writepages(mapping, wbc, omfs_get_block); +} + +static int omfs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + *pagep = NULL; + return block_write_begin(file, mapping, pos, len, flags, + pagep, fsdata, omfs_get_block); +} + +static sector_t omfs_bmap(struct address_space *mapping, sector_t block) +{ + return generic_block_bmap(mapping, block, omfs_get_block); +} + +struct file_operations omfs_file_operations = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .aio_read = generic_file_aio_read, + .aio_write = generic_file_aio_write, + .mmap = generic_file_mmap, + .fsync = omfs_sync_file, + .splice_read = generic_file_splice_read, +}; + +struct inode_operations omfs_file_inops = { + .truncate = omfs_truncate +}; + +struct address_space_operations omfs_aops = { + .readpage = omfs_readpage, + .readpages = omfs_readpages, + .writepage = omfs_writepage, + .writepages = omfs_writepages, + .sync_page = block_sync_page, + .write_begin = omfs_write_begin, + .write_end = generic_write_end, + .bmap = omfs_bmap, +}; + diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c new file mode 100644 index 000000000000..cbf047a847c5 --- /dev/null +++ b/fs/omfs/inode.c @@ -0,0 +1,553 @@ +/* + * Optimized MPEG FS - inode and super operations. + * Copyright (C) 2006 Bob Copeland <me@bobcopeland.com> + * Released under GPL v2. + */ +#include <linux/version.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/fs.h> +#include <linux/vfs.h> +#include <linux/parser.h> +#include <linux/buffer_head.h> +#include <linux/vmalloc.h> +#include <linux/crc-itu-t.h> +#include "omfs.h" + +MODULE_AUTHOR("Bob Copeland <me@bobcopeland.com>"); +MODULE_DESCRIPTION("OMFS (ReplayTV/Karma) Filesystem for Linux"); +MODULE_LICENSE("GPL"); + +struct inode *omfs_new_inode(struct inode *dir, int mode) +{ + struct inode *inode; + u64 new_block; + int err; + int len; + struct omfs_sb_info *sbi = OMFS_SB(dir->i_sb); + + inode = new_inode(dir->i_sb); + if (!inode) + return ERR_PTR(-ENOMEM); + + err = omfs_allocate_range(dir->i_sb, sbi->s_mirrors, sbi->s_mirrors, + &new_block, &len); + if (err) + goto fail; + + inode->i_ino = new_block; + inode->i_mode = mode; + inode->i_uid = current->fsuid; + inode->i_gid = current->fsgid; + inode->i_blocks = 0; + inode->i_mapping->a_ops = &omfs_aops; + + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + switch (mode & S_IFMT) { + case S_IFDIR: + inode->i_op = &omfs_dir_inops; + inode->i_fop = &omfs_dir_operations; + inode->i_size = sbi->s_sys_blocksize; + inc_nlink(inode); + break; + case S_IFREG: + inode->i_op = &omfs_file_inops; + inode->i_fop = &omfs_file_operations; + inode->i_size = 0; + break; + } + + insert_inode_hash(inode); + mark_inode_dirty(inode); + return inode; +fail: + make_bad_inode(inode); + iput(inode); + return ERR_PTR(err); +} + +/* + * Update the header checksums for a dirty inode based on its contents. + * Caller is expected to hold the buffer head underlying oi and mark it + * dirty. + */ +static void omfs_update_checksums(struct omfs_inode *oi) +{ + int xor, i, ofs = 0, count; + u16 crc = 0; + unsigned char *ptr = (unsigned char *) oi; + + count = be32_to_cpu(oi->i_head.h_body_size); + ofs = sizeof(struct omfs_header); + + crc = crc_itu_t(crc, ptr + ofs, count); + oi->i_head.h_crc = cpu_to_be16(crc); + + xor = ptr[0]; + for (i = 1; i < OMFS_XOR_COUNT; i++) + xor ^= ptr[i]; + + oi->i_head.h_check_xor = xor; +} + +static int omfs_write_inode(struct inode *inode, int wait) +{ + struct omfs_inode *oi; + struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb); + struct buffer_head *bh, *bh2; + unsigned int block; + u64 ctime; + int i; + int ret = -EIO; + int sync_failed = 0; + + /* get current inode since we may have written sibling ptrs etc. */ + block = clus_to_blk(sbi, inode->i_ino); + bh = sb_bread(inode->i_sb, block); + if (!bh) + goto out; + + oi = (struct omfs_inode *) bh->b_data; + + oi->i_head.h_self = cpu_to_be64(inode->i_ino); + if (S_ISDIR(inode->i_mode)) + oi->i_type = OMFS_DIR; + else if (S_ISREG(inode->i_mode)) + oi->i_type = OMFS_FILE; + else { + printk(KERN_WARNING "omfs: unknown file type: %d\n", + inode->i_mode); + goto out_brelse; + } + + oi->i_head.h_body_size = cpu_to_be32(sbi->s_sys_blocksize - + sizeof(struct omfs_header)); + oi->i_head.h_version = 1; + oi->i_head.h_type = OMFS_INODE_NORMAL; + oi->i_head.h_magic = OMFS_IMAGIC; + oi->i_size = cpu_to_be64(inode->i_size); + + ctime = inode->i_ctime.tv_sec * 1000LL + + ((inode->i_ctime.tv_nsec + 999)/1000); + oi->i_ctime = cpu_to_be64(ctime); + + omfs_update_checksums(oi); + + mark_buffer_dirty(bh); + if (wait) { + sync_dirty_buffer(bh); + if (buffer_req(bh) && !buffer_uptodate(bh)) + sync_failed = 1; + } + + /* if mirroring writes, copy to next fsblock */ + for (i = 1; i < sbi->s_mirrors; i++) { + bh2 = sb_bread(inode->i_sb, block + i * + (sbi->s_blocksize / sbi->s_sys_blocksize)); + if (!bh2) + goto out_brelse; + + memcpy(bh2->b_data, bh->b_data, bh->b_size); + mark_buffer_dirty(bh2); + if (wait) { + sync_dirty_buffer(bh2); + if (buffer_req(bh2) && !buffer_uptodate(bh2)) + sync_failed = 1; + } + brelse(bh2); + } + ret = (sync_failed) ? -EIO : 0; +out_brelse: + brelse(bh); +out: + return ret; +} + +int omfs_sync_inode(struct inode *inode) +{ + return omfs_write_inode(inode, 1); +} + +/* + * called when an entry is deleted, need to clear the bits in the + * bitmaps. + */ +static void omfs_delete_inode(struct inode *inode) +{ + truncate_inode_pages(&inode->i_data, 0); + + if (S_ISREG(inode->i_mode)) { + inode->i_size = 0; + omfs_shrink_inode(inode); + } + + omfs_clear_range(inode->i_sb, inode->i_ino, 2); + clear_inode(inode); +} + +struct inode *omfs_iget(struct super_block *sb, ino_t ino) +{ + struct omfs_sb_info *sbi = OMFS_SB(sb); + struct omfs_inode *oi; + struct buffer_head *bh; + unsigned int block; + u64 ctime; + unsigned long nsecs; + struct inode *inode; + + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + block = clus_to_blk(sbi, ino); + bh = sb_bread(inode->i_sb, block); + if (!bh) + goto iget_failed; + + oi = (struct omfs_inode *)bh->b_data; + + /* check self */ + if (ino != be64_to_cpu(oi->i_head.h_self)) + goto fail_bh; + + inode->i_uid = sbi->s_uid; + inode->i_gid = sbi->s_gid; + + ctime = be64_to_cpu(oi->i_ctime); + nsecs = do_div(ctime, 1000) * 1000L; + + inode->i_atime.tv_sec = ctime; + inode->i_mtime.tv_sec = ctime; + inode->i_ctime.tv_sec = ctime; + inode->i_atime.tv_nsec = nsecs; + inode->i_mtime.tv_nsec = nsecs; + inode->i_ctime.tv_nsec = nsecs; + + inode->i_mapping->a_ops = &omfs_aops; + + switch (oi->i_type) { + case OMFS_DIR: + inode->i_mode = S_IFDIR | (S_IRWXUGO & ~sbi->s_dmask); + inode->i_op = &omfs_dir_inops; + inode->i_fop = &omfs_dir_operations; + inode->i_size = sbi->s_sys_blocksize; + inc_nlink(inode); + break; + case OMFS_FILE: + inode->i_mode = S_IFREG | (S_IRWXUGO & ~sbi->s_fmask); + inode->i_fop = &omfs_file_operations; + inode->i_size = be64_to_cpu(oi->i_size); + break; + } + brelse(bh); + unlock_new_inode(inode); + return inode; +fail_bh: + brelse(bh); +iget_failed: + iget_failed(inode); + return ERR_PTR(-EIO); +} + +static void omfs_put_super(struct super_block *sb) +{ + struct omfs_sb_info *sbi = OMFS_SB(sb); + kfree(sbi->s_imap); + kfree(sbi); + sb->s_fs_info = NULL; +} + +static int omfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *s = dentry->d_sb; + struct omfs_sb_info *sbi = OMFS_SB(s); + buf->f_type = OMFS_MAGIC; + buf->f_bsize = sbi->s_blocksize; + buf->f_blocks = sbi->s_num_blocks; + buf->f_files = sbi->s_num_blocks; + buf->f_namelen = OMFS_NAMELEN; + + buf->f_bfree = buf->f_bavail = buf->f_ffree = + omfs_count_free(s); + return 0; +} + +static struct super_operations omfs_sops = { + .write_inode = omfs_write_inode, + .delete_inode = omfs_delete_inode, + .put_super = omfs_put_super, + .statfs = omfs_statfs, + .show_options = generic_show_options, +}; + +/* + * For Rio Karma, there is an on-disk free bitmap whose location is + * stored in the root block. For ReplayTV, there is no such free bitmap + * so we have to walk the tree. Both inodes and file data are allocated + * from the same map. This array can be big (300k) so we allocate + * in units of the blocksize. + */ +static int omfs_get_imap(struct super_block *sb) +{ + int bitmap_size; + int array_size; + int count; + struct omfs_sb_info *sbi = OMFS_SB(sb); + struct buffer_head *bh; + unsigned long **ptr; + sector_t block; + + bitmap_size = DIV_ROUND_UP(sbi->s_num_blocks, 8); + array_size = DIV_ROUND_UP(bitmap_size, sb->s_blocksize); + + if (sbi->s_bitmap_ino == ~0ULL) + goto out; + + sbi->s_imap_size = array_size; + sbi->s_imap = kzalloc(array_size * sizeof(unsigned long *), GFP_KERNEL); + if (!sbi->s_imap) + goto nomem; + + block = clus_to_blk(sbi, sbi->s_bitmap_ino); + ptr = sbi->s_imap; + for (count = bitmap_size; count > 0; count -= sb->s_blocksize) { + bh = sb_bread(sb, block++); + if (!bh) + goto nomem_free; + *ptr = kmalloc(sb->s_blocksize, GFP_KERNEL); + if (!*ptr) { + brelse(bh); + goto nomem_free; + } + memcpy(*ptr, bh->b_data, sb->s_blocksize); + if (count < sb->s_blocksize) + memset((void *)*ptr + count, 0xff, + sb->s_blocksize - count); + brelse(bh); + ptr++; + } +out: + return 0; + +nomem_free: + for (count = 0; count < array_size; count++) + kfree(sbi->s_imap[count]); + + kfree(sbi->s_imap); +nomem: + sbi->s_imap = NULL; + sbi->s_imap_size = 0; + return -ENOMEM; +} + +enum { + Opt_uid, Opt_gid, Opt_umask, Opt_dmask, Opt_fmask +}; + +static const match_table_t tokens = { + {Opt_uid, "uid=%u"}, + {Opt_gid, "gid=%u"}, + {Opt_umask, "umask=%o"}, + {Opt_dmask, "dmask=%o"}, + {Opt_fmask, "fmask=%o"}, +}; + +static int parse_options(char *options, struct omfs_sb_info *sbi) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + int option; + + if (!options) + return 1; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_uid: + if (match_int(&args[0], &option)) + return 0; + sbi->s_uid = option; + break; + case Opt_gid: + if (match_int(&args[0], &option)) + return 0; + sbi->s_gid = option; + break; + case Opt_umask: + if (match_octal(&args[0], &option)) + return 0; + sbi->s_fmask = sbi->s_dmask = option; + break; + case Opt_dmask: + if (match_octal(&args[0], &option)) + return 0; + sbi->s_dmask = option; + break; + case Opt_fmask: + if (match_octal(&args[0], &option)) + return 0; + sbi->s_fmask = option; + break; + default: + return 0; + } + } + return 1; +} + +static int omfs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct buffer_head *bh, *bh2; + struct omfs_super_block *omfs_sb; + struct omfs_root_block *omfs_rb; + struct omfs_sb_info *sbi; + struct inode *root; + sector_t start; + int ret = -EINVAL; + + save_mount_options(sb, (char *) data); + + sbi = kzalloc(sizeof(struct omfs_sb_info), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + + sb->s_fs_info = sbi; + + sbi->s_uid = current->uid; + sbi->s_gid = current->gid; + sbi->s_dmask = sbi->s_fmask = current->fs->umask; + + if (!parse_options((char *) data, sbi)) + goto end; + + sb->s_maxbytes = 0xffffffff; + + sb_set_blocksize(sb, 0x200); + + bh = sb_bread(sb, 0); + if (!bh) + goto end; + + omfs_sb = (struct omfs_super_block *)bh->b_data; + + if (omfs_sb->s_magic != cpu_to_be32(OMFS_MAGIC)) { + if (!silent) + printk(KERN_ERR "omfs: Invalid superblock (%x)\n", + omfs_sb->s_magic); + goto out_brelse_bh; + } + sb->s_magic = OMFS_MAGIC; + + sbi->s_num_blocks = be64_to_cpu(omfs_sb->s_num_blocks); + sbi->s_blocksize = be32_to_cpu(omfs_sb->s_blocksize); + sbi->s_mirrors = be32_to_cpu(omfs_sb->s_mirrors); + sbi->s_root_ino = be64_to_cpu(omfs_sb->s_root_block); + sbi->s_sys_blocksize = be32_to_cpu(omfs_sb->s_sys_blocksize); + mutex_init(&sbi->s_bitmap_lock); + + if (sbi->s_sys_blocksize > PAGE_SIZE) { + printk(KERN_ERR "omfs: sysblock size (%d) is out of range\n", + sbi->s_sys_blocksize); + goto out_brelse_bh; + } + + if (sbi->s_blocksize < sbi->s_sys_blocksize || + sbi->s_blocksize > OMFS_MAX_BLOCK_SIZE) { + printk(KERN_ERR "omfs: block size (%d) is out of range\n", + sbi->s_blocksize); + goto out_brelse_bh; + } + + /* + * Use sys_blocksize as the fs block since it is smaller than a + * page while the fs blocksize can be larger. + */ + sb_set_blocksize(sb, sbi->s_sys_blocksize); + + /* + * ...and the difference goes into a shift. sys_blocksize is always + * a power of two factor of blocksize. + */ + sbi->s_block_shift = get_bitmask_order(sbi->s_blocksize) - + get_bitmask_order(sbi->s_sys_blocksize); + + start = clus_to_blk(sbi, be64_to_cpu(omfs_sb->s_root_block)); + bh2 = sb_bread(sb, start); + if (!bh2) + goto out_brelse_bh; + + omfs_rb = (struct omfs_root_block *)bh2->b_data; + + sbi->s_bitmap_ino = be64_to_cpu(omfs_rb->r_bitmap); + sbi->s_clustersize = be32_to_cpu(omfs_rb->r_clustersize); + + if (sbi->s_num_blocks != be64_to_cpu(omfs_rb->r_num_blocks)) { + printk(KERN_ERR "omfs: block count discrepancy between " + "super and root blocks (%llx, %llx)\n", + (unsigned long long)sbi->s_num_blocks, + (unsigned long long)be64_to_cpu(omfs_rb->r_num_blocks)); + goto out_brelse_bh2; + } + + ret = omfs_get_imap(sb); + if (ret) + goto out_brelse_bh2; + + sb->s_op = &omfs_sops; + + root = omfs_iget(sb, be64_to_cpu(omfs_rb->r_root_dir)); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + goto out_brelse_bh2; + } + + sb->s_root = d_alloc_root(root); + if (!sb->s_root) { + iput(root); + goto out_brelse_bh2; + } + printk(KERN_DEBUG "omfs: Mounted volume %s\n", omfs_rb->r_name); + + ret = 0; +out_brelse_bh2: + brelse(bh2); +out_brelse_bh: + brelse(bh); +end: + return ret; +} + +static int omfs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data, struct vfsmount *m) +{ + return get_sb_bdev(fs_type, flags, dev_name, data, omfs_fill_super, m); +} + +static struct file_system_type omfs_fs_type = { + .owner = THIS_MODULE, + .name = "omfs", + .get_sb = omfs_get_sb, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +static int __init init_omfs_fs(void) +{ + return register_filesystem(&omfs_fs_type); +} + +static void __exit exit_omfs_fs(void) +{ + unregister_filesystem(&omfs_fs_type); +} + +module_init(init_omfs_fs); +module_exit(exit_omfs_fs); diff --git a/fs/omfs/omfs.h b/fs/omfs/omfs.h new file mode 100644 index 000000000000..2bc0f0670406 --- /dev/null +++ b/fs/omfs/omfs.h @@ -0,0 +1,67 @@ +#ifndef _OMFS_H +#define _OMFS_H + +#include <linux/module.h> +#include <linux/fs.h> + +#include "omfs_fs.h" + +/* In-memory structures */ +struct omfs_sb_info { + u64 s_num_blocks; + u64 s_bitmap_ino; + u64 s_root_ino; + u32 s_blocksize; + u32 s_mirrors; + u32 s_sys_blocksize; + u32 s_clustersize; + int s_block_shift; + unsigned long **s_imap; + int s_imap_size; + struct mutex s_bitmap_lock; + int s_uid; + int s_gid; + int s_dmask; + int s_fmask; +}; + +/* convert a cluster number to a scaled block number */ +static inline sector_t clus_to_blk(struct omfs_sb_info *sbi, sector_t block) +{ + return block << sbi->s_block_shift; +} + +static inline struct omfs_sb_info *OMFS_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +/* bitmap.c */ +extern unsigned long omfs_count_free(struct super_block *sb); +extern int omfs_allocate_block(struct super_block *sb, u64 block); +extern int omfs_allocate_range(struct super_block *sb, int min_request, + int max_request, u64 *return_block, int *return_size); +extern int omfs_clear_range(struct super_block *sb, u64 block, int count); + +/* dir.c */ +extern struct file_operations omfs_dir_operations; +extern struct inode_operations omfs_dir_inops; +extern int omfs_make_empty(struct inode *inode, struct super_block *sb); +extern int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header, + u64 fsblock); + +/* file.c */ +extern struct file_operations omfs_file_operations; +extern struct inode_operations omfs_file_inops; +extern struct address_space_operations omfs_aops; +extern void omfs_make_empty_table(struct buffer_head *bh, int offset); +extern int omfs_shrink_inode(struct inode *inode); + +/* inode.c */ +extern struct inode *omfs_iget(struct super_block *sb, ino_t inode); +extern struct inode *omfs_new_inode(struct inode *dir, int mode); +extern int omfs_reserve_block(struct super_block *sb, sector_t block); +extern int omfs_find_empty_block(struct super_block *sb, int mode, ino_t *ino); +extern int omfs_sync_inode(struct inode *inode); + +#endif diff --git a/fs/omfs/omfs_fs.h b/fs/omfs/omfs_fs.h new file mode 100644 index 000000000000..12cca245d6e8 --- /dev/null +++ b/fs/omfs/omfs_fs.h @@ -0,0 +1,80 @@ +#ifndef _OMFS_FS_H +#define _OMFS_FS_H + +/* OMFS On-disk structures */ + +#define OMFS_MAGIC 0xC2993D87 +#define OMFS_IMAGIC 0xD2 + +#define OMFS_DIR 'D' +#define OMFS_FILE 'F' +#define OMFS_INODE_NORMAL 'e' +#define OMFS_INODE_CONTINUATION 'c' +#define OMFS_INODE_SYSTEM 's' +#define OMFS_NAMELEN 256 +#define OMFS_DIR_START 0x1b8 +#define OMFS_EXTENT_START 0x1d0 +#define OMFS_EXTENT_CONT 0x40 +#define OMFS_XOR_COUNT 19 +#define OMFS_MAX_BLOCK_SIZE 8192 + +struct omfs_super_block { + char s_fill1[256]; + __be64 s_root_block; /* block number of omfs_root_block */ + __be64 s_num_blocks; /* total number of FS blocks */ + __be32 s_magic; /* OMFS_MAGIC */ + __be32 s_blocksize; /* size of a block */ + __be32 s_mirrors; /* # of mirrors of system blocks */ + __be32 s_sys_blocksize; /* size of non-data blocks */ +}; + +struct omfs_header { + __be64 h_self; /* FS block where this is located */ + __be32 h_body_size; /* size of useful data after header */ + __be16 h_crc; /* crc-ccitt of body_size bytes */ + char h_fill1[2]; + u8 h_version; /* version, always 1 */ + char h_type; /* OMFS_INODE_X */ + u8 h_magic; /* OMFS_IMAGIC */ + u8 h_check_xor; /* XOR of header bytes before this */ + __be32 h_fill2; +}; + +struct omfs_root_block { + struct omfs_header r_head; /* header */ + __be64 r_fill1; + __be64 r_num_blocks; /* total number of FS blocks */ + __be64 r_root_dir; /* block # of root directory */ + __be64 r_bitmap; /* block # of free space bitmap */ + __be32 r_blocksize; /* size of a block */ + __be32 r_clustersize; /* size allocated for data blocks */ + __be64 r_mirrors; /* # of mirrors of system blocks */ + char r_name[OMFS_NAMELEN]; /* partition label */ +}; + +struct omfs_inode { + struct omfs_header i_head; /* header */ + __be64 i_parent; /* parent containing this inode */ + __be64 i_sibling; /* next inode in hash bucket */ + __be64 i_ctime; /* ctime, in milliseconds */ + char i_fill1[35]; + char i_type; /* OMFS_[DIR,FILE] */ + __be32 i_fill2; + char i_fill3[64]; + char i_name[OMFS_NAMELEN]; /* filename */ + __be64 i_size; /* size of file, in bytes */ +}; + +struct omfs_extent_entry { + __be64 e_cluster; /* start location of a set of blocks */ + __be64 e_blocks; /* number of blocks after e_cluster */ +}; + +struct omfs_extent { + __be64 e_next; /* next extent table location */ + __be32 e_extent_count; /* total # extents in this table */ + __be32 e_fill; + struct omfs_extent_entry e_entry; /* start of extent entries */ +}; + +#endif diff --git a/fs/open.c b/fs/open.c index a1450086e92f..5596049863bf 100644 --- a/fs/open.c +++ b/fs/open.c @@ -16,6 +16,7 @@ #include <linux/namei.h> #include <linux/backing-dev.h> #include <linux/capability.h> +#include <linux/securebits.h> #include <linux/security.h> #include <linux/mount.h> #include <linux/vfs.h> @@ -63,7 +64,8 @@ static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf) memcpy(buf, &st, sizeof(st)); else { if (sizeof buf->f_blocks == 4) { - if ((st.f_blocks | st.f_bfree | st.f_bavail) & + if ((st.f_blocks | st.f_bfree | st.f_bavail | + st.f_bsize | st.f_frsize) & 0xffffffff00000000ULL) return -EOVERFLOW; /* @@ -120,37 +122,37 @@ static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf) return 0; } -asmlinkage long sys_statfs(const char __user * path, struct statfs __user * buf) +asmlinkage long sys_statfs(const char __user *pathname, struct statfs __user * buf) { - struct nameidata nd; + struct path path; int error; - error = user_path_walk(path, &nd); + error = user_path(pathname, &path); if (!error) { struct statfs tmp; - error = vfs_statfs_native(nd.path.dentry, &tmp); + error = vfs_statfs_native(path.dentry, &tmp); if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) error = -EFAULT; - path_put(&nd.path); + path_put(&path); } return error; } -asmlinkage long sys_statfs64(const char __user *path, size_t sz, struct statfs64 __user *buf) +asmlinkage long sys_statfs64(const char __user *pathname, size_t sz, struct statfs64 __user *buf) { - struct nameidata nd; + struct path path; long error; if (sz != sizeof(*buf)) return -EINVAL; - error = user_path_walk(path, &nd); + error = user_path(pathname, &path); if (!error) { struct statfs64 tmp; - error = vfs_statfs64(nd.path.dentry, &tmp); + error = vfs_statfs64(path.dentry, &tmp); if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) error = -EFAULT; - path_put(&nd.path); + path_put(&path); } return error; } @@ -221,20 +223,20 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, return err; } -static long do_sys_truncate(const char __user * path, loff_t length) +static long do_sys_truncate(const char __user *pathname, loff_t length) { - struct nameidata nd; - struct inode * inode; + struct path path; + struct inode *inode; int error; error = -EINVAL; if (length < 0) /* sorry, but loff_t says... */ goto out; - error = user_path_walk(path, &nd); + error = user_path(pathname, &path); if (error) goto out; - inode = nd.path.dentry->d_inode; + inode = path.dentry->d_inode; /* For directories it's -EISDIR, for other non-regulars - -EINVAL */ error = -EISDIR; @@ -245,16 +247,16 @@ static long do_sys_truncate(const char __user * path, loff_t length) if (!S_ISREG(inode->i_mode)) goto dput_and_out; - error = mnt_want_write(nd.path.mnt); + error = mnt_want_write(path.mnt); if (error) goto dput_and_out; - error = vfs_permission(&nd, MAY_WRITE); + error = inode_permission(inode, MAY_WRITE); if (error) goto mnt_drop_write_and_out; error = -EPERM; - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + if (IS_APPEND(inode)) goto mnt_drop_write_and_out; error = get_write_access(inode); @@ -272,15 +274,15 @@ static long do_sys_truncate(const char __user * path, loff_t length) error = locks_verify_truncate(inode, NULL, length); if (!error) { DQUOT_INIT(inode); - error = do_truncate(nd.path.dentry, length, 0, NULL); + error = do_truncate(path.dentry, length, 0, NULL); } put_write_and_out: put_write_access(inode); mnt_drop_write_and_out: - mnt_drop_write(nd.path.mnt); + mnt_drop_write(path.mnt); dput_and_out: - path_put(&nd.path); + path_put(&path); out: return error; } @@ -423,9 +425,10 @@ out: */ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode) { - struct nameidata nd; + struct path path; + struct inode *inode; int old_fsuid, old_fsgid; - kernel_cap_t old_cap; + kernel_cap_t uninitialized_var(old_cap); /* !SECURE_NO_SETUID_FIXUP */ int res; if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ @@ -433,32 +436,47 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode) old_fsuid = current->fsuid; old_fsgid = current->fsgid; - old_cap = current->cap_effective; current->fsuid = current->uid; current->fsgid = current->gid; - /* - * Clear the capabilities if we switch to a non-root user - * - * FIXME: There is a race here against sys_capset. The - * capabilities can change yet we will restore the old - * value below. We should hold task_capabilities_lock, - * but we cannot because user_path_walk can sleep. - */ - if (current->uid) - cap_clear(current->cap_effective); - else - current->cap_effective = current->cap_permitted; + if (!issecure(SECURE_NO_SETUID_FIXUP)) { + /* + * Clear the capabilities if we switch to a non-root user + */ +#ifndef CONFIG_SECURITY_FILE_CAPABILITIES + /* + * FIXME: There is a race here against sys_capset. The + * capabilities can change yet we will restore the old + * value below. We should hold task_capabilities_lock, + * but we cannot because user_path_at can sleep. + */ +#endif /* ndef CONFIG_SECURITY_FILE_CAPABILITIES */ + if (current->uid) + old_cap = cap_set_effective(__cap_empty_set); + else + old_cap = cap_set_effective(current->cap_permitted); + } - res = __user_walk_fd(dfd, filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd); + res = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path); if (res) goto out; - res = vfs_permission(&nd, mode); + inode = path.dentry->d_inode; + + if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) { + /* + * MAY_EXEC on regular files is denied if the fs is mounted + * with the "noexec" flag. + */ + res = -EACCES; + if (path.mnt->mnt_flags & MNT_NOEXEC) + goto out_path_release; + } + + res = inode_permission(inode, mode | MAY_ACCESS); /* SuS v2 requires we report a read only fs too */ - if(res || !(mode & S_IWOTH) || - special_file(nd.path.dentry->d_inode->i_mode)) + if (res || !(mode & S_IWOTH) || special_file(inode->i_mode)) goto out_path_release; /* * This is a rare case where using __mnt_is_readonly() @@ -470,15 +488,17 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode) * inherently racy and know that the fs may change * state before we even see this result. */ - if (__mnt_is_readonly(nd.path.mnt)) + if (__mnt_is_readonly(path.mnt)) res = -EROFS; out_path_release: - path_put(&nd.path); + path_put(&path); out: current->fsuid = old_fsuid; current->fsgid = old_fsgid; - current->cap_effective = old_cap; + + if (!issecure(SECURE_NO_SETUID_FIXUP)) + cap_set_effective(old_cap); return res; } @@ -490,22 +510,21 @@ asmlinkage long sys_access(const char __user *filename, int mode) asmlinkage long sys_chdir(const char __user * filename) { - struct nameidata nd; + struct path path; int error; - error = __user_walk(filename, - LOOKUP_FOLLOW|LOOKUP_DIRECTORY|LOOKUP_CHDIR, &nd); + error = user_path_dir(filename, &path); if (error) goto out; - error = vfs_permission(&nd, MAY_EXEC); + error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_ACCESS); if (error) goto dput_and_out; - set_fs_pwd(current->fs, &nd.path); + set_fs_pwd(current->fs, &path); dput_and_out: - path_put(&nd.path); + path_put(&path); out: return error; } @@ -527,7 +546,7 @@ asmlinkage long sys_fchdir(unsigned int fd) if (!S_ISDIR(inode->i_mode)) goto out_putf; - error = file_permission(file, MAY_EXEC); + error = inode_permission(inode, MAY_EXEC | MAY_ACCESS); if (!error) set_fs_pwd(current->fs, &file->f_path); out_putf: @@ -538,14 +557,14 @@ out: asmlinkage long sys_chroot(const char __user * filename) { - struct nameidata nd; + struct path path; int error; - error = __user_walk(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd); + error = user_path_dir(filename, &path); if (error) goto out; - error = vfs_permission(&nd, MAY_EXEC); + error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_ACCESS); if (error) goto dput_and_out; @@ -553,11 +572,10 @@ asmlinkage long sys_chroot(const char __user * filename) if (!capable(CAP_SYS_CHROOT)) goto dput_and_out; - set_fs_root(current->fs, &nd.path); - set_fs_altroot(); + set_fs_root(current->fs, &path); error = 0; dput_and_out: - path_put(&nd.path); + path_put(&path); out: return error; } @@ -582,9 +600,6 @@ asmlinkage long sys_fchmod(unsigned int fd, mode_t mode) err = mnt_want_write(file->f_path.mnt); if (err) goto out_putf; - err = -EPERM; - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) - goto out_drop_write; mutex_lock(&inode->i_mutex); if (mode == (mode_t) -1) mode = inode->i_mode; @@ -592,8 +607,6 @@ asmlinkage long sys_fchmod(unsigned int fd, mode_t mode) newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; err = notify_change(dentry, &newattrs); mutex_unlock(&inode->i_mutex); - -out_drop_write: mnt_drop_write(file->f_path.mnt); out_putf: fput(file); @@ -604,36 +617,29 @@ out: asmlinkage long sys_fchmodat(int dfd, const char __user *filename, mode_t mode) { - struct nameidata nd; - struct inode * inode; + struct path path; + struct inode *inode; int error; struct iattr newattrs; - error = __user_walk_fd(dfd, filename, LOOKUP_FOLLOW, &nd); + error = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path); if (error) goto out; - inode = nd.path.dentry->d_inode; + inode = path.dentry->d_inode; - error = mnt_want_write(nd.path.mnt); + error = mnt_want_write(path.mnt); if (error) goto dput_and_out; - - error = -EPERM; - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) - goto out_drop_write; - mutex_lock(&inode->i_mutex); if (mode == (mode_t) -1) mode = inode->i_mode; newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; - error = notify_change(nd.path.dentry, &newattrs); + error = notify_change(path.dentry, &newattrs); mutex_unlock(&inode->i_mutex); - -out_drop_write: - mnt_drop_write(nd.path.mnt); + mnt_drop_write(path.mnt); dput_and_out: - path_put(&nd.path); + path_put(&path); out: return error; } @@ -645,18 +651,10 @@ asmlinkage long sys_chmod(const char __user *filename, mode_t mode) static int chown_common(struct dentry * dentry, uid_t user, gid_t group) { - struct inode * inode; + struct inode *inode = dentry->d_inode; int error; struct iattr newattrs; - error = -ENOENT; - if (!(inode = dentry->d_inode)) { - printk(KERN_ERR "chown_common: NULL inode\n"); - goto out; - } - error = -EPERM; - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) - goto out; newattrs.ia_valid = ATTR_CTIME; if (user != (uid_t) -1) { newattrs.ia_valid |= ATTR_UID; @@ -672,25 +670,25 @@ static int chown_common(struct dentry * dentry, uid_t user, gid_t group) mutex_lock(&inode->i_mutex); error = notify_change(dentry, &newattrs); mutex_unlock(&inode->i_mutex); -out: + return error; } asmlinkage long sys_chown(const char __user * filename, uid_t user, gid_t group) { - struct nameidata nd; + struct path path; int error; - error = user_path_walk(filename, &nd); + error = user_path(filename, &path); if (error) goto out; - error = mnt_want_write(nd.path.mnt); + error = mnt_want_write(path.mnt); if (error) goto out_release; - error = chown_common(nd.path.dentry, user, group); - mnt_drop_write(nd.path.mnt); + error = chown_common(path.dentry, user, group); + mnt_drop_write(path.mnt); out_release: - path_put(&nd.path); + path_put(&path); out: return error; } @@ -698,7 +696,7 @@ out: asmlinkage long sys_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, int flag) { - struct nameidata nd; + struct path path; int error = -EINVAL; int follow; @@ -706,35 +704,35 @@ asmlinkage long sys_fchownat(int dfd, const char __user *filename, uid_t user, goto out; follow = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; - error = __user_walk_fd(dfd, filename, follow, &nd); + error = user_path_at(dfd, filename, follow, &path); if (error) goto out; - error = mnt_want_write(nd.path.mnt); + error = mnt_want_write(path.mnt); if (error) goto out_release; - error = chown_common(nd.path.dentry, user, group); - mnt_drop_write(nd.path.mnt); + error = chown_common(path.dentry, user, group); + mnt_drop_write(path.mnt); out_release: - path_put(&nd.path); + path_put(&path); out: return error; } asmlinkage long sys_lchown(const char __user * filename, uid_t user, gid_t group) { - struct nameidata nd; + struct path path; int error; - error = user_path_walk_link(filename, &nd); + error = user_lpath(filename, &path); if (error) goto out; - error = mnt_want_write(nd.path.mnt); + error = mnt_want_write(path.mnt); if (error) goto out_release; - error = chown_common(nd.path.dentry, user, group); - mnt_drop_write(nd.path.mnt); + error = chown_common(path.dentry, user, group); + mnt_drop_write(path.mnt); out_release: - path_put(&nd.path); + path_put(&path); out: return error; } @@ -965,71 +963,6 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags) } EXPORT_SYMBOL(dentry_open); -/* - * Find an empty file descriptor entry, and mark it busy. - */ -int get_unused_fd_flags(int flags) -{ - struct files_struct * files = current->files; - int fd, error; - struct fdtable *fdt; - - error = -EMFILE; - spin_lock(&files->file_lock); - -repeat: - fdt = files_fdtable(files); - fd = find_next_zero_bit(fdt->open_fds->fds_bits, fdt->max_fds, - files->next_fd); - - /* - * N.B. For clone tasks sharing a files structure, this test - * will limit the total number of files that can be opened. - */ - if (fd >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur) - goto out; - - /* Do we need to expand the fd array or fd set? */ - error = expand_files(files, fd); - if (error < 0) - goto out; - - if (error) { - /* - * If we needed to expand the fs array we - * might have blocked - try again. - */ - error = -EMFILE; - goto repeat; - } - - FD_SET(fd, fdt->open_fds); - if (flags & O_CLOEXEC) - FD_SET(fd, fdt->close_on_exec); - else - FD_CLR(fd, fdt->close_on_exec); - files->next_fd = fd + 1; -#if 1 - /* Sanity check */ - if (fdt->fd[fd] != NULL) { - printk(KERN_WARNING "get_unused_fd: slot %d not NULL!\n", fd); - fdt->fd[fd] = NULL; - } -#endif - error = fd; - -out: - spin_unlock(&files->file_lock); - return error; -} - -int get_unused_fd(void) -{ - return get_unused_fd_flags(0); -} - -EXPORT_SYMBOL(get_unused_fd); - static void __put_unused_fd(struct files_struct *files, unsigned int fd) { struct fdtable *fdt = files_fdtable(files); @@ -1208,8 +1141,7 @@ EXPORT_SYMBOL(sys_close); asmlinkage long sys_vhangup(void) { if (capable(CAP_SYS_TTY_CONFIG)) { - /* XXX: this needs locking */ - tty_vhangup(current->signal->tty); + tty_vhangup_self(); return 0; } return -EPERM; diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index d17b4fd204e1..9f5b054f06b9 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -430,7 +430,7 @@ static struct file_system_type openprom_fs_type = { .kill_sb = kill_anon_super, }; -static void op_inode_init_once(struct kmem_cache * cachep, void *data) +static void op_inode_init_once(void *data) { struct op_inode_info *oi = (struct op_inode_info *) data; diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 6149e4b58c88..7408227c49c9 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -120,22 +120,21 @@ static int (*check_part[])(struct parsed_partitions *, struct block_device *) = * a pointer to that same buffer (for convenience). */ -char *disk_name(struct gendisk *hd, int part, char *buf) +char *disk_name(struct gendisk *hd, int partno, char *buf) { - if (!part) + if (!partno) snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name); else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1])) - snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, part); + snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno); else - snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, part); + snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno); return buf; } const char *bdevname(struct block_device *bdev, char *buf) { - int part = MINOR(bdev->bd_dev) - bdev->bd_disk->first_minor; - return disk_name(bdev->bd_disk, part, buf); + return disk_name(bdev->bd_disk, bdev->bd_part->partno, buf); } EXPORT_SYMBOL(bdevname); @@ -169,7 +168,7 @@ check_partition(struct gendisk *hd, struct block_device *bdev) if (isdigit(state->name[strlen(state->name)-1])) sprintf(state->name, "p"); - state->limit = hd->minors; + state->limit = disk_max_parts(hd); i = res = err = 0; while (!res && check_part[i]) { memset(&state->parts, 0, sizeof(state->parts)); @@ -204,21 +203,22 @@ static ssize_t part_start_show(struct device *dev, return sprintf(buf, "%llu\n",(unsigned long long)p->start_sect); } -static ssize_t part_size_show(struct device *dev, - struct device_attribute *attr, char *buf) +ssize_t part_size_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct hd_struct *p = dev_to_part(dev); return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects); } -static ssize_t part_stat_show(struct device *dev, - struct device_attribute *attr, char *buf) +ssize_t part_stat_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct hd_struct *p = dev_to_part(dev); + int cpu; - preempt_disable(); - part_round_stats(p); - preempt_enable(); + cpu = part_stat_lock(); + part_round_stats(cpu, p); + part_stat_unlock(); return sprintf(buf, "%8lu %8lu %8llu %8u " "%8lu %8lu %8llu %8u " @@ -238,17 +238,17 @@ static ssize_t part_stat_show(struct device *dev, } #ifdef CONFIG_FAIL_MAKE_REQUEST -static ssize_t part_fail_show(struct device *dev, - struct device_attribute *attr, char *buf) +ssize_t part_fail_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct hd_struct *p = dev_to_part(dev); return sprintf(buf, "%d\n", p->make_it_fail); } -static ssize_t part_fail_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) +ssize_t part_fail_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) { struct hd_struct *p = dev_to_part(dev); int i; @@ -300,40 +300,34 @@ struct device_type part_type = { .release = part_release, }; -static inline void partition_sysfs_add_subdir(struct hd_struct *p) -{ - struct kobject *k; - - k = kobject_get(&p->dev.kobj); - p->holder_dir = kobject_create_and_add("holders", k); - kobject_put(k); -} - -static inline void disk_sysfs_add_subdirs(struct gendisk *disk) +static void delete_partition_rcu_cb(struct rcu_head *head) { - struct kobject *k; + struct hd_struct *part = container_of(head, struct hd_struct, rcu_head); - k = kobject_get(&disk->dev.kobj); - disk->holder_dir = kobject_create_and_add("holders", k); - disk->slave_dir = kobject_create_and_add("slaves", k); - kobject_put(k); + part->start_sect = 0; + part->nr_sects = 0; + part_stat_set_all(part, 0); + put_device(part_to_dev(part)); } -void delete_partition(struct gendisk *disk, int part) +void delete_partition(struct gendisk *disk, int partno) { - struct hd_struct *p = disk->part[part-1]; + struct disk_part_tbl *ptbl = disk->part_tbl; + struct hd_struct *part; - if (!p) + if (partno >= ptbl->len) return; - if (!p->nr_sects) + + part = ptbl->part[partno]; + if (!part) return; - disk->part[part-1] = NULL; - p->start_sect = 0; - p->nr_sects = 0; - part_stat_set_all(p, 0); - kobject_put(p->holder_dir); - device_del(&p->dev); - put_device(&p->dev); + + blk_free_devt(part_devt(part)); + rcu_assign_pointer(ptbl->part[partno], NULL); + kobject_put(part->holder_dir); + device_del(part_to_dev(part)); + + call_rcu(&part->rcu_head, delete_partition_rcu_cb); } static ssize_t whole_disk_show(struct device *dev, @@ -344,86 +338,132 @@ static ssize_t whole_disk_show(struct device *dev, static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH, whole_disk_show, NULL); -void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len, int flags) +int add_partition(struct gendisk *disk, int partno, + sector_t start, sector_t len, int flags) { struct hd_struct *p; + dev_t devt = MKDEV(0, 0); + struct device *ddev = disk_to_dev(disk); + struct device *pdev; + struct disk_part_tbl *ptbl; + const char *dname; int err; + err = disk_expand_part_tbl(disk, partno); + if (err) + return err; + ptbl = disk->part_tbl; + + if (ptbl->part[partno]) + return -EBUSY; + p = kzalloc(sizeof(*p), GFP_KERNEL); if (!p) - return; + return -ENOMEM; if (!init_part_stats(p)) { - kfree(p); - return; + err = -ENOMEM; + goto out_free; } + pdev = part_to_dev(p); + p->start_sect = start; p->nr_sects = len; - p->partno = part; - p->policy = disk->policy; + p->partno = partno; + p->policy = get_disk_ro(disk); - if (isdigit(disk->dev.bus_id[strlen(disk->dev.bus_id)-1])) - snprintf(p->dev.bus_id, BUS_ID_SIZE, - "%sp%d", disk->dev.bus_id, part); + dname = dev_name(ddev); + if (isdigit(dname[strlen(dname) - 1])) + snprintf(pdev->bus_id, BUS_ID_SIZE, "%sp%d", dname, partno); else - snprintf(p->dev.bus_id, BUS_ID_SIZE, - "%s%d", disk->dev.bus_id, part); + snprintf(pdev->bus_id, BUS_ID_SIZE, "%s%d", dname, partno); - device_initialize(&p->dev); - p->dev.devt = MKDEV(disk->major, disk->first_minor + part); - p->dev.class = &block_class; - p->dev.type = &part_type; - p->dev.parent = &disk->dev; - disk->part[part-1] = p; + device_initialize(pdev); + pdev->class = &block_class; + pdev->type = &part_type; + pdev->parent = ddev; + + err = blk_alloc_devt(p, &devt); + if (err) + goto out_free; + pdev->devt = devt; /* delay uevent until 'holders' subdir is created */ - p->dev.uevent_suppress = 1; - device_add(&p->dev); - partition_sysfs_add_subdir(p); - p->dev.uevent_suppress = 0; - if (flags & ADDPART_FLAG_WHOLEDISK) - err = device_create_file(&p->dev, &dev_attr_whole_disk); + pdev->uevent_suppress = 1; + err = device_add(pdev); + if (err) + goto out_put; + + err = -ENOMEM; + p->holder_dir = kobject_create_and_add("holders", &pdev->kobj); + if (!p->holder_dir) + goto out_del; + + pdev->uevent_suppress = 0; + if (flags & ADDPART_FLAG_WHOLEDISK) { + err = device_create_file(pdev, &dev_attr_whole_disk); + if (err) + goto out_del; + } + + /* everything is up and running, commence */ + INIT_RCU_HEAD(&p->rcu_head); + rcu_assign_pointer(ptbl->part[partno], p); /* suppress uevent if the disk supresses it */ - if (!disk->dev.uevent_suppress) - kobject_uevent(&p->dev.kobj, KOBJ_ADD); + if (!ddev->uevent_suppress) + kobject_uevent(&pdev->kobj, KOBJ_ADD); + + return 0; + +out_free: + kfree(p); + return err; +out_del: + kobject_put(p->holder_dir); + device_del(pdev); +out_put: + put_device(pdev); + blk_free_devt(devt); + return err; } /* Not exported, helper to add_disk(). */ void register_disk(struct gendisk *disk) { + struct device *ddev = disk_to_dev(disk); struct block_device *bdev; + struct disk_part_iter piter; + struct hd_struct *part; char *s; - int i; - struct hd_struct *p; int err; - disk->dev.parent = disk->driverfs_dev; - disk->dev.devt = MKDEV(disk->major, disk->first_minor); + ddev->parent = disk->driverfs_dev; - strlcpy(disk->dev.bus_id, disk->disk_name, KOBJ_NAME_LEN); + strlcpy(ddev->bus_id, disk->disk_name, BUS_ID_SIZE); /* ewww... some of these buggers have / in the name... */ - s = strchr(disk->dev.bus_id, '/'); + s = strchr(ddev->bus_id, '/'); if (s) *s = '!'; /* delay uevents, until we scanned partition table */ - disk->dev.uevent_suppress = 1; + ddev->uevent_suppress = 1; - if (device_add(&disk->dev)) + if (device_add(ddev)) return; #ifndef CONFIG_SYSFS_DEPRECATED - err = sysfs_create_link(block_depr, &disk->dev.kobj, - kobject_name(&disk->dev.kobj)); + err = sysfs_create_link(block_depr, &ddev->kobj, + kobject_name(&ddev->kobj)); if (err) { - device_del(&disk->dev); + device_del(ddev); return; } #endif - disk_sysfs_add_subdirs(disk); + disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj); + disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); /* No minors to use for partitions */ - if (disk->minors == 1) + if (!disk_partitionable(disk)) goto exit; /* No such device (e.g., media were just removed) */ @@ -442,51 +482,73 @@ void register_disk(struct gendisk *disk) exit: /* announce disk after possible partitions are created */ - disk->dev.uevent_suppress = 0; - kobject_uevent(&disk->dev.kobj, KOBJ_ADD); + ddev->uevent_suppress = 0; + kobject_uevent(&ddev->kobj, KOBJ_ADD); /* announce possible partitions */ - for (i = 1; i < disk->minors; i++) { - p = disk->part[i-1]; - if (!p || !p->nr_sects) - continue; - kobject_uevent(&p->dev.kobj, KOBJ_ADD); - } + disk_part_iter_init(&piter, disk, 0); + while ((part = disk_part_iter_next(&piter))) + kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD); + disk_part_iter_exit(&piter); } int rescan_partitions(struct gendisk *disk, struct block_device *bdev) { + struct disk_part_iter piter; + struct hd_struct *part; struct parsed_partitions *state; - int p, res; + int p, highest, res; if (bdev->bd_part_count) return -EBUSY; res = invalidate_partition(disk, 0); if (res) return res; - bdev->bd_invalidated = 0; - for (p = 1; p < disk->minors; p++) - delete_partition(disk, p); + + disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); + while ((part = disk_part_iter_next(&piter))) + delete_partition(disk, part->partno); + disk_part_iter_exit(&piter); + if (disk->fops->revalidate_disk) disk->fops->revalidate_disk(disk); + check_disk_size_change(disk, bdev); + bdev->bd_invalidated = 0; if (!get_capacity(disk) || !(state = check_partition(disk, bdev))) return 0; if (IS_ERR(state)) /* I/O error reading the partition table */ return -EIO; /* tell userspace that the media / partition table may have changed */ - kobject_uevent(&disk->dev.kobj, KOBJ_CHANGE); + kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); + + /* Detect the highest partition number and preallocate + * disk->part_tbl. This is an optimization and not strictly + * necessary. + */ + for (p = 1, highest = 0; p < state->limit; p++) + if (state->parts[p].size) + highest = p; + + disk_expand_part_tbl(disk, highest); + /* add partitions */ for (p = 1; p < state->limit; p++) { sector_t size = state->parts[p].size; sector_t from = state->parts[p].from; if (!size) continue; if (from + size > get_capacity(disk)) { - printk(" %s: p%d exceeds device capacity\n", + printk(KERN_WARNING + "%s: p%d exceeds device capacity\n", disk->disk_name, p); } - add_partition(disk, p, from, size, state->parts[p].flags); + res = add_partition(disk, p, from, size, state->parts[p].flags); + if (res) { + printk(KERN_ERR " %s: p%d could not be added: %d\n", + disk->disk_name, p, -res); + continue; + } #ifdef CONFIG_BLK_DEV_MD if (state->parts[p].flags & ADDPART_FLAG_RAID) md_autodetect_dev(bdev->bd_dev+p); @@ -519,25 +581,31 @@ EXPORT_SYMBOL(read_dev_sector); void del_gendisk(struct gendisk *disk) { - int p; + struct disk_part_iter piter; + struct hd_struct *part; /* invalidate stuff */ - for (p = disk->minors - 1; p > 0; p--) { - invalidate_partition(disk, p); - delete_partition(disk, p); + disk_part_iter_init(&piter, disk, + DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); + while ((part = disk_part_iter_next(&piter))) { + invalidate_partition(disk, part->partno); + delete_partition(disk, part->partno); } + disk_part_iter_exit(&piter); + invalidate_partition(disk, 0); - disk->capacity = 0; + blk_free_devt(disk_to_dev(disk)->devt); + set_capacity(disk, 0); disk->flags &= ~GENHD_FL_UP; unlink_gendisk(disk); - disk_stat_set_all(disk, 0); - disk->stamp = 0; + part_stat_set_all(&disk->part0, 0); + disk->part0.stamp = 0; - kobject_put(disk->holder_dir); + kobject_put(disk->part0.holder_dir); kobject_put(disk->slave_dir); disk->driverfs_dev = NULL; #ifndef CONFIG_SYSFS_DEPRECATED - sysfs_remove_link(block_depr, disk->dev.bus_id); + sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); #endif - device_del(&disk->dev); + device_del(disk_to_dev(disk)); } diff --git a/fs/partitions/check.h b/fs/partitions/check.h index 17ae8ecd9e8b..98dbe1a84528 100644 --- a/fs/partitions/check.h +++ b/fs/partitions/check.h @@ -5,15 +5,13 @@ * add_gd_partition adds a partitions details to the devices partition * description. */ -enum { MAX_PART = 256 }; - struct parsed_partitions { char name[BDEVNAME_SIZE]; struct { sector_t from; sector_t size; int flags; - } parts[MAX_PART]; + } parts[DISK_MAX_PARTS]; int next; int limit; }; diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c index e7b07006bc41..038a6022152f 100644 --- a/fs/partitions/efi.c +++ b/fs/partitions/efi.c @@ -95,13 +95,6 @@ #include "check.h" #include "efi.h" -#undef EFI_DEBUG -#ifdef EFI_DEBUG -#define Dprintk(x...) printk(KERN_DEBUG x) -#else -#define Dprintk(x...) -#endif - /* This allows a kernel command line option 'gpt' to override * the test for invalid PMBR. Not __initdata because reloading * the partition tables happens after init too. @@ -305,10 +298,10 @@ is_gpt_valid(struct block_device *bdev, u64 lba, /* Check the GUID Partition Table signature */ if (le64_to_cpu((*gpt)->signature) != GPT_HEADER_SIGNATURE) { - Dprintk("GUID Partition Table Header signature is wrong:" - "%lld != %lld\n", - (unsigned long long)le64_to_cpu((*gpt)->signature), - (unsigned long long)GPT_HEADER_SIGNATURE); + pr_debug("GUID Partition Table Header signature is wrong:" + "%lld != %lld\n", + (unsigned long long)le64_to_cpu((*gpt)->signature), + (unsigned long long)GPT_HEADER_SIGNATURE); goto fail; } @@ -318,9 +311,8 @@ is_gpt_valid(struct block_device *bdev, u64 lba, crc = efi_crc32((const unsigned char *) (*gpt), le32_to_cpu((*gpt)->header_size)); if (crc != origcrc) { - Dprintk - ("GUID Partition Table Header CRC is wrong: %x != %x\n", - crc, origcrc); + pr_debug("GUID Partition Table Header CRC is wrong: %x != %x\n", + crc, origcrc); goto fail; } (*gpt)->header_crc32 = cpu_to_le32(origcrc); @@ -328,9 +320,9 @@ is_gpt_valid(struct block_device *bdev, u64 lba, /* Check that the my_lba entry points to the LBA that contains * the GUID Partition Table */ if (le64_to_cpu((*gpt)->my_lba) != lba) { - Dprintk("GPT my_lba incorrect: %lld != %lld\n", - (unsigned long long)le64_to_cpu((*gpt)->my_lba), - (unsigned long long)lba); + pr_debug("GPT my_lba incorrect: %lld != %lld\n", + (unsigned long long)le64_to_cpu((*gpt)->my_lba), + (unsigned long long)lba); goto fail; } @@ -339,15 +331,15 @@ is_gpt_valid(struct block_device *bdev, u64 lba, */ lastlba = last_lba(bdev); if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) { - Dprintk("GPT: first_usable_lba incorrect: %lld > %lld\n", - (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba), - (unsigned long long)lastlba); + pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n", + (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba), + (unsigned long long)lastlba); goto fail; } if (le64_to_cpu((*gpt)->last_usable_lba) > lastlba) { - Dprintk("GPT: last_usable_lba incorrect: %lld > %lld\n", - (unsigned long long)le64_to_cpu((*gpt)->last_usable_lba), - (unsigned long long)lastlba); + pr_debug("GPT: last_usable_lba incorrect: %lld > %lld\n", + (unsigned long long)le64_to_cpu((*gpt)->last_usable_lba), + (unsigned long long)lastlba); goto fail; } @@ -360,7 +352,7 @@ is_gpt_valid(struct block_device *bdev, u64 lba, le32_to_cpu((*gpt)->sizeof_partition_entry)); if (crc != le32_to_cpu((*gpt)->partition_entry_array_crc32)) { - Dprintk("GUID Partitition Entry Array CRC check failed.\n"); + pr_debug("GUID Partitition Entry Array CRC check failed.\n"); goto fail_ptes; } @@ -616,7 +608,7 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev) return 0; } - Dprintk("GUID Partition Table is valid! Yea!\n"); + pr_debug("GUID Partition Table is valid! Yea!\n"); for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) { if (!is_pte_valid(&ptes[i], last_lba(bdev))) diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c index 0fdda2e8a4cc..8652fb99e962 100644 --- a/fs/partitions/ldm.c +++ b/fs/partitions/ldm.c @@ -133,17 +133,17 @@ static bool ldm_parse_privhead(const u8 *data, struct privhead *ph) bool is_vista = false; BUG_ON(!data || !ph); - if (MAGIC_PRIVHEAD != BE64(data)) { + if (MAGIC_PRIVHEAD != get_unaligned_be64(data)) { ldm_error("Cannot find PRIVHEAD structure. LDM database is" " corrupt. Aborting."); return false; } - ph->ver_major = BE16(data + 0x000C); - ph->ver_minor = BE16(data + 0x000E); - ph->logical_disk_start = BE64(data + 0x011B); - ph->logical_disk_size = BE64(data + 0x0123); - ph->config_start = BE64(data + 0x012B); - ph->config_size = BE64(data + 0x0133); + ph->ver_major = get_unaligned_be16(data + 0x000C); + ph->ver_minor = get_unaligned_be16(data + 0x000E); + ph->logical_disk_start = get_unaligned_be64(data + 0x011B); + ph->logical_disk_size = get_unaligned_be64(data + 0x0123); + ph->config_start = get_unaligned_be64(data + 0x012B); + ph->config_size = get_unaligned_be64(data + 0x0133); /* Version 2.11 is Win2k/XP and version 2.12 is Vista. */ if (ph->ver_major == 2 && ph->ver_minor == 12) is_vista = true; @@ -191,14 +191,14 @@ static bool ldm_parse_tocblock (const u8 *data, struct tocblock *toc) { BUG_ON (!data || !toc); - if (MAGIC_TOCBLOCK != BE64 (data)) { + if (MAGIC_TOCBLOCK != get_unaligned_be64(data)) { ldm_crit ("Cannot find TOCBLOCK, database may be corrupt."); return false; } strncpy (toc->bitmap1_name, data + 0x24, sizeof (toc->bitmap1_name)); toc->bitmap1_name[sizeof (toc->bitmap1_name) - 1] = 0; - toc->bitmap1_start = BE64 (data + 0x2E); - toc->bitmap1_size = BE64 (data + 0x36); + toc->bitmap1_start = get_unaligned_be64(data + 0x2E); + toc->bitmap1_size = get_unaligned_be64(data + 0x36); if (strncmp (toc->bitmap1_name, TOC_BITMAP1, sizeof (toc->bitmap1_name)) != 0) { @@ -208,8 +208,8 @@ static bool ldm_parse_tocblock (const u8 *data, struct tocblock *toc) } strncpy (toc->bitmap2_name, data + 0x46, sizeof (toc->bitmap2_name)); toc->bitmap2_name[sizeof (toc->bitmap2_name) - 1] = 0; - toc->bitmap2_start = BE64 (data + 0x50); - toc->bitmap2_size = BE64 (data + 0x58); + toc->bitmap2_start = get_unaligned_be64(data + 0x50); + toc->bitmap2_size = get_unaligned_be64(data + 0x58); if (strncmp (toc->bitmap2_name, TOC_BITMAP2, sizeof (toc->bitmap2_name)) != 0) { ldm_crit ("TOCBLOCK's second bitmap is '%s', should be '%s'.", @@ -237,22 +237,22 @@ static bool ldm_parse_vmdb (const u8 *data, struct vmdb *vm) { BUG_ON (!data || !vm); - if (MAGIC_VMDB != BE32 (data)) { + if (MAGIC_VMDB != get_unaligned_be32(data)) { ldm_crit ("Cannot find the VMDB, database may be corrupt."); return false; } - vm->ver_major = BE16 (data + 0x12); - vm->ver_minor = BE16 (data + 0x14); + vm->ver_major = get_unaligned_be16(data + 0x12); + vm->ver_minor = get_unaligned_be16(data + 0x14); if ((vm->ver_major != 4) || (vm->ver_minor != 10)) { ldm_error ("Expected VMDB version %d.%d, got %d.%d. " "Aborting.", 4, 10, vm->ver_major, vm->ver_minor); return false; } - vm->vblk_size = BE32 (data + 0x08); - vm->vblk_offset = BE32 (data + 0x0C); - vm->last_vblk_seq = BE32 (data + 0x04); + vm->vblk_size = get_unaligned_be32(data + 0x08); + vm->vblk_offset = get_unaligned_be32(data + 0x0C); + vm->last_vblk_seq = get_unaligned_be32(data + 0x04); ldm_debug ("Parsed VMDB successfully."); return true; @@ -507,7 +507,7 @@ static bool ldm_validate_vmdb (struct block_device *bdev, unsigned long base, goto out; /* Already logged */ /* Are there uncommitted transactions? */ - if (BE16(data + 0x10) != 0x01) { + if (get_unaligned_be16(data + 0x10) != 0x01) { ldm_crit ("Database is not in a consistent state. Aborting."); goto out; } @@ -802,7 +802,7 @@ static bool ldm_parse_cmp3 (const u8 *buffer, int buflen, struct vblk *vb) return false; len += VBLK_SIZE_CMP3; - if (len != BE32 (buffer + 0x14)) + if (len != get_unaligned_be32(buffer + 0x14)) return false; comp = &vb->vblk.comp; @@ -851,7 +851,7 @@ static int ldm_parse_dgr3 (const u8 *buffer, int buflen, struct vblk *vb) return false; len += VBLK_SIZE_DGR3; - if (len != BE32 (buffer + 0x14)) + if (len != get_unaligned_be32(buffer + 0x14)) return false; dgrp = &vb->vblk.dgrp; @@ -895,7 +895,7 @@ static bool ldm_parse_dgr4 (const u8 *buffer, int buflen, struct vblk *vb) return false; len += VBLK_SIZE_DGR4; - if (len != BE32 (buffer + 0x14)) + if (len != get_unaligned_be32(buffer + 0x14)) return false; dgrp = &vb->vblk.dgrp; @@ -931,7 +931,7 @@ static bool ldm_parse_dsk3 (const u8 *buffer, int buflen, struct vblk *vb) return false; len += VBLK_SIZE_DSK3; - if (len != BE32 (buffer + 0x14)) + if (len != get_unaligned_be32(buffer + 0x14)) return false; disk = &vb->vblk.disk; @@ -968,7 +968,7 @@ static bool ldm_parse_dsk4 (const u8 *buffer, int buflen, struct vblk *vb) return false; len += VBLK_SIZE_DSK4; - if (len != BE32 (buffer + 0x14)) + if (len != get_unaligned_be32(buffer + 0x14)) return false; disk = &vb->vblk.disk; @@ -1034,14 +1034,14 @@ static bool ldm_parse_prt3(const u8 *buffer, int buflen, struct vblk *vb) return false; } len += VBLK_SIZE_PRT3; - if (len > BE32(buffer + 0x14)) { + if (len > get_unaligned_be32(buffer + 0x14)) { ldm_error("len %d > BE32(buffer + 0x14) %d", len, - BE32(buffer + 0x14)); + get_unaligned_be32(buffer + 0x14)); return false; } part = &vb->vblk.part; - part->start = BE64(buffer + 0x24 + r_name); - part->volume_offset = BE64(buffer + 0x2C + r_name); + part->start = get_unaligned_be64(buffer + 0x24 + r_name); + part->volume_offset = get_unaligned_be64(buffer + 0x2C + r_name); part->size = ldm_get_vnum(buffer + 0x34 + r_name); part->parent_id = ldm_get_vnum(buffer + 0x34 + r_size); part->disk_id = ldm_get_vnum(buffer + 0x34 + r_parent); @@ -1139,9 +1139,9 @@ static bool ldm_parse_vol5(const u8 *buffer, int buflen, struct vblk *vb) return false; } len += VBLK_SIZE_VOL5; - if (len > BE32(buffer + 0x14)) { + if (len > get_unaligned_be32(buffer + 0x14)) { ldm_error("len %d > BE32(buffer + 0x14) %d", len, - BE32(buffer + 0x14)); + get_unaligned_be32(buffer + 0x14)); return false; } volu = &vb->vblk.volu; @@ -1294,9 +1294,9 @@ static bool ldm_frag_add (const u8 *data, int size, struct list_head *frags) BUG_ON (!data || !frags); - group = BE32 (data + 0x08); - rec = BE16 (data + 0x0C); - num = BE16 (data + 0x0E); + group = get_unaligned_be32(data + 0x08); + rec = get_unaligned_be16(data + 0x0C); + num = get_unaligned_be16(data + 0x0E); if ((num < 1) || (num > 4)) { ldm_error ("A VBLK claims to have %d parts.", num); return false; @@ -1425,12 +1425,12 @@ static bool ldm_get_vblks (struct block_device *bdev, unsigned long base, } for (v = 0; v < perbuf; v++, data+=size) { /* For each vblk */ - if (MAGIC_VBLK != BE32 (data)) { + if (MAGIC_VBLK != get_unaligned_be32(data)) { ldm_error ("Expected to find a VBLK."); goto out; } - recs = BE16 (data + 0x0E); /* Number of records */ + recs = get_unaligned_be16(data + 0x0E); /* Number of records */ if (recs == 1) { if (!ldm_ldmdb_add (data, size, ldb)) goto out; /* Already logged */ diff --git a/fs/partitions/ldm.h b/fs/partitions/ldm.h index 80f63b5fdd9f..30e08e809c1d 100644 --- a/fs/partitions/ldm.h +++ b/fs/partitions/ldm.h @@ -98,11 +98,6 @@ struct parsed_partitions; #define TOC_BITMAP1 "config" /* Names of the two defined */ #define TOC_BITMAP2 "log" /* bitmaps in the TOCBLOCK. */ -/* Most numbers we deal with are big-endian and won't be aligned. */ -#define BE16(x) ((u16)be16_to_cpu(get_unaligned((__be16*)(x)))) -#define BE32(x) ((u32)be32_to_cpu(get_unaligned((__be32*)(x)))) -#define BE64(x) ((u64)be64_to_cpu(get_unaligned((__be64*)(x)))) - /* Borrowed from msdos.c */ #define SYS_IND(p) (get_unaligned(&(p)->sys_ind)) diff --git a/fs/pipe.c b/fs/pipe.c index ec228bc9f882..fcba6542b8d0 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -777,45 +777,10 @@ pipe_rdwr_open(struct inode *inode, struct file *filp) /* * The file_operations structs are not static because they * are also used in linux/fs/fifo.c to do operations on FIFOs. + * + * Pipes reuse fifos' file_operations structs. */ -const struct file_operations read_fifo_fops = { - .llseek = no_llseek, - .read = do_sync_read, - .aio_read = pipe_read, - .write = bad_pipe_w, - .poll = pipe_poll, - .unlocked_ioctl = pipe_ioctl, - .open = pipe_read_open, - .release = pipe_read_release, - .fasync = pipe_read_fasync, -}; - -const struct file_operations write_fifo_fops = { - .llseek = no_llseek, - .read = bad_pipe_r, - .write = do_sync_write, - .aio_write = pipe_write, - .poll = pipe_poll, - .unlocked_ioctl = pipe_ioctl, - .open = pipe_write_open, - .release = pipe_write_release, - .fasync = pipe_write_fasync, -}; - -const struct file_operations rdwr_fifo_fops = { - .llseek = no_llseek, - .read = do_sync_read, - .aio_read = pipe_read, - .write = do_sync_write, - .aio_write = pipe_write, - .poll = pipe_poll, - .unlocked_ioctl = pipe_ioctl, - .open = pipe_rdwr_open, - .release = pipe_rdwr_release, - .fasync = pipe_rdwr_fasync, -}; - -static const struct file_operations read_pipe_fops = { +const struct file_operations read_pipefifo_fops = { .llseek = no_llseek, .read = do_sync_read, .aio_read = pipe_read, @@ -827,7 +792,7 @@ static const struct file_operations read_pipe_fops = { .fasync = pipe_read_fasync, }; -static const struct file_operations write_pipe_fops = { +const struct file_operations write_pipefifo_fops = { .llseek = no_llseek, .read = bad_pipe_r, .write = do_sync_write, @@ -839,7 +804,7 @@ static const struct file_operations write_pipe_fops = { .fasync = pipe_write_fasync, }; -static const struct file_operations rdwr_pipe_fops = { +const struct file_operations rdwr_pipefifo_fops = { .llseek = no_llseek, .read = do_sync_read, .aio_read = pipe_read, @@ -927,7 +892,7 @@ static struct inode * get_pipe_inode(void) inode->i_pipe = pipe; pipe->readers = pipe->writers = 1; - inode->i_fop = &rdwr_pipe_fops; + inode->i_fop = &rdwr_pipefifo_fops; /* * Mark the inode dirty from the very beginning, @@ -950,7 +915,7 @@ fail_inode: return NULL; } -struct file *create_write_pipe(void) +struct file *create_write_pipe(int flags) { int err; struct inode *inode; @@ -978,12 +943,12 @@ struct file *create_write_pipe(void) d_instantiate(dentry, inode); err = -ENFILE; - f = alloc_file(pipe_mnt, dentry, FMODE_WRITE, &write_pipe_fops); + f = alloc_file(pipe_mnt, dentry, FMODE_WRITE, &write_pipefifo_fops); if (!f) goto err_dentry; f->f_mapping = inode->i_mapping; - f->f_flags = O_WRONLY; + f->f_flags = O_WRONLY | (flags & O_NONBLOCK); f->f_version = 0; return f; @@ -1003,51 +968,53 @@ struct file *create_write_pipe(void) void free_write_pipe(struct file *f) { free_pipe_info(f->f_dentry->d_inode); - dput(f->f_path.dentry); - mntput(f->f_path.mnt); + path_put(&f->f_path); put_filp(f); } -struct file *create_read_pipe(struct file *wrf) +struct file *create_read_pipe(struct file *wrf, int flags) { struct file *f = get_empty_filp(); if (!f) return ERR_PTR(-ENFILE); /* Grab pipe from the writer */ - f->f_path.mnt = mntget(wrf->f_path.mnt); - f->f_path.dentry = dget(wrf->f_path.dentry); + f->f_path = wrf->f_path; + path_get(&wrf->f_path); f->f_mapping = wrf->f_path.dentry->d_inode->i_mapping; f->f_pos = 0; - f->f_flags = O_RDONLY; - f->f_op = &read_pipe_fops; + f->f_flags = O_RDONLY | (flags & O_NONBLOCK); + f->f_op = &read_pipefifo_fops; f->f_mode = FMODE_READ; f->f_version = 0; return f; } -int do_pipe(int *fd) +int do_pipe_flags(int *fd, int flags) { struct file *fw, *fr; int error; int fdw, fdr; - fw = create_write_pipe(); + if (flags & ~(O_CLOEXEC | O_NONBLOCK)) + return -EINVAL; + + fw = create_write_pipe(flags); if (IS_ERR(fw)) return PTR_ERR(fw); - fr = create_read_pipe(fw); + fr = create_read_pipe(fw, flags); error = PTR_ERR(fr); if (IS_ERR(fr)) goto err_write_pipe; - error = get_unused_fd(); + error = get_unused_fd_flags(flags); if (error < 0) goto err_read_pipe; fdr = error; - error = get_unused_fd(); + error = get_unused_fd_flags(flags); if (error < 0) goto err_fdr; fdw = error; @@ -1068,24 +1035,28 @@ int do_pipe(int *fd) err_fdr: put_unused_fd(fdr); err_read_pipe: - dput(fr->f_dentry); - mntput(fr->f_vfsmnt); + path_put(&fr->f_path); put_filp(fr); err_write_pipe: free_write_pipe(fw); return error; } +int do_pipe(int *fd) +{ + return do_pipe_flags(fd, 0); +} + /* * sys_pipe() is the normal C calling standard for creating * a pipe. It's not the way Unix traditionally does this, though. */ -asmlinkage long __weak sys_pipe(int __user *fildes) +asmlinkage long __weak sys_pipe2(int __user *fildes, int flags) { int fd[2]; int error; - error = do_pipe(fd); + error = do_pipe_flags(fd, flags); if (!error) { if (copy_to_user(fildes, fd, sizeof(fd))) { sys_close(fd[0]); @@ -1096,6 +1067,11 @@ asmlinkage long __weak sys_pipe(int __user *fildes) return error; } +asmlinkage long __weak sys_pipe(int __user *fildes) +{ + return sys_pipe2(fildes, 0); +} + /* * pipefs should _never_ be mounted by userland - too much of security hassle, * no real gain from having the whole whorehouse mounted. So we don't need diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig new file mode 100644 index 000000000000..50f8f0600f06 --- /dev/null +++ b/fs/proc/Kconfig @@ -0,0 +1,69 @@ +config PROC_FS + bool "/proc file system support" if EMBEDDED + default y + help + This is a virtual file system providing information about the status + of the system. "Virtual" means that it doesn't take up any space on + your hard disk: the files are created on the fly by the kernel when + you try to access them. Also, you cannot read the files with older + version of the program less: you need to use more or cat. + + It's totally cool; for example, "cat /proc/interrupts" gives + information about what the different IRQs are used for at the moment + (there is a small number of Interrupt ReQuest lines in your computer + that are used by the attached devices to gain the CPU's attention -- + often a source of trouble if two devices are mistakenly configured + to use the same IRQ). The program procinfo to display some + information about your system gathered from the /proc file system. + + Before you can use the /proc file system, it has to be mounted, + meaning it has to be given a location in the directory hierarchy. + That location should be /proc. A command such as "mount -t proc proc + /proc" or the equivalent line in /etc/fstab does the job. + + The /proc file system is explained in the file + <file:Documentation/filesystems/proc.txt> and on the proc(5) manpage + ("man 5 proc"). + + This option will enlarge your kernel by about 67 KB. Several + programs depend on this, so everyone should say Y here. + +config PROC_KCORE + bool "/proc/kcore support" if !ARM + depends on PROC_FS && MMU + +config PROC_VMCORE + bool "/proc/vmcore support (EXPERIMENTAL)" + depends on PROC_FS && CRASH_DUMP + default y + help + Exports the dump image of crashed kernel in ELF format. + +config PROC_SYSCTL + bool "Sysctl support (/proc/sys)" if EMBEDDED + depends on PROC_FS + select SYSCTL + default y + ---help--- + The sysctl interface provides a means of dynamically changing + certain kernel parameters and variables on the fly without requiring + a recompile of the kernel or reboot of the system. The primary + interface is through /proc/sys. If you say Y here a tree of + modifiable sysctl entries will be generated beneath the + /proc/sys directory. They are explained in the files + in <file:Documentation/sysctl/>. Note that enabling this + option will enlarge the kernel by at least 8 KB. + + As it is generally a good thing, you should say Y here unless + building a kernel for install/rescue disks or your system is very + limited in memory. + +config PROC_PAGE_MONITOR + default y + depends on PROC_FS && MMU + bool "Enable /proc page monitoring" if EMBEDDED + help + Various /proc files exist to monitor process memory utilization: + /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap, + /proc/kpagecount, and /proc/kpageflags. Disabling these + interfaces will reduce the size of the kernel by approximately 4kb. diff --git a/fs/proc/array.c b/fs/proc/array.c index 797d775e0354..f4bc0e789539 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -80,16 +80,12 @@ #include <linux/delayacct.h> #include <linux/seq_file.h> #include <linux/pid_namespace.h> +#include <linux/tracehook.h> #include <asm/pgtable.h> #include <asm/processor.h> #include "internal.h" -/* Gcc optimizes away "strlen(x)" for constant x */ -#define ADDBUF(buffer, string) \ -do { memcpy(buffer, string, strlen(string)); \ - buffer += strlen(string); } while (0) - static inline void task_name(struct seq_file *m, struct task_struct *p) { int i; @@ -168,8 +164,12 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, rcu_read_lock(); ppid = pid_alive(p) ? task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0; - tpid = pid_alive(p) && p->ptrace ? - task_pid_nr_ns(rcu_dereference(p->parent), ns) : 0; + tpid = 0; + if (pid_alive(p)) { + struct task_struct *tracer = tracehook_tracer_task(p); + if (tracer) + tpid = task_pid_nr_ns(tracer, ns); + } seq_printf(m, "State:\t%s\n" "Tgid:\t%d\n" @@ -256,7 +256,6 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p) sigemptyset(&ignored); sigemptyset(&caught); - rcu_read_lock(); if (lock_task_sighand(p, &flags)) { pending = p->pending.signal; shpending = p->signal->shared_pending.signal; @@ -267,7 +266,6 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p) qlim = p->signal->rlim[RLIMIT_SIGPENDING].rlim_cur; unlock_task_sighand(p, &flags); } - rcu_read_unlock(); seq_printf(m, "Threads:\t%d\n", num_threads); seq_printf(m, "SigQ:\t%lu/%lu\n", qsize, qlim); @@ -332,65 +330,6 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, return 0; } -/* - * Use precise platform statistics if available: - */ -#ifdef CONFIG_VIRT_CPU_ACCOUNTING -static cputime_t task_utime(struct task_struct *p) -{ - return p->utime; -} - -static cputime_t task_stime(struct task_struct *p) -{ - return p->stime; -} -#else -static cputime_t task_utime(struct task_struct *p) -{ - clock_t utime = cputime_to_clock_t(p->utime), - total = utime + cputime_to_clock_t(p->stime); - u64 temp; - - /* - * Use CFS's precise accounting: - */ - temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); - - if (total) { - temp *= utime; - do_div(temp, total); - } - utime = (clock_t)temp; - - p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); - return p->prev_utime; -} - -static cputime_t task_stime(struct task_struct *p) -{ - clock_t stime; - - /* - * Use CFS's precise accounting. (we subtract utime from - * the total, to make sure the total observed by userspace - * grows monotonically - apps rely on that): - */ - stime = nsec_to_clock_t(p->se.sum_exec_runtime) - - cputime_to_clock_t(task_utime(p)); - - if (stime >= 0) - p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); - - return p->prev_stime; -} -#endif - -static cputime_t task_gtime(struct task_struct *p) -{ - return p->gtime; -} - static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task, int whole) { diff --git a/fs/proc/base.c b/fs/proc/base.c index 3b455371e7ff..b5918ae8ca79 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -53,6 +53,7 @@ #include <linux/time.h> #include <linux/proc_fs.h> #include <linux/stat.h> +#include <linux/task_io_accounting_ops.h> #include <linux/init.h> #include <linux/capability.h> #include <linux/file.h> @@ -69,6 +70,7 @@ #include <linux/mount.h> #include <linux/security.h> #include <linux/ptrace.h> +#include <linux/tracehook.h> #include <linux/cgroup.h> #include <linux/cpuset.h> #include <linux/audit.h> @@ -146,9 +148,6 @@ static unsigned int pid_entry_count_dirs(const struct pid_entry *entries, return count; } -int maps_protect; -EXPORT_SYMBOL(maps_protect); - static struct fs_struct *get_fs_struct(struct task_struct *task) { struct fs_struct *fs; @@ -162,7 +161,6 @@ static struct fs_struct *get_fs_struct(struct task_struct *task) static int get_nr_threads(struct task_struct *tsk) { - /* Must be called with the rcu_read_lock held */ unsigned long flags; int count = 0; @@ -231,10 +229,14 @@ static int check_mem_permission(struct task_struct *task) * If current is actively ptrace'ing, and would also be * permitted to freshly attach with ptrace now, permit it. */ - if (task->parent == current && (task->ptrace & PT_PTRACED) && - task_is_stopped_or_traced(task) && - ptrace_may_attach(task)) - return 0; + if (task_is_stopped_or_traced(task)) { + int match; + rcu_read_lock(); + match = (tracehook_tracer_task(task) == current); + rcu_read_unlock(); + if (match && ptrace_may_access(task, PTRACE_MODE_ATTACH)) + return 0; + } /* * Noone else is allowed. @@ -251,7 +253,8 @@ struct mm_struct *mm_for_maps(struct task_struct *task) task_lock(task); if (task->mm != mm) goto out; - if (task->mm != current->mm && __ptrace_may_attach(task) < 0) + if (task->mm != current->mm && + __ptrace_may_access(task, PTRACE_MODE_READ) < 0) goto out; task_unlock(task); return mm; @@ -464,14 +467,10 @@ static int proc_pid_limits(struct task_struct *task, char *buffer) struct rlimit rlim[RLIM_NLIMITS]; - rcu_read_lock(); - if (!lock_task_sighand(task,&flags)) { - rcu_read_unlock(); + if (!lock_task_sighand(task, &flags)) return 0; - } memcpy(rlim, task->signal->rlim, sizeof(struct rlimit) * RLIM_NLIMITS); unlock_task_sighand(task, &flags); - rcu_read_unlock(); /* * print the file header @@ -503,6 +502,26 @@ static int proc_pid_limits(struct task_struct *task, char *buffer) return count; } +#ifdef CONFIG_HAVE_ARCH_TRACEHOOK +static int proc_pid_syscall(struct task_struct *task, char *buffer) +{ + long nr; + unsigned long args[6], sp, pc; + + if (task_current_syscall(task, &nr, args, 6, &sp, &pc)) + return sprintf(buffer, "running\n"); + + if (nr < 0) + return sprintf(buffer, "%ld 0x%lx 0x%lx\n", nr, sp, pc); + + return sprintf(buffer, + "%ld 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n", + nr, + args[0], args[1], args[2], args[3], args[4], args[5], + sp, pc); +} +#endif /* CONFIG_HAVE_ARCH_TRACEHOOK */ + /************************************************************************/ /* Here the fs part begins */ /************************************************************************/ @@ -518,7 +537,7 @@ static int proc_fd_access_allowed(struct inode *inode) */ task = get_proc_task(inode); if (task) { - allowed = ptrace_may_attach(task); + allowed = ptrace_may_access(task, PTRACE_MODE_READ); put_task_struct(task); } return allowed; @@ -904,7 +923,7 @@ static ssize_t environ_read(struct file *file, char __user *buf, if (!task) goto out_no_task; - if (!ptrace_may_attach(task)) + if (!ptrace_may_access(task, PTRACE_MODE_READ)) goto out; ret = -ENOMEM; @@ -1833,8 +1852,7 @@ static const struct file_operations proc_fd_operations = { * /proc/pid/fd needs a special permission handler so that a process can still * access /proc/self/fd after it has executed a setuid(). */ -static int proc_fd_permission(struct inode *inode, int mask, - struct nameidata *nd) +static int proc_fd_permission(struct inode *inode, int mask) { int rv; @@ -2375,29 +2393,54 @@ static int proc_base_fill_cache(struct file *filp, void *dirent, } #ifdef CONFIG_TASK_IO_ACCOUNTING -static int proc_pid_io_accounting(struct task_struct *task, char *buffer) +static int do_io_accounting(struct task_struct *task, char *buffer, int whole) { + struct task_io_accounting acct = task->ioac; + unsigned long flags; + + if (whole && lock_task_sighand(task, &flags)) { + struct task_struct *t = task; + + task_io_accounting_add(&acct, &task->signal->ioac); + while_each_thread(task, t) + task_io_accounting_add(&acct, &t->ioac); + + unlock_task_sighand(task, &flags); + } return sprintf(buffer, -#ifdef CONFIG_TASK_XACCT "rchar: %llu\n" "wchar: %llu\n" "syscr: %llu\n" "syscw: %llu\n" -#endif "read_bytes: %llu\n" "write_bytes: %llu\n" "cancelled_write_bytes: %llu\n", -#ifdef CONFIG_TASK_XACCT - (unsigned long long)task->rchar, - (unsigned long long)task->wchar, - (unsigned long long)task->syscr, - (unsigned long long)task->syscw, -#endif - (unsigned long long)task->ioac.read_bytes, - (unsigned long long)task->ioac.write_bytes, - (unsigned long long)task->ioac.cancelled_write_bytes); + (unsigned long long)acct.rchar, + (unsigned long long)acct.wchar, + (unsigned long long)acct.syscr, + (unsigned long long)acct.syscw, + (unsigned long long)acct.read_bytes, + (unsigned long long)acct.write_bytes, + (unsigned long long)acct.cancelled_write_bytes); +} + +static int proc_tid_io_accounting(struct task_struct *task, char *buffer) +{ + return do_io_accounting(task, buffer, 0); +} + +static int proc_tgid_io_accounting(struct task_struct *task, char *buffer) +{ + return do_io_accounting(task, buffer, 1); +} +#endif /* CONFIG_TASK_IO_ACCOUNTING */ + +static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + seq_printf(m, "%08x\n", task->personality); + return 0; } -#endif /* * Thread groups @@ -2415,10 +2458,14 @@ static const struct pid_entry tgid_base_stuff[] = { REG("environ", S_IRUSR, environ), INF("auxv", S_IRUSR, pid_auxv), ONE("status", S_IRUGO, pid_status), + ONE("personality", S_IRUSR, pid_personality), INF("limits", S_IRUSR, pid_limits), #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, pid_sched), #endif +#ifdef CONFIG_HAVE_ARCH_TRACEHOOK + INF("syscall", S_IRUSR, pid_syscall), +#endif INF("cmdline", S_IRUGO, pid_cmdline), ONE("stat", S_IRUGO, tgid_stat), ONE("statm", S_IRUGO, pid_statm), @@ -2469,7 +2516,7 @@ static const struct pid_entry tgid_base_stuff[] = { REG("coredump_filter", S_IRUGO|S_IWUSR, coredump_filter), #endif #ifdef CONFIG_TASK_IO_ACCOUNTING - INF("io", S_IRUGO, pid_io_accounting), + INF("io", S_IRUGO, tgid_io_accounting), #endif }; @@ -2747,10 +2794,14 @@ static const struct pid_entry tid_base_stuff[] = { REG("environ", S_IRUSR, environ), INF("auxv", S_IRUSR, pid_auxv), ONE("status", S_IRUGO, pid_status), + ONE("personality", S_IRUSR, pid_personality), INF("limits", S_IRUSR, pid_limits), #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, pid_sched), #endif +#ifdef CONFIG_HAVE_ARCH_TRACEHOOK + INF("syscall", S_IRUSR, pid_syscall), +#endif INF("cmdline", S_IRUGO, pid_cmdline), ONE("stat", S_IRUGO, tid_stat), ONE("statm", S_IRUGO, pid_statm), @@ -2796,6 +2847,9 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_FAULT_INJECTION REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject), #endif +#ifdef CONFIG_TASK_IO_ACCOUNTING + INF("io", S_IRUGO, tid_io_accounting), +#endif }; static int proc_tid_base_readdir(struct file * filp, @@ -3035,9 +3089,7 @@ static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct generic_fillattr(inode, stat); if (p) { - rcu_read_lock(); stat->nlink += get_nr_threads(p); - rcu_read_unlock(); put_task_struct(p); } diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 43e54e86cefd..7821589a17d5 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -300,10 +300,10 @@ out: return rtn; } -static DEFINE_IDR(proc_inum_idr); +static DEFINE_IDA(proc_inum_ida); static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */ -#define PROC_DYNAMIC_FIRST 0xF0000000UL +#define PROC_DYNAMIC_FIRST 0xF0000000U /* * Return an inode number between PROC_DYNAMIC_FIRST and @@ -311,36 +311,34 @@ static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */ */ static unsigned int get_inode_number(void) { - int i, inum = 0; + unsigned int i; int error; retry: - if (idr_pre_get(&proc_inum_idr, GFP_KERNEL) == 0) + if (ida_pre_get(&proc_inum_ida, GFP_KERNEL) == 0) return 0; spin_lock(&proc_inum_lock); - error = idr_get_new(&proc_inum_idr, NULL, &i); + error = ida_get_new(&proc_inum_ida, &i); spin_unlock(&proc_inum_lock); if (error == -EAGAIN) goto retry; else if (error) return 0; - inum = (i & MAX_ID_MASK) + PROC_DYNAMIC_FIRST; - - /* inum will never be more than 0xf0ffffff, so no check - * for overflow. - */ - - return inum; + if (i > UINT_MAX - PROC_DYNAMIC_FIRST) { + spin_lock(&proc_inum_lock); + ida_remove(&proc_inum_ida, i); + spin_unlock(&proc_inum_lock); + return 0; + } + return PROC_DYNAMIC_FIRST + i; } static void release_inode_number(unsigned int inum) { - int id = (inum - PROC_DYNAMIC_FIRST) | ~MAX_ID_MASK; - spin_lock(&proc_inum_lock); - idr_remove(&proc_inum_idr, id); + ida_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST); spin_unlock(&proc_inum_lock); } @@ -549,8 +547,8 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp for (tmp = dir->subdir; tmp; tmp = tmp->next) if (strcmp(tmp->name, dp->name) == 0) { - printk(KERN_WARNING "proc_dir_entry '%s' already " - "registered\n", dp->name); + printk(KERN_WARNING "proc_dir_entry '%s/%s' already registered\n", + dir->name, dp->name); dump_stack(); break; } @@ -597,6 +595,7 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, ent->pde_users = 0; spin_lock_init(&ent->pde_unload_lock); ent->pde_unload_completion = NULL; + INIT_LIST_HEAD(&ent->pde_openers); out: return ent; } @@ -789,15 +788,25 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) spin_unlock(&de->pde_unload_lock); continue_removing: + spin_lock(&de->pde_unload_lock); + while (!list_empty(&de->pde_openers)) { + struct pde_opener *pdeo; + + pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh); + list_del(&pdeo->lh); + spin_unlock(&de->pde_unload_lock); + pdeo->release(pdeo->inode, pdeo->file); + kfree(pdeo); + spin_lock(&de->pde_unload_lock); + } + spin_unlock(&de->pde_unload_lock); + if (S_ISDIR(de->mode)) parent->nlink--; de->nlink = 0; - if (de->subdir) { - printk(KERN_WARNING "%s: removing non-empty directory " + WARN(de->subdir, KERN_WARNING "%s: removing non-empty directory " "'%s/%s', leaking at least '%s'\n", __func__, de->parent->name, de->name, de->subdir->name); - WARN_ON(1); - } if (atomic_dec_and_test(&de->count)) free_proc_entry(de); } diff --git a/fs/proc/inode.c b/fs/proc/inode.c index b08d10017911..c6b4fa7e3b49 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -17,6 +17,7 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/smp_lock.h> +#include <linux/sysctl.h> #include <asm/system.h> #include <asm/uaccess.h> @@ -65,6 +66,8 @@ static void proc_delete_inode(struct inode *inode) module_put(de->owner); de_put(de); } + if (PROC_I(inode)->sysctl) + sysctl_head_put(PROC_I(inode)->sysctl); clear_inode(inode); } @@ -84,6 +87,8 @@ static struct inode *proc_alloc_inode(struct super_block *sb) ei->fd = 0; ei->op.proc_get_link = NULL; ei->pde = NULL; + ei->sysctl = NULL; + ei->sysctl_entry = NULL; inode = &ei->vfs_inode; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; return inode; @@ -94,7 +99,7 @@ static void proc_destroy_inode(struct inode *inode) kmem_cache_free(proc_inode_cachep, PROC_I(inode)); } -static void init_once(struct kmem_cache * cachep, void *foo) +static void init_once(void *foo) { struct proc_inode *ei = (struct proc_inode *) foo; @@ -111,27 +116,25 @@ int __init proc_init_inodecache(void) return 0; } -static int proc_remount(struct super_block *sb, int *flags, char *data) -{ - *flags |= MS_NODIRATIME; - return 0; -} - static const struct super_operations proc_sops = { .alloc_inode = proc_alloc_inode, .destroy_inode = proc_destroy_inode, .drop_inode = generic_delete_inode, .delete_inode = proc_delete_inode, .statfs = simple_statfs, - .remount_fs = proc_remount, }; -static void pde_users_dec(struct proc_dir_entry *pde) +static void __pde_users_dec(struct proc_dir_entry *pde) { - spin_lock(&pde->pde_unload_lock); pde->pde_users--; if (pde->pde_unload_completion && pde->pde_users == 0) complete(pde->pde_unload_completion); +} + +static void pde_users_dec(struct proc_dir_entry *pde) +{ + spin_lock(&pde->pde_unload_lock); + __pde_users_dec(pde); spin_unlock(&pde->pde_unload_lock); } @@ -318,36 +321,97 @@ static int proc_reg_open(struct inode *inode, struct file *file) struct proc_dir_entry *pde = PDE(inode); int rv = 0; int (*open)(struct inode *, struct file *); + int (*release)(struct inode *, struct file *); + struct pde_opener *pdeo; + + /* + * What for, you ask? Well, we can have open, rmmod, remove_proc_entry + * sequence. ->release won't be called because ->proc_fops will be + * cleared. Depending on complexity of ->release, consequences vary. + * + * We can't wait for mercy when close will be done for real, it's + * deadlockable: rmmod foo </proc/foo . So, we're going to do ->release + * by hand in remove_proc_entry(). For this, save opener's credentials + * for later. + */ + pdeo = kmalloc(sizeof(struct pde_opener), GFP_KERNEL); + if (!pdeo) + return -ENOMEM; spin_lock(&pde->pde_unload_lock); if (!pde->proc_fops) { spin_unlock(&pde->pde_unload_lock); - return rv; + kfree(pdeo); + return -EINVAL; } pde->pde_users++; open = pde->proc_fops->open; + release = pde->proc_fops->release; spin_unlock(&pde->pde_unload_lock); if (open) rv = open(inode, file); - pde_users_dec(pde); + spin_lock(&pde->pde_unload_lock); + if (rv == 0 && release) { + /* To know what to release. */ + pdeo->inode = inode; + pdeo->file = file; + /* Strictly for "too late" ->release in proc_reg_release(). */ + pdeo->release = release; + list_add(&pdeo->lh, &pde->pde_openers); + } else + kfree(pdeo); + __pde_users_dec(pde); + spin_unlock(&pde->pde_unload_lock); return rv; } +static struct pde_opener *find_pde_opener(struct proc_dir_entry *pde, + struct inode *inode, struct file *file) +{ + struct pde_opener *pdeo; + + list_for_each_entry(pdeo, &pde->pde_openers, lh) { + if (pdeo->inode == inode && pdeo->file == file) + return pdeo; + } + return NULL; +} + static int proc_reg_release(struct inode *inode, struct file *file) { struct proc_dir_entry *pde = PDE(inode); int rv = 0; int (*release)(struct inode *, struct file *); + struct pde_opener *pdeo; spin_lock(&pde->pde_unload_lock); + pdeo = find_pde_opener(pde, inode, file); if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); + /* + * Can't simply exit, __fput() will think that everything is OK, + * and move on to freeing struct file. remove_proc_entry() will + * find slacker in opener's list and will try to do non-trivial + * things with struct file. Therefore, remove opener from list. + * + * But if opener is removed from list, who will ->release it? + */ + if (pdeo) { + list_del(&pdeo->lh); + spin_unlock(&pde->pde_unload_lock); + rv = pdeo->release(inode, file); + kfree(pdeo); + } else + spin_unlock(&pde->pde_unload_lock); return rv; } pde->pde_users++; release = pde->proc_fops->release; + if (pdeo) { + list_del(&pdeo->lh); + kfree(pdeo); + } spin_unlock(&pde->pde_unload_lock); if (release) diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 28cbca805905..3bfb7b8747b3 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -45,8 +45,6 @@ do { \ extern int nommu_vma_show(struct seq_file *, struct vm_area_struct *); #endif -extern int maps_protect; - extern int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task); extern int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns, @@ -63,6 +61,7 @@ extern const struct file_operations proc_smaps_operations; extern const struct file_operations proc_clear_refs_operations; extern const struct file_operations proc_pagemap_operations; extern const struct file_operations proc_net_operations; +extern const struct file_operations proc_kmsg_operations; extern const struct inode_operations proc_net_inode_operations; void free_proc_entry(struct proc_dir_entry *de); @@ -88,3 +87,10 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *ino, struct dentry *dentry); int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent, filldir_t filldir); + +struct pde_opener { + struct inode *inode; + struct file *file; + int (*release)(struct inode *, struct file *); + struct list_head lh; +}; diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index e78c81fcf547..c2370c76fb71 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -23,6 +23,10 @@ #define CORE_STR "CORE" +#ifndef ELF_CORE_EFLAGS +#define ELF_CORE_EFLAGS 0 +#endif + static int open_kcore(struct inode * inode, struct file * filp) { return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; @@ -164,11 +168,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff) elf->e_entry = 0; elf->e_phoff = sizeof(struct elfhdr); elf->e_shoff = 0; -#if defined(CONFIG_H8300) - elf->e_flags = ELF_FLAGS; -#else - elf->e_flags = 0; -#endif + elf->e_flags = ELF_CORE_EFLAGS; elf->e_ehsize = sizeof(struct elfhdr); elf->e_phentsize= sizeof(struct elf_phdr); elf->e_phnum = nphdr; diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c index ff3b90b56e9d..9fd5df3f40ce 100644 --- a/fs/proc/kmsg.c +++ b/fs/proc/kmsg.c @@ -15,6 +15,8 @@ #include <asm/uaccess.h> #include <asm/io.h> +#include "internal.h" + extern wait_queue_head_t log_wait; extern int do_syslog(int type, char __user *bug, int count); diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c index 79ecd281d2cb..3f87d2632947 100644 --- a/fs/proc/nommu.c +++ b/fs/proc/nommu.c @@ -52,14 +52,14 @@ int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) } seq_printf(m, - "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n", + "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", vma->vm_start, vma->vm_end, flags & VM_READ ? 'r' : '-', flags & VM_WRITE ? 'w' : '-', flags & VM_EXEC ? 'x' : '-', flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p', - vma->vm_pgoff << PAGE_SHIFT, + ((loff_t)vma->vm_pgoff) << PAGE_SHIFT, MAJOR(dev), MINOR(dev), ino, &len); if (file) { diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index 7e277f2ad466..b675a49c1823 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -24,6 +24,7 @@ #include <linux/tty.h> #include <linux/string.h> #include <linux/mman.h> +#include <linux/quicklist.h> #include <linux/proc_fs.h> #include <linux/ioport.h> #include <linux/mm.h> @@ -67,7 +68,6 @@ extern int get_hardware_list(char *); extern int get_stram_list(char *); extern int get_exec_domain_list(char *); -extern int get_dma_list(char *); static int proc_calc_metrics(char *page, char **start, off_t off, int count, int *eof, int len) @@ -123,6 +123,11 @@ static int uptime_read_proc(char *page, char **start, off_t off, return proc_calc_metrics(page, start, off, count, eof, len); } +int __attribute__((weak)) arch_report_meminfo(char *page) +{ + return 0; +} + static int meminfo_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) { @@ -177,6 +182,9 @@ static int meminfo_read_proc(char *page, char **start, off_t off, "SReclaimable: %8lu kB\n" "SUnreclaim: %8lu kB\n" "PageTables: %8lu kB\n" +#ifdef CONFIG_QUICKLIST + "Quicklists: %8lu kB\n" +#endif "NFS_Unstable: %8lu kB\n" "Bounce: %8lu kB\n" "WritebackTmp: %8lu kB\n" @@ -209,6 +217,9 @@ static int meminfo_read_proc(char *page, char **start, off_t off, K(global_page_state(NR_SLAB_RECLAIMABLE)), K(global_page_state(NR_SLAB_UNRECLAIMABLE)), K(global_page_state(NR_PAGETABLE)), +#ifdef CONFIG_QUICKLIST + K(quicklist_total_size()), +#endif K(global_page_state(NR_UNSTABLE_NFS)), K(global_page_state(NR_BOUNCE)), K(global_page_state(NR_WRITEBACK_TEMP)), @@ -221,11 +232,12 @@ static int meminfo_read_proc(char *page, char **start, off_t off, len += hugetlb_report_meminfo(page + len); + len += arch_report_meminfo(page + len); + return proc_calc_metrics(page, start, off, count, eof, len); #undef K } -extern const struct seq_operations fragmentation_op; static int fragmentation_open(struct inode *inode, struct file *file) { (void)inode; @@ -239,7 +251,6 @@ static const struct file_operations fragmentation_file_operations = { .release = seq_release, }; -extern const struct seq_operations pagetypeinfo_op; static int pagetypeinfo_open(struct inode *inode, struct file *file) { return seq_open(file, &pagetypeinfo_op); @@ -252,7 +263,6 @@ static const struct file_operations pagetypeinfo_file_ops = { .release = seq_release, }; -extern const struct seq_operations zoneinfo_op; static int zoneinfo_open(struct inode *inode, struct file *file) { return seq_open(file, &zoneinfo_op); @@ -349,7 +359,6 @@ static const struct file_operations proc_devinfo_operations = { .release = seq_release, }; -extern const struct seq_operations vmstat_op; static int vmstat_open(struct inode *inode, struct file *file) { return seq_open(file, &vmstat_op); @@ -461,17 +470,35 @@ static const struct file_operations proc_slabstats_operations = { #ifdef CONFIG_MMU static int vmalloc_open(struct inode *inode, struct file *file) { - return seq_open(file, &vmalloc_op); + unsigned int *ptr = NULL; + int ret; + + if (NUMA_BUILD) + ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); + ret = seq_open(file, &vmalloc_op); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = ptr; + } else + kfree(ptr); + return ret; } static const struct file_operations proc_vmalloc_operations = { .open = vmalloc_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = seq_release_private, }; #endif +#ifndef arch_irq_stat_cpu +#define arch_irq_stat_cpu(cpu) 0 +#endif +#ifndef arch_irq_stat +#define arch_irq_stat() 0 +#endif + static int show_stat(struct seq_file *p, void *v) { int i; @@ -509,7 +536,9 @@ static int show_stat(struct seq_file *p, void *v) sum += temp; per_irq_sum[j] += temp; } + sum += arch_irq_stat_cpu(i); } + sum += arch_irq_stat(); seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", (unsigned long long)cputime64_to_clock_t(user), @@ -654,6 +683,7 @@ static int cmdline_read_proc(char *page, char **start, off_t off, return proc_calc_metrics(page, start, off, count, eof, len); } +#ifdef CONFIG_FILE_LOCKING static int locks_open(struct inode *inode, struct file *filp) { return seq_open(filp, &locks_seq_operations); @@ -665,6 +695,7 @@ static const struct file_operations proc_locks_operations = { .llseek = seq_lseek, .release = seq_release, }; +#endif /* CONFIG_FILE_LOCKING */ static int execdomains_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) @@ -858,7 +889,9 @@ void __init proc_misc_init(void) #ifdef CONFIG_PRINTK proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations); #endif +#ifdef CONFIG_FILE_LOCKING proc_create("locks", 0, NULL, &proc_locks_operations); +#endif proc_create("devices", 0, NULL, &proc_devinfo_operations); proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations); #ifdef CONFIG_BLOCK diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 83f357b30d71..7bc296f424ae 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -27,6 +27,11 @@ #include "internal.h" +static struct net *get_proc_net(const struct inode *inode) +{ + return maybe_get_net(PDE_NET(PDE(inode))); +} + int seq_open_net(struct inode *ino, struct file *f, const struct seq_operations *ops, int size) { @@ -51,6 +56,30 @@ int seq_open_net(struct inode *ino, struct file *f, } EXPORT_SYMBOL_GPL(seq_open_net); +int single_open_net(struct inode *inode, struct file *file, + int (*show)(struct seq_file *, void *)) +{ + int err; + struct net *net; + + err = -ENXIO; + net = get_proc_net(inode); + if (net == NULL) + goto err_net; + + err = single_open(file, show, net); + if (err < 0) + goto err_open; + + return 0; + +err_open: + put_net(net); +err_net: + return err; +} +EXPORT_SYMBOL_GPL(single_open_net); + int seq_release_net(struct inode *ino, struct file *f) { struct seq_file *seq; @@ -63,6 +92,14 @@ int seq_release_net(struct inode *ino, struct file *f) } EXPORT_SYMBOL_GPL(seq_release_net); +int single_release_net(struct inode *ino, struct file *f) +{ + struct seq_file *seq = f->private_data; + put_net(seq->private); + return single_release(ino, f); +} +EXPORT_SYMBOL_GPL(single_release_net); + static struct net *get_proc_task_net(struct inode *dir) { struct task_struct *task; @@ -153,12 +190,6 @@ void proc_net_remove(struct net *net, const char *name) } EXPORT_SYMBOL_GPL(proc_net_remove); -struct net *get_proc_net(const struct inode *inode) -{ - return maybe_get_net(PDE_NET(PDE(inode))); -} -EXPORT_SYMBOL_GPL(get_proc_net); - static __net_init int proc_net_ns_init(struct net *net) { struct proc_dir_entry *netd, *net_statd; diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 5acc001d49f6..945a81043ba2 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -10,149 +10,110 @@ static struct dentry_operations proc_sys_dentry_operations; static const struct file_operations proc_sys_file_operations; static const struct inode_operations proc_sys_inode_operations; +static const struct file_operations proc_sys_dir_file_operations; +static const struct inode_operations proc_sys_dir_operations; -static void proc_sys_refresh_inode(struct inode *inode, struct ctl_table *table) -{ - /* Refresh the cached information bits in the inode */ - if (table) { - inode->i_uid = 0; - inode->i_gid = 0; - inode->i_mode = table->mode; - if (table->proc_handler) { - inode->i_mode |= S_IFREG; - inode->i_nlink = 1; - } else { - inode->i_mode |= S_IFDIR; - inode->i_nlink = 0; /* It is too hard to figure out */ - } - } -} - -static struct inode *proc_sys_make_inode(struct inode *dir, struct ctl_table *table) +static struct inode *proc_sys_make_inode(struct super_block *sb, + struct ctl_table_header *head, struct ctl_table *table) { struct inode *inode; - struct proc_inode *dir_ei, *ei; - int depth; + struct proc_inode *ei; - inode = new_inode(dir->i_sb); + inode = new_inode(sb); if (!inode) goto out; - /* A directory is always one deeper than it's parent */ - dir_ei = PROC_I(dir); - depth = dir_ei->fd + 1; - + sysctl_head_get(head); ei = PROC_I(inode); - ei->fd = depth; + ei->sysctl = head; + ei->sysctl_entry = table; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; - inode->i_op = &proc_sys_inode_operations; - inode->i_fop = &proc_sys_file_operations; inode->i_flags |= S_PRIVATE; /* tell selinux to ignore this inode */ - proc_sys_refresh_inode(inode, table); + inode->i_mode = table->mode; + if (!table->child) { + inode->i_mode |= S_IFREG; + inode->i_op = &proc_sys_inode_operations; + inode->i_fop = &proc_sys_file_operations; + } else { + inode->i_mode |= S_IFDIR; + inode->i_nlink = 0; + inode->i_op = &proc_sys_dir_operations; + inode->i_fop = &proc_sys_dir_file_operations; + } out: return inode; } -static struct dentry *proc_sys_ancestor(struct dentry *dentry, int depth) -{ - for (;;) { - struct proc_inode *ei; - - ei = PROC_I(dentry->d_inode); - if (ei->fd == depth) - break; /* found */ - - dentry = dentry->d_parent; - } - return dentry; -} - -static struct ctl_table *proc_sys_lookup_table_one(struct ctl_table *table, - struct qstr *name) +static struct ctl_table *find_in_table(struct ctl_table *p, struct qstr *name) { int len; - for ( ; table->ctl_name || table->procname; table++) { + for ( ; p->ctl_name || p->procname; p++) { - if (!table->procname) + if (!p->procname) continue; - len = strlen(table->procname); + len = strlen(p->procname); if (len != name->len) continue; - if (memcmp(table->procname, name->name, len) != 0) + if (memcmp(p->procname, name->name, len) != 0) continue; /* I have a match */ - return table; + return p; } return NULL; } -static struct ctl_table *proc_sys_lookup_table(struct dentry *dentry, - struct ctl_table *table) +static struct ctl_table_header *grab_header(struct inode *inode) { - struct dentry *ancestor; - struct proc_inode *ei; - int depth, i; + if (PROC_I(inode)->sysctl) + return sysctl_head_grab(PROC_I(inode)->sysctl); + else + return sysctl_head_next(NULL); +} - ei = PROC_I(dentry->d_inode); - depth = ei->fd; +static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + struct ctl_table_header *head = grab_header(dir); + struct ctl_table *table = PROC_I(dir)->sysctl_entry; + struct ctl_table_header *h = NULL; + struct qstr *name = &dentry->d_name; + struct ctl_table *p; + struct inode *inode; + struct dentry *err = ERR_PTR(-ENOENT); - if (depth == 0) - return table; + if (IS_ERR(head)) + return ERR_CAST(head); - for (i = 1; table && (i <= depth); i++) { - ancestor = proc_sys_ancestor(dentry, i); - table = proc_sys_lookup_table_one(table, &ancestor->d_name); - if (table) - table = table->child; + if (table && !table->child) { + WARN_ON(1); + goto out; } - return table; -} -static struct ctl_table *proc_sys_lookup_entry(struct dentry *dparent, - struct qstr *name, - struct ctl_table *table) -{ - table = proc_sys_lookup_table(dparent, table); - if (table) - table = proc_sys_lookup_table_one(table, name); - return table; -} + table = table ? table->child : head->ctl_table; -static struct ctl_table *do_proc_sys_lookup(struct dentry *parent, - struct qstr *name, - struct ctl_table_header **ptr) -{ - struct ctl_table_header *head; - struct ctl_table *table = NULL; - - for (head = sysctl_head_next(NULL); head; - head = sysctl_head_next(head)) { - table = proc_sys_lookup_entry(parent, name, head->ctl_table); - if (table) - break; + p = find_in_table(table, name); + if (!p) { + for (h = sysctl_head_next(NULL); h; h = sysctl_head_next(h)) { + if (h->attached_to != table) + continue; + p = find_in_table(h->attached_by, name); + if (p) + break; + } } - *ptr = head; - return table; -} - -static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) -{ - struct ctl_table_header *head; - struct inode *inode; - struct dentry *err; - struct ctl_table *table; - err = ERR_PTR(-ENOENT); - table = do_proc_sys_lookup(dentry->d_parent, &dentry->d_name, &head); - if (!table) + if (!p) goto out; err = ERR_PTR(-ENOMEM); - inode = proc_sys_make_inode(dir, table); + inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p); + if (h) + sysctl_head_finish(h); + if (!inode) goto out; @@ -168,22 +129,14 @@ out: static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf, size_t count, loff_t *ppos, int write) { - struct dentry *dentry = filp->f_dentry; - struct ctl_table_header *head; - struct ctl_table *table; + struct inode *inode = filp->f_path.dentry->d_inode; + struct ctl_table_header *head = grab_header(inode); + struct ctl_table *table = PROC_I(inode)->sysctl_entry; ssize_t error; size_t res; - table = do_proc_sys_lookup(dentry->d_parent, &dentry->d_name, &head); - /* Has the sysctl entry disappeared on us? */ - error = -ENOENT; - if (!table) - goto out; - - /* Has the sysctl entry been replaced by a directory? */ - error = -EISDIR; - if (!table->proc_handler) - goto out; + if (IS_ERR(head)) + return PTR_ERR(head); /* * At this point we know that the sysctl was not unregistered @@ -193,6 +146,11 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf, if (sysctl_perm(head->root, table, write ? MAY_WRITE : MAY_READ)) goto out; + /* if that can happen at all, it should be -EINVAL, not -EISDIR */ + error = -EINVAL; + if (!table->proc_handler) + goto out; + /* careful: calling conventions are nasty here */ res = count; error = table->proc_handler(table, write, filp, buf, &res, ppos); @@ -218,82 +176,86 @@ static ssize_t proc_sys_write(struct file *filp, const char __user *buf, static int proc_sys_fill_cache(struct file *filp, void *dirent, - filldir_t filldir, struct ctl_table *table) + filldir_t filldir, + struct ctl_table_header *head, + struct ctl_table *table) { - struct ctl_table_header *head; - struct ctl_table *child_table = NULL; struct dentry *child, *dir = filp->f_path.dentry; struct inode *inode; struct qstr qname; ino_t ino = 0; unsigned type = DT_UNKNOWN; - int ret; qname.name = table->procname; qname.len = strlen(table->procname); qname.hash = full_name_hash(qname.name, qname.len); - /* Suppress duplicates. - * Only fill a directory entry if it is the value that - * an ordinary lookup of that name returns. Hide all - * others. - * - * If we ever cache this translation in the dcache - * I should do a dcache lookup first. But for now - * it is just simpler not to. - */ - ret = 0; - child_table = do_proc_sys_lookup(dir, &qname, &head); - sysctl_head_finish(head); - if (child_table != table) - return 0; - child = d_lookup(dir, &qname); if (!child) { - struct dentry *new; - new = d_alloc(dir, &qname); - if (new) { - inode = proc_sys_make_inode(dir->d_inode, table); - if (!inode) - child = ERR_PTR(-ENOMEM); - else { - new->d_op = &proc_sys_dentry_operations; - d_add(new, inode); + child = d_alloc(dir, &qname); + if (child) { + inode = proc_sys_make_inode(dir->d_sb, head, table); + if (!inode) { + dput(child); + return -ENOMEM; + } else { + child->d_op = &proc_sys_dentry_operations; + d_add(child, inode); } - if (child) - dput(new); - else - child = new; + } else { + return -ENOMEM; } } - if (!child || IS_ERR(child) || !child->d_inode) - goto end_instantiate; inode = child->d_inode; - if (inode) { - ino = inode->i_ino; - type = inode->i_mode >> 12; - } + ino = inode->i_ino; + type = inode->i_mode >> 12; dput(child); -end_instantiate: - if (!ino) - ino= find_inode_number(dir, &qname); - if (!ino) - ino = 1; - return filldir(dirent, qname.name, qname.len, filp->f_pos, ino, type); + return !!filldir(dirent, qname.name, qname.len, filp->f_pos, ino, type); +} + +static int scan(struct ctl_table_header *head, ctl_table *table, + unsigned long *pos, struct file *file, + void *dirent, filldir_t filldir) +{ + + for (; table->ctl_name || table->procname; table++, (*pos)++) { + int res; + + /* Can't do anything without a proc name */ + if (!table->procname) + continue; + + if (*pos < file->f_pos) + continue; + + res = proc_sys_fill_cache(file, dirent, filldir, head, table); + if (res) + return res; + + file->f_pos = *pos + 1; + } + return 0; } static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir) { - struct dentry *dentry = filp->f_dentry; + struct dentry *dentry = filp->f_path.dentry; struct inode *inode = dentry->d_inode; - struct ctl_table_header *head = NULL; - struct ctl_table *table; + struct ctl_table_header *head = grab_header(inode); + struct ctl_table *table = PROC_I(inode)->sysctl_entry; + struct ctl_table_header *h = NULL; unsigned long pos; - int ret; + int ret = -EINVAL; - ret = -ENOTDIR; - if (!S_ISDIR(inode->i_mode)) + if (IS_ERR(head)) + return PTR_ERR(head); + + if (table && !table->child) { + WARN_ON(1); goto out; + } + + table = table ? table->child : head->ctl_table; ret = 0; /* Avoid a switch here: arm builds fail with missing __cmpdi2 */ @@ -311,30 +273,17 @@ static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir) } pos = 2; - /* - Find each instance of the directory - * - Read all entries in each instance - * - Before returning an entry to user space lookup the entry - * by name and if I find a different entry don't return - * this one because it means it is a buried dup. - * For sysctl this should only happen for directory entries. - */ - for (head = sysctl_head_next(NULL); head; head = sysctl_head_next(head)) { - table = proc_sys_lookup_table(dentry, head->ctl_table); + ret = scan(head, table, &pos, filp, dirent, filldir); + if (ret) + goto out; - if (!table) + for (h = sysctl_head_next(NULL); h; h = sysctl_head_next(h)) { + if (h->attached_to != table) continue; - - for (; table->ctl_name || table->procname; table++, pos++) { - /* Can't do anything without a proc name */ - if (!table->procname) - continue; - - if (pos < filp->f_pos) - continue; - - if (proc_sys_fill_cache(filp, dirent, filldir, table) < 0) - goto out; - filp->f_pos = pos + 1; + ret = scan(h, h->attached_by, &pos, filp, dirent, filldir); + if (ret) { + sysctl_head_finish(h); + break; } } ret = 1; @@ -343,53 +292,24 @@ out: return ret; } -static int proc_sys_permission(struct inode *inode, int mask, struct nameidata *nd) +static int proc_sys_permission(struct inode *inode, int mask) { /* * sysctl entries that are not writeable, * are _NOT_ writeable, capabilities or not. */ - struct ctl_table_header *head; - struct ctl_table *table; - struct dentry *dentry; - int mode; - int depth; + struct ctl_table_header *head = grab_header(inode); + struct ctl_table *table = PROC_I(inode)->sysctl_entry; int error; - head = NULL; - depth = PROC_I(inode)->fd; - - /* First check the cached permissions, in case we don't have - * enough information to lookup the sysctl table entry. - */ - error = -EACCES; - mode = inode->i_mode; - - if (current->euid == 0) - mode >>= 6; - else if (in_group_p(0)) - mode >>= 3; - - if ((mode & mask & (MAY_READ|MAY_WRITE|MAY_EXEC)) == mask) - error = 0; - - /* If we can't get a sysctl table entry the permission - * checks on the cached mode will have to be enough. - */ - if (!nd || !depth) - goto out; - - dentry = nd->path.dentry; - table = do_proc_sys_lookup(dentry->d_parent, &dentry->d_name, &head); + if (IS_ERR(head)) + return PTR_ERR(head); - /* If the entry does not exist deny permission */ - error = -EACCES; - if (!table) - goto out; + if (!table) /* global root - r-xr-xr-x */ + error = mask & MAY_WRITE ? -EACCES : 0; + else /* Use the permissions on the sysctl table entry */ + error = sysctl_perm(head->root, table, mask); - /* Use the permissions on the sysctl table entry */ - error = sysctl_perm(head->root, table, mask); -out: sysctl_head_finish(head); return error; } @@ -409,42 +329,79 @@ static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr) return error; } -/* I'm lazy and don't distinguish between files and directories, - * until access time. - */ +static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + struct ctl_table_header *head = grab_header(inode); + struct ctl_table *table = PROC_I(inode)->sysctl_entry; + + if (IS_ERR(head)) + return PTR_ERR(head); + + generic_fillattr(inode, stat); + if (table) + stat->mode = (stat->mode & S_IFMT) | table->mode; + + sysctl_head_finish(head); + return 0; +} + static const struct file_operations proc_sys_file_operations = { .read = proc_sys_read, .write = proc_sys_write, +}; + +static const struct file_operations proc_sys_dir_file_operations = { .readdir = proc_sys_readdir, }; static const struct inode_operations proc_sys_inode_operations = { + .permission = proc_sys_permission, + .setattr = proc_sys_setattr, + .getattr = proc_sys_getattr, +}; + +static const struct inode_operations proc_sys_dir_operations = { .lookup = proc_sys_lookup, .permission = proc_sys_permission, .setattr = proc_sys_setattr, + .getattr = proc_sys_getattr, }; static int proc_sys_revalidate(struct dentry *dentry, struct nameidata *nd) { - struct ctl_table_header *head; - struct ctl_table *table; - table = do_proc_sys_lookup(dentry->d_parent, &dentry->d_name, &head); - proc_sys_refresh_inode(dentry->d_inode, table); - sysctl_head_finish(head); - return !!table; + return !PROC_I(dentry->d_inode)->sysctl->unregistering; +} + +static int proc_sys_delete(struct dentry *dentry) +{ + return !!PROC_I(dentry->d_inode)->sysctl->unregistering; +} + +static int proc_sys_compare(struct dentry *dir, struct qstr *qstr, + struct qstr *name) +{ + struct dentry *dentry = container_of(qstr, struct dentry, d_name); + if (qstr->len != name->len) + return 1; + if (memcmp(qstr->name, name->name, name->len)) + return 1; + return !sysctl_is_seen(PROC_I(dentry->d_inode)->sysctl); } static struct dentry_operations proc_sys_dentry_operations = { .d_revalidate = proc_sys_revalidate, + .d_delete = proc_sys_delete, + .d_compare = proc_sys_compare, }; -static struct proc_dir_entry *proc_sys_root; - int proc_sys_init(void) { + struct proc_dir_entry *proc_sys_root; + proc_sys_root = proc_mkdir("sys", NULL); - proc_sys_root->proc_iops = &proc_sys_inode_operations; - proc_sys_root->proc_fops = &proc_sys_file_operations; + proc_sys_root->proc_iops = &proc_sys_dir_operations; + proc_sys_root->proc_fops = &proc_sys_dir_file_operations; proc_sys_root->nlink = 0; return 0; } diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c index 21f490f5d65c..d153946d6d15 100644 --- a/fs/proc/proc_tty.c +++ b/fs/proc/proc_tty.c @@ -136,54 +136,6 @@ static const struct file_operations proc_tty_drivers_operations = { .release = seq_release, }; -static void * tty_ldiscs_seq_start(struct seq_file *m, loff_t *pos) -{ - return (*pos < NR_LDISCS) ? pos : NULL; -} - -static void * tty_ldiscs_seq_next(struct seq_file *m, void *v, loff_t *pos) -{ - (*pos)++; - return (*pos < NR_LDISCS) ? pos : NULL; -} - -static void tty_ldiscs_seq_stop(struct seq_file *m, void *v) -{ -} - -static int tty_ldiscs_seq_show(struct seq_file *m, void *v) -{ - int i = *(loff_t *)v; - struct tty_ldisc *ld; - - ld = tty_ldisc_get(i); - if (ld == NULL) - return 0; - seq_printf(m, "%-10s %2d\n", ld->name ? ld->name : "???", i); - tty_ldisc_put(i); - return 0; -} - -static const struct seq_operations tty_ldiscs_seq_ops = { - .start = tty_ldiscs_seq_start, - .next = tty_ldiscs_seq_next, - .stop = tty_ldiscs_seq_stop, - .show = tty_ldiscs_seq_show, -}; - -static int proc_tty_ldiscs_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &tty_ldiscs_seq_ops); -} - -static const struct file_operations tty_ldiscs_proc_fops = { - .owner = THIS_MODULE, - .open = proc_tty_ldiscs_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - /* * This function is called by tty_register_driver() to handle * registering the driver's /proc handler into /proc/tty/driver/<foo> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index ab8ccc9d14ff..4806830ea2a1 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -210,23 +210,20 @@ static int show_map(struct seq_file *m, void *v) dev_t dev = 0; int len; - if (maps_protect && !ptrace_may_attach(task)) - return -EACCES; - if (file) { struct inode *inode = vma->vm_file->f_path.dentry->d_inode; dev = inode->i_sb->s_dev; ino = inode->i_ino; } - seq_printf(m, "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n", + seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", vma->vm_start, vma->vm_end, flags & VM_READ ? 'r' : '-', flags & VM_WRITE ? 'w' : '-', flags & VM_EXEC ? 'x' : '-', flags & VM_MAYSHARE ? 's' : 'p', - vma->vm_pgoff << PAGE_SHIFT, + ((loff_t)vma->vm_pgoff) << PAGE_SHIFT, MAJOR(dev), MINOR(dev), ino, &len); /* @@ -476,10 +473,10 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, return -ESRCH; mm = get_task_mm(task); if (mm) { - static struct mm_walk clear_refs_walk; - memset(&clear_refs_walk, 0, sizeof(clear_refs_walk)); - clear_refs_walk.pmd_entry = clear_refs_pte_range; - clear_refs_walk.mm = mm; + struct mm_walk clear_refs_walk = { + .pmd_entry = clear_refs_pte_range, + .mm = mm, + }; down_read(&mm->mmap_sem); for (vma = mm->mmap; vma; vma = vma->vm_next) { clear_refs_walk.private = vma; @@ -602,11 +599,6 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, return err; } -static struct mm_walk pagemap_walk = { - .pmd_entry = pagemap_pte_range, - .pte_hole = pagemap_pte_hole -}; - /* * /proc/pid/pagemap - an array mapping virtual pages to pfns * @@ -641,12 +633,17 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, struct pagemapread pm; int pagecount; int ret = -ESRCH; + struct mm_walk pagemap_walk = {}; + unsigned long src; + unsigned long svpfn; + unsigned long start_vaddr; + unsigned long end_vaddr; if (!task) goto out; ret = -EACCES; - if (!ptrace_may_attach(task)) + if (!ptrace_may_access(task, PTRACE_MODE_READ)) goto out_task; ret = -EINVAL; @@ -659,11 +656,15 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, if (!mm) goto out_task; - ret = -ENOMEM; + uaddr = (unsigned long)buf & PAGE_MASK; uend = (unsigned long)(buf + count); pagecount = (PAGE_ALIGN(uend) - uaddr) / PAGE_SIZE; - pages = kmalloc(pagecount * sizeof(struct page *), GFP_KERNEL); + ret = 0; + if (pagecount == 0) + goto out_mm; + pages = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL); + ret = -ENOMEM; if (!pages) goto out_mm; @@ -684,33 +685,33 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, pm.out = (u64 *)buf; pm.end = (u64 *)(buf + count); - if (!ptrace_may_attach(task)) { - ret = -EIO; - } else { - unsigned long src = *ppos; - unsigned long svpfn = src / PM_ENTRY_BYTES; - unsigned long start_vaddr = svpfn << PAGE_SHIFT; - unsigned long end_vaddr = TASK_SIZE_OF(task); - - /* watch out for wraparound */ - if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT) - start_vaddr = end_vaddr; - - /* - * The odds are that this will stop walking way - * before end_vaddr, because the length of the - * user buffer is tracked in "pm", and the walk - * will stop when we hit the end of the buffer. - */ - ret = walk_page_range(start_vaddr, end_vaddr, - &pagemap_walk); - if (ret == PM_END_OF_BUFFER) - ret = 0; - /* don't need mmap_sem for these, but this looks cleaner */ - *ppos += (char *)pm.out - buf; - if (!ret) - ret = (char *)pm.out - buf; - } + pagemap_walk.pmd_entry = pagemap_pte_range; + pagemap_walk.pte_hole = pagemap_pte_hole; + pagemap_walk.mm = mm; + pagemap_walk.private = ± + + src = *ppos; + svpfn = src / PM_ENTRY_BYTES; + start_vaddr = svpfn << PAGE_SHIFT; + end_vaddr = TASK_SIZE_OF(task); + + /* watch out for wraparound */ + if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT) + start_vaddr = end_vaddr; + + /* + * The odds are that this will stop walking way + * before end_vaddr, because the length of the + * user buffer is tracked in "pm", and the walk + * will stop when we hit the end of the buffer. + */ + ret = walk_page_range(start_vaddr, end_vaddr, &pagemap_walk); + if (ret == PM_END_OF_BUFFER) + ret = 0; + /* don't need mmap_sem for these, but this looks cleaner */ + *ppos += (char *)pm.out - buf; + if (!ret) + ret = (char *)pm.out - buf; out_pages: for (; pagecount; pagecount--) { @@ -738,22 +739,11 @@ const struct file_operations proc_pagemap_operations = { #ifdef CONFIG_NUMA extern int show_numa_map(struct seq_file *m, void *v); -static int show_numa_map_checked(struct seq_file *m, void *v) -{ - struct proc_maps_private *priv = m->private; - struct task_struct *task = priv->task; - - if (maps_protect && !ptrace_may_attach(task)) - return -EACCES; - - return show_numa_map(m, v); -} - static const struct seq_operations proc_pid_numa_maps_op = { .start = m_start, .next = m_next, .stop = m_stop, - .show = show_numa_map_checked + .show = show_numa_map, }; static int numa_maps_open(struct inode *inode, struct file *file) diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 4b4f9cc2f186..219bd79ea894 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -110,11 +110,6 @@ int task_statm(struct mm_struct *mm, int *shared, int *text, static int show_map(struct seq_file *m, void *_vml) { struct vm_list_struct *vml = _vml; - struct proc_maps_private *priv = m->private; - struct task_struct *task = priv->task; - - if (maps_protect && !ptrace_may_attach(task)) - return -EACCES; return nommu_vma_show(m, vml->vma); } diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 9ac0f5e064e0..841368b87a29 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -165,14 +165,8 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer, return acc; } -static int open_vmcore(struct inode *inode, struct file *filp) -{ - return 0; -} - const struct file_operations proc_vmcore_operations = { .read = read_vmcore, - .open = open_vmcore, }; static struct vmcore* __init get_new_element(void) diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index b31ab78052b3..2aad1044b84c 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -553,7 +553,7 @@ static void qnx4_destroy_inode(struct inode *inode) kmem_cache_free(qnx4_inode_cachep, qnx4_i(inode)); } -static void init_once(struct kmem_cache *cachep, void *foo) +static void init_once(void *foo) { struct qnx4_inode_info *ei = (struct qnx4_inode_info *) foo; diff --git a/fs/quota.c b/fs/quota.c index db1cc9f3c7aa..7f4386ebc23a 100644 --- a/fs/quota.c +++ b/fs/quota.c @@ -186,7 +186,7 @@ static void quota_sync_sb(struct super_block *sb, int type) void sync_dquots(struct super_block *sb, int type) { - int cnt, dirty; + int cnt; if (sb) { if (sb->s_qcop->quota_sync) @@ -198,11 +198,17 @@ void sync_dquots(struct super_block *sb, int type) restart: list_for_each_entry(sb, &super_blocks, s_list) { /* This test just improves performance so it needn't be reliable... */ - for (cnt = 0, dirty = 0; cnt < MAXQUOTAS; cnt++) - if ((type == cnt || type == -1) && sb_has_quota_enabled(sb, cnt) - && info_any_dirty(&sb_dqopt(sb)->info[cnt])) - dirty = 1; - if (!dirty) + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (type != -1 && type != cnt) + continue; + if (!sb_has_quota_enabled(sb, cnt)) + continue; + if (!info_dirty(&sb_dqopt(sb)->info[cnt]) && + list_empty(&sb_dqopt(sb)->info[cnt].dqi_dirty_list)) + continue; + break; + } + if (cnt == MAXQUOTAS) continue; sb->s_count++; spin_unlock(&sb_lock); diff --git a/fs/quota_v1.c b/fs/quota_v1.c index a6cf9269105c..5ae15b13eeb0 100644 --- a/fs/quota_v1.c +++ b/fs/quota_v1.c @@ -1,6 +1,7 @@ #include <linux/errno.h> #include <linux/fs.h> #include <linux/quota.h> +#include <linux/quotaops.h> #include <linux/dqblk_v1.h> #include <linux/quotaio_v1.h> #include <linux/kernel.h> diff --git a/fs/quota_v2.c b/fs/quota_v2.c index 234ada903633..b53827dc02d9 100644 --- a/fs/quota_v2.c +++ b/fs/quota_v2.c @@ -11,6 +11,7 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/slab.h> +#include <linux/quotaops.h> #include <asm/byteorder.h> diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c index 9590b9024300..78f613cb9c76 100644 --- a/fs/ramfs/file-mmu.c +++ b/fs/ramfs/file-mmu.c @@ -45,6 +45,7 @@ const struct file_operations ramfs_file_operations = { .mmap = generic_file_mmap, .fsync = simple_sync_file, .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, .llseek = generic_file_llseek, }; diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 0989bc2c2f69..5145cb9125af 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -43,6 +43,7 @@ const struct file_operations ramfs_file_operations = { .aio_write = generic_file_aio_write, .fsync = simple_sync_file, .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, .llseek = generic_file_llseek, }; @@ -57,7 +58,7 @@ const struct inode_operations ramfs_file_inode_operations = { * size 0 on the assumption that it's going to be used for an mmap of shared * memory */ -static int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) +int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) { struct pagevec lru_pvec; unsigned long npages, xpages, loop, limit; diff --git a/fs/read_write.c b/fs/read_write.c index f0d1240a5c69..9ba495d5a29b 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -31,12 +31,12 @@ const struct file_operations generic_ro_fops = { EXPORT_SYMBOL(generic_ro_fops); -loff_t generic_file_llseek(struct file *file, loff_t offset, int origin) +loff_t +generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin) { loff_t retval; struct inode *inode = file->f_mapping->host; - mutex_lock(&inode->i_mutex); switch (origin) { case SEEK_END: offset += inode->i_size; @@ -46,42 +46,26 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int origin) } retval = -EINVAL; if (offset>=0 && offset<=inode->i_sb->s_maxbytes) { + /* Special lock needed here? */ if (offset != file->f_pos) { file->f_pos = offset; file->f_version = 0; } retval = offset; } - mutex_unlock(&inode->i_mutex); return retval; } +EXPORT_SYMBOL(generic_file_llseek_unlocked); -EXPORT_SYMBOL(generic_file_llseek); - -loff_t remote_llseek(struct file *file, loff_t offset, int origin) +loff_t generic_file_llseek(struct file *file, loff_t offset, int origin) { - loff_t retval; - - lock_kernel(); - switch (origin) { - case SEEK_END: - offset += i_size_read(file->f_path.dentry->d_inode); - break; - case SEEK_CUR: - offset += file->f_pos; - } - retval = -EINVAL; - if (offset>=0 && offset<=file->f_path.dentry->d_inode->i_sb->s_maxbytes) { - if (offset != file->f_pos) { - file->f_pos = offset; - file->f_version = 0; - } - retval = offset; - } - unlock_kernel(); - return retval; + loff_t n; + mutex_lock(&file->f_dentry->d_inode->i_mutex); + n = generic_file_llseek_unlocked(file, offset, origin); + mutex_unlock(&file->f_dentry->d_inode->i_mutex); + return n; } -EXPORT_SYMBOL(remote_llseek); +EXPORT_SYMBOL(generic_file_llseek); loff_t no_llseek(struct file *file, loff_t offset, int origin) { diff --git a/fs/readdir.c b/fs/readdir.c index 4e026e5407fb..93a7559bbfd8 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -80,8 +80,10 @@ static int fillonedir(void * __buf, const char * name, int namlen, loff_t offset if (buf->result) return -EINVAL; d_ino = ino; - if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) + if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { + buf->result = -EOVERFLOW; return -EOVERFLOW; + } buf->result++; dirent = buf->dirent; if (!access_ok(VERIFY_WRITE, dirent, @@ -155,8 +157,10 @@ static int filldir(void * __buf, const char * name, int namlen, loff_t offset, if (reclen > buf->count) return -EINVAL; d_ino = ino; - if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) + if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { + buf->error = -EOVERFLOW; return -EOVERFLOW; + } dirent = buf->previous; if (dirent) { if (__put_user(offset, &dirent->d_off)) diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 57917932212e..5699171212ae 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -45,6 +45,8 @@ void reiserfs_delete_inode(struct inode *inode) goto out; reiserfs_update_inode_transaction(inode); + reiserfs_discard_prealloc(&th, inode); + err = reiserfs_delete_object(&th, inode); /* Do quota update inside a transaction for journaled quotas. We must do that @@ -2433,7 +2435,7 @@ static int reiserfs_write_full_page(struct page *page, if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { lock_buffer(bh); } else { - if (test_set_buffer_locked(bh)) { + if (!trylock_buffer(bh)) { redirty_page_for_writepage(wbc, page); continue; } diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index e396b2fa4743..c21df71943a6 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -34,15 +34,10 @@ ** from within kupdate, it will ignore the immediate flag */ -#include <asm/uaccess.h> -#include <asm/system.h> - #include <linux/time.h> #include <linux/semaphore.h> - #include <linux/vmalloc.h> #include <linux/reiserfs_fs.h> - #include <linux/kernel.h> #include <linux/errno.h> #include <linux/fcntl.h> @@ -54,6 +49,9 @@ #include <linux/writeback.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> +#include <linux/uaccess.h> + +#include <asm/system.h> /* gets a struct reiserfs_journal_list * from a list head */ #define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \ @@ -558,13 +556,13 @@ static inline void insert_journal_hash(struct reiserfs_journal_cnode **table, static inline void lock_journal(struct super_block *p_s_sb) { PROC_INFO_INC(p_s_sb, journal.lock_journal); - down(&SB_JOURNAL(p_s_sb)->j_lock); + mutex_lock(&SB_JOURNAL(p_s_sb)->j_mutex); } /* unlock the current transaction */ static inline void unlock_journal(struct super_block *p_s_sb) { - up(&SB_JOURNAL(p_s_sb)->j_lock); + mutex_unlock(&SB_JOURNAL(p_s_sb)->j_mutex); } static inline void get_journal_list(struct reiserfs_journal_list *jl) @@ -629,7 +627,7 @@ static int journal_list_still_alive(struct super_block *s, static void release_buffer_page(struct buffer_head *bh) { struct page *page = bh->b_page; - if (!page->mapping && !TestSetPageLocked(page)) { + if (!page->mapping && trylock_page(page)) { page_cache_get(page); put_bh(bh); if (!page->mapping) @@ -857,7 +855,7 @@ static int write_ordered_buffers(spinlock_t * lock, jh = JH_ENTRY(list->next); bh = jh->bh; get_bh(bh); - if (test_set_buffer_locked(bh)) { + if (!trylock_buffer(bh)) { if (!buffer_dirty(bh)) { list_move(&jh->list, &tmp); goto loop_next; @@ -1045,9 +1043,9 @@ static int flush_commit_list(struct super_block *s, } /* make sure nobody is trying to flush this one at the same time */ - down(&jl->j_commit_lock); + mutex_lock(&jl->j_commit_mutex); if (!journal_list_still_alive(s, trans_id)) { - up(&jl->j_commit_lock); + mutex_unlock(&jl->j_commit_mutex); goto put_jl; } BUG_ON(jl->j_trans_id == 0); @@ -1057,7 +1055,7 @@ static int flush_commit_list(struct super_block *s, if (flushall) { atomic_set(&(jl->j_older_commits_done), 1); } - up(&jl->j_commit_lock); + mutex_unlock(&jl->j_commit_mutex); goto put_jl; } @@ -1181,7 +1179,7 @@ static int flush_commit_list(struct super_block *s, if (flushall) { atomic_set(&(jl->j_older_commits_done), 1); } - up(&jl->j_commit_lock); + mutex_unlock(&jl->j_commit_mutex); put_jl: put_journal_list(s, jl); @@ -1411,8 +1409,8 @@ static int flush_journal_list(struct super_block *s, /* if flushall == 0, the lock is already held */ if (flushall) { - down(&journal->j_flush_sem); - } else if (!down_trylock(&journal->j_flush_sem)) { + mutex_lock(&journal->j_flush_mutex); + } else if (mutex_trylock(&journal->j_flush_mutex)) { BUG(); } @@ -1642,7 +1640,7 @@ static int flush_journal_list(struct super_block *s, jl->j_state = 0; put_journal_list(s, jl); if (flushall) - up(&journal->j_flush_sem); + mutex_unlock(&journal->j_flush_mutex); put_fs_excl(); return err; } @@ -1772,12 +1770,12 @@ static int kupdate_transactions(struct super_block *s, struct reiserfs_journal *journal = SB_JOURNAL(s); chunk.nr = 0; - down(&journal->j_flush_sem); + mutex_lock(&journal->j_flush_mutex); if (!journal_list_still_alive(s, orig_trans_id)) { goto done; } - /* we've got j_flush_sem held, nobody is going to delete any + /* we've got j_flush_mutex held, nobody is going to delete any * of these lists out from underneath us */ while ((num_trans && transactions_flushed < num_trans) || @@ -1812,7 +1810,7 @@ static int kupdate_transactions(struct super_block *s, } done: - up(&journal->j_flush_sem); + mutex_unlock(&journal->j_flush_mutex); return ret; } @@ -2556,7 +2554,7 @@ static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s) INIT_LIST_HEAD(&jl->j_working_list); INIT_LIST_HEAD(&jl->j_tail_bh_list); INIT_LIST_HEAD(&jl->j_bh_list); - sema_init(&jl->j_commit_lock, 1); + mutex_init(&jl->j_commit_mutex); SB_JOURNAL(s)->j_num_lists++; get_journal_list(jl); return jl; @@ -2837,8 +2835,8 @@ int journal_init(struct super_block *p_s_sb, const char *j_dev_name, journal->j_last = NULL; journal->j_first = NULL; init_waitqueue_head(&(journal->j_join_wait)); - sema_init(&journal->j_lock, 1); - sema_init(&journal->j_flush_sem, 1); + mutex_init(&journal->j_mutex); + mutex_init(&journal->j_flush_mutex); journal->j_trans_id = 10; journal->j_mount_id = 10; @@ -3873,7 +3871,7 @@ int reiserfs_prepare_for_journal(struct super_block *p_s_sb, { PROC_INFO_INC(p_s_sb, journal.prepare); - if (test_set_buffer_locked(bh)) { + if (!trylock_buffer(bh)) { if (!wait) return 0; lock_buffer(bh); @@ -4030,7 +4028,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, * the new transaction is fully setup, and we've already flushed the * ordered bh list */ - down(&jl->j_commit_lock); + mutex_lock(&jl->j_commit_mutex); /* save the transaction id in case we need to commit it later */ commit_trans_id = jl->j_trans_id; @@ -4196,7 +4194,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, lock_kernel(); } BUG_ON(!list_empty(&jl->j_tail_bh_list)); - up(&jl->j_commit_lock); + mutex_unlock(&jl->j_commit_mutex); /* honor the flush wishes from the caller, simple commits can ** be done outside the journal lock, they are done below diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index ed424d708e69..d318c7e663fa 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -22,11 +22,11 @@ #include <linux/blkdev.h> #include <linux/buffer_head.h> #include <linux/exportfs.h> +#include <linux/quotaops.h> #include <linux/vfs.h> #include <linux/mnt_namespace.h> #include <linux/mount.h> #include <linux/namei.h> -#include <linux/quotaops.h> struct file_system_type reiserfs_fs_type; @@ -182,7 +182,7 @@ static int finish_unfinished(struct super_block *s) int ret = reiserfs_quota_on_mount(s, i); if (ret < 0) reiserfs_warning(s, - "reiserfs: cannot turn on journalled quota: error %d", + "reiserfs: cannot turn on journaled quota: error %d", ret); } } @@ -520,7 +520,7 @@ static void reiserfs_destroy_inode(struct inode *inode) kmem_cache_free(reiserfs_inode_cachep, REISERFS_I(inode)); } -static void init_once(struct kmem_cache * cachep, void *foo) +static void init_once(void *foo) { struct reiserfs_inode_info *ei = (struct reiserfs_inode_info *)foo; @@ -876,7 +876,9 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin mount options were selected. */ unsigned long *blocks, /* strtol-ed from NNN of resize=NNN */ char **jdev_name, - unsigned int *commit_max_age) + unsigned int *commit_max_age, + char **qf_names, + unsigned int *qfmt) { int c; char *arg = NULL; @@ -992,9 +994,11 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin if (c == 'u' || c == 'g') { int qtype = c == 'u' ? USRQUOTA : GRPQUOTA; - if (sb_any_quota_enabled(s)) { + if ((sb_any_quota_enabled(s) || + sb_any_quota_suspended(s)) && + (!*arg != !REISERFS_SB(s)->s_qf_names[qtype])) { reiserfs_warning(s, - "reiserfs_parse_options: cannot change journalled quota options when quota turned on."); + "reiserfs_parse_options: cannot change journaled quota options when quota turned on."); return 0; } if (*arg) { /* Some filename specified? */ @@ -1011,46 +1015,54 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin "reiserfs_parse_options: quotafile must be on filesystem root."); return 0; } - REISERFS_SB(s)->s_qf_names[qtype] = + qf_names[qtype] = kmalloc(strlen(arg) + 1, GFP_KERNEL); - if (!REISERFS_SB(s)->s_qf_names[qtype]) { + if (!qf_names[qtype]) { reiserfs_warning(s, "reiserfs_parse_options: not enough memory for storing quotafile name."); return 0; } - strcpy(REISERFS_SB(s)->s_qf_names[qtype], arg); + strcpy(qf_names[qtype], arg); *mount_options |= 1 << REISERFS_QUOTA; } else { - kfree(REISERFS_SB(s)->s_qf_names[qtype]); - REISERFS_SB(s)->s_qf_names[qtype] = NULL; + if (qf_names[qtype] != + REISERFS_SB(s)->s_qf_names[qtype]) + kfree(qf_names[qtype]); + qf_names[qtype] = NULL; } } if (c == 'f') { if (!strcmp(arg, "vfsold")) - REISERFS_SB(s)->s_jquota_fmt = QFMT_VFS_OLD; + *qfmt = QFMT_VFS_OLD; else if (!strcmp(arg, "vfsv0")) - REISERFS_SB(s)->s_jquota_fmt = QFMT_VFS_V0; + *qfmt = QFMT_VFS_V0; else { reiserfs_warning(s, "reiserfs_parse_options: unknown quota format specified."); return 0; } + if ((sb_any_quota_enabled(s) || + sb_any_quota_suspended(s)) && + *qfmt != REISERFS_SB(s)->s_jquota_fmt) { + reiserfs_warning(s, + "reiserfs_parse_options: cannot change journaled quota options when quota turned on."); + return 0; + } } #else if (c == 'u' || c == 'g' || c == 'f') { reiserfs_warning(s, - "reiserfs_parse_options: journalled quota options not supported."); + "reiserfs_parse_options: journaled quota options not supported."); return 0; } #endif } #ifdef CONFIG_QUOTA - if (!REISERFS_SB(s)->s_jquota_fmt - && (REISERFS_SB(s)->s_qf_names[USRQUOTA] - || REISERFS_SB(s)->s_qf_names[GRPQUOTA])) { + if (!REISERFS_SB(s)->s_jquota_fmt && !*qfmt + && (qf_names[USRQUOTA] || qf_names[GRPQUOTA])) { reiserfs_warning(s, - "reiserfs_parse_options: journalled quota format not specified."); + "reiserfs_parse_options: journaled quota format not specified."); return 0; } /* This checking is not precise wrt the quota type but for our purposes it is sufficient */ @@ -1130,6 +1142,21 @@ static void handle_attrs(struct super_block *s) } } +#ifdef CONFIG_QUOTA +static void handle_quota_files(struct super_block *s, char **qf_names, + unsigned int *qfmt) +{ + int i; + + for (i = 0; i < MAXQUOTAS; i++) { + if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i]) + kfree(REISERFS_SB(s)->s_qf_names[i]); + REISERFS_SB(s)->s_qf_names[i] = qf_names[i]; + } + REISERFS_SB(s)->s_jquota_fmt = *qfmt; +} +#endif + static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) { struct reiserfs_super_block *rs; @@ -1141,23 +1168,30 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) struct reiserfs_journal *journal = SB_JOURNAL(s); char *new_opts = kstrdup(arg, GFP_KERNEL); int err; + char *qf_names[MAXQUOTAS]; + unsigned int qfmt = 0; #ifdef CONFIG_QUOTA int i; + + memcpy(qf_names, REISERFS_SB(s)->s_qf_names, sizeof(qf_names)); #endif rs = SB_DISK_SUPER_BLOCK(s); if (!reiserfs_parse_options - (s, arg, &mount_options, &blocks, NULL, &commit_max_age)) { + (s, arg, &mount_options, &blocks, NULL, &commit_max_age, + qf_names, &qfmt)) { #ifdef CONFIG_QUOTA - for (i = 0; i < MAXQUOTAS; i++) { - kfree(REISERFS_SB(s)->s_qf_names[i]); - REISERFS_SB(s)->s_qf_names[i] = NULL; - } + for (i = 0; i < MAXQUOTAS; i++) + if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i]) + kfree(qf_names[i]); #endif err = -EINVAL; goto out_err; } +#ifdef CONFIG_QUOTA + handle_quota_files(s, qf_names, &qfmt); +#endif handle_attrs(s); @@ -1570,6 +1604,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) char *jdev_name; struct reiserfs_sb_info *sbi; int errval = -EINVAL; + char *qf_names[MAXQUOTAS] = {}; + unsigned int qfmt = 0; save_mount_options(s, data); @@ -1597,9 +1633,12 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) jdev_name = NULL; if (reiserfs_parse_options (s, (char *)data, &(sbi->s_mount_opt), &blocks, &jdev_name, - &commit_max_age) == 0) { + &commit_max_age, qf_names, &qfmt) == 0) { goto error; } +#ifdef CONFIG_QUOTA + handle_quota_files(s, qf_names, &qfmt); +#endif if (blocks) { SWARN(silent, s, "jmacd-7: reiserfs_fill_super: resize option " @@ -1819,7 +1858,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) return (0); - error: +error: if (jinit_done) { /* kill the commit thread, free journal ram */ journal_release_error(NULL, s); } @@ -1830,10 +1869,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) #ifdef CONFIG_QUOTA { int j; - for (j = 0; j < MAXQUOTAS; j++) { - kfree(sbi->s_qf_names[j]); - sbi->s_qf_names[j] = NULL; - } + for (j = 0; j < MAXQUOTAS; j++) + kfree(qf_names[j]); } #endif kfree(sbi); @@ -1980,7 +2017,7 @@ static int reiserfs_release_dquot(struct dquot *dquot) static int reiserfs_mark_dquot_dirty(struct dquot *dquot) { - /* Are we journalling quotas? */ + /* Are we journaling quotas? */ if (REISERFS_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] || REISERFS_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) { dquot_mark_dquot_dirty(dquot); @@ -2026,6 +2063,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, int err; struct nameidata nd; struct inode *inode; + struct reiserfs_transaction_handle th; if (!(REISERFS_SB(sb)->s_mount_opt & (1 << REISERFS_QUOTA))) return -EINVAL; @@ -2037,8 +2075,8 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, return err; /* Quotafile not on the same filesystem? */ if (nd.path.mnt->mnt_sb != sb) { - path_put(&nd.path); - return -EXDEV; + err = -EXDEV; + goto out; } inode = nd.path.dentry->d_inode; /* We must not pack tails for quota files on reiserfs for quota IO to work */ @@ -2048,24 +2086,37 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, reiserfs_warning(sb, "reiserfs: Unpacking tail of quota file failed" " (%d). Cannot turn on quotas.", err); - path_put(&nd.path); - return -EINVAL; + err = -EINVAL; + goto out; } mark_inode_dirty(inode); } - /* Not journalling quota? No more tests needed... */ - if (!REISERFS_SB(sb)->s_qf_names[USRQUOTA] && - !REISERFS_SB(sb)->s_qf_names[GRPQUOTA]) { - path_put(&nd.path); - return vfs_quota_on(sb, type, format_id, path, 0); - } - /* Quotafile not of fs root? */ - if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode) - reiserfs_warning(sb, + /* Journaling quota? */ + if (REISERFS_SB(sb)->s_qf_names[type]) { + /* Quotafile not of fs root? */ + if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode) + reiserfs_warning(sb, "reiserfs: Quota file not on filesystem root. " "Journalled quota will not work."); + } + + /* + * When we journal data on quota file, we have to flush journal to see + * all updates to the file when we bypass pagecache... + */ + if (reiserfs_file_data_log(inode)) { + /* Just start temporary transaction and finish it */ + err = journal_begin(&th, sb, 1); + if (err) + goto out; + err = journal_end_sync(&th, sb, 1); + if (err) + goto out; + } + err = vfs_quota_on_path(sb, type, format_id, &nd.path); +out: path_put(&nd.path); - return vfs_quota_on(sb, type, format_id, path, 0); + return err; } /* Read data from quotafile - avoid pagecache and such because we cannot afford @@ -2165,8 +2216,10 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type, blk++; } out: - if (len == towrite) + if (len == towrite) { + mutex_unlock(&inode->i_mutex); return err; + } if (inode->i_size < off + len - towrite) i_size_write(inode, off + len - towrite); inode->i_version++; diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index d7c4935c1034..bb3cb5b7cdb2 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -1250,7 +1250,7 @@ static int reiserfs_check_acl(struct inode *inode, int mask) return error; } -int reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd) +int reiserfs_permission(struct inode *inode, int mask) { /* * We don't do permission checks on the internal objects. diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c index 5e90a95ad60b..056008db1377 100644 --- a/fs/reiserfs/xattr_security.c +++ b/fs/reiserfs/xattr_security.c @@ -6,8 +6,6 @@ #include <linux/reiserfs_xattr.h> #include <asm/uaccess.h> -#define XATTR_SECURITY_PREFIX "security." - static int security_get(struct inode *inode, const char *name, void *buffer, size_t size) { diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c index 024a938ca60f..60abe2bb1f98 100644 --- a/fs/reiserfs/xattr_trusted.c +++ b/fs/reiserfs/xattr_trusted.c @@ -7,8 +7,6 @@ #include <linux/reiserfs_xattr.h> #include <asm/uaccess.h> -#define XATTR_TRUSTED_PREFIX "trusted." - static int trusted_get(struct inode *inode, const char *name, void *buffer, size_t size) { diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c index 073f39364b11..1384efcb938e 100644 --- a/fs/reiserfs/xattr_user.c +++ b/fs/reiserfs/xattr_user.c @@ -10,8 +10,6 @@ # include <linux/reiserfs_acl.h> #endif -#define XATTR_USER_PREFIX "user." - static int user_get(struct inode *inode, const char *name, void *buffer, size_t size) { diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c index 3f13d491c7c7..60d2f822e87b 100644 --- a/fs/romfs/inode.c +++ b/fs/romfs/inode.c @@ -418,7 +418,8 @@ static int romfs_readpage(struct file *file, struct page * page) { struct inode *inode = page->mapping->host; - loff_t offset, avail, readlen; + loff_t offset, size; + unsigned long filled; void *buf; int result = -EIO; @@ -430,21 +431,29 @@ romfs_readpage(struct file *file, struct page * page) /* 32 bit warning -- but not for us :) */ offset = page_offset(page); - if (offset < i_size_read(inode)) { - avail = inode->i_size-offset; - readlen = min_t(unsigned long, avail, PAGE_SIZE); - if (romfs_copyfrom(inode, buf, ROMFS_I(inode)->i_dataoffset+offset, readlen) == readlen) { - if (readlen < PAGE_SIZE) { - memset(buf + readlen,0,PAGE_SIZE-readlen); - } - SetPageUptodate(page); - result = 0; + size = i_size_read(inode); + filled = 0; + result = 0; + if (offset < size) { + unsigned long readlen; + + size -= offset; + readlen = size > PAGE_SIZE ? PAGE_SIZE : size; + + filled = romfs_copyfrom(inode, buf, ROMFS_I(inode)->i_dataoffset+offset, readlen); + + if (filled != readlen) { + SetPageError(page); + filled = 0; + result = -EIO; } } - if (result) { - memset(buf, 0, PAGE_SIZE); - SetPageError(page); - } + + if (filled < PAGE_SIZE) + memset(buf + filled, 0, PAGE_SIZE-filled); + + if (!result) + SetPageUptodate(page); flush_dcache_page(page); unlock_page(page); @@ -577,7 +586,7 @@ static void romfs_destroy_inode(struct inode *inode) kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode)); } -static void init_once(struct kmem_cache *cachep, void *foo) +static void init_once(void *foo) { struct romfs_inode_info *ei = foo; diff --git a/fs/seq_file.c b/fs/seq_file.c index 3f54dbd6c49b..bd20f7f5a933 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -108,9 +108,9 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) goto Done; } /* we need at least one record in buffer */ + pos = m->index; + p = m->op->start(m, &pos); while (1) { - pos = m->index; - p = m->op->start(m, &pos); err = PTR_ERR(p); if (!p || IS_ERR(p)) break; @@ -119,6 +119,11 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) break; if (unlikely(err)) m->count = 0; + if (unlikely(!m->count)) { + p = m->op->next(m, p, &pos); + m->index = pos; + continue; + } if (m->count < m->size) goto Fill; m->op->stop(m, p); @@ -128,6 +133,8 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) goto Enomem; m->count = 0; m->version = 0; + pos = m->index; + p = m->op->start(m, &pos); } m->op->stop(m, p); m->count = 0; @@ -443,6 +450,20 @@ int seq_dentry(struct seq_file *m, struct dentry *dentry, char *esc) return -1; } +int seq_bitmap(struct seq_file *m, unsigned long *bits, unsigned int nr_bits) +{ + size_t len = bitmap_scnprintf_len(nr_bits); + + if (m->count + len < m->size) { + bitmap_scnprintf(m->buf + m->count, m->size - m->count, + bits, nr_bits); + m->count += len; + return 0; + } + m->count = m->size; + return -1; +} + static void *single_start(struct seq_file *p, loff_t *pos) { return NULL + (*pos == 0); diff --git a/fs/signalfd.c b/fs/signalfd.c index 619725644c75..9c39bc7f8431 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -205,11 +205,19 @@ static const struct file_operations signalfd_fops = { .read = signalfd_read, }; -asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemask) +asmlinkage long sys_signalfd4(int ufd, sigset_t __user *user_mask, + size_t sizemask, int flags) { sigset_t sigmask; struct signalfd_ctx *ctx; + /* Check the SFD_* constants for consistency. */ + BUILD_BUG_ON(SFD_CLOEXEC != O_CLOEXEC); + BUILD_BUG_ON(SFD_NONBLOCK != O_NONBLOCK); + + if (flags & ~(SFD_CLOEXEC | SFD_NONBLOCK)) + return -EINVAL; + if (sizemask != sizeof(sigset_t) || copy_from_user(&sigmask, user_mask, sizeof(sigmask))) return -EINVAL; @@ -227,7 +235,8 @@ asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemas * When we call this, the initialization must be complete, since * anon_inode_getfd() will install the fd. */ - ufd = anon_inode_getfd("[signalfd]", &signalfd_fops, ctx); + ufd = anon_inode_getfd("[signalfd]", &signalfd_fops, ctx, + flags & (O_CLOEXEC | O_NONBLOCK)); if (ufd < 0) kfree(ctx); } else { @@ -249,3 +258,9 @@ asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemas return ufd; } + +asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, + size_t sizemask) +{ + return sys_signalfd4(ufd, user_mask, sizemask, 0); +} diff --git a/fs/smbfs/cache.c b/fs/smbfs/cache.c index 8182f0542a21..8c177eb7e344 100644 --- a/fs/smbfs/cache.c +++ b/fs/smbfs/cache.c @@ -13,7 +13,6 @@ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/mm.h> -#include <linux/dirent.h> #include <linux/smb_fs.h> #include <linux/pagemap.h> #include <linux/net.h> diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c index efbe29af3d7a..e4f8d51a5553 100644 --- a/fs/smbfs/file.c +++ b/fs/smbfs/file.c @@ -408,7 +408,7 @@ smb_file_release(struct inode *inode, struct file * file) * privileges, so we need our own check for this. */ static int -smb_file_permission(struct inode *inode, int mask, struct nameidata *nd) +smb_file_permission(struct inode *inode, int mask) { int mode = inode->i_mode; int error = 0; @@ -417,14 +417,23 @@ smb_file_permission(struct inode *inode, int mask, struct nameidata *nd) /* Look at user permissions */ mode >>= 6; - if ((mode & 7 & mask) != mask) + if (mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC)) error = -EACCES; return error; } +static loff_t smb_remote_llseek(struct file *file, loff_t offset, int origin) +{ + loff_t ret; + lock_kernel(); + ret = generic_file_llseek_unlocked(file, offset, origin); + unlock_kernel(); + return ret; +} + const struct file_operations smb_file_operations = { - .llseek = remote_llseek, + .llseek = smb_remote_llseek, .read = do_sync_read, .aio_read = smb_file_aio_read, .write = do_sync_write, diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c index 376ef3ee6ed7..3528f40ffb0f 100644 --- a/fs/smbfs/inode.c +++ b/fs/smbfs/inode.c @@ -67,7 +67,7 @@ static void smb_destroy_inode(struct inode *inode) kmem_cache_free(smb_inode_cachep, SMB_I(inode)); } -static void init_once(struct kmem_cache *cachep, void *foo) +static void init_once(void *foo) { struct smb_inode_info *ei = (struct smb_inode_info *) foo; diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c index d517a27b7f4b..ee536e8a649a 100644 --- a/fs/smbfs/proc.c +++ b/fs/smbfs/proc.c @@ -16,7 +16,6 @@ #include <linux/stat.h> #include <linux/fcntl.h> #include <linux/dcache.h> -#include <linux/dirent.h> #include <linux/nls.h> #include <linux/smp_lock.h> #include <linux/net.h> diff --git a/fs/splice.c b/fs/splice.c index aa5f6f60b305..a1e701c27156 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -371,7 +371,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, * for an in-flight io page */ if (flags & SPLICE_F_NONBLOCK) { - if (TestSetPageLocked(page)) { + if (!trylock_page(page)) { error = -EAGAIN; break; } @@ -379,13 +379,22 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, lock_page(page); /* - * page was truncated, stop here. if this isn't the - * first page, we'll just complete what we already - * added + * Page was truncated, or invalidated by the + * filesystem. Redo the find/create, but this time the + * page is kept locked, so there's no chance of another + * race with truncate/invalidate. */ if (!page->mapping) { unlock_page(page); - break; + page = find_or_create_page(mapping, index, + mapping_gfp_mask(mapping)); + + if (!page) { + error = -ENOMEM; + break; + } + page_cache_release(pages[page_nr]); + pages[page_nr] = page; } /* * page was already under io and is now done, great @@ -763,7 +772,7 @@ generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out, ssize_t ret; int err; - err = remove_suid(out->f_path.dentry); + err = file_remove_suid(out); if (unlikely(err)) return err; @@ -821,7 +830,7 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, ssize_t ret; inode_double_lock(inode, pipe->inode); - ret = remove_suid(out->f_path.dentry); + ret = file_remove_suid(out); if (likely(!ret)) ret = __splice_from_pipe(pipe, &sd, pipe_to_file); inode_double_unlock(inode, pipe->inode); @@ -889,6 +898,9 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, if (unlikely(!(out->f_mode & FMODE_WRITE))) return -EBADF; + if (unlikely(out->f_flags & O_APPEND)) + return -EINVAL; + ret = rw_verify_area(WRITE, out, ppos, len); if (unlikely(ret < 0)) return ret; @@ -1152,36 +1164,6 @@ static long do_splice(struct file *in, loff_t __user *off_in, } /* - * Do a copy-from-user while holding the mmap_semaphore for reading, in a - * manner safe from deadlocking with simultaneous mmap() (grabbing mmap_sem - * for writing) and page faulting on the user memory pointed to by src. - * This assumes that we will very rarely hit the partial != 0 path, or this - * will not be a win. - */ -static int copy_from_user_mmap_sem(void *dst, const void __user *src, size_t n) -{ - int partial; - - if (!access_ok(VERIFY_READ, src, n)) - return -EFAULT; - - pagefault_disable(); - partial = __copy_from_user_inatomic(dst, src, n); - pagefault_enable(); - - /* - * Didn't copy everything, drop the mmap_sem and do a faulting copy - */ - if (unlikely(partial)) { - up_read(¤t->mm->mmap_sem); - partial = copy_from_user(dst, src, n); - down_read(¤t->mm->mmap_sem); - } - - return partial; -} - -/* * Map an iov into an array of pages and offset/length tupples. With the * partial_page structure, we can map several non-contiguous ranges into * our ones pages[] map instead of splitting that operation into pieces. @@ -1194,8 +1176,6 @@ static int get_iovec_page_array(const struct iovec __user *iov, { int buffers = 0, error = 0; - down_read(¤t->mm->mmap_sem); - while (nr_vecs) { unsigned long off, npages; struct iovec entry; @@ -1204,7 +1184,7 @@ static int get_iovec_page_array(const struct iovec __user *iov, int i; error = -EFAULT; - if (copy_from_user_mmap_sem(&entry, iov, sizeof(entry))) + if (copy_from_user(&entry, iov, sizeof(entry))) break; base = entry.iov_base; @@ -1238,9 +1218,8 @@ static int get_iovec_page_array(const struct iovec __user *iov, if (npages > PIPE_BUFFERS - buffers) npages = PIPE_BUFFERS - buffers; - error = get_user_pages(current, current->mm, - (unsigned long) base, npages, 0, 0, - &pages[buffers], NULL); + error = get_user_pages_fast((unsigned long)base, npages, + 0, &pages[buffers]); if (unlikely(error <= 0)) break; @@ -1279,8 +1258,6 @@ static int get_iovec_page_array(const struct iovec __user *iov, iov++; } - up_read(¤t->mm->mmap_sem); - if (buffers) return buffers; diff --git a/fs/stat.c b/fs/stat.c index 9cf41f719d50..7c46fbeb8b76 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -57,13 +57,13 @@ EXPORT_SYMBOL(vfs_getattr); int vfs_stat_fd(int dfd, char __user *name, struct kstat *stat) { - struct nameidata nd; + struct path path; int error; - error = __user_walk_fd(dfd, name, LOOKUP_FOLLOW, &nd); + error = user_path_at(dfd, name, LOOKUP_FOLLOW, &path); if (!error) { - error = vfs_getattr(nd.path.mnt, nd.path.dentry, stat); - path_put(&nd.path); + error = vfs_getattr(path.mnt, path.dentry, stat); + path_put(&path); } return error; } @@ -77,13 +77,13 @@ EXPORT_SYMBOL(vfs_stat); int vfs_lstat_fd(int dfd, char __user *name, struct kstat *stat) { - struct nameidata nd; + struct path path; int error; - error = __user_walk_fd(dfd, name, 0, &nd); + error = user_path_at(dfd, name, 0, &path); if (!error) { - error = vfs_getattr(nd.path.mnt, nd.path.dentry, stat); - path_put(&nd.path); + error = vfs_getattr(path.mnt, path.dentry, stat); + path_put(&path); } return error; } @@ -291,29 +291,29 @@ asmlinkage long sys_newfstat(unsigned int fd, struct stat __user *statbuf) return error; } -asmlinkage long sys_readlinkat(int dfd, const char __user *path, +asmlinkage long sys_readlinkat(int dfd, const char __user *pathname, char __user *buf, int bufsiz) { - struct nameidata nd; + struct path path; int error; if (bufsiz <= 0) return -EINVAL; - error = __user_walk_fd(dfd, path, 0, &nd); + error = user_path_at(dfd, pathname, 0, &path); if (!error) { - struct inode *inode = nd.path.dentry->d_inode; + struct inode *inode = path.dentry->d_inode; error = -EINVAL; if (inode->i_op && inode->i_op->readlink) { - error = security_inode_readlink(nd.path.dentry); + error = security_inode_readlink(path.dentry); if (!error) { - touch_atime(nd.path.mnt, nd.path.dentry); - error = inode->i_op->readlink(nd.path.dentry, + touch_atime(path.mnt, path.dentry); + error = inode->i_op->readlink(path.dentry, buf, bufsiz); } } - path_put(&nd.path); + path_put(&path); } return error; } diff --git a/fs/super.c b/fs/super.c index 453877c5697b..e931ae9511fe 100644 --- a/fs/super.c +++ b/fs/super.c @@ -70,6 +70,7 @@ static struct super_block *alloc_super(struct file_system_type *type) INIT_LIST_HEAD(&s->s_instances); INIT_HLIST_HEAD(&s->s_anon); INIT_LIST_HEAD(&s->s_inodes); + INIT_LIST_HEAD(&s->s_dentry_lru); init_rwsem(&s->s_umount); mutex_init(&s->s_lock); lockdep_set_class(&s->s_umount, &type->s_umount_key); diff --git a/fs/sync.c b/fs/sync.c index 228e17b5e9ee..2967562d416f 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -139,7 +139,8 @@ asmlinkage long sys_fdatasync(unsigned int fd) * before performing the write. * * SYNC_FILE_RANGE_WRITE: initiate writeout of all those dirty pages in the - * range which are not presently under writeback. + * range which are not presently under writeback. Note that this may block for + * significant periods due to exhaustion of disk request structures. * * SYNC_FILE_RANGE_WAIT_AFTER: wait upon writeout of all pages in the range * after performing the write. diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 8c0e4b92574f..aedaeba82ae5 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -398,7 +398,7 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt, } /** - * sysfs_add_one - add sysfs_dirent to parent + * __sysfs_add_one - add sysfs_dirent to parent without warning * @acxt: addrm context to use * @sd: sysfs_dirent to be added * @@ -417,7 +417,7 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt, * 0 on success, -EEXIST if entry with the given name already * exists. */ -int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) +int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) { if (sysfs_find_dirent(acxt->parent_sd, sd->s_name)) return -EEXIST; @@ -435,6 +435,36 @@ int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) } /** + * sysfs_add_one - add sysfs_dirent to parent + * @acxt: addrm context to use + * @sd: sysfs_dirent to be added + * + * Get @acxt->parent_sd and set sd->s_parent to it and increment + * nlink of parent inode if @sd is a directory and link into the + * children list of the parent. + * + * This function should be called between calls to + * sysfs_addrm_start() and sysfs_addrm_finish() and should be + * passed the same @acxt as passed to sysfs_addrm_start(). + * + * LOCKING: + * Determined by sysfs_addrm_start(). + * + * RETURNS: + * 0 on success, -EEXIST if entry with the given name already + * exists. + */ +int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) +{ + int ret; + + ret = __sysfs_add_one(acxt, sd); + WARN(ret == -EEXIST, KERN_WARNING "sysfs: duplicate filename '%s' " + "can not be created\n", sd->s_name); + return ret; +} + +/** * sysfs_remove_one - remove sysfs_dirent from parent * @acxt: addrm context to use * @sd: sysfs_dirent to be removed diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index e7735f643cd1..c9e4e5091da1 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -14,6 +14,7 @@ #include <linux/kobject.h> #include <linux/kallsyms.h> #include <linux/slab.h> +#include <linux/fsnotify.h> #include <linux/namei.h> #include <linux/poll.h> #include <linux/list.h> @@ -336,9 +337,8 @@ static int sysfs_open_file(struct inode *inode, struct file *file) if (kobj->ktype && kobj->ktype->sysfs_ops) ops = kobj->ktype->sysfs_ops; else { - printk(KERN_ERR "missing sysfs attribute operations for " + WARN(1, KERN_ERR "missing sysfs attribute operations for " "kobject: %s\n", kobject_name(kobj)); - WARN_ON(1); goto err_out; } @@ -585,9 +585,11 @@ int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode) newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; - rc = notify_change(victim, &newattrs); + newattrs.ia_ctime = current_fs_time(inode->i_sb); + rc = sysfs_setattr(victim, &newattrs); if (rc == 0) { + fsnotify_change(victim, newattrs.ia_valid); mutex_lock(&sysfs_mutex); victim_sd->s_mode = newattrs.ia_mode; mutex_unlock(&sysfs_mutex); diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index eeba38417b1d..fe611949a7f7 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -134,9 +134,8 @@ void sysfs_remove_group(struct kobject * kobj, if (grp->name) { sd = sysfs_get_dirent(dir_sd, grp->name); if (!sd) { - printk(KERN_WARNING "sysfs group %p not found for " + WARN(!sd, KERN_WARNING "sysfs group %p not found for " "kobject '%s'\n", grp, kobject_name(kobj)); - WARN_ON(!sd); return; } } else diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index 817f5966edca..a3ba217fbe74 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -19,13 +19,8 @@ #include "sysfs.h" -/** - * sysfs_create_link - create symlink between two objects. - * @kobj: object whose directory we're creating the link in. - * @target: object we're pointing to. - * @name: name of the symlink. - */ -int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char * name) +static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target, + const char *name, int warn) { struct sysfs_dirent *parent_sd = NULL; struct sysfs_dirent *target_sd = NULL; @@ -65,7 +60,10 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char target_sd = NULL; /* reference is now owned by the symlink */ sysfs_addrm_start(&acxt, parent_sd); - error = sysfs_add_one(&acxt, sd); + if (warn) + error = sysfs_add_one(&acxt, sd); + else + error = __sysfs_add_one(&acxt, sd); sysfs_addrm_finish(&acxt); if (error) @@ -80,6 +78,33 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char } /** + * sysfs_create_link - create symlink between two objects. + * @kobj: object whose directory we're creating the link in. + * @target: object we're pointing to. + * @name: name of the symlink. + */ +int sysfs_create_link(struct kobject *kobj, struct kobject *target, + const char *name) +{ + return sysfs_do_create_link(kobj, target, name, 1); +} + +/** + * sysfs_create_link_nowarn - create symlink between two objects. + * @kobj: object whose directory we're creating the link in. + * @target: object we're pointing to. + * @name: name of the symlink. + * + * This function does the same as sysf_create_link(), but it + * doesn't warn if the link already exists. + */ +int sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target, + const char *name) +{ + return sysfs_do_create_link(kobj, target, name, 0); +} + +/** * sysfs_remove_link - remove symlink in object's directory. * @kobj: object we're acting for. * @name: name of the symlink to remove. diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index ce4e15f8aaeb..a5db496f71c7 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -107,6 +107,7 @@ struct sysfs_dirent *sysfs_get_active_two(struct sysfs_dirent *sd); void sysfs_put_active_two(struct sysfs_dirent *sd); void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *parent_sd); +int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd); int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd); void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd); void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt); diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index c5d60de0658f..df0d435baa48 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -326,7 +326,7 @@ static void sysv_destroy_inode(struct inode *inode) kmem_cache_free(sysv_inode_cachep, SYSV_I(inode)); } -static void init_once(struct kmem_cache *cachep, void *p) +static void init_once(void *p) { struct sysv_inode_info *si = (struct sysv_inode_info *)p; diff --git a/fs/timerfd.c b/fs/timerfd.c index d87d354ec424..c502c60e4f54 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -184,7 +184,11 @@ asmlinkage long sys_timerfd_create(int clockid, int flags) int ufd; struct timerfd_ctx *ctx; - if (flags) + /* Check the TFD_* constants for consistency. */ + BUILD_BUG_ON(TFD_CLOEXEC != O_CLOEXEC); + BUILD_BUG_ON(TFD_NONBLOCK != O_NONBLOCK); + + if (flags & ~(TFD_CLOEXEC | TFD_NONBLOCK)) return -EINVAL; if (clockid != CLOCK_MONOTONIC && clockid != CLOCK_REALTIME) @@ -198,7 +202,8 @@ asmlinkage long sys_timerfd_create(int clockid, int flags) ctx->clockid = clockid; hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS); - ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx); + ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx, + flags & (O_CLOEXEC | O_NONBLOCK)); if (ufd < 0) kfree(ctx); diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig new file mode 100644 index 000000000000..91ceeda7e5bf --- /dev/null +++ b/fs/ubifs/Kconfig @@ -0,0 +1,72 @@ +config UBIFS_FS + tristate "UBIFS file system support" + select CRC16 + select CRC32 + select CRYPTO if UBIFS_FS_ADVANCED_COMPR + select CRYPTO if UBIFS_FS_LZO + select CRYPTO if UBIFS_FS_ZLIB + select CRYPTO_LZO if UBIFS_FS_LZO + select CRYPTO_DEFLATE if UBIFS_FS_ZLIB + depends on MTD_UBI + help + UBIFS is a file system for flash devices which works on top of UBI. + +config UBIFS_FS_XATTR + bool "Extended attributes support" + depends on UBIFS_FS + help + This option enables support of extended attributes. + +config UBIFS_FS_ADVANCED_COMPR + bool "Advanced compression options" + depends on UBIFS_FS + help + This option allows to explicitly choose which compressions, if any, + are enabled in UBIFS. Removing compressors means inbility to read + existing file systems. + + If unsure, say 'N'. + +config UBIFS_FS_LZO + bool "LZO compression support" if UBIFS_FS_ADVANCED_COMPR + depends on UBIFS_FS + default y + help + LZO compressor is generally faster then zlib but compresses worse. + Say 'Y' if unsure. + +config UBIFS_FS_ZLIB + bool "ZLIB compression support" if UBIFS_FS_ADVANCED_COMPR + depends on UBIFS_FS + default y + help + Zlib copresses better then LZO but it is slower. Say 'Y' if unsure. + +# Debugging-related stuff +config UBIFS_FS_DEBUG + bool "Enable debugging" + depends on UBIFS_FS + select DEBUG_FS + select KALLSYMS_ALL + help + This option enables UBIFS debugging. + +config UBIFS_FS_DEBUG_MSG_LVL + int "Default message level (0 = no extra messages, 3 = lots)" + depends on UBIFS_FS_DEBUG + default "0" + help + This controls the amount of debugging messages produced by UBIFS. + If reporting bugs, please try to have available a full dump of the + messages at level 1 while the misbehaviour was occurring. Level 2 + may become necessary if level 1 messages were not enough to find the + bug. Generally Level 3 should be avoided. + +config UBIFS_FS_DEBUG_CHKS + bool "Enable extra checks" + depends on UBIFS_FS_DEBUG + help + If extra checks are enabled UBIFS will check the consistency of its + internal data structures during operation. However, UBIFS performance + is dramatically slower when this option is selected especially if the + file system is large. diff --git a/fs/ubifs/Makefile b/fs/ubifs/Makefile new file mode 100644 index 000000000000..80e93c35e496 --- /dev/null +++ b/fs/ubifs/Makefile @@ -0,0 +1,9 @@ +obj-$(CONFIG_UBIFS_FS) += ubifs.o + +ubifs-y += shrinker.o journal.o file.o dir.o super.o sb.o io.o +ubifs-y += tnc.o master.o scan.o replay.o log.o commit.o gc.o orphan.o +ubifs-y += budget.o find.o tnc_commit.o compress.o lpt.o lprops.o +ubifs-y += recovery.o ioctl.o lpt_commit.o tnc_misc.o + +ubifs-$(CONFIG_UBIFS_FS_DEBUG) += debug.o +ubifs-$(CONFIG_UBIFS_FS_XATTR) += xattr.o diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c new file mode 100644 index 000000000000..73db464cd08b --- /dev/null +++ b/fs/ubifs/budget.c @@ -0,0 +1,812 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * This file implements the budgeting sub-system which is responsible for UBIFS + * space management. + * + * Factors such as compression, wasted space at the ends of LEBs, space in other + * journal heads, the effect of updates on the index, and so on, make it + * impossible to accurately predict the amount of space needed. Consequently + * approximations are used. + */ + +#include "ubifs.h" +#include <linux/writeback.h> +#include <asm/div64.h> + +/* + * When pessimistic budget calculations say that there is no enough space, + * UBIFS starts writing back dirty inodes and pages, doing garbage collection, + * or committing. The below constants define maximum number of times UBIFS + * repeats the operations. + */ +#define MAX_SHRINK_RETRIES 8 +#define MAX_GC_RETRIES 4 +#define MAX_CMT_RETRIES 2 +#define MAX_NOSPC_RETRIES 1 + +/* + * The below constant defines amount of dirty pages which should be written + * back at when trying to shrink the liability. + */ +#define NR_TO_WRITE 16 + +/** + * struct retries_info - information about re-tries while making free space. + * @prev_liability: previous liability + * @shrink_cnt: how many times the liability was shrinked + * @shrink_retries: count of liability shrink re-tries (increased when + * liability does not shrink) + * @try_gc: GC should be tried first + * @gc_retries: how many times GC was run + * @cmt_retries: how many times commit has been done + * @nospc_retries: how many times GC returned %-ENOSPC + * + * Since we consider budgeting to be the fast-path, and this structure has to + * be allocated on stack and zeroed out, we make it smaller using bit-fields. + */ +struct retries_info { + long long prev_liability; + unsigned int shrink_cnt; + unsigned int shrink_retries:5; + unsigned int try_gc:1; + unsigned int gc_retries:4; + unsigned int cmt_retries:3; + unsigned int nospc_retries:1; +}; + +/** + * shrink_liability - write-back some dirty pages/inodes. + * @c: UBIFS file-system description object + * @nr_to_write: how many dirty pages to write-back + * + * This function shrinks UBIFS liability by means of writing back some amount + * of dirty inodes and their pages. Returns the amount of pages which were + * written back. The returned value does not include dirty inodes which were + * synchronized. + * + * Note, this function synchronizes even VFS inodes which are locked + * (@i_mutex) by the caller of the budgeting function, because write-back does + * not touch @i_mutex. + */ +static int shrink_liability(struct ubifs_info *c, int nr_to_write) +{ + int nr_written; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + .range_end = LLONG_MAX, + .nr_to_write = nr_to_write, + }; + + generic_sync_sb_inodes(c->vfs_sb, &wbc); + nr_written = nr_to_write - wbc.nr_to_write; + + if (!nr_written) { + /* + * Re-try again but wait on pages/inodes which are being + * written-back concurrently (e.g., by pdflush). + */ + memset(&wbc, 0, sizeof(struct writeback_control)); + wbc.sync_mode = WB_SYNC_ALL; + wbc.range_end = LLONG_MAX; + wbc.nr_to_write = nr_to_write; + generic_sync_sb_inodes(c->vfs_sb, &wbc); + nr_written = nr_to_write - wbc.nr_to_write; + } + + dbg_budg("%d pages were written back", nr_written); + return nr_written; +} + + +/** + * run_gc - run garbage collector. + * @c: UBIFS file-system description object + * + * This function runs garbage collector to make some more free space. Returns + * zero if a free LEB has been produced, %-EAGAIN if commit is required, and a + * negative error code in case of failure. + */ +static int run_gc(struct ubifs_info *c) +{ + int err, lnum; + + /* Make some free space by garbage-collecting dirty space */ + down_read(&c->commit_sem); + lnum = ubifs_garbage_collect(c, 1); + up_read(&c->commit_sem); + if (lnum < 0) + return lnum; + + /* GC freed one LEB, return it to lprops */ + dbg_budg("GC freed LEB %d", lnum); + err = ubifs_return_leb(c, lnum); + if (err) + return err; + return 0; +} + +/** + * make_free_space - make more free space on the file-system. + * @c: UBIFS file-system description object + * @ri: information about previous invocations of this function + * + * This function is called when an operation cannot be budgeted because there + * is supposedly no free space. But in most cases there is some free space: + * o budgeting is pessimistic, so it always budgets more then it is actually + * needed, so shrinking the liability is one way to make free space - the + * cached data will take less space then it was budgeted for; + * o GC may turn some dark space into free space (budgeting treats dark space + * as not available); + * o commit may free some LEB, i.e., turn freeable LEBs into free LEBs. + * + * So this function tries to do the above. Returns %-EAGAIN if some free space + * was presumably made and the caller has to re-try budgeting the operation. + * Returns %-ENOSPC if it couldn't do more free space, and other negative error + * codes on failures. + */ +static int make_free_space(struct ubifs_info *c, struct retries_info *ri) +{ + int err; + + /* + * If we have some dirty pages and inodes (liability), try to write + * them back unless this was tried too many times without effect + * already. + */ + if (ri->shrink_retries < MAX_SHRINK_RETRIES && !ri->try_gc) { + long long liability; + + spin_lock(&c->space_lock); + liability = c->budg_idx_growth + c->budg_data_growth + + c->budg_dd_growth; + spin_unlock(&c->space_lock); + + if (ri->prev_liability >= liability) { + /* Liability does not shrink, next time try GC then */ + ri->shrink_retries += 1; + if (ri->gc_retries < MAX_GC_RETRIES) + ri->try_gc = 1; + dbg_budg("liability did not shrink: retries %d of %d", + ri->shrink_retries, MAX_SHRINK_RETRIES); + } + + dbg_budg("force write-back (count %d)", ri->shrink_cnt); + shrink_liability(c, NR_TO_WRITE + ri->shrink_cnt); + + ri->prev_liability = liability; + ri->shrink_cnt += 1; + return -EAGAIN; + } + + /* + * Try to run garbage collector unless it was already tried too many + * times. + */ + if (ri->gc_retries < MAX_GC_RETRIES) { + ri->gc_retries += 1; + dbg_budg("run GC, retries %d of %d", + ri->gc_retries, MAX_GC_RETRIES); + + ri->try_gc = 0; + err = run_gc(c); + if (!err) + return -EAGAIN; + + if (err == -EAGAIN) { + dbg_budg("GC asked to commit"); + err = ubifs_run_commit(c); + if (err) + return err; + return -EAGAIN; + } + + if (err != -ENOSPC) + return err; + + /* + * GC could not make any progress. If this is the first time, + * then it makes sense to try to commit, because it might make + * some dirty space. + */ + dbg_budg("GC returned -ENOSPC, retries %d", + ri->nospc_retries); + if (ri->nospc_retries >= MAX_NOSPC_RETRIES) + return err; + ri->nospc_retries += 1; + } + + /* Neither GC nor write-back helped, try to commit */ + if (ri->cmt_retries < MAX_CMT_RETRIES) { + ri->cmt_retries += 1; + dbg_budg("run commit, retries %d of %d", + ri->cmt_retries, MAX_CMT_RETRIES); + err = ubifs_run_commit(c); + if (err) + return err; + return -EAGAIN; + } + return -ENOSPC; +} + +/** + * ubifs_calc_min_idx_lebs - calculate amount of eraseblocks for the index. + * @c: UBIFS file-system description object + * + * This function calculates and returns the number of eraseblocks which should + * be kept for index usage. + */ +int ubifs_calc_min_idx_lebs(struct ubifs_info *c) +{ + int ret; + uint64_t idx_size; + + idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx; + + /* And make sure we have thrice the index size of space reserved */ + idx_size = idx_size + (idx_size << 1); + + /* + * We do not maintain 'old_idx_size' as 'old_idx_lebs'/'old_idx_bytes' + * pair, nor similarly the two variables for the new index size, so we + * have to do this costly 64-bit division on fast-path. + */ + if (do_div(idx_size, c->leb_size - c->max_idx_node_sz)) + ret = idx_size + 1; + else + ret = idx_size; + /* + * The index head is not available for the in-the-gaps method, so add an + * extra LEB to compensate. + */ + ret += 1; + /* + * At present the index needs at least 2 LEBs: one for the index head + * and one for in-the-gaps method (which currently does not cater for + * the index head and so excludes it from consideration). + */ + if (ret < 2) + ret = 2; + return ret; +} + +/** + * ubifs_calc_available - calculate available FS space. + * @c: UBIFS file-system description object + * @min_idx_lebs: minimum number of LEBs reserved for the index + * + * This function calculates and returns amount of FS space available for use. + */ +long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs) +{ + int subtract_lebs; + long long available; + + available = c->main_bytes - c->lst.total_used; + + /* + * Now 'available' contains theoretically available flash space + * assuming there is no index, so we have to subtract the space which + * is reserved for the index. + */ + subtract_lebs = min_idx_lebs; + + /* Take into account that GC reserves one LEB for its own needs */ + subtract_lebs += 1; + + /* + * The GC journal head LEB is not really accessible. And since + * different write types go to different heads, we may count only on + * one head's space. + */ + subtract_lebs += c->jhead_cnt - 1; + + /* We also reserve one LEB for deletions, which bypass budgeting */ + subtract_lebs += 1; + + available -= (long long)subtract_lebs * c->leb_size; + + /* Subtract the dead space which is not available for use */ + available -= c->lst.total_dead; + + /* + * Subtract dark space, which might or might not be usable - it depends + * on the data which we have on the media and which will be written. If + * this is a lot of uncompressed or not-compressible data, the dark + * space cannot be used. + */ + available -= c->lst.total_dark; + + /* + * However, there is more dark space. The index may be bigger than + * @min_idx_lebs. Those extra LEBs are assumed to be available, but + * their dark space is not included in total_dark, so it is subtracted + * here. + */ + if (c->lst.idx_lebs > min_idx_lebs) { + subtract_lebs = c->lst.idx_lebs - min_idx_lebs; + available -= subtract_lebs * c->dark_wm; + } + + /* The calculations are rough and may end up with a negative number */ + return available > 0 ? available : 0; +} + +/** + * can_use_rp - check whether the user is allowed to use reserved pool. + * @c: UBIFS file-system description object + * + * UBIFS has so-called "reserved pool" which is flash space reserved + * for the superuser and for uses whose UID/GID is recorded in UBIFS superblock. + * This function checks whether current user is allowed to use reserved pool. + * Returns %1 current user is allowed to use reserved pool and %0 otherwise. + */ +static int can_use_rp(struct ubifs_info *c) +{ + if (current->fsuid == c->rp_uid || capable(CAP_SYS_RESOURCE) || + (c->rp_gid != 0 && in_group_p(c->rp_gid))) + return 1; + return 0; +} + +/** + * do_budget_space - reserve flash space for index and data growth. + * @c: UBIFS file-system description object + * + * This function makes sure UBIFS has enough free eraseblocks for index growth + * and data. + * + * When budgeting index space, UBIFS reserves thrice as many LEBs as the index + * would take if it was consolidated and written to the flash. This guarantees + * that the "in-the-gaps" commit method always succeeds and UBIFS will always + * be able to commit dirty index. So this function basically adds amount of + * budgeted index space to the size of the current index, multiplies this by 3, + * and makes sure this does not exceed the amount of free eraseblocks. + * + * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables: + * o @c->lst.idx_lebs is the number of LEBs the index currently uses. It might + * be large, because UBIFS does not do any index consolidation as long as + * there is free space. IOW, the index may take a lot of LEBs, but the LEBs + * will contain a lot of dirt. + * o @c->min_idx_lebs is the the index presumably takes. IOW, the index may be + * consolidated to take up to @c->min_idx_lebs LEBs. + * + * This function returns zero in case of success, and %-ENOSPC in case of + * failure. + */ +static int do_budget_space(struct ubifs_info *c) +{ + long long outstanding, available; + int lebs, rsvd_idx_lebs, min_idx_lebs; + + /* First budget index space */ + min_idx_lebs = ubifs_calc_min_idx_lebs(c); + + /* Now 'min_idx_lebs' contains number of LEBs to reserve */ + if (min_idx_lebs > c->lst.idx_lebs) + rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs; + else + rsvd_idx_lebs = 0; + + /* + * The number of LEBs that are available to be used by the index is: + * + * @c->lst.empty_lebs + @c->freeable_cnt + @c->idx_gc_cnt - + * @c->lst.taken_empty_lebs + * + * @empty_lebs are available because they are empty. @freeable_cnt are + * available because they contain only free and dirty space and the + * index allocation always occurs after wbufs are synch'ed. + * @idx_gc_cnt are available because they are index LEBs that have been + * garbage collected (including trivial GC) and are awaiting the commit + * before they can be unmapped - note that the in-the-gaps method will + * grab these if it needs them. @taken_empty_lebs are empty_lebs that + * have already been allocated for some purpose (also includes those + * LEBs on the @idx_gc list). + * + * Note, @taken_empty_lebs may temporarily be higher by one because of + * the way we serialize LEB allocations and budgeting. See a comment in + * 'ubifs_find_free_space()'. + */ + lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - + c->lst.taken_empty_lebs; + if (unlikely(rsvd_idx_lebs > lebs)) { + dbg_budg("out of indexing space: min_idx_lebs %d (old %d), " + "rsvd_idx_lebs %d", min_idx_lebs, c->min_idx_lebs, + rsvd_idx_lebs); + return -ENOSPC; + } + + available = ubifs_calc_available(c, min_idx_lebs); + outstanding = c->budg_data_growth + c->budg_dd_growth; + + if (unlikely(available < outstanding)) { + dbg_budg("out of data space: available %lld, outstanding %lld", + available, outstanding); + return -ENOSPC; + } + + if (available - outstanding <= c->rp_size && !can_use_rp(c)) + return -ENOSPC; + + c->min_idx_lebs = min_idx_lebs; + return 0; +} + +/** + * calc_idx_growth - calculate approximate index growth from budgeting request. + * @c: UBIFS file-system description object + * @req: budgeting request + * + * For now we assume each new node adds one znode. But this is rather poor + * approximation, though. + */ +static int calc_idx_growth(const struct ubifs_info *c, + const struct ubifs_budget_req *req) +{ + int znodes; + + znodes = req->new_ino + (req->new_page << UBIFS_BLOCKS_PER_PAGE_SHIFT) + + req->new_dent; + return znodes * c->max_idx_node_sz; +} + +/** + * calc_data_growth - calculate approximate amount of new data from budgeting + * request. + * @c: UBIFS file-system description object + * @req: budgeting request + */ +static int calc_data_growth(const struct ubifs_info *c, + const struct ubifs_budget_req *req) +{ + int data_growth; + + data_growth = req->new_ino ? c->inode_budget : 0; + if (req->new_page) + data_growth += c->page_budget; + if (req->new_dent) + data_growth += c->dent_budget; + data_growth += req->new_ino_d; + return data_growth; +} + +/** + * calc_dd_growth - calculate approximate amount of data which makes other data + * dirty from budgeting request. + * @c: UBIFS file-system description object + * @req: budgeting request + */ +static int calc_dd_growth(const struct ubifs_info *c, + const struct ubifs_budget_req *req) +{ + int dd_growth; + + dd_growth = req->dirtied_page ? c->page_budget : 0; + + if (req->dirtied_ino) + dd_growth += c->inode_budget << (req->dirtied_ino - 1); + if (req->mod_dent) + dd_growth += c->dent_budget; + dd_growth += req->dirtied_ino_d; + return dd_growth; +} + +/** + * ubifs_budget_space - ensure there is enough space to complete an operation. + * @c: UBIFS file-system description object + * @req: budget request + * + * This function allocates budget for an operation. It uses pessimistic + * approximation of how much flash space the operation needs. The goal of this + * function is to make sure UBIFS always has flash space to flush all dirty + * pages, dirty inodes, and dirty znodes (liability). This function may force + * commit, garbage-collection or write-back. Returns zero in case of success, + * %-ENOSPC if there is no free space and other negative error codes in case of + * failures. + */ +int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req) +{ + int uninitialized_var(cmt_retries), uninitialized_var(wb_retries); + int err, idx_growth, data_growth, dd_growth; + struct retries_info ri; + + ubifs_assert(req->new_page <= 1); + ubifs_assert(req->dirtied_page <= 1); + ubifs_assert(req->new_dent <= 1); + ubifs_assert(req->mod_dent <= 1); + ubifs_assert(req->new_ino <= 1); + ubifs_assert(req->new_ino_d <= UBIFS_MAX_INO_DATA); + ubifs_assert(req->dirtied_ino <= 4); + ubifs_assert(req->dirtied_ino_d <= UBIFS_MAX_INO_DATA * 4); + ubifs_assert(!(req->new_ino_d & 7)); + ubifs_assert(!(req->dirtied_ino_d & 7)); + + data_growth = calc_data_growth(c, req); + dd_growth = calc_dd_growth(c, req); + if (!data_growth && !dd_growth) + return 0; + idx_growth = calc_idx_growth(c, req); + memset(&ri, 0, sizeof(struct retries_info)); + +again: + spin_lock(&c->space_lock); + ubifs_assert(c->budg_idx_growth >= 0); + ubifs_assert(c->budg_data_growth >= 0); + ubifs_assert(c->budg_dd_growth >= 0); + + if (unlikely(c->nospace) && (c->nospace_rp || !can_use_rp(c))) { + dbg_budg("no space"); + spin_unlock(&c->space_lock); + return -ENOSPC; + } + + c->budg_idx_growth += idx_growth; + c->budg_data_growth += data_growth; + c->budg_dd_growth += dd_growth; + + err = do_budget_space(c); + if (likely(!err)) { + req->idx_growth = idx_growth; + req->data_growth = data_growth; + req->dd_growth = dd_growth; + spin_unlock(&c->space_lock); + return 0; + } + + /* Restore the old values */ + c->budg_idx_growth -= idx_growth; + c->budg_data_growth -= data_growth; + c->budg_dd_growth -= dd_growth; + spin_unlock(&c->space_lock); + + if (req->fast) { + dbg_budg("no space for fast budgeting"); + return err; + } + + err = make_free_space(c, &ri); + if (err == -EAGAIN) { + dbg_budg("try again"); + cond_resched(); + goto again; + } else if (err == -ENOSPC) { + dbg_budg("FS is full, -ENOSPC"); + c->nospace = 1; + if (can_use_rp(c) || c->rp_size == 0) + c->nospace_rp = 1; + smp_wmb(); + } else + ubifs_err("cannot budget space, error %d", err); + return err; +} + +/** + * ubifs_release_budget - release budgeted free space. + * @c: UBIFS file-system description object + * @req: budget request + * + * This function releases the space budgeted by 'ubifs_budget_space()'. Note, + * since the index changes (which were budgeted for in @req->idx_growth) will + * only be written to the media on commit, this function moves the index budget + * from @c->budg_idx_growth to @c->budg_uncommitted_idx. The latter will be + * zeroed by the commit operation. + */ +void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req) +{ + ubifs_assert(req->new_page <= 1); + ubifs_assert(req->dirtied_page <= 1); + ubifs_assert(req->new_dent <= 1); + ubifs_assert(req->mod_dent <= 1); + ubifs_assert(req->new_ino <= 1); + ubifs_assert(req->new_ino_d <= UBIFS_MAX_INO_DATA); + ubifs_assert(req->dirtied_ino <= 4); + ubifs_assert(req->dirtied_ino_d <= UBIFS_MAX_INO_DATA * 4); + ubifs_assert(!(req->new_ino_d & 7)); + ubifs_assert(!(req->dirtied_ino_d & 7)); + if (!req->recalculate) { + ubifs_assert(req->idx_growth >= 0); + ubifs_assert(req->data_growth >= 0); + ubifs_assert(req->dd_growth >= 0); + } + + if (req->recalculate) { + req->data_growth = calc_data_growth(c, req); + req->dd_growth = calc_dd_growth(c, req); + req->idx_growth = calc_idx_growth(c, req); + } + + if (!req->data_growth && !req->dd_growth) + return; + + c->nospace = c->nospace_rp = 0; + smp_wmb(); + + spin_lock(&c->space_lock); + c->budg_idx_growth -= req->idx_growth; + c->budg_uncommitted_idx += req->idx_growth; + c->budg_data_growth -= req->data_growth; + c->budg_dd_growth -= req->dd_growth; + c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); + + ubifs_assert(c->budg_idx_growth >= 0); + ubifs_assert(c->budg_data_growth >= 0); + ubifs_assert(c->budg_dd_growth >= 0); + ubifs_assert(c->min_idx_lebs < c->main_lebs); + ubifs_assert(!(c->budg_idx_growth & 7)); + ubifs_assert(!(c->budg_data_growth & 7)); + ubifs_assert(!(c->budg_dd_growth & 7)); + spin_unlock(&c->space_lock); +} + +/** + * ubifs_convert_page_budget - convert budget of a new page. + * @c: UBIFS file-system description object + * + * This function converts budget which was allocated for a new page of data to + * the budget of changing an existing page of data. The latter is smaller then + * the former, so this function only does simple re-calculation and does not + * involve any write-back. + */ +void ubifs_convert_page_budget(struct ubifs_info *c) +{ + spin_lock(&c->space_lock); + /* Release the index growth reservation */ + c->budg_idx_growth -= c->max_idx_node_sz << UBIFS_BLOCKS_PER_PAGE_SHIFT; + /* Release the data growth reservation */ + c->budg_data_growth -= c->page_budget; + /* Increase the dirty data growth reservation instead */ + c->budg_dd_growth += c->page_budget; + /* And re-calculate the indexing space reservation */ + c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); + spin_unlock(&c->space_lock); +} + +/** + * ubifs_release_dirty_inode_budget - release dirty inode budget. + * @c: UBIFS file-system description object + * @ui: UBIFS inode to release the budget for + * + * This function releases budget corresponding to a dirty inode. It is usually + * called when after the inode has been written to the media and marked as + * clean. + */ +void ubifs_release_dirty_inode_budget(struct ubifs_info *c, + struct ubifs_inode *ui) +{ + struct ubifs_budget_req req; + + memset(&req, 0, sizeof(struct ubifs_budget_req)); + req.dd_growth = c->inode_budget + ALIGN(ui->data_len, 8); + ubifs_release_budget(c, &req); +} + +/** + * ubifs_reported_space - calculate reported free space. + * @c: the UBIFS file-system description object + * @free: amount of free space + * + * This function calculates amount of free space which will be reported to + * user-space. User-space application tend to expect that if the file-system + * (e.g., via the 'statfs()' call) reports that it has N bytes available, they + * are able to write a file of size N. UBIFS attaches node headers to each data + * node and it has to write indexind nodes as well. This introduces additional + * overhead, and UBIFS it has to report sligtly less free space to meet the + * above expectetion. + * + * This function assumes free space is made up of uncompressed data nodes and + * full index nodes (one per data node, tripled because we always allow enough + * space to write the index thrice). + * + * Note, the calculation is pessimistic, which means that most of the time + * UBIFS reports less space than it actually has. + */ +long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free) +{ + int divisor, factor, f; + + /* + * Reported space size is @free * X, where X is UBIFS block size + * divided by UBIFS block size + all overhead one data block + * introduces. The overhead is the node header + indexing overhead. + * + * Indexing overhead calculations are based on the following formula: + * I = N/(f - 1) + 1, where I - number of indexing nodes, N - number + * of data nodes, f - fanout. Because effective UBIFS fanout is twice + * as less than maximum fanout, we assume that each data node + * introduces 3 * @c->max_idx_node_sz / (@c->fanout/2 - 1) bytes. + * Note, the multiplier 3 is because UBIFS reseves thrice as more space + * for the index. + */ + f = c->fanout > 3 ? c->fanout >> 1 : 2; + factor = UBIFS_BLOCK_SIZE; + divisor = UBIFS_MAX_DATA_NODE_SZ; + divisor += (c->max_idx_node_sz * 3) / (f - 1); + free *= factor; + do_div(free, divisor); + return free; +} + +/** + * ubifs_get_free_space - return amount of free space. + * @c: UBIFS file-system description object + * + * This function calculates amount of free space to report to user-space. + * + * Because UBIFS may introduce substantial overhead (the index, node headers, + * alighment, wastage at the end of eraseblocks, etc), it cannot report real + * amount of free flash space it has (well, because not all dirty space is + * reclamable, UBIFS does not actually know the real amount). If UBIFS did so, + * it would bread user expectetion about what free space is. Users seem to + * accustomed to assume that if the file-system reports N bytes of free space, + * they would be able to fit a file of N bytes to the FS. This almost works for + * traditional file-systems, because they have way less overhead than UBIFS. + * So, to keep users happy, UBIFS tries to take the overhead into account. + */ +long long ubifs_get_free_space(struct ubifs_info *c) +{ + int min_idx_lebs, rsvd_idx_lebs, lebs; + long long available, outstanding, free; + + spin_lock(&c->space_lock); + min_idx_lebs = ubifs_calc_min_idx_lebs(c); + outstanding = c->budg_data_growth + c->budg_dd_growth; + + /* + * Force the amount available to the total size reported if the used + * space is zero. + */ + if (c->lst.total_used <= UBIFS_INO_NODE_SZ && !outstanding) { + spin_unlock(&c->space_lock); + return (long long)c->block_cnt << UBIFS_BLOCK_SHIFT; + } + + available = ubifs_calc_available(c, min_idx_lebs); + + /* + * When reporting free space to user-space, UBIFS guarantees that it is + * possible to write a file of free space size. This means that for + * empty LEBs we may use more precise calculations than + * 'ubifs_calc_available()' is using. Namely, we know that in empty + * LEBs we would waste only @c->leb_overhead bytes, not @c->dark_wm. + * Thus, amend the available space. + * + * Note, the calculations below are similar to what we have in + * 'do_budget_space()', so refer there for comments. + */ + if (min_idx_lebs > c->lst.idx_lebs) + rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs; + else + rsvd_idx_lebs = 0; + lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - + c->lst.taken_empty_lebs; + lebs -= rsvd_idx_lebs; + available += lebs * (c->dark_wm - c->leb_overhead); + spin_unlock(&c->space_lock); + + if (available > outstanding) + free = ubifs_reported_space(c, available - outstanding); + else + free = 0; + return free; +} diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c new file mode 100644 index 000000000000..0a6aa2cc78f0 --- /dev/null +++ b/fs/ubifs/commit.c @@ -0,0 +1,678 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * This file implements functions that manage the running of the commit process. + * Each affected module has its own functions to accomplish their part in the + * commit and those functions are called here. + * + * The commit is the process whereby all updates to the index and LEB properties + * are written out together and the journal becomes empty. This keeps the + * file system consistent - at all times the state can be recreated by reading + * the index and LEB properties and then replaying the journal. + * + * The commit is split into two parts named "commit start" and "commit end". + * During commit start, the commit process has exclusive access to the journal + * by holding the commit semaphore down for writing. As few I/O operations as + * possible are performed during commit start, instead the nodes that are to be + * written are merely identified. During commit end, the commit semaphore is no + * longer held and the journal is again in operation, allowing users to continue + * to use the file system while the bulk of the commit I/O is performed. The + * purpose of this two-step approach is to prevent the commit from causing any + * latency blips. Note that in any case, the commit does not prevent lookups + * (as permitted by the TNC mutex), or access to VFS data structures e.g. page + * cache. + */ + +#include <linux/freezer.h> +#include <linux/kthread.h> +#include "ubifs.h" + +/** + * do_commit - commit the journal. + * @c: UBIFS file-system description object + * + * This function implements UBIFS commit. It has to be called with commit lock + * locked. Returns zero in case of success and a negative error code in case of + * failure. + */ +static int do_commit(struct ubifs_info *c) +{ + int err, new_ltail_lnum, old_ltail_lnum, i; + struct ubifs_zbranch zroot; + struct ubifs_lp_stats lst; + + dbg_cmt("start"); + if (c->ro_media) { + err = -EROFS; + goto out_up; + } + + /* Sync all write buffers (necessary for recovery) */ + for (i = 0; i < c->jhead_cnt; i++) { + err = ubifs_wbuf_sync(&c->jheads[i].wbuf); + if (err) + goto out_up; + } + + c->cmt_no += 1; + err = ubifs_gc_start_commit(c); + if (err) + goto out_up; + err = dbg_check_lprops(c); + if (err) + goto out_up; + err = ubifs_log_start_commit(c, &new_ltail_lnum); + if (err) + goto out_up; + err = ubifs_tnc_start_commit(c, &zroot); + if (err) + goto out_up; + err = ubifs_lpt_start_commit(c); + if (err) + goto out_up; + err = ubifs_orphan_start_commit(c); + if (err) + goto out_up; + + ubifs_get_lp_stats(c, &lst); + + up_write(&c->commit_sem); + + err = ubifs_tnc_end_commit(c); + if (err) + goto out; + err = ubifs_lpt_end_commit(c); + if (err) + goto out; + err = ubifs_orphan_end_commit(c); + if (err) + goto out; + old_ltail_lnum = c->ltail_lnum; + err = ubifs_log_end_commit(c, new_ltail_lnum); + if (err) + goto out; + err = dbg_check_old_index(c, &zroot); + if (err) + goto out; + + mutex_lock(&c->mst_mutex); + c->mst_node->cmt_no = cpu_to_le64(c->cmt_no); + c->mst_node->log_lnum = cpu_to_le32(new_ltail_lnum); + c->mst_node->root_lnum = cpu_to_le32(zroot.lnum); + c->mst_node->root_offs = cpu_to_le32(zroot.offs); + c->mst_node->root_len = cpu_to_le32(zroot.len); + c->mst_node->ihead_lnum = cpu_to_le32(c->ihead_lnum); + c->mst_node->ihead_offs = cpu_to_le32(c->ihead_offs); + c->mst_node->index_size = cpu_to_le64(c->old_idx_sz); + c->mst_node->lpt_lnum = cpu_to_le32(c->lpt_lnum); + c->mst_node->lpt_offs = cpu_to_le32(c->lpt_offs); + c->mst_node->nhead_lnum = cpu_to_le32(c->nhead_lnum); + c->mst_node->nhead_offs = cpu_to_le32(c->nhead_offs); + c->mst_node->ltab_lnum = cpu_to_le32(c->ltab_lnum); + c->mst_node->ltab_offs = cpu_to_le32(c->ltab_offs); + c->mst_node->lsave_lnum = cpu_to_le32(c->lsave_lnum); + c->mst_node->lsave_offs = cpu_to_le32(c->lsave_offs); + c->mst_node->lscan_lnum = cpu_to_le32(c->lscan_lnum); + c->mst_node->empty_lebs = cpu_to_le32(lst.empty_lebs); + c->mst_node->idx_lebs = cpu_to_le32(lst.idx_lebs); + c->mst_node->total_free = cpu_to_le64(lst.total_free); + c->mst_node->total_dirty = cpu_to_le64(lst.total_dirty); + c->mst_node->total_used = cpu_to_le64(lst.total_used); + c->mst_node->total_dead = cpu_to_le64(lst.total_dead); + c->mst_node->total_dark = cpu_to_le64(lst.total_dark); + if (c->no_orphs) + c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS); + else + c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_NO_ORPHS); + err = ubifs_write_master(c); + mutex_unlock(&c->mst_mutex); + if (err) + goto out; + + err = ubifs_log_post_commit(c, old_ltail_lnum); + if (err) + goto out; + err = ubifs_gc_end_commit(c); + if (err) + goto out; + err = ubifs_lpt_post_commit(c); + if (err) + goto out; + + spin_lock(&c->cs_lock); + c->cmt_state = COMMIT_RESTING; + wake_up(&c->cmt_wq); + dbg_cmt("commit end"); + spin_unlock(&c->cs_lock); + + return 0; + +out_up: + up_write(&c->commit_sem); +out: + ubifs_err("commit failed, error %d", err); + spin_lock(&c->cs_lock); + c->cmt_state = COMMIT_BROKEN; + wake_up(&c->cmt_wq); + spin_unlock(&c->cs_lock); + ubifs_ro_mode(c, err); + return err; +} + +/** + * run_bg_commit - run background commit if it is needed. + * @c: UBIFS file-system description object + * + * This function runs background commit if it is needed. Returns zero in case + * of success and a negative error code in case of failure. + */ +static int run_bg_commit(struct ubifs_info *c) +{ + spin_lock(&c->cs_lock); + /* + * Run background commit only if background commit was requested or if + * commit is required. + */ + if (c->cmt_state != COMMIT_BACKGROUND && + c->cmt_state != COMMIT_REQUIRED) + goto out; + spin_unlock(&c->cs_lock); + + down_write(&c->commit_sem); + spin_lock(&c->cs_lock); + if (c->cmt_state == COMMIT_REQUIRED) + c->cmt_state = COMMIT_RUNNING_REQUIRED; + else if (c->cmt_state == COMMIT_BACKGROUND) + c->cmt_state = COMMIT_RUNNING_BACKGROUND; + else + goto out_cmt_unlock; + spin_unlock(&c->cs_lock); + + return do_commit(c); + +out_cmt_unlock: + up_write(&c->commit_sem); +out: + spin_unlock(&c->cs_lock); + return 0; +} + +/** + * ubifs_bg_thread - UBIFS background thread function. + * @info: points to the file-system description object + * + * This function implements various file-system background activities: + * o when a write-buffer timer expires it synchronizes the appropriate + * write-buffer; + * o when the journal is about to be full, it starts in-advance commit. + * + * Note, other stuff like background garbage collection may be added here in + * future. + */ +int ubifs_bg_thread(void *info) +{ + int err; + struct ubifs_info *c = info; + + ubifs_msg("background thread \"%s\" started, PID %d", + c->bgt_name, current->pid); + set_freezable(); + + while (1) { + if (kthread_should_stop()) + break; + + if (try_to_freeze()) + continue; + + set_current_state(TASK_INTERRUPTIBLE); + /* Check if there is something to do */ + if (!c->need_bgt) { + /* + * Nothing prevents us from going sleep now and + * be never woken up and block the task which + * could wait in 'kthread_stop()' forever. + */ + if (kthread_should_stop()) + break; + schedule(); + continue; + } else + __set_current_state(TASK_RUNNING); + + c->need_bgt = 0; + err = ubifs_bg_wbufs_sync(c); + if (err) + ubifs_ro_mode(c, err); + + run_bg_commit(c); + cond_resched(); + } + + dbg_msg("background thread \"%s\" stops", c->bgt_name); + return 0; +} + +/** + * ubifs_commit_required - set commit state to "required". + * @c: UBIFS file-system description object + * + * This function is called if a commit is required but cannot be done from the + * calling function, so it is just flagged instead. + */ +void ubifs_commit_required(struct ubifs_info *c) +{ + spin_lock(&c->cs_lock); + switch (c->cmt_state) { + case COMMIT_RESTING: + case COMMIT_BACKGROUND: + dbg_cmt("old: %s, new: %s", dbg_cstate(c->cmt_state), + dbg_cstate(COMMIT_REQUIRED)); + c->cmt_state = COMMIT_REQUIRED; + break; + case COMMIT_RUNNING_BACKGROUND: + dbg_cmt("old: %s, new: %s", dbg_cstate(c->cmt_state), + dbg_cstate(COMMIT_RUNNING_REQUIRED)); + c->cmt_state = COMMIT_RUNNING_REQUIRED; + break; + case COMMIT_REQUIRED: + case COMMIT_RUNNING_REQUIRED: + case COMMIT_BROKEN: + break; + } + spin_unlock(&c->cs_lock); +} + +/** + * ubifs_request_bg_commit - notify the background thread to do a commit. + * @c: UBIFS file-system description object + * + * This function is called if the journal is full enough to make a commit + * worthwhile, so background thread is kicked to start it. + */ +void ubifs_request_bg_commit(struct ubifs_info *c) +{ + spin_lock(&c->cs_lock); + if (c->cmt_state == COMMIT_RESTING) { + dbg_cmt("old: %s, new: %s", dbg_cstate(c->cmt_state), + dbg_cstate(COMMIT_BACKGROUND)); + c->cmt_state = COMMIT_BACKGROUND; + spin_unlock(&c->cs_lock); + ubifs_wake_up_bgt(c); + } else + spin_unlock(&c->cs_lock); +} + +/** + * wait_for_commit - wait for commit. + * @c: UBIFS file-system description object + * + * This function sleeps until the commit operation is no longer running. + */ +static int wait_for_commit(struct ubifs_info *c) +{ + dbg_cmt("pid %d goes sleep", current->pid); + + /* + * The following sleeps if the condition is false, and will be woken + * when the commit ends. It is possible, although very unlikely, that we + * will wake up and see the subsequent commit running, rather than the + * one we were waiting for, and go back to sleep. However, we will be + * woken again, so there is no danger of sleeping forever. + */ + wait_event(c->cmt_wq, c->cmt_state != COMMIT_RUNNING_BACKGROUND && + c->cmt_state != COMMIT_RUNNING_REQUIRED); + dbg_cmt("commit finished, pid %d woke up", current->pid); + return 0; +} + +/** + * ubifs_run_commit - run or wait for commit. + * @c: UBIFS file-system description object + * + * This function runs commit and returns zero in case of success and a negative + * error code in case of failure. + */ +int ubifs_run_commit(struct ubifs_info *c) +{ + int err = 0; + + spin_lock(&c->cs_lock); + if (c->cmt_state == COMMIT_BROKEN) { + err = -EINVAL; + goto out; + } + + if (c->cmt_state == COMMIT_RUNNING_BACKGROUND) + /* + * We set the commit state to 'running required' to indicate + * that we want it to complete as quickly as possible. + */ + c->cmt_state = COMMIT_RUNNING_REQUIRED; + + if (c->cmt_state == COMMIT_RUNNING_REQUIRED) { + spin_unlock(&c->cs_lock); + return wait_for_commit(c); + } + spin_unlock(&c->cs_lock); + + /* Ok, the commit is indeed needed */ + + down_write(&c->commit_sem); + spin_lock(&c->cs_lock); + /* + * Since we unlocked 'c->cs_lock', the state may have changed, so + * re-check it. + */ + if (c->cmt_state == COMMIT_BROKEN) { + err = -EINVAL; + goto out_cmt_unlock; + } + + if (c->cmt_state == COMMIT_RUNNING_BACKGROUND) + c->cmt_state = COMMIT_RUNNING_REQUIRED; + + if (c->cmt_state == COMMIT_RUNNING_REQUIRED) { + up_write(&c->commit_sem); + spin_unlock(&c->cs_lock); + return wait_for_commit(c); + } + c->cmt_state = COMMIT_RUNNING_REQUIRED; + spin_unlock(&c->cs_lock); + + err = do_commit(c); + return err; + +out_cmt_unlock: + up_write(&c->commit_sem); +out: + spin_unlock(&c->cs_lock); + return err; +} + +/** + * ubifs_gc_should_commit - determine if it is time for GC to run commit. + * @c: UBIFS file-system description object + * + * This function is called by garbage collection to determine if commit should + * be run. If commit state is @COMMIT_BACKGROUND, which means that the journal + * is full enough to start commit, this function returns true. It is not + * absolutely necessary to commit yet, but it feels like this should be better + * then to keep doing GC. This function returns %1 if GC has to initiate commit + * and %0 if not. + */ +int ubifs_gc_should_commit(struct ubifs_info *c) +{ + int ret = 0; + + spin_lock(&c->cs_lock); + if (c->cmt_state == COMMIT_BACKGROUND) { + dbg_cmt("commit required now"); + c->cmt_state = COMMIT_REQUIRED; + } else + dbg_cmt("commit not requested"); + if (c->cmt_state == COMMIT_REQUIRED) + ret = 1; + spin_unlock(&c->cs_lock); + return ret; +} + +#ifdef CONFIG_UBIFS_FS_DEBUG + +/** + * struct idx_node - hold index nodes during index tree traversal. + * @list: list + * @iip: index in parent (slot number of this indexing node in the parent + * indexing node) + * @upper_key: all keys in this indexing node have to be less or equivalent to + * this key + * @idx: index node (8-byte aligned because all node structures must be 8-byte + * aligned) + */ +struct idx_node { + struct list_head list; + int iip; + union ubifs_key upper_key; + struct ubifs_idx_node idx __attribute__((aligned(8))); +}; + +/** + * dbg_old_index_check_init - get information for the next old index check. + * @c: UBIFS file-system description object + * @zroot: root of the index + * + * This function records information about the index that will be needed for the + * next old index check i.e. 'dbg_check_old_index()'. + * + * This function returns %0 on success and a negative error code on failure. + */ +int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot) +{ + struct ubifs_idx_node *idx; + int lnum, offs, len, err = 0; + + c->old_zroot = *zroot; + + lnum = c->old_zroot.lnum; + offs = c->old_zroot.offs; + len = c->old_zroot.len; + + idx = kmalloc(c->max_idx_node_sz, GFP_NOFS); + if (!idx) + return -ENOMEM; + + err = ubifs_read_node(c, idx, UBIFS_IDX_NODE, len, lnum, offs); + if (err) + goto out; + + c->old_zroot_level = le16_to_cpu(idx->level); + c->old_zroot_sqnum = le64_to_cpu(idx->ch.sqnum); +out: + kfree(idx); + return err; +} + +/** + * dbg_check_old_index - check the old copy of the index. + * @c: UBIFS file-system description object + * @zroot: root of the new index + * + * In order to be able to recover from an unclean unmount, a complete copy of + * the index must exist on flash. This is the "old" index. The commit process + * must write the "new" index to flash without overwriting or destroying any + * part of the old index. This function is run at commit end in order to check + * that the old index does indeed exist completely intact. + * + * This function returns %0 on success and a negative error code on failure. + */ +int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot) +{ + int lnum, offs, len, err = 0, uninitialized_var(last_level), child_cnt; + int first = 1, iip; + union ubifs_key lower_key, upper_key, l_key, u_key; + unsigned long long uninitialized_var(last_sqnum); + struct ubifs_idx_node *idx; + struct list_head list; + struct idx_node *i; + size_t sz; + + if (!(ubifs_chk_flags & UBIFS_CHK_OLD_IDX)) + goto out; + + INIT_LIST_HEAD(&list); + + sz = sizeof(struct idx_node) + ubifs_idx_node_sz(c, c->fanout) - + UBIFS_IDX_NODE_SZ; + + /* Start at the old zroot */ + lnum = c->old_zroot.lnum; + offs = c->old_zroot.offs; + len = c->old_zroot.len; + iip = 0; + + /* + * Traverse the index tree preorder depth-first i.e. do a node and then + * its subtrees from left to right. + */ + while (1) { + struct ubifs_branch *br; + + /* Get the next index node */ + i = kmalloc(sz, GFP_NOFS); + if (!i) { + err = -ENOMEM; + goto out_free; + } + i->iip = iip; + /* Keep the index nodes on our path in a linked list */ + list_add_tail(&i->list, &list); + /* Read the index node */ + idx = &i->idx; + err = ubifs_read_node(c, idx, UBIFS_IDX_NODE, len, lnum, offs); + if (err) + goto out_free; + /* Validate index node */ + child_cnt = le16_to_cpu(idx->child_cnt); + if (child_cnt < 1 || child_cnt > c->fanout) { + err = 1; + goto out_dump; + } + if (first) { + first = 0; + /* Check root level and sqnum */ + if (le16_to_cpu(idx->level) != c->old_zroot_level) { + err = 2; + goto out_dump; + } + if (le64_to_cpu(idx->ch.sqnum) != c->old_zroot_sqnum) { + err = 3; + goto out_dump; + } + /* Set last values as though root had a parent */ + last_level = le16_to_cpu(idx->level) + 1; + last_sqnum = le64_to_cpu(idx->ch.sqnum) + 1; + key_read(c, ubifs_idx_key(c, idx), &lower_key); + highest_ino_key(c, &upper_key, INUM_WATERMARK); + } + key_copy(c, &upper_key, &i->upper_key); + if (le16_to_cpu(idx->level) != last_level - 1) { + err = 3; + goto out_dump; + } + /* + * The index is always written bottom up hence a child's sqnum + * is always less than the parents. + */ + if (le64_to_cpu(idx->ch.sqnum) >= last_sqnum) { + err = 4; + goto out_dump; + } + /* Check key range */ + key_read(c, ubifs_idx_key(c, idx), &l_key); + br = ubifs_idx_branch(c, idx, child_cnt - 1); + key_read(c, &br->key, &u_key); + if (keys_cmp(c, &lower_key, &l_key) > 0) { + err = 5; + goto out_dump; + } + if (keys_cmp(c, &upper_key, &u_key) < 0) { + err = 6; + goto out_dump; + } + if (keys_cmp(c, &upper_key, &u_key) == 0) + if (!is_hash_key(c, &u_key)) { + err = 7; + goto out_dump; + } + /* Go to next index node */ + if (le16_to_cpu(idx->level) == 0) { + /* At the bottom, so go up until can go right */ + while (1) { + /* Drop the bottom of the list */ + list_del(&i->list); + kfree(i); + /* No more list means we are done */ + if (list_empty(&list)) + goto out; + /* Look at the new bottom */ + i = list_entry(list.prev, struct idx_node, + list); + idx = &i->idx; + /* Can we go right */ + if (iip + 1 < le16_to_cpu(idx->child_cnt)) { + iip = iip + 1; + break; + } else + /* Nope, so go up again */ + iip = i->iip; + } + } else + /* Go down left */ + iip = 0; + /* + * We have the parent in 'idx' and now we set up for reading the + * child pointed to by slot 'iip'. + */ + last_level = le16_to_cpu(idx->level); + last_sqnum = le64_to_cpu(idx->ch.sqnum); + br = ubifs_idx_branch(c, idx, iip); + lnum = le32_to_cpu(br->lnum); + offs = le32_to_cpu(br->offs); + len = le32_to_cpu(br->len); + key_read(c, &br->key, &lower_key); + if (iip + 1 < le16_to_cpu(idx->child_cnt)) { + br = ubifs_idx_branch(c, idx, iip + 1); + key_read(c, &br->key, &upper_key); + } else + key_copy(c, &i->upper_key, &upper_key); + } +out: + err = dbg_old_index_check_init(c, zroot); + if (err) + goto out_free; + + return 0; + +out_dump: + dbg_err("dumping index node (iip=%d)", i->iip); + dbg_dump_node(c, idx); + list_del(&i->list); + kfree(i); + if (!list_empty(&list)) { + i = list_entry(list.prev, struct idx_node, list); + dbg_err("dumping parent index node"); + dbg_dump_node(c, &i->idx); + } +out_free: + while (!list_empty(&list)) { + i = list_entry(list.next, struct idx_node, list); + list_del(&i->list); + kfree(i); + } + ubifs_err("failed, error %d", err); + if (err > 0) + err = -EINVAL; + return err; +} + +#endif /* CONFIG_UBIFS_FS_DEBUG */ diff --git a/fs/ubifs/compress.c b/fs/ubifs/compress.c new file mode 100644 index 000000000000..5bb51dac3c16 --- /dev/null +++ b/fs/ubifs/compress.c @@ -0,0 +1,253 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * Copyright (C) 2006, 2007 University of Szeged, Hungary + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + * Zoltan Sogor + */ + +/* + * This file provides a single place to access to compression and + * decompression. + */ + +#include <linux/crypto.h> +#include "ubifs.h" + +/* Fake description object for the "none" compressor */ +static struct ubifs_compressor none_compr = { + .compr_type = UBIFS_COMPR_NONE, + .name = "no compression", + .capi_name = "", +}; + +#ifdef CONFIG_UBIFS_FS_LZO +static DEFINE_MUTEX(lzo_mutex); + +static struct ubifs_compressor lzo_compr = { + .compr_type = UBIFS_COMPR_LZO, + .comp_mutex = &lzo_mutex, + .name = "LZO", + .capi_name = "lzo", +}; +#else +static struct ubifs_compressor lzo_compr = { + .compr_type = UBIFS_COMPR_LZO, + .name = "LZO", +}; +#endif + +#ifdef CONFIG_UBIFS_FS_ZLIB +static DEFINE_MUTEX(deflate_mutex); +static DEFINE_MUTEX(inflate_mutex); + +static struct ubifs_compressor zlib_compr = { + .compr_type = UBIFS_COMPR_ZLIB, + .comp_mutex = &deflate_mutex, + .decomp_mutex = &inflate_mutex, + .name = "zlib", + .capi_name = "deflate", +}; +#else +static struct ubifs_compressor zlib_compr = { + .compr_type = UBIFS_COMPR_ZLIB, + .name = "zlib", +}; +#endif + +/* All UBIFS compressors */ +struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT]; + +/** + * ubifs_compress - compress data. + * @in_buf: data to compress + * @in_len: length of the data to compress + * @out_buf: output buffer where compressed data should be stored + * @out_len: output buffer length is returned here + * @compr_type: type of compression to use on enter, actually used compression + * type on exit + * + * This function compresses input buffer @in_buf of length @in_len and stores + * the result in the output buffer @out_buf and the resulting length in + * @out_len. If the input buffer does not compress, it is just copied to the + * @out_buf. The same happens if @compr_type is %UBIFS_COMPR_NONE or if + * compression error occurred. + * + * Note, if the input buffer was not compressed, it is copied to the output + * buffer and %UBIFS_COMPR_NONE is returned in @compr_type. + * + * This functions returns %0 on success or a negative error code on failure. + */ +void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len, + int *compr_type) +{ + int err; + struct ubifs_compressor *compr = ubifs_compressors[*compr_type]; + + if (*compr_type == UBIFS_COMPR_NONE) + goto no_compr; + + /* If the input data is small, do not even try to compress it */ + if (in_len < UBIFS_MIN_COMPR_LEN) + goto no_compr; + + if (compr->comp_mutex) + mutex_lock(compr->comp_mutex); + err = crypto_comp_compress(compr->cc, in_buf, in_len, out_buf, + out_len); + if (compr->comp_mutex) + mutex_unlock(compr->comp_mutex); + if (unlikely(err)) { + ubifs_warn("cannot compress %d bytes, compressor %s, " + "error %d, leave data uncompressed", + in_len, compr->name, err); + goto no_compr; + } + + /* + * Presently, we just require that compression results in less data, + * rather than any defined minimum compression ratio or amount. + */ + if (ALIGN(*out_len, 8) >= ALIGN(in_len, 8)) + goto no_compr; + + return; + +no_compr: + memcpy(out_buf, in_buf, in_len); + *out_len = in_len; + *compr_type = UBIFS_COMPR_NONE; +} + +/** + * ubifs_decompress - decompress data. + * @in_buf: data to decompress + * @in_len: length of the data to decompress + * @out_buf: output buffer where decompressed data should + * @out_len: output length is returned here + * @compr_type: type of compression + * + * This function decompresses data from buffer @in_buf into buffer @out_buf. + * The length of the uncompressed data is returned in @out_len. This functions + * returns %0 on success or a negative error code on failure. + */ +int ubifs_decompress(const void *in_buf, int in_len, void *out_buf, + int *out_len, int compr_type) +{ + int err; + struct ubifs_compressor *compr; + + if (unlikely(compr_type < 0 || compr_type >= UBIFS_COMPR_TYPES_CNT)) { + ubifs_err("invalid compression type %d", compr_type); + return -EINVAL; + } + + compr = ubifs_compressors[compr_type]; + + if (unlikely(!compr->capi_name)) { + ubifs_err("%s compression is not compiled in", compr->name); + return -EINVAL; + } + + if (compr_type == UBIFS_COMPR_NONE) { + memcpy(out_buf, in_buf, in_len); + *out_len = in_len; + return 0; + } + + if (compr->decomp_mutex) + mutex_lock(compr->decomp_mutex); + err = crypto_comp_decompress(compr->cc, in_buf, in_len, out_buf, + out_len); + if (compr->decomp_mutex) + mutex_unlock(compr->decomp_mutex); + if (err) + ubifs_err("cannot decompress %d bytes, compressor %s, " + "error %d", in_len, compr->name, err); + + return err; +} + +/** + * compr_init - initialize a compressor. + * @compr: compressor description object + * + * This function initializes the requested compressor and returns zero in case + * of success or a negative error code in case of failure. + */ +static int __init compr_init(struct ubifs_compressor *compr) +{ + if (compr->capi_name) { + compr->cc = crypto_alloc_comp(compr->capi_name, 0, 0); + if (IS_ERR(compr->cc)) { + ubifs_err("cannot initialize compressor %s, error %ld", + compr->name, PTR_ERR(compr->cc)); + return PTR_ERR(compr->cc); + } + } + + ubifs_compressors[compr->compr_type] = compr; + return 0; +} + +/** + * compr_exit - de-initialize a compressor. + * @compr: compressor description object + */ +static void compr_exit(struct ubifs_compressor *compr) +{ + if (compr->capi_name) + crypto_free_comp(compr->cc); + return; +} + +/** + * ubifs_compressors_init - initialize UBIFS compressors. + * + * This function initializes the compressor which were compiled in. Returns + * zero in case of success and a negative error code in case of failure. + */ +int __init ubifs_compressors_init(void) +{ + int err; + + err = compr_init(&lzo_compr); + if (err) + return err; + + err = compr_init(&zlib_compr); + if (err) + goto out_lzo; + + ubifs_compressors[UBIFS_COMPR_NONE] = &none_compr; + return 0; + +out_lzo: + compr_exit(&lzo_compr); + return err; +} + +/** + * ubifs_compressors_exit - de-initialize UBIFS compressors. + */ +void __exit ubifs_compressors_exit(void) +{ + compr_exit(&lzo_compr); + compr_exit(&zlib_compr); +} diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c new file mode 100644 index 000000000000..d7f7645779f2 --- /dev/null +++ b/fs/ubifs/debug.c @@ -0,0 +1,2290 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* + * This file implements most of the debugging stuff which is compiled in only + * when it is enabled. But some debugging check functions are implemented in + * corresponding subsystem, just because they are closely related and utilize + * various local functions of those subsystems. + */ + +#define UBIFS_DBG_PRESERVE_UBI + +#include "ubifs.h" +#include <linux/module.h> +#include <linux/moduleparam.h> + +#ifdef CONFIG_UBIFS_FS_DEBUG + +DEFINE_SPINLOCK(dbg_lock); + +static char dbg_key_buf0[128]; +static char dbg_key_buf1[128]; + +unsigned int ubifs_msg_flags = UBIFS_MSG_FLAGS_DEFAULT; +unsigned int ubifs_chk_flags = UBIFS_CHK_FLAGS_DEFAULT; +unsigned int ubifs_tst_flags; + +module_param_named(debug_msgs, ubifs_msg_flags, uint, S_IRUGO | S_IWUSR); +module_param_named(debug_chks, ubifs_chk_flags, uint, S_IRUGO | S_IWUSR); +module_param_named(debug_tsts, ubifs_tst_flags, uint, S_IRUGO | S_IWUSR); + +MODULE_PARM_DESC(debug_msgs, "Debug message type flags"); +MODULE_PARM_DESC(debug_chks, "Debug check flags"); +MODULE_PARM_DESC(debug_tsts, "Debug special test flags"); + +static const char *get_key_fmt(int fmt) +{ + switch (fmt) { + case UBIFS_SIMPLE_KEY_FMT: + return "simple"; + default: + return "unknown/invalid format"; + } +} + +static const char *get_key_hash(int hash) +{ + switch (hash) { + case UBIFS_KEY_HASH_R5: + return "R5"; + case UBIFS_KEY_HASH_TEST: + return "test"; + default: + return "unknown/invalid name hash"; + } +} + +static const char *get_key_type(int type) +{ + switch (type) { + case UBIFS_INO_KEY: + return "inode"; + case UBIFS_DENT_KEY: + return "direntry"; + case UBIFS_XENT_KEY: + return "xentry"; + case UBIFS_DATA_KEY: + return "data"; + case UBIFS_TRUN_KEY: + return "truncate"; + default: + return "unknown/invalid key"; + } +} + +static void sprintf_key(const struct ubifs_info *c, const union ubifs_key *key, + char *buffer) +{ + char *p = buffer; + int type = key_type(c, key); + + if (c->key_fmt == UBIFS_SIMPLE_KEY_FMT) { + switch (type) { + case UBIFS_INO_KEY: + sprintf(p, "(%lu, %s)", key_inum(c, key), + get_key_type(type)); + break; + case UBIFS_DENT_KEY: + case UBIFS_XENT_KEY: + sprintf(p, "(%lu, %s, %#08x)", key_inum(c, key), + get_key_type(type), key_hash(c, key)); + break; + case UBIFS_DATA_KEY: + sprintf(p, "(%lu, %s, %u)", key_inum(c, key), + get_key_type(type), key_block(c, key)); + break; + case UBIFS_TRUN_KEY: + sprintf(p, "(%lu, %s)", + key_inum(c, key), get_key_type(type)); + break; + default: + sprintf(p, "(bad key type: %#08x, %#08x)", + key->u32[0], key->u32[1]); + } + } else + sprintf(p, "bad key format %d", c->key_fmt); +} + +const char *dbg_key_str0(const struct ubifs_info *c, const union ubifs_key *key) +{ + /* dbg_lock must be held */ + sprintf_key(c, key, dbg_key_buf0); + return dbg_key_buf0; +} + +const char *dbg_key_str1(const struct ubifs_info *c, const union ubifs_key *key) +{ + /* dbg_lock must be held */ + sprintf_key(c, key, dbg_key_buf1); + return dbg_key_buf1; +} + +const char *dbg_ntype(int type) +{ + switch (type) { + case UBIFS_PAD_NODE: + return "padding node"; + case UBIFS_SB_NODE: + return "superblock node"; + case UBIFS_MST_NODE: + return "master node"; + case UBIFS_REF_NODE: + return "reference node"; + case UBIFS_INO_NODE: + return "inode node"; + case UBIFS_DENT_NODE: + return "direntry node"; + case UBIFS_XENT_NODE: + return "xentry node"; + case UBIFS_DATA_NODE: + return "data node"; + case UBIFS_TRUN_NODE: + return "truncate node"; + case UBIFS_IDX_NODE: + return "indexing node"; + case UBIFS_CS_NODE: + return "commit start node"; + case UBIFS_ORPH_NODE: + return "orphan node"; + default: + return "unknown node"; + } +} + +static const char *dbg_gtype(int type) +{ + switch (type) { + case UBIFS_NO_NODE_GROUP: + return "no node group"; + case UBIFS_IN_NODE_GROUP: + return "in node group"; + case UBIFS_LAST_OF_NODE_GROUP: + return "last of node group"; + default: + return "unknown"; + } +} + +const char *dbg_cstate(int cmt_state) +{ + switch (cmt_state) { + case COMMIT_RESTING: + return "commit resting"; + case COMMIT_BACKGROUND: + return "background commit requested"; + case COMMIT_REQUIRED: + return "commit required"; + case COMMIT_RUNNING_BACKGROUND: + return "BACKGROUND commit running"; + case COMMIT_RUNNING_REQUIRED: + return "commit running and required"; + case COMMIT_BROKEN: + return "broken commit"; + default: + return "unknown commit state"; + } +} + +static void dump_ch(const struct ubifs_ch *ch) +{ + printk(KERN_DEBUG "\tmagic %#x\n", le32_to_cpu(ch->magic)); + printk(KERN_DEBUG "\tcrc %#x\n", le32_to_cpu(ch->crc)); + printk(KERN_DEBUG "\tnode_type %d (%s)\n", ch->node_type, + dbg_ntype(ch->node_type)); + printk(KERN_DEBUG "\tgroup_type %d (%s)\n", ch->group_type, + dbg_gtype(ch->group_type)); + printk(KERN_DEBUG "\tsqnum %llu\n", + (unsigned long long)le64_to_cpu(ch->sqnum)); + printk(KERN_DEBUG "\tlen %u\n", le32_to_cpu(ch->len)); +} + +void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode) +{ + const struct ubifs_inode *ui = ubifs_inode(inode); + + printk(KERN_DEBUG "inode %lu\n", inode->i_ino); + printk(KERN_DEBUG "size %llu\n", + (unsigned long long)i_size_read(inode)); + printk(KERN_DEBUG "nlink %u\n", inode->i_nlink); + printk(KERN_DEBUG "uid %u\n", (unsigned int)inode->i_uid); + printk(KERN_DEBUG "gid %u\n", (unsigned int)inode->i_gid); + printk(KERN_DEBUG "atime %u.%u\n", + (unsigned int)inode->i_atime.tv_sec, + (unsigned int)inode->i_atime.tv_nsec); + printk(KERN_DEBUG "mtime %u.%u\n", + (unsigned int)inode->i_mtime.tv_sec, + (unsigned int)inode->i_mtime.tv_nsec); + printk(KERN_DEBUG "ctime %u.%u\n", + (unsigned int)inode->i_ctime.tv_sec, + (unsigned int)inode->i_ctime.tv_nsec); + printk(KERN_DEBUG "creat_sqnum %llu\n", ui->creat_sqnum); + printk(KERN_DEBUG "xattr_size %u\n", ui->xattr_size); + printk(KERN_DEBUG "xattr_cnt %u\n", ui->xattr_cnt); + printk(KERN_DEBUG "xattr_names %u\n", ui->xattr_names); + printk(KERN_DEBUG "dirty %u\n", ui->dirty); + printk(KERN_DEBUG "xattr %u\n", ui->xattr); + printk(KERN_DEBUG "flags %d\n", ui->flags); + printk(KERN_DEBUG "compr_type %d\n", ui->compr_type); + printk(KERN_DEBUG "data_len %d\n", ui->data_len); +} + +void dbg_dump_node(const struct ubifs_info *c, const void *node) +{ + int i, n; + union ubifs_key key; + const struct ubifs_ch *ch = node; + + if (dbg_failure_mode) + return; + + /* If the magic is incorrect, just hexdump the first bytes */ + if (le32_to_cpu(ch->magic) != UBIFS_NODE_MAGIC) { + printk(KERN_DEBUG "Not a node, first %zu bytes:", UBIFS_CH_SZ); + print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 1, + (void *)node, UBIFS_CH_SZ, 1); + return; + } + + spin_lock(&dbg_lock); + dump_ch(node); + + switch (ch->node_type) { + case UBIFS_PAD_NODE: + { + const struct ubifs_pad_node *pad = node; + + printk(KERN_DEBUG "\tpad_len %u\n", + le32_to_cpu(pad->pad_len)); + break; + } + case UBIFS_SB_NODE: + { + const struct ubifs_sb_node *sup = node; + unsigned int sup_flags = le32_to_cpu(sup->flags); + + printk(KERN_DEBUG "\tkey_hash %d (%s)\n", + (int)sup->key_hash, get_key_hash(sup->key_hash)); + printk(KERN_DEBUG "\tkey_fmt %d (%s)\n", + (int)sup->key_fmt, get_key_fmt(sup->key_fmt)); + printk(KERN_DEBUG "\tflags %#x\n", sup_flags); + printk(KERN_DEBUG "\t big_lpt %u\n", + !!(sup_flags & UBIFS_FLG_BIGLPT)); + printk(KERN_DEBUG "\tmin_io_size %u\n", + le32_to_cpu(sup->min_io_size)); + printk(KERN_DEBUG "\tleb_size %u\n", + le32_to_cpu(sup->leb_size)); + printk(KERN_DEBUG "\tleb_cnt %u\n", + le32_to_cpu(sup->leb_cnt)); + printk(KERN_DEBUG "\tmax_leb_cnt %u\n", + le32_to_cpu(sup->max_leb_cnt)); + printk(KERN_DEBUG "\tmax_bud_bytes %llu\n", + (unsigned long long)le64_to_cpu(sup->max_bud_bytes)); + printk(KERN_DEBUG "\tlog_lebs %u\n", + le32_to_cpu(sup->log_lebs)); + printk(KERN_DEBUG "\tlpt_lebs %u\n", + le32_to_cpu(sup->lpt_lebs)); + printk(KERN_DEBUG "\torph_lebs %u\n", + le32_to_cpu(sup->orph_lebs)); + printk(KERN_DEBUG "\tjhead_cnt %u\n", + le32_to_cpu(sup->jhead_cnt)); + printk(KERN_DEBUG "\tfanout %u\n", + le32_to_cpu(sup->fanout)); + printk(KERN_DEBUG "\tlsave_cnt %u\n", + le32_to_cpu(sup->lsave_cnt)); + printk(KERN_DEBUG "\tdefault_compr %u\n", + (int)le16_to_cpu(sup->default_compr)); + printk(KERN_DEBUG "\trp_size %llu\n", + (unsigned long long)le64_to_cpu(sup->rp_size)); + printk(KERN_DEBUG "\trp_uid %u\n", + le32_to_cpu(sup->rp_uid)); + printk(KERN_DEBUG "\trp_gid %u\n", + le32_to_cpu(sup->rp_gid)); + printk(KERN_DEBUG "\tfmt_version %u\n", + le32_to_cpu(sup->fmt_version)); + printk(KERN_DEBUG "\ttime_gran %u\n", + le32_to_cpu(sup->time_gran)); + printk(KERN_DEBUG "\tUUID %02X%02X%02X%02X-%02X%02X" + "-%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X\n", + sup->uuid[0], sup->uuid[1], sup->uuid[2], sup->uuid[3], + sup->uuid[4], sup->uuid[5], sup->uuid[6], sup->uuid[7], + sup->uuid[8], sup->uuid[9], sup->uuid[10], sup->uuid[11], + sup->uuid[12], sup->uuid[13], sup->uuid[14], + sup->uuid[15]); + break; + } + case UBIFS_MST_NODE: + { + const struct ubifs_mst_node *mst = node; + + printk(KERN_DEBUG "\thighest_inum %llu\n", + (unsigned long long)le64_to_cpu(mst->highest_inum)); + printk(KERN_DEBUG "\tcommit number %llu\n", + (unsigned long long)le64_to_cpu(mst->cmt_no)); + printk(KERN_DEBUG "\tflags %#x\n", + le32_to_cpu(mst->flags)); + printk(KERN_DEBUG "\tlog_lnum %u\n", + le32_to_cpu(mst->log_lnum)); + printk(KERN_DEBUG "\troot_lnum %u\n", + le32_to_cpu(mst->root_lnum)); + printk(KERN_DEBUG "\troot_offs %u\n", + le32_to_cpu(mst->root_offs)); + printk(KERN_DEBUG "\troot_len %u\n", + le32_to_cpu(mst->root_len)); + printk(KERN_DEBUG "\tgc_lnum %u\n", + le32_to_cpu(mst->gc_lnum)); + printk(KERN_DEBUG "\tihead_lnum %u\n", + le32_to_cpu(mst->ihead_lnum)); + printk(KERN_DEBUG "\tihead_offs %u\n", + le32_to_cpu(mst->ihead_offs)); + printk(KERN_DEBUG "\tindex_size %u\n", + le32_to_cpu(mst->index_size)); + printk(KERN_DEBUG "\tlpt_lnum %u\n", + le32_to_cpu(mst->lpt_lnum)); + printk(KERN_DEBUG "\tlpt_offs %u\n", + le32_to_cpu(mst->lpt_offs)); + printk(KERN_DEBUG "\tnhead_lnum %u\n", + le32_to_cpu(mst->nhead_lnum)); + printk(KERN_DEBUG "\tnhead_offs %u\n", + le32_to_cpu(mst->nhead_offs)); + printk(KERN_DEBUG "\tltab_lnum %u\n", + le32_to_cpu(mst->ltab_lnum)); + printk(KERN_DEBUG "\tltab_offs %u\n", + le32_to_cpu(mst->ltab_offs)); + printk(KERN_DEBUG "\tlsave_lnum %u\n", + le32_to_cpu(mst->lsave_lnum)); + printk(KERN_DEBUG "\tlsave_offs %u\n", + le32_to_cpu(mst->lsave_offs)); + printk(KERN_DEBUG "\tlscan_lnum %u\n", + le32_to_cpu(mst->lscan_lnum)); + printk(KERN_DEBUG "\tleb_cnt %u\n", + le32_to_cpu(mst->leb_cnt)); + printk(KERN_DEBUG "\tempty_lebs %u\n", + le32_to_cpu(mst->empty_lebs)); + printk(KERN_DEBUG "\tidx_lebs %u\n", + le32_to_cpu(mst->idx_lebs)); + printk(KERN_DEBUG "\ttotal_free %llu\n", + (unsigned long long)le64_to_cpu(mst->total_free)); + printk(KERN_DEBUG "\ttotal_dirty %llu\n", + (unsigned long long)le64_to_cpu(mst->total_dirty)); + printk(KERN_DEBUG "\ttotal_used %llu\n", + (unsigned long long)le64_to_cpu(mst->total_used)); + printk(KERN_DEBUG "\ttotal_dead %llu\n", + (unsigned long long)le64_to_cpu(mst->total_dead)); + printk(KERN_DEBUG "\ttotal_dark %llu\n", + (unsigned long long)le64_to_cpu(mst->total_dark)); + break; + } + case UBIFS_REF_NODE: + { + const struct ubifs_ref_node *ref = node; + + printk(KERN_DEBUG "\tlnum %u\n", + le32_to_cpu(ref->lnum)); + printk(KERN_DEBUG "\toffs %u\n", + le32_to_cpu(ref->offs)); + printk(KERN_DEBUG "\tjhead %u\n", + le32_to_cpu(ref->jhead)); + break; + } + case UBIFS_INO_NODE: + { + const struct ubifs_ino_node *ino = node; + + key_read(c, &ino->key, &key); + printk(KERN_DEBUG "\tkey %s\n", DBGKEY(&key)); + printk(KERN_DEBUG "\tcreat_sqnum %llu\n", + (unsigned long long)le64_to_cpu(ino->creat_sqnum)); + printk(KERN_DEBUG "\tsize %llu\n", + (unsigned long long)le64_to_cpu(ino->size)); + printk(KERN_DEBUG "\tnlink %u\n", + le32_to_cpu(ino->nlink)); + printk(KERN_DEBUG "\tatime %lld.%u\n", + (long long)le64_to_cpu(ino->atime_sec), + le32_to_cpu(ino->atime_nsec)); + printk(KERN_DEBUG "\tmtime %lld.%u\n", + (long long)le64_to_cpu(ino->mtime_sec), + le32_to_cpu(ino->mtime_nsec)); + printk(KERN_DEBUG "\tctime %lld.%u\n", + (long long)le64_to_cpu(ino->ctime_sec), + le32_to_cpu(ino->ctime_nsec)); + printk(KERN_DEBUG "\tuid %u\n", + le32_to_cpu(ino->uid)); + printk(KERN_DEBUG "\tgid %u\n", + le32_to_cpu(ino->gid)); + printk(KERN_DEBUG "\tmode %u\n", + le32_to_cpu(ino->mode)); + printk(KERN_DEBUG "\tflags %#x\n", + le32_to_cpu(ino->flags)); + printk(KERN_DEBUG "\txattr_cnt %u\n", + le32_to_cpu(ino->xattr_cnt)); + printk(KERN_DEBUG "\txattr_size %u\n", + le32_to_cpu(ino->xattr_size)); + printk(KERN_DEBUG "\txattr_names %u\n", + le32_to_cpu(ino->xattr_names)); + printk(KERN_DEBUG "\tcompr_type %#x\n", + (int)le16_to_cpu(ino->compr_type)); + printk(KERN_DEBUG "\tdata len %u\n", + le32_to_cpu(ino->data_len)); + break; + } + case UBIFS_DENT_NODE: + case UBIFS_XENT_NODE: + { + const struct ubifs_dent_node *dent = node; + int nlen = le16_to_cpu(dent->nlen); + + key_read(c, &dent->key, &key); + printk(KERN_DEBUG "\tkey %s\n", DBGKEY(&key)); + printk(KERN_DEBUG "\tinum %llu\n", + (unsigned long long)le64_to_cpu(dent->inum)); + printk(KERN_DEBUG "\ttype %d\n", (int)dent->type); + printk(KERN_DEBUG "\tnlen %d\n", nlen); + printk(KERN_DEBUG "\tname "); + + if (nlen > UBIFS_MAX_NLEN) + printk(KERN_DEBUG "(bad name length, not printing, " + "bad or corrupted node)"); + else { + for (i = 0; i < nlen && dent->name[i]; i++) + printk("%c", dent->name[i]); + } + printk("\n"); + + break; + } + case UBIFS_DATA_NODE: + { + const struct ubifs_data_node *dn = node; + int dlen = le32_to_cpu(ch->len) - UBIFS_DATA_NODE_SZ; + + key_read(c, &dn->key, &key); + printk(KERN_DEBUG "\tkey %s\n", DBGKEY(&key)); + printk(KERN_DEBUG "\tsize %u\n", + le32_to_cpu(dn->size)); + printk(KERN_DEBUG "\tcompr_typ %d\n", + (int)le16_to_cpu(dn->compr_type)); + printk(KERN_DEBUG "\tdata size %d\n", + dlen); + printk(KERN_DEBUG "\tdata:\n"); + print_hex_dump(KERN_DEBUG, "\t", DUMP_PREFIX_OFFSET, 32, 1, + (void *)&dn->data, dlen, 0); + break; + } + case UBIFS_TRUN_NODE: + { + const struct ubifs_trun_node *trun = node; + + printk(KERN_DEBUG "\tinum %u\n", + le32_to_cpu(trun->inum)); + printk(KERN_DEBUG "\told_size %llu\n", + (unsigned long long)le64_to_cpu(trun->old_size)); + printk(KERN_DEBUG "\tnew_size %llu\n", + (unsigned long long)le64_to_cpu(trun->new_size)); + break; + } + case UBIFS_IDX_NODE: + { + const struct ubifs_idx_node *idx = node; + + n = le16_to_cpu(idx->child_cnt); + printk(KERN_DEBUG "\tchild_cnt %d\n", n); + printk(KERN_DEBUG "\tlevel %d\n", + (int)le16_to_cpu(idx->level)); + printk(KERN_DEBUG "\tBranches:\n"); + + for (i = 0; i < n && i < c->fanout - 1; i++) { + const struct ubifs_branch *br; + + br = ubifs_idx_branch(c, idx, i); + key_read(c, &br->key, &key); + printk(KERN_DEBUG "\t%d: LEB %d:%d len %d key %s\n", + i, le32_to_cpu(br->lnum), le32_to_cpu(br->offs), + le32_to_cpu(br->len), DBGKEY(&key)); + } + break; + } + case UBIFS_CS_NODE: + break; + case UBIFS_ORPH_NODE: + { + const struct ubifs_orph_node *orph = node; + + printk(KERN_DEBUG "\tcommit number %llu\n", + (unsigned long long) + le64_to_cpu(orph->cmt_no) & LLONG_MAX); + printk(KERN_DEBUG "\tlast node flag %llu\n", + (unsigned long long)(le64_to_cpu(orph->cmt_no)) >> 63); + n = (le32_to_cpu(ch->len) - UBIFS_ORPH_NODE_SZ) >> 3; + printk(KERN_DEBUG "\t%d orphan inode numbers:\n", n); + for (i = 0; i < n; i++) + printk(KERN_DEBUG "\t ino %llu\n", + (unsigned long long)le64_to_cpu(orph->inos[i])); + break; + } + default: + printk(KERN_DEBUG "node type %d was not recognized\n", + (int)ch->node_type); + } + spin_unlock(&dbg_lock); +} + +void dbg_dump_budget_req(const struct ubifs_budget_req *req) +{ + spin_lock(&dbg_lock); + printk(KERN_DEBUG "Budgeting request: new_ino %d, dirtied_ino %d\n", + req->new_ino, req->dirtied_ino); + printk(KERN_DEBUG "\tnew_ino_d %d, dirtied_ino_d %d\n", + req->new_ino_d, req->dirtied_ino_d); + printk(KERN_DEBUG "\tnew_page %d, dirtied_page %d\n", + req->new_page, req->dirtied_page); + printk(KERN_DEBUG "\tnew_dent %d, mod_dent %d\n", + req->new_dent, req->mod_dent); + printk(KERN_DEBUG "\tidx_growth %d\n", req->idx_growth); + printk(KERN_DEBUG "\tdata_growth %d dd_growth %d\n", + req->data_growth, req->dd_growth); + spin_unlock(&dbg_lock); +} + +void dbg_dump_lstats(const struct ubifs_lp_stats *lst) +{ + spin_lock(&dbg_lock); + printk(KERN_DEBUG "(pid %d) Lprops statistics: empty_lebs %d, " + "idx_lebs %d\n", current->pid, lst->empty_lebs, lst->idx_lebs); + printk(KERN_DEBUG "\ttaken_empty_lebs %d, total_free %lld, " + "total_dirty %lld\n", lst->taken_empty_lebs, lst->total_free, + lst->total_dirty); + printk(KERN_DEBUG "\ttotal_used %lld, total_dark %lld, " + "total_dead %lld\n", lst->total_used, lst->total_dark, + lst->total_dead); + spin_unlock(&dbg_lock); +} + +void dbg_dump_budg(struct ubifs_info *c) +{ + int i; + struct rb_node *rb; + struct ubifs_bud *bud; + struct ubifs_gced_idx_leb *idx_gc; + + spin_lock(&dbg_lock); + printk(KERN_DEBUG "(pid %d) Budgeting info: budg_data_growth %lld, " + "budg_dd_growth %lld, budg_idx_growth %lld\n", current->pid, + c->budg_data_growth, c->budg_dd_growth, c->budg_idx_growth); + printk(KERN_DEBUG "\tdata budget sum %lld, total budget sum %lld, " + "freeable_cnt %d\n", c->budg_data_growth + c->budg_dd_growth, + c->budg_data_growth + c->budg_dd_growth + c->budg_idx_growth, + c->freeable_cnt); + printk(KERN_DEBUG "\tmin_idx_lebs %d, old_idx_sz %lld, " + "calc_idx_sz %lld, idx_gc_cnt %d\n", c->min_idx_lebs, + c->old_idx_sz, c->calc_idx_sz, c->idx_gc_cnt); + printk(KERN_DEBUG "\tdirty_pg_cnt %ld, dirty_zn_cnt %ld, " + "clean_zn_cnt %ld\n", atomic_long_read(&c->dirty_pg_cnt), + atomic_long_read(&c->dirty_zn_cnt), + atomic_long_read(&c->clean_zn_cnt)); + printk(KERN_DEBUG "\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n", + c->dark_wm, c->dead_wm, c->max_idx_node_sz); + printk(KERN_DEBUG "\tgc_lnum %d, ihead_lnum %d\n", + c->gc_lnum, c->ihead_lnum); + for (i = 0; i < c->jhead_cnt; i++) + printk(KERN_DEBUG "\tjhead %d\t LEB %d\n", + c->jheads[i].wbuf.jhead, c->jheads[i].wbuf.lnum); + for (rb = rb_first(&c->buds); rb; rb = rb_next(rb)) { + bud = rb_entry(rb, struct ubifs_bud, rb); + printk(KERN_DEBUG "\tbud LEB %d\n", bud->lnum); + } + list_for_each_entry(bud, &c->old_buds, list) + printk(KERN_DEBUG "\told bud LEB %d\n", bud->lnum); + list_for_each_entry(idx_gc, &c->idx_gc, list) + printk(KERN_DEBUG "\tGC'ed idx LEB %d unmap %d\n", + idx_gc->lnum, idx_gc->unmap); + printk(KERN_DEBUG "\tcommit state %d\n", c->cmt_state); + spin_unlock(&dbg_lock); +} + +void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp) +{ + printk(KERN_DEBUG "LEB %d lprops: free %d, dirty %d (used %d), " + "flags %#x\n", lp->lnum, lp->free, lp->dirty, + c->leb_size - lp->free - lp->dirty, lp->flags); +} + +void dbg_dump_lprops(struct ubifs_info *c) +{ + int lnum, err; + struct ubifs_lprops lp; + struct ubifs_lp_stats lst; + + printk(KERN_DEBUG "(pid %d) Dumping LEB properties\n", current->pid); + ubifs_get_lp_stats(c, &lst); + dbg_dump_lstats(&lst); + + for (lnum = c->main_first; lnum < c->leb_cnt; lnum++) { + err = ubifs_read_one_lp(c, lnum, &lp); + if (err) + ubifs_err("cannot read lprops for LEB %d", lnum); + + dbg_dump_lprop(c, &lp); + } +} + +void dbg_dump_leb(const struct ubifs_info *c, int lnum) +{ + struct ubifs_scan_leb *sleb; + struct ubifs_scan_node *snod; + + if (dbg_failure_mode) + return; + + printk(KERN_DEBUG "(pid %d) Dumping LEB %d\n", current->pid, lnum); + + sleb = ubifs_scan(c, lnum, 0, c->dbg_buf); + if (IS_ERR(sleb)) { + ubifs_err("scan error %d", (int)PTR_ERR(sleb)); + return; + } + + printk(KERN_DEBUG "LEB %d has %d nodes ending at %d\n", lnum, + sleb->nodes_cnt, sleb->endpt); + + list_for_each_entry(snod, &sleb->nodes, list) { + cond_resched(); + printk(KERN_DEBUG "Dumping node at LEB %d:%d len %d\n", lnum, + snod->offs, snod->len); + dbg_dump_node(c, snod->node); + } + + ubifs_scan_destroy(sleb); + return; +} + +void dbg_dump_znode(const struct ubifs_info *c, + const struct ubifs_znode *znode) +{ + int n; + const struct ubifs_zbranch *zbr; + + spin_lock(&dbg_lock); + if (znode->parent) + zbr = &znode->parent->zbranch[znode->iip]; + else + zbr = &c->zroot; + + printk(KERN_DEBUG "znode %p, LEB %d:%d len %d parent %p iip %d level %d" + " child_cnt %d flags %lx\n", znode, zbr->lnum, zbr->offs, + zbr->len, znode->parent, znode->iip, znode->level, + znode->child_cnt, znode->flags); + + if (znode->child_cnt <= 0 || znode->child_cnt > c->fanout) { + spin_unlock(&dbg_lock); + return; + } + + printk(KERN_DEBUG "zbranches:\n"); + for (n = 0; n < znode->child_cnt; n++) { + zbr = &znode->zbranch[n]; + if (znode->level > 0) + printk(KERN_DEBUG "\t%d: znode %p LEB %d:%d len %d key " + "%s\n", n, zbr->znode, zbr->lnum, + zbr->offs, zbr->len, + DBGKEY(&zbr->key)); + else + printk(KERN_DEBUG "\t%d: LNC %p LEB %d:%d len %d key " + "%s\n", n, zbr->znode, zbr->lnum, + zbr->offs, zbr->len, + DBGKEY(&zbr->key)); + } + spin_unlock(&dbg_lock); +} + +void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat) +{ + int i; + + printk(KERN_DEBUG "(pid %d) Dumping heap cat %d (%d elements)\n", + current->pid, cat, heap->cnt); + for (i = 0; i < heap->cnt; i++) { + struct ubifs_lprops *lprops = heap->arr[i]; + + printk(KERN_DEBUG "\t%d. LEB %d hpos %d free %d dirty %d " + "flags %d\n", i, lprops->lnum, lprops->hpos, + lprops->free, lprops->dirty, lprops->flags); + } +} + +void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, + struct ubifs_nnode *parent, int iip) +{ + int i; + + printk(KERN_DEBUG "(pid %d) Dumping pnode:\n", current->pid); + printk(KERN_DEBUG "\taddress %zx parent %zx cnext %zx\n", + (size_t)pnode, (size_t)parent, (size_t)pnode->cnext); + printk(KERN_DEBUG "\tflags %lu iip %d level %d num %d\n", + pnode->flags, iip, pnode->level, pnode->num); + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + struct ubifs_lprops *lp = &pnode->lprops[i]; + + printk(KERN_DEBUG "\t%d: free %d dirty %d flags %d lnum %d\n", + i, lp->free, lp->dirty, lp->flags, lp->lnum); + } +} + +void dbg_dump_tnc(struct ubifs_info *c) +{ + struct ubifs_znode *znode; + int level; + + printk(KERN_DEBUG "\n"); + printk(KERN_DEBUG "(pid %d) Dumping the TNC tree\n", current->pid); + znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL); + level = znode->level; + printk(KERN_DEBUG "== Level %d ==\n", level); + while (znode) { + if (level != znode->level) { + level = znode->level; + printk(KERN_DEBUG "== Level %d ==\n", level); + } + dbg_dump_znode(c, znode); + znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode); + } + + printk(KERN_DEBUG "\n"); +} + +static int dump_znode(struct ubifs_info *c, struct ubifs_znode *znode, + void *priv) +{ + dbg_dump_znode(c, znode); + return 0; +} + +/** + * dbg_dump_index - dump the on-flash index. + * @c: UBIFS file-system description object + * + * This function dumps whole UBIFS indexing B-tree, unlike 'dbg_dump_tnc()' + * which dumps only in-memory znodes and does not read znodes which from flash. + */ +void dbg_dump_index(struct ubifs_info *c) +{ + dbg_walk_index(c, NULL, dump_znode, NULL); +} + +/** + * dbg_check_synced_i_size - check synchronized inode size. + * @inode: inode to check + * + * If inode is clean, synchronized inode size has to be equivalent to current + * inode size. This function has to be called only for locked inodes (@i_mutex + * has to be locked). Returns %0 if synchronized inode size if correct, and + * %-EINVAL if not. + */ +int dbg_check_synced_i_size(struct inode *inode) +{ + int err = 0; + struct ubifs_inode *ui = ubifs_inode(inode); + + if (!(ubifs_chk_flags & UBIFS_CHK_GEN)) + return 0; + if (!S_ISREG(inode->i_mode)) + return 0; + + mutex_lock(&ui->ui_mutex); + spin_lock(&ui->ui_lock); + if (ui->ui_size != ui->synced_i_size && !ui->dirty) { + ubifs_err("ui_size is %lld, synced_i_size is %lld, but inode " + "is clean", ui->ui_size, ui->synced_i_size); + ubifs_err("i_ino %lu, i_mode %#x, i_size %lld", inode->i_ino, + inode->i_mode, i_size_read(inode)); + dbg_dump_stack(); + err = -EINVAL; + } + spin_unlock(&ui->ui_lock); + mutex_unlock(&ui->ui_mutex); + return err; +} + +/* + * dbg_check_dir - check directory inode size and link count. + * @c: UBIFS file-system description object + * @dir: the directory to calculate size for + * @size: the result is returned here + * + * This function makes sure that directory size and link count are correct. + * Returns zero in case of success and a negative error code in case of + * failure. + * + * Note, it is good idea to make sure the @dir->i_mutex is locked before + * calling this function. + */ +int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir) +{ + unsigned int nlink = 2; + union ubifs_key key; + struct ubifs_dent_node *dent, *pdent = NULL; + struct qstr nm = { .name = NULL }; + loff_t size = UBIFS_INO_NODE_SZ; + + if (!(ubifs_chk_flags & UBIFS_CHK_GEN)) + return 0; + + if (!S_ISDIR(dir->i_mode)) + return 0; + + lowest_dent_key(c, &key, dir->i_ino); + while (1) { + int err; + + dent = ubifs_tnc_next_ent(c, &key, &nm); + if (IS_ERR(dent)) { + err = PTR_ERR(dent); + if (err == -ENOENT) + break; + return err; + } + + nm.name = dent->name; + nm.len = le16_to_cpu(dent->nlen); + size += CALC_DENT_SIZE(nm.len); + if (dent->type == UBIFS_ITYPE_DIR) + nlink += 1; + kfree(pdent); + pdent = dent; + key_read(c, &dent->key, &key); + } + kfree(pdent); + + if (i_size_read(dir) != size) { + ubifs_err("directory inode %lu has size %llu, " + "but calculated size is %llu", dir->i_ino, + (unsigned long long)i_size_read(dir), + (unsigned long long)size); + dump_stack(); + return -EINVAL; + } + if (dir->i_nlink != nlink) { + ubifs_err("directory inode %lu has nlink %u, but calculated " + "nlink is %u", dir->i_ino, dir->i_nlink, nlink); + dump_stack(); + return -EINVAL; + } + + return 0; +} + +/** + * dbg_check_key_order - make sure that colliding keys are properly ordered. + * @c: UBIFS file-system description object + * @zbr1: first zbranch + * @zbr2: following zbranch + * + * In UBIFS indexing B-tree colliding keys has to be sorted in binary order of + * names of the direntries/xentries which are referred by the keys. This + * function reads direntries/xentries referred by @zbr1 and @zbr2 and makes + * sure the name of direntry/xentry referred by @zbr1 is less than + * direntry/xentry referred by @zbr2. Returns zero if this is true, %1 if not, + * and a negative error code in case of failure. + */ +static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1, + struct ubifs_zbranch *zbr2) +{ + int err, nlen1, nlen2, cmp; + struct ubifs_dent_node *dent1, *dent2; + union ubifs_key key; + + ubifs_assert(!keys_cmp(c, &zbr1->key, &zbr2->key)); + dent1 = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS); + if (!dent1) + return -ENOMEM; + dent2 = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS); + if (!dent2) { + err = -ENOMEM; + goto out_free; + } + + err = ubifs_tnc_read_node(c, zbr1, dent1); + if (err) + goto out_free; + err = ubifs_validate_entry(c, dent1); + if (err) + goto out_free; + + err = ubifs_tnc_read_node(c, zbr2, dent2); + if (err) + goto out_free; + err = ubifs_validate_entry(c, dent2); + if (err) + goto out_free; + + /* Make sure node keys are the same as in zbranch */ + err = 1; + key_read(c, &dent1->key, &key); + if (keys_cmp(c, &zbr1->key, &key)) { + dbg_err("1st entry at %d:%d has key %s", zbr1->lnum, + zbr1->offs, DBGKEY(&key)); + dbg_err("but it should have key %s according to tnc", + DBGKEY(&zbr1->key)); + dbg_dump_node(c, dent1); + goto out_free; + } + + key_read(c, &dent2->key, &key); + if (keys_cmp(c, &zbr2->key, &key)) { + dbg_err("2nd entry at %d:%d has key %s", zbr1->lnum, + zbr1->offs, DBGKEY(&key)); + dbg_err("but it should have key %s according to tnc", + DBGKEY(&zbr2->key)); + dbg_dump_node(c, dent2); + goto out_free; + } + + nlen1 = le16_to_cpu(dent1->nlen); + nlen2 = le16_to_cpu(dent2->nlen); + + cmp = memcmp(dent1->name, dent2->name, min_t(int, nlen1, nlen2)); + if (cmp < 0 || (cmp == 0 && nlen1 < nlen2)) { + err = 0; + goto out_free; + } + if (cmp == 0 && nlen1 == nlen2) + dbg_err("2 xent/dent nodes with the same name"); + else + dbg_err("bad order of colliding key %s", + DBGKEY(&key)); + + dbg_msg("first node at %d:%d\n", zbr1->lnum, zbr1->offs); + dbg_dump_node(c, dent1); + dbg_msg("second node at %d:%d\n", zbr2->lnum, zbr2->offs); + dbg_dump_node(c, dent2); + +out_free: + kfree(dent2); + kfree(dent1); + return err; +} + +/** + * dbg_check_znode - check if znode is all right. + * @c: UBIFS file-system description object + * @zbr: zbranch which points to this znode + * + * This function makes sure that znode referred to by @zbr is all right. + * Returns zero if it is, and %-EINVAL if it is not. + */ +static int dbg_check_znode(struct ubifs_info *c, struct ubifs_zbranch *zbr) +{ + struct ubifs_znode *znode = zbr->znode; + struct ubifs_znode *zp = znode->parent; + int n, err, cmp; + + if (znode->child_cnt <= 0 || znode->child_cnt > c->fanout) { + err = 1; + goto out; + } + if (znode->level < 0) { + err = 2; + goto out; + } + if (znode->iip < 0 || znode->iip >= c->fanout) { + err = 3; + goto out; + } + + if (zbr->len == 0) + /* Only dirty zbranch may have no on-flash nodes */ + if (!ubifs_zn_dirty(znode)) { + err = 4; + goto out; + } + + if (ubifs_zn_dirty(znode)) { + /* + * If znode is dirty, its parent has to be dirty as well. The + * order of the operation is important, so we have to have + * memory barriers. + */ + smp_mb(); + if (zp && !ubifs_zn_dirty(zp)) { + /* + * The dirty flag is atomic and is cleared outside the + * TNC mutex, so znode's dirty flag may now have + * been cleared. The child is always cleared before the + * parent, so we just need to check again. + */ + smp_mb(); + if (ubifs_zn_dirty(znode)) { + err = 5; + goto out; + } + } + } + + if (zp) { + const union ubifs_key *min, *max; + + if (znode->level != zp->level - 1) { + err = 6; + goto out; + } + + /* Make sure the 'parent' pointer in our znode is correct */ + err = ubifs_search_zbranch(c, zp, &zbr->key, &n); + if (!err) { + /* This zbranch does not exist in the parent */ + err = 7; + goto out; + } + + if (znode->iip >= zp->child_cnt) { + err = 8; + goto out; + } + + if (znode->iip != n) { + /* This may happen only in case of collisions */ + if (keys_cmp(c, &zp->zbranch[n].key, + &zp->zbranch[znode->iip].key)) { + err = 9; + goto out; + } + n = znode->iip; + } + + /* + * Make sure that the first key in our znode is greater than or + * equal to the key in the pointing zbranch. + */ + min = &zbr->key; + cmp = keys_cmp(c, min, &znode->zbranch[0].key); + if (cmp == 1) { + err = 10; + goto out; + } + + if (n + 1 < zp->child_cnt) { + max = &zp->zbranch[n + 1].key; + + /* + * Make sure the last key in our znode is less or + * equivalent than the the key in zbranch which goes + * after our pointing zbranch. + */ + cmp = keys_cmp(c, max, + &znode->zbranch[znode->child_cnt - 1].key); + if (cmp == -1) { + err = 11; + goto out; + } + } + } else { + /* This may only be root znode */ + if (zbr != &c->zroot) { + err = 12; + goto out; + } + } + + /* + * Make sure that next key is greater or equivalent then the previous + * one. + */ + for (n = 1; n < znode->child_cnt; n++) { + cmp = keys_cmp(c, &znode->zbranch[n - 1].key, + &znode->zbranch[n].key); + if (cmp > 0) { + err = 13; + goto out; + } + if (cmp == 0) { + /* This can only be keys with colliding hash */ + if (!is_hash_key(c, &znode->zbranch[n].key)) { + err = 14; + goto out; + } + + if (znode->level != 0 || c->replaying) + continue; + + /* + * Colliding keys should follow binary order of + * corresponding xentry/dentry names. + */ + err = dbg_check_key_order(c, &znode->zbranch[n - 1], + &znode->zbranch[n]); + if (err < 0) + return err; + if (err) { + err = 15; + goto out; + } + } + } + + for (n = 0; n < znode->child_cnt; n++) { + if (!znode->zbranch[n].znode && + (znode->zbranch[n].lnum == 0 || + znode->zbranch[n].len == 0)) { + err = 16; + goto out; + } + + if (znode->zbranch[n].lnum != 0 && + znode->zbranch[n].len == 0) { + err = 17; + goto out; + } + + if (znode->zbranch[n].lnum == 0 && + znode->zbranch[n].len != 0) { + err = 18; + goto out; + } + + if (znode->zbranch[n].lnum == 0 && + znode->zbranch[n].offs != 0) { + err = 19; + goto out; + } + + if (znode->level != 0 && znode->zbranch[n].znode) + if (znode->zbranch[n].znode->parent != znode) { + err = 20; + goto out; + } + } + + return 0; + +out: + ubifs_err("failed, error %d", err); + ubifs_msg("dump of the znode"); + dbg_dump_znode(c, znode); + if (zp) { + ubifs_msg("dump of the parent znode"); + dbg_dump_znode(c, zp); + } + dump_stack(); + return -EINVAL; +} + +/** + * dbg_check_tnc - check TNC tree. + * @c: UBIFS file-system description object + * @extra: do extra checks that are possible at start commit + * + * This function traverses whole TNC tree and checks every znode. Returns zero + * if everything is all right and %-EINVAL if something is wrong with TNC. + */ +int dbg_check_tnc(struct ubifs_info *c, int extra) +{ + struct ubifs_znode *znode; + long clean_cnt = 0, dirty_cnt = 0; + int err, last; + + if (!(ubifs_chk_flags & UBIFS_CHK_TNC)) + return 0; + + ubifs_assert(mutex_is_locked(&c->tnc_mutex)); + if (!c->zroot.znode) + return 0; + + znode = ubifs_tnc_postorder_first(c->zroot.znode); + while (1) { + struct ubifs_znode *prev; + struct ubifs_zbranch *zbr; + + if (!znode->parent) + zbr = &c->zroot; + else + zbr = &znode->parent->zbranch[znode->iip]; + + err = dbg_check_znode(c, zbr); + if (err) + return err; + + if (extra) { + if (ubifs_zn_dirty(znode)) + dirty_cnt += 1; + else + clean_cnt += 1; + } + + prev = znode; + znode = ubifs_tnc_postorder_next(znode); + if (!znode) + break; + + /* + * If the last key of this znode is equivalent to the first key + * of the next znode (collision), then check order of the keys. + */ + last = prev->child_cnt - 1; + if (prev->level == 0 && znode->level == 0 && !c->replaying && + !keys_cmp(c, &prev->zbranch[last].key, + &znode->zbranch[0].key)) { + err = dbg_check_key_order(c, &prev->zbranch[last], + &znode->zbranch[0]); + if (err < 0) + return err; + if (err) { + ubifs_msg("first znode"); + dbg_dump_znode(c, prev); + ubifs_msg("second znode"); + dbg_dump_znode(c, znode); + return -EINVAL; + } + } + } + + if (extra) { + if (clean_cnt != atomic_long_read(&c->clean_zn_cnt)) { + ubifs_err("incorrect clean_zn_cnt %ld, calculated %ld", + atomic_long_read(&c->clean_zn_cnt), + clean_cnt); + return -EINVAL; + } + if (dirty_cnt != atomic_long_read(&c->dirty_zn_cnt)) { + ubifs_err("incorrect dirty_zn_cnt %ld, calculated %ld", + atomic_long_read(&c->dirty_zn_cnt), + dirty_cnt); + return -EINVAL; + } + } + + return 0; +} + +/** + * dbg_walk_index - walk the on-flash index. + * @c: UBIFS file-system description object + * @leaf_cb: called for each leaf node + * @znode_cb: called for each indexing node + * @priv: private date which is passed to callbacks + * + * This function walks the UBIFS index and calls the @leaf_cb for each leaf + * node and @znode_cb for each indexing node. Returns zero in case of success + * and a negative error code in case of failure. + * + * It would be better if this function removed every znode it pulled to into + * the TNC, so that the behavior more closely matched the non-debugging + * behavior. + */ +int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb, + dbg_znode_callback znode_cb, void *priv) +{ + int err; + struct ubifs_zbranch *zbr; + struct ubifs_znode *znode, *child; + + mutex_lock(&c->tnc_mutex); + /* If the root indexing node is not in TNC - pull it */ + if (!c->zroot.znode) { + c->zroot.znode = ubifs_load_znode(c, &c->zroot, NULL, 0); + if (IS_ERR(c->zroot.znode)) { + err = PTR_ERR(c->zroot.znode); + c->zroot.znode = NULL; + goto out_unlock; + } + } + + /* + * We are going to traverse the indexing tree in the postorder manner. + * Go down and find the leftmost indexing node where we are going to + * start from. + */ + znode = c->zroot.znode; + while (znode->level > 0) { + zbr = &znode->zbranch[0]; + child = zbr->znode; + if (!child) { + child = ubifs_load_znode(c, zbr, znode, 0); + if (IS_ERR(child)) { + err = PTR_ERR(child); + goto out_unlock; + } + zbr->znode = child; + } + + znode = child; + } + + /* Iterate over all indexing nodes */ + while (1) { + int idx; + + cond_resched(); + + if (znode_cb) { + err = znode_cb(c, znode, priv); + if (err) { + ubifs_err("znode checking function returned " + "error %d", err); + dbg_dump_znode(c, znode); + goto out_dump; + } + } + if (leaf_cb && znode->level == 0) { + for (idx = 0; idx < znode->child_cnt; idx++) { + zbr = &znode->zbranch[idx]; + err = leaf_cb(c, zbr, priv); + if (err) { + ubifs_err("leaf checking function " + "returned error %d, for leaf " + "at LEB %d:%d", + err, zbr->lnum, zbr->offs); + goto out_dump; + } + } + } + + if (!znode->parent) + break; + + idx = znode->iip + 1; + znode = znode->parent; + if (idx < znode->child_cnt) { + /* Switch to the next index in the parent */ + zbr = &znode->zbranch[idx]; + child = zbr->znode; + if (!child) { + child = ubifs_load_znode(c, zbr, znode, idx); + if (IS_ERR(child)) { + err = PTR_ERR(child); + goto out_unlock; + } + zbr->znode = child; + } + znode = child; + } else + /* + * This is the last child, switch to the parent and + * continue. + */ + continue; + + /* Go to the lowest leftmost znode in the new sub-tree */ + while (znode->level > 0) { + zbr = &znode->zbranch[0]; + child = zbr->znode; + if (!child) { + child = ubifs_load_znode(c, zbr, znode, 0); + if (IS_ERR(child)) { + err = PTR_ERR(child); + goto out_unlock; + } + zbr->znode = child; + } + znode = child; + } + } + + mutex_unlock(&c->tnc_mutex); + return 0; + +out_dump: + if (znode->parent) + zbr = &znode->parent->zbranch[znode->iip]; + else + zbr = &c->zroot; + ubifs_msg("dump of znode at LEB %d:%d", zbr->lnum, zbr->offs); + dbg_dump_znode(c, znode); +out_unlock: + mutex_unlock(&c->tnc_mutex); + return err; +} + +/** + * add_size - add znode size to partially calculated index size. + * @c: UBIFS file-system description object + * @znode: znode to add size for + * @priv: partially calculated index size + * + * This is a helper function for 'dbg_check_idx_size()' which is called for + * every indexing node and adds its size to the 'long long' variable pointed to + * by @priv. + */ +static int add_size(struct ubifs_info *c, struct ubifs_znode *znode, void *priv) +{ + long long *idx_size = priv; + int add; + + add = ubifs_idx_node_sz(c, znode->child_cnt); + add = ALIGN(add, 8); + *idx_size += add; + return 0; +} + +/** + * dbg_check_idx_size - check index size. + * @c: UBIFS file-system description object + * @idx_size: size to check + * + * This function walks the UBIFS index, calculates its size and checks that the + * size is equivalent to @idx_size. Returns zero in case of success and a + * negative error code in case of failure. + */ +int dbg_check_idx_size(struct ubifs_info *c, long long idx_size) +{ + int err; + long long calc = 0; + + if (!(ubifs_chk_flags & UBIFS_CHK_IDX_SZ)) + return 0; + + err = dbg_walk_index(c, NULL, add_size, &calc); + if (err) { + ubifs_err("error %d while walking the index", err); + return err; + } + + if (calc != idx_size) { + ubifs_err("index size check failed: calculated size is %lld, " + "should be %lld", calc, idx_size); + dump_stack(); + return -EINVAL; + } + + return 0; +} + +/** + * struct fsck_inode - information about an inode used when checking the file-system. + * @rb: link in the RB-tree of inodes + * @inum: inode number + * @mode: inode type, permissions, etc + * @nlink: inode link count + * @xattr_cnt: count of extended attributes + * @references: how many directory/xattr entries refer this inode (calculated + * while walking the index) + * @calc_cnt: for directory inode count of child directories + * @size: inode size (read from on-flash inode) + * @xattr_sz: summary size of all extended attributes (read from on-flash + * inode) + * @calc_sz: for directories calculated directory size + * @calc_xcnt: count of extended attributes + * @calc_xsz: calculated summary size of all extended attributes + * @xattr_nms: sum of lengths of all extended attribute names belonging to this + * inode (read from on-flash inode) + * @calc_xnms: calculated sum of lengths of all extended attribute names + */ +struct fsck_inode { + struct rb_node rb; + ino_t inum; + umode_t mode; + unsigned int nlink; + unsigned int xattr_cnt; + int references; + int calc_cnt; + long long size; + unsigned int xattr_sz; + long long calc_sz; + long long calc_xcnt; + long long calc_xsz; + unsigned int xattr_nms; + long long calc_xnms; +}; + +/** + * struct fsck_data - private FS checking information. + * @inodes: RB-tree of all inodes (contains @struct fsck_inode objects) + */ +struct fsck_data { + struct rb_root inodes; +}; + +/** + * add_inode - add inode information to RB-tree of inodes. + * @c: UBIFS file-system description object + * @fsckd: FS checking information + * @ino: raw UBIFS inode to add + * + * This is a helper function for 'check_leaf()' which adds information about + * inode @ino to the RB-tree of inodes. Returns inode information pointer in + * case of success and a negative error code in case of failure. + */ +static struct fsck_inode *add_inode(struct ubifs_info *c, + struct fsck_data *fsckd, + struct ubifs_ino_node *ino) +{ + struct rb_node **p, *parent = NULL; + struct fsck_inode *fscki; + ino_t inum = key_inum_flash(c, &ino->key); + + p = &fsckd->inodes.rb_node; + while (*p) { + parent = *p; + fscki = rb_entry(parent, struct fsck_inode, rb); + if (inum < fscki->inum) + p = &(*p)->rb_left; + else if (inum > fscki->inum) + p = &(*p)->rb_right; + else + return fscki; + } + + if (inum > c->highest_inum) { + ubifs_err("too high inode number, max. is %lu", + c->highest_inum); + return ERR_PTR(-EINVAL); + } + + fscki = kzalloc(sizeof(struct fsck_inode), GFP_NOFS); + if (!fscki) + return ERR_PTR(-ENOMEM); + + fscki->inum = inum; + fscki->nlink = le32_to_cpu(ino->nlink); + fscki->size = le64_to_cpu(ino->size); + fscki->xattr_cnt = le32_to_cpu(ino->xattr_cnt); + fscki->xattr_sz = le32_to_cpu(ino->xattr_size); + fscki->xattr_nms = le32_to_cpu(ino->xattr_names); + fscki->mode = le32_to_cpu(ino->mode); + if (S_ISDIR(fscki->mode)) { + fscki->calc_sz = UBIFS_INO_NODE_SZ; + fscki->calc_cnt = 2; + } + rb_link_node(&fscki->rb, parent, p); + rb_insert_color(&fscki->rb, &fsckd->inodes); + return fscki; +} + +/** + * search_inode - search inode in the RB-tree of inodes. + * @fsckd: FS checking information + * @inum: inode number to search + * + * This is a helper function for 'check_leaf()' which searches inode @inum in + * the RB-tree of inodes and returns an inode information pointer or %NULL if + * the inode was not found. + */ +static struct fsck_inode *search_inode(struct fsck_data *fsckd, ino_t inum) +{ + struct rb_node *p; + struct fsck_inode *fscki; + + p = fsckd->inodes.rb_node; + while (p) { + fscki = rb_entry(p, struct fsck_inode, rb); + if (inum < fscki->inum) + p = p->rb_left; + else if (inum > fscki->inum) + p = p->rb_right; + else + return fscki; + } + return NULL; +} + +/** + * read_add_inode - read inode node and add it to RB-tree of inodes. + * @c: UBIFS file-system description object + * @fsckd: FS checking information + * @inum: inode number to read + * + * This is a helper function for 'check_leaf()' which finds inode node @inum in + * the index, reads it, and adds it to the RB-tree of inodes. Returns inode + * information pointer in case of success and a negative error code in case of + * failure. + */ +static struct fsck_inode *read_add_inode(struct ubifs_info *c, + struct fsck_data *fsckd, ino_t inum) +{ + int n, err; + union ubifs_key key; + struct ubifs_znode *znode; + struct ubifs_zbranch *zbr; + struct ubifs_ino_node *ino; + struct fsck_inode *fscki; + + fscki = search_inode(fsckd, inum); + if (fscki) + return fscki; + + ino_key_init(c, &key, inum); + err = ubifs_lookup_level0(c, &key, &znode, &n); + if (!err) { + ubifs_err("inode %lu not found in index", inum); + return ERR_PTR(-ENOENT); + } else if (err < 0) { + ubifs_err("error %d while looking up inode %lu", err, inum); + return ERR_PTR(err); + } + + zbr = &znode->zbranch[n]; + if (zbr->len < UBIFS_INO_NODE_SZ) { + ubifs_err("bad node %lu node length %d", inum, zbr->len); + return ERR_PTR(-EINVAL); + } + + ino = kmalloc(zbr->len, GFP_NOFS); + if (!ino) + return ERR_PTR(-ENOMEM); + + err = ubifs_tnc_read_node(c, zbr, ino); + if (err) { + ubifs_err("cannot read inode node at LEB %d:%d, error %d", + zbr->lnum, zbr->offs, err); + kfree(ino); + return ERR_PTR(err); + } + + fscki = add_inode(c, fsckd, ino); + kfree(ino); + if (IS_ERR(fscki)) { + ubifs_err("error %ld while adding inode %lu node", + PTR_ERR(fscki), inum); + return fscki; + } + + return fscki; +} + +/** + * check_leaf - check leaf node. + * @c: UBIFS file-system description object + * @zbr: zbranch of the leaf node to check + * @priv: FS checking information + * + * This is a helper function for 'dbg_check_filesystem()' which is called for + * every single leaf node while walking the indexing tree. It checks that the + * leaf node referred from the indexing tree exists, has correct CRC, and does + * some other basic validation. This function is also responsible for building + * an RB-tree of inodes - it adds all inodes into the RB-tree. It also + * calculates reference count, size, etc for each inode in order to later + * compare them to the information stored inside the inodes and detect possible + * inconsistencies. Returns zero in case of success and a negative error code + * in case of failure. + */ +static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr, + void *priv) +{ + ino_t inum; + void *node; + struct ubifs_ch *ch; + int err, type = key_type(c, &zbr->key); + struct fsck_inode *fscki; + + if (zbr->len < UBIFS_CH_SZ) { + ubifs_err("bad leaf length %d (LEB %d:%d)", + zbr->len, zbr->lnum, zbr->offs); + return -EINVAL; + } + + node = kmalloc(zbr->len, GFP_NOFS); + if (!node) + return -ENOMEM; + + err = ubifs_tnc_read_node(c, zbr, node); + if (err) { + ubifs_err("cannot read leaf node at LEB %d:%d, error %d", + zbr->lnum, zbr->offs, err); + goto out_free; + } + + /* If this is an inode node, add it to RB-tree of inodes */ + if (type == UBIFS_INO_KEY) { + fscki = add_inode(c, priv, node); + if (IS_ERR(fscki)) { + err = PTR_ERR(fscki); + ubifs_err("error %d while adding inode node", err); + goto out_dump; + } + goto out; + } + + if (type != UBIFS_DENT_KEY && type != UBIFS_XENT_KEY && + type != UBIFS_DATA_KEY) { + ubifs_err("unexpected node type %d at LEB %d:%d", + type, zbr->lnum, zbr->offs); + err = -EINVAL; + goto out_free; + } + + ch = node; + if (le64_to_cpu(ch->sqnum) > c->max_sqnum) { + ubifs_err("too high sequence number, max. is %llu", + c->max_sqnum); + err = -EINVAL; + goto out_dump; + } + + if (type == UBIFS_DATA_KEY) { + long long blk_offs; + struct ubifs_data_node *dn = node; + + /* + * Search the inode node this data node belongs to and insert + * it to the RB-tree of inodes. + */ + inum = key_inum_flash(c, &dn->key); + fscki = read_add_inode(c, priv, inum); + if (IS_ERR(fscki)) { + err = PTR_ERR(fscki); + ubifs_err("error %d while processing data node and " + "trying to find inode node %lu", err, inum); + goto out_dump; + } + + /* Make sure the data node is within inode size */ + blk_offs = key_block_flash(c, &dn->key); + blk_offs <<= UBIFS_BLOCK_SHIFT; + blk_offs += le32_to_cpu(dn->size); + if (blk_offs > fscki->size) { + ubifs_err("data node at LEB %d:%d is not within inode " + "size %lld", zbr->lnum, zbr->offs, + fscki->size); + err = -EINVAL; + goto out_dump; + } + } else { + int nlen; + struct ubifs_dent_node *dent = node; + struct fsck_inode *fscki1; + + err = ubifs_validate_entry(c, dent); + if (err) + goto out_dump; + + /* + * Search the inode node this entry refers to and the parent + * inode node and insert them to the RB-tree of inodes. + */ + inum = le64_to_cpu(dent->inum); + fscki = read_add_inode(c, priv, inum); + if (IS_ERR(fscki)) { + err = PTR_ERR(fscki); + ubifs_err("error %d while processing entry node and " + "trying to find inode node %lu", err, inum); + goto out_dump; + } + + /* Count how many direntries or xentries refers this inode */ + fscki->references += 1; + + inum = key_inum_flash(c, &dent->key); + fscki1 = read_add_inode(c, priv, inum); + if (IS_ERR(fscki1)) { + err = PTR_ERR(fscki); + ubifs_err("error %d while processing entry node and " + "trying to find parent inode node %lu", + err, inum); + goto out_dump; + } + + nlen = le16_to_cpu(dent->nlen); + if (type == UBIFS_XENT_KEY) { + fscki1->calc_xcnt += 1; + fscki1->calc_xsz += CALC_DENT_SIZE(nlen); + fscki1->calc_xsz += CALC_XATTR_BYTES(fscki->size); + fscki1->calc_xnms += nlen; + } else { + fscki1->calc_sz += CALC_DENT_SIZE(nlen); + if (dent->type == UBIFS_ITYPE_DIR) + fscki1->calc_cnt += 1; + } + } + +out: + kfree(node); + return 0; + +out_dump: + ubifs_msg("dump of node at LEB %d:%d", zbr->lnum, zbr->offs); + dbg_dump_node(c, node); +out_free: + kfree(node); + return err; +} + +/** + * free_inodes - free RB-tree of inodes. + * @fsckd: FS checking information + */ +static void free_inodes(struct fsck_data *fsckd) +{ + struct rb_node *this = fsckd->inodes.rb_node; + struct fsck_inode *fscki; + + while (this) { + if (this->rb_left) + this = this->rb_left; + else if (this->rb_right) + this = this->rb_right; + else { + fscki = rb_entry(this, struct fsck_inode, rb); + this = rb_parent(this); + if (this) { + if (this->rb_left == &fscki->rb) + this->rb_left = NULL; + else + this->rb_right = NULL; + } + kfree(fscki); + } + } +} + +/** + * check_inodes - checks all inodes. + * @c: UBIFS file-system description object + * @fsckd: FS checking information + * + * This is a helper function for 'dbg_check_filesystem()' which walks the + * RB-tree of inodes after the index scan has been finished, and checks that + * inode nlink, size, etc are correct. Returns zero if inodes are fine, + * %-EINVAL if not, and a negative error code in case of failure. + */ +static int check_inodes(struct ubifs_info *c, struct fsck_data *fsckd) +{ + int n, err; + union ubifs_key key; + struct ubifs_znode *znode; + struct ubifs_zbranch *zbr; + struct ubifs_ino_node *ino; + struct fsck_inode *fscki; + struct rb_node *this = rb_first(&fsckd->inodes); + + while (this) { + fscki = rb_entry(this, struct fsck_inode, rb); + this = rb_next(this); + + if (S_ISDIR(fscki->mode)) { + /* + * Directories have to have exactly one reference (they + * cannot have hardlinks), although root inode is an + * exception. + */ + if (fscki->inum != UBIFS_ROOT_INO && + fscki->references != 1) { + ubifs_err("directory inode %lu has %d " + "direntries which refer it, but " + "should be 1", fscki->inum, + fscki->references); + goto out_dump; + } + if (fscki->inum == UBIFS_ROOT_INO && + fscki->references != 0) { + ubifs_err("root inode %lu has non-zero (%d) " + "direntries which refer it", + fscki->inum, fscki->references); + goto out_dump; + } + if (fscki->calc_sz != fscki->size) { + ubifs_err("directory inode %lu size is %lld, " + "but calculated size is %lld", + fscki->inum, fscki->size, + fscki->calc_sz); + goto out_dump; + } + if (fscki->calc_cnt != fscki->nlink) { + ubifs_err("directory inode %lu nlink is %d, " + "but calculated nlink is %d", + fscki->inum, fscki->nlink, + fscki->calc_cnt); + goto out_dump; + } + } else { + if (fscki->references != fscki->nlink) { + ubifs_err("inode %lu nlink is %d, but " + "calculated nlink is %d", fscki->inum, + fscki->nlink, fscki->references); + goto out_dump; + } + } + if (fscki->xattr_sz != fscki->calc_xsz) { + ubifs_err("inode %lu has xattr size %u, but " + "calculated size is %lld", + fscki->inum, fscki->xattr_sz, + fscki->calc_xsz); + goto out_dump; + } + if (fscki->xattr_cnt != fscki->calc_xcnt) { + ubifs_err("inode %lu has %u xattrs, but " + "calculated count is %lld", fscki->inum, + fscki->xattr_cnt, fscki->calc_xcnt); + goto out_dump; + } + if (fscki->xattr_nms != fscki->calc_xnms) { + ubifs_err("inode %lu has xattr names' size %u, but " + "calculated names' size is %lld", + fscki->inum, fscki->xattr_nms, + fscki->calc_xnms); + goto out_dump; + } + } + + return 0; + +out_dump: + /* Read the bad inode and dump it */ + ino_key_init(c, &key, fscki->inum); + err = ubifs_lookup_level0(c, &key, &znode, &n); + if (!err) { + ubifs_err("inode %lu not found in index", fscki->inum); + return -ENOENT; + } else if (err < 0) { + ubifs_err("error %d while looking up inode %lu", + err, fscki->inum); + return err; + } + + zbr = &znode->zbranch[n]; + ino = kmalloc(zbr->len, GFP_NOFS); + if (!ino) + return -ENOMEM; + + err = ubifs_tnc_read_node(c, zbr, ino); + if (err) { + ubifs_err("cannot read inode node at LEB %d:%d, error %d", + zbr->lnum, zbr->offs, err); + kfree(ino); + return err; + } + + ubifs_msg("dump of the inode %lu sitting in LEB %d:%d", + fscki->inum, zbr->lnum, zbr->offs); + dbg_dump_node(c, ino); + kfree(ino); + return -EINVAL; +} + +/** + * dbg_check_filesystem - check the file-system. + * @c: UBIFS file-system description object + * + * This function checks the file system, namely: + * o makes sure that all leaf nodes exist and their CRCs are correct; + * o makes sure inode nlink, size, xattr size/count are correct (for all + * inodes). + * + * The function reads whole indexing tree and all nodes, so it is pretty + * heavy-weight. Returns zero if the file-system is consistent, %-EINVAL if + * not, and a negative error code in case of failure. + */ +int dbg_check_filesystem(struct ubifs_info *c) +{ + int err; + struct fsck_data fsckd; + + if (!(ubifs_chk_flags & UBIFS_CHK_FS)) + return 0; + + fsckd.inodes = RB_ROOT; + err = dbg_walk_index(c, check_leaf, NULL, &fsckd); + if (err) + goto out_free; + + err = check_inodes(c, &fsckd); + if (err) + goto out_free; + + free_inodes(&fsckd); + return 0; + +out_free: + ubifs_err("file-system check failed with error %d", err); + dump_stack(); + free_inodes(&fsckd); + return err; +} + +static int invocation_cnt; + +int dbg_force_in_the_gaps(void) +{ + if (!dbg_force_in_the_gaps_enabled) + return 0; + /* Force in-the-gaps every 8th commit */ + return !((invocation_cnt++) & 0x7); +} + +/* Failure mode for recovery testing */ + +#define chance(n, d) (simple_rand() <= (n) * 32768LL / (d)) + +struct failure_mode_info { + struct list_head list; + struct ubifs_info *c; +}; + +static LIST_HEAD(fmi_list); +static DEFINE_SPINLOCK(fmi_lock); + +static unsigned int next; + +static int simple_rand(void) +{ + if (next == 0) + next = current->pid; + next = next * 1103515245 + 12345; + return (next >> 16) & 32767; +} + +void dbg_failure_mode_registration(struct ubifs_info *c) +{ + struct failure_mode_info *fmi; + + fmi = kmalloc(sizeof(struct failure_mode_info), GFP_NOFS); + if (!fmi) { + dbg_err("Failed to register failure mode - no memory"); + return; + } + fmi->c = c; + spin_lock(&fmi_lock); + list_add_tail(&fmi->list, &fmi_list); + spin_unlock(&fmi_lock); +} + +void dbg_failure_mode_deregistration(struct ubifs_info *c) +{ + struct failure_mode_info *fmi, *tmp; + + spin_lock(&fmi_lock); + list_for_each_entry_safe(fmi, tmp, &fmi_list, list) + if (fmi->c == c) { + list_del(&fmi->list); + kfree(fmi); + } + spin_unlock(&fmi_lock); +} + +static struct ubifs_info *dbg_find_info(struct ubi_volume_desc *desc) +{ + struct failure_mode_info *fmi; + + spin_lock(&fmi_lock); + list_for_each_entry(fmi, &fmi_list, list) + if (fmi->c->ubi == desc) { + struct ubifs_info *c = fmi->c; + + spin_unlock(&fmi_lock); + return c; + } + spin_unlock(&fmi_lock); + return NULL; +} + +static int in_failure_mode(struct ubi_volume_desc *desc) +{ + struct ubifs_info *c = dbg_find_info(desc); + + if (c && dbg_failure_mode) + return c->failure_mode; + return 0; +} + +static int do_fail(struct ubi_volume_desc *desc, int lnum, int write) +{ + struct ubifs_info *c = dbg_find_info(desc); + + if (!c || !dbg_failure_mode) + return 0; + if (c->failure_mode) + return 1; + if (!c->fail_cnt) { + /* First call - decide delay to failure */ + if (chance(1, 2)) { + unsigned int delay = 1 << (simple_rand() >> 11); + + if (chance(1, 2)) { + c->fail_delay = 1; + c->fail_timeout = jiffies + + msecs_to_jiffies(delay); + dbg_rcvry("failing after %ums", delay); + } else { + c->fail_delay = 2; + c->fail_cnt_max = delay; + dbg_rcvry("failing after %u calls", delay); + } + } + c->fail_cnt += 1; + } + /* Determine if failure delay has expired */ + if (c->fail_delay == 1) { + if (time_before(jiffies, c->fail_timeout)) + return 0; + } else if (c->fail_delay == 2) + if (c->fail_cnt++ < c->fail_cnt_max) + return 0; + if (lnum == UBIFS_SB_LNUM) { + if (write) { + if (chance(1, 2)) + return 0; + } else if (chance(19, 20)) + return 0; + dbg_rcvry("failing in super block LEB %d", lnum); + } else if (lnum == UBIFS_MST_LNUM || lnum == UBIFS_MST_LNUM + 1) { + if (chance(19, 20)) + return 0; + dbg_rcvry("failing in master LEB %d", lnum); + } else if (lnum >= UBIFS_LOG_LNUM && lnum <= c->log_last) { + if (write) { + if (chance(99, 100)) + return 0; + } else if (chance(399, 400)) + return 0; + dbg_rcvry("failing in log LEB %d", lnum); + } else if (lnum >= c->lpt_first && lnum <= c->lpt_last) { + if (write) { + if (chance(7, 8)) + return 0; + } else if (chance(19, 20)) + return 0; + dbg_rcvry("failing in LPT LEB %d", lnum); + } else if (lnum >= c->orph_first && lnum <= c->orph_last) { + if (write) { + if (chance(1, 2)) + return 0; + } else if (chance(9, 10)) + return 0; + dbg_rcvry("failing in orphan LEB %d", lnum); + } else if (lnum == c->ihead_lnum) { + if (chance(99, 100)) + return 0; + dbg_rcvry("failing in index head LEB %d", lnum); + } else if (c->jheads && lnum == c->jheads[GCHD].wbuf.lnum) { + if (chance(9, 10)) + return 0; + dbg_rcvry("failing in GC head LEB %d", lnum); + } else if (write && !RB_EMPTY_ROOT(&c->buds) && + !ubifs_search_bud(c, lnum)) { + if (chance(19, 20)) + return 0; + dbg_rcvry("failing in non-bud LEB %d", lnum); + } else if (c->cmt_state == COMMIT_RUNNING_BACKGROUND || + c->cmt_state == COMMIT_RUNNING_REQUIRED) { + if (chance(999, 1000)) + return 0; + dbg_rcvry("failing in bud LEB %d commit running", lnum); + } else { + if (chance(9999, 10000)) + return 0; + dbg_rcvry("failing in bud LEB %d commit not running", lnum); + } + ubifs_err("*** SETTING FAILURE MODE ON (LEB %d) ***", lnum); + c->failure_mode = 1; + dump_stack(); + return 1; +} + +static void cut_data(const void *buf, int len) +{ + int flen, i; + unsigned char *p = (void *)buf; + + flen = (len * (long long)simple_rand()) >> 15; + for (i = flen; i < len; i++) + p[i] = 0xff; +} + +int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset, + int len, int check) +{ + if (in_failure_mode(desc)) + return -EIO; + return ubi_leb_read(desc, lnum, buf, offset, len, check); +} + +int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf, + int offset, int len, int dtype) +{ + int err, failing; + + if (in_failure_mode(desc)) + return -EIO; + failing = do_fail(desc, lnum, 1); + if (failing) + cut_data(buf, len); + err = ubi_leb_write(desc, lnum, buf, offset, len, dtype); + if (err) + return err; + if (failing) + return -EIO; + return 0; +} + +int dbg_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf, + int len, int dtype) +{ + int err; + + if (do_fail(desc, lnum, 1)) + return -EIO; + err = ubi_leb_change(desc, lnum, buf, len, dtype); + if (err) + return err; + if (do_fail(desc, lnum, 1)) + return -EIO; + return 0; +} + +int dbg_leb_erase(struct ubi_volume_desc *desc, int lnum) +{ + int err; + + if (do_fail(desc, lnum, 0)) + return -EIO; + err = ubi_leb_erase(desc, lnum); + if (err) + return err; + if (do_fail(desc, lnum, 0)) + return -EIO; + return 0; +} + +int dbg_leb_unmap(struct ubi_volume_desc *desc, int lnum) +{ + int err; + + if (do_fail(desc, lnum, 0)) + return -EIO; + err = ubi_leb_unmap(desc, lnum); + if (err) + return err; + if (do_fail(desc, lnum, 0)) + return -EIO; + return 0; +} + +int dbg_is_mapped(struct ubi_volume_desc *desc, int lnum) +{ + if (in_failure_mode(desc)) + return -EIO; + return ubi_is_mapped(desc, lnum); +} + +int dbg_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype) +{ + int err; + + if (do_fail(desc, lnum, 0)) + return -EIO; + err = ubi_leb_map(desc, lnum, dtype); + if (err) + return err; + if (do_fail(desc, lnum, 0)) + return -EIO; + return 0; +} + +#endif /* CONFIG_UBIFS_FS_DEBUG */ diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h new file mode 100644 index 000000000000..50315fc57185 --- /dev/null +++ b/fs/ubifs/debug.h @@ -0,0 +1,398 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +#ifndef __UBIFS_DEBUG_H__ +#define __UBIFS_DEBUG_H__ + +#ifdef CONFIG_UBIFS_FS_DEBUG + +#define UBIFS_DBG(op) op + +#define ubifs_assert(expr) do { \ + if (unlikely(!(expr))) { \ + printk(KERN_CRIT "UBIFS assert failed in %s at %u (pid %d)\n", \ + __func__, __LINE__, current->pid); \ + dbg_dump_stack(); \ + } \ +} while (0) + +#define ubifs_assert_cmt_locked(c) do { \ + if (unlikely(down_write_trylock(&(c)->commit_sem))) { \ + up_write(&(c)->commit_sem); \ + printk(KERN_CRIT "commit lock is not locked!\n"); \ + ubifs_assert(0); \ + } \ +} while (0) + +#define dbg_dump_stack() do { \ + if (!dbg_failure_mode) \ + dump_stack(); \ +} while (0) + +/* Generic debugging messages */ +#define dbg_msg(fmt, ...) do { \ + spin_lock(&dbg_lock); \ + printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", current->pid, \ + __func__, ##__VA_ARGS__); \ + spin_unlock(&dbg_lock); \ +} while (0) + +#define dbg_do_msg(typ, fmt, ...) do { \ + if (ubifs_msg_flags & typ) \ + dbg_msg(fmt, ##__VA_ARGS__); \ +} while (0) + +#define dbg_err(fmt, ...) do { \ + spin_lock(&dbg_lock); \ + ubifs_err(fmt, ##__VA_ARGS__); \ + spin_unlock(&dbg_lock); \ +} while (0) + +const char *dbg_key_str0(const struct ubifs_info *c, + const union ubifs_key *key); +const char *dbg_key_str1(const struct ubifs_info *c, + const union ubifs_key *key); + +/* + * DBGKEY macros require @dbg_lock to be held, which it is in the dbg message + * macros. + */ +#define DBGKEY(key) dbg_key_str0(c, (key)) +#define DBGKEY1(key) dbg_key_str1(c, (key)) + +/* General messages */ +#define dbg_gen(fmt, ...) dbg_do_msg(UBIFS_MSG_GEN, fmt, ##__VA_ARGS__) + +/* Additional journal messages */ +#define dbg_jnl(fmt, ...) dbg_do_msg(UBIFS_MSG_JNL, fmt, ##__VA_ARGS__) + +/* Additional TNC messages */ +#define dbg_tnc(fmt, ...) dbg_do_msg(UBIFS_MSG_TNC, fmt, ##__VA_ARGS__) + +/* Additional lprops messages */ +#define dbg_lp(fmt, ...) dbg_do_msg(UBIFS_MSG_LP, fmt, ##__VA_ARGS__) + +/* Additional LEB find messages */ +#define dbg_find(fmt, ...) dbg_do_msg(UBIFS_MSG_FIND, fmt, ##__VA_ARGS__) + +/* Additional mount messages */ +#define dbg_mnt(fmt, ...) dbg_do_msg(UBIFS_MSG_MNT, fmt, ##__VA_ARGS__) + +/* Additional I/O messages */ +#define dbg_io(fmt, ...) dbg_do_msg(UBIFS_MSG_IO, fmt, ##__VA_ARGS__) + +/* Additional commit messages */ +#define dbg_cmt(fmt, ...) dbg_do_msg(UBIFS_MSG_CMT, fmt, ##__VA_ARGS__) + +/* Additional budgeting messages */ +#define dbg_budg(fmt, ...) dbg_do_msg(UBIFS_MSG_BUDG, fmt, ##__VA_ARGS__) + +/* Additional log messages */ +#define dbg_log(fmt, ...) dbg_do_msg(UBIFS_MSG_LOG, fmt, ##__VA_ARGS__) + +/* Additional gc messages */ +#define dbg_gc(fmt, ...) dbg_do_msg(UBIFS_MSG_GC, fmt, ##__VA_ARGS__) + +/* Additional scan messages */ +#define dbg_scan(fmt, ...) dbg_do_msg(UBIFS_MSG_SCAN, fmt, ##__VA_ARGS__) + +/* Additional recovery messages */ +#define dbg_rcvry(fmt, ...) dbg_do_msg(UBIFS_MSG_RCVRY, fmt, ##__VA_ARGS__) + +/* + * Debugging message type flags (must match msg_type_names in debug.c). + * + * UBIFS_MSG_GEN: general messages + * UBIFS_MSG_JNL: journal messages + * UBIFS_MSG_MNT: mount messages + * UBIFS_MSG_CMT: commit messages + * UBIFS_MSG_FIND: LEB find messages + * UBIFS_MSG_BUDG: budgeting messages + * UBIFS_MSG_GC: garbage collection messages + * UBIFS_MSG_TNC: TNC messages + * UBIFS_MSG_LP: lprops messages + * UBIFS_MSG_IO: I/O messages + * UBIFS_MSG_LOG: log messages + * UBIFS_MSG_SCAN: scan messages + * UBIFS_MSG_RCVRY: recovery messages + */ +enum { + UBIFS_MSG_GEN = 0x1, + UBIFS_MSG_JNL = 0x2, + UBIFS_MSG_MNT = 0x4, + UBIFS_MSG_CMT = 0x8, + UBIFS_MSG_FIND = 0x10, + UBIFS_MSG_BUDG = 0x20, + UBIFS_MSG_GC = 0x40, + UBIFS_MSG_TNC = 0x80, + UBIFS_MSG_LP = 0x100, + UBIFS_MSG_IO = 0x200, + UBIFS_MSG_LOG = 0x400, + UBIFS_MSG_SCAN = 0x800, + UBIFS_MSG_RCVRY = 0x1000, +}; + +/* Debugging message type flags for each default debug message level */ +#define UBIFS_MSG_LVL_0 0 +#define UBIFS_MSG_LVL_1 0x1 +#define UBIFS_MSG_LVL_2 0x7f +#define UBIFS_MSG_LVL_3 0xffff + +/* + * Debugging check flags (must match chk_names in debug.c). + * + * UBIFS_CHK_GEN: general checks + * UBIFS_CHK_TNC: check TNC + * UBIFS_CHK_IDX_SZ: check index size + * UBIFS_CHK_ORPH: check orphans + * UBIFS_CHK_OLD_IDX: check the old index + * UBIFS_CHK_LPROPS: check lprops + * UBIFS_CHK_FS: check the file-system + */ +enum { + UBIFS_CHK_GEN = 0x1, + UBIFS_CHK_TNC = 0x2, + UBIFS_CHK_IDX_SZ = 0x4, + UBIFS_CHK_ORPH = 0x8, + UBIFS_CHK_OLD_IDX = 0x10, + UBIFS_CHK_LPROPS = 0x20, + UBIFS_CHK_FS = 0x40, +}; + +/* + * Special testing flags (must match tst_names in debug.c). + * + * UBIFS_TST_FORCE_IN_THE_GAPS: force the use of in-the-gaps method + * UBIFS_TST_RCVRY: failure mode for recovery testing + */ +enum { + UBIFS_TST_FORCE_IN_THE_GAPS = 0x2, + UBIFS_TST_RCVRY = 0x4, +}; + +#if CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 1 +#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_1 +#elif CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 2 +#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_2 +#elif CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 3 +#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_3 +#else +#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_0 +#endif + +#ifdef CONFIG_UBIFS_FS_DEBUG_CHKS +#define UBIFS_CHK_FLAGS_DEFAULT 0xffffffff +#else +#define UBIFS_CHK_FLAGS_DEFAULT 0 +#endif + +extern spinlock_t dbg_lock; + +extern unsigned int ubifs_msg_flags; +extern unsigned int ubifs_chk_flags; +extern unsigned int ubifs_tst_flags; + +/* Dump functions */ + +const char *dbg_ntype(int type); +const char *dbg_cstate(int cmt_state); +const char *dbg_get_key_dump(const struct ubifs_info *c, + const union ubifs_key *key); +void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode); +void dbg_dump_node(const struct ubifs_info *c, const void *node); +void dbg_dump_budget_req(const struct ubifs_budget_req *req); +void dbg_dump_lstats(const struct ubifs_lp_stats *lst); +void dbg_dump_budg(struct ubifs_info *c); +void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp); +void dbg_dump_lprops(struct ubifs_info *c); +void dbg_dump_leb(const struct ubifs_info *c, int lnum); +void dbg_dump_znode(const struct ubifs_info *c, + const struct ubifs_znode *znode); +void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat); +void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, + struct ubifs_nnode *parent, int iip); +void dbg_dump_tnc(struct ubifs_info *c); +void dbg_dump_index(struct ubifs_info *c); + +/* Checking helper functions */ + +typedef int (*dbg_leaf_callback)(struct ubifs_info *c, + struct ubifs_zbranch *zbr, void *priv); +typedef int (*dbg_znode_callback)(struct ubifs_info *c, + struct ubifs_znode *znode, void *priv); +int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb, + dbg_znode_callback znode_cb, void *priv); + +/* Checking functions */ + +int dbg_check_lprops(struct ubifs_info *c); +int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot); +int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot); +int dbg_check_cats(struct ubifs_info *c); +int dbg_check_ltab(struct ubifs_info *c); +int dbg_check_synced_i_size(struct inode *inode); +int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir); +int dbg_check_tnc(struct ubifs_info *c, int extra); +int dbg_check_idx_size(struct ubifs_info *c, long long idx_size); +int dbg_check_filesystem(struct ubifs_info *c); +void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat, + int add_pos); +int dbg_check_lprops(struct ubifs_info *c); +int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode, + int row, int col); + +/* Force the use of in-the-gaps method for testing */ + +#define dbg_force_in_the_gaps_enabled \ + (ubifs_tst_flags & UBIFS_TST_FORCE_IN_THE_GAPS) + +int dbg_force_in_the_gaps(void); + +/* Failure mode for recovery testing */ + +#define dbg_failure_mode (ubifs_tst_flags & UBIFS_TST_RCVRY) + +void dbg_failure_mode_registration(struct ubifs_info *c); +void dbg_failure_mode_deregistration(struct ubifs_info *c); + +#ifndef UBIFS_DBG_PRESERVE_UBI + +#define ubi_leb_read dbg_leb_read +#define ubi_leb_write dbg_leb_write +#define ubi_leb_change dbg_leb_change +#define ubi_leb_erase dbg_leb_erase +#define ubi_leb_unmap dbg_leb_unmap +#define ubi_is_mapped dbg_is_mapped +#define ubi_leb_map dbg_leb_map + +#endif + +int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset, + int len, int check); +int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf, + int offset, int len, int dtype); +int dbg_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf, + int len, int dtype); +int dbg_leb_erase(struct ubi_volume_desc *desc, int lnum); +int dbg_leb_unmap(struct ubi_volume_desc *desc, int lnum); +int dbg_is_mapped(struct ubi_volume_desc *desc, int lnum); +int dbg_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype); + +static inline int dbg_read(struct ubi_volume_desc *desc, int lnum, char *buf, + int offset, int len) +{ + return dbg_leb_read(desc, lnum, buf, offset, len, 0); +} + +static inline int dbg_write(struct ubi_volume_desc *desc, int lnum, + const void *buf, int offset, int len) +{ + return dbg_leb_write(desc, lnum, buf, offset, len, UBI_UNKNOWN); +} + +static inline int dbg_change(struct ubi_volume_desc *desc, int lnum, + const void *buf, int len) +{ + return dbg_leb_change(desc, lnum, buf, len, UBI_UNKNOWN); +} + +#else /* !CONFIG_UBIFS_FS_DEBUG */ + +#define UBIFS_DBG(op) + +/* Use "if (0)" to make compiler check arguments even if debugging is off */ +#define ubifs_assert(expr) do { \ + if (0 && (expr)) \ + printk(KERN_CRIT "UBIFS assert failed in %s at %u (pid %d)\n", \ + __func__, __LINE__, current->pid); \ +} while (0) + +#define dbg_err(fmt, ...) do { \ + if (0) \ + ubifs_err(fmt, ##__VA_ARGS__); \ +} while (0) + +#define dbg_msg(fmt, ...) do { \ + if (0) \ + printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", \ + current->pid, __func__, ##__VA_ARGS__); \ +} while (0) + +#define dbg_dump_stack() +#define ubifs_assert_cmt_locked(c) + +#define dbg_gen(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_jnl(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_tnc(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_lp(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_find(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_mnt(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_io(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_cmt(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_budg(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_log(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_gc(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_scan(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_rcvry(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) + +#define DBGKEY(key) ((char *)(key)) +#define DBGKEY1(key) ((char *)(key)) + +#define dbg_ntype(type) "" +#define dbg_cstate(cmt_state) "" +#define dbg_get_key_dump(c, key) ({}) +#define dbg_dump_inode(c, inode) ({}) +#define dbg_dump_node(c, node) ({}) +#define dbg_dump_budget_req(req) ({}) +#define dbg_dump_lstats(lst) ({}) +#define dbg_dump_budg(c) ({}) +#define dbg_dump_lprop(c, lp) ({}) +#define dbg_dump_lprops(c) ({}) +#define dbg_dump_leb(c, lnum) ({}) +#define dbg_dump_znode(c, znode) ({}) +#define dbg_dump_heap(c, heap, cat) ({}) +#define dbg_dump_pnode(c, pnode, parent, iip) ({}) +#define dbg_dump_tnc(c) ({}) +#define dbg_dump_index(c) ({}) + +#define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0 +#define dbg_old_index_check_init(c, zroot) 0 +#define dbg_check_old_index(c, zroot) 0 +#define dbg_check_cats(c) 0 +#define dbg_check_ltab(c) 0 +#define dbg_check_synced_i_size(inode) 0 +#define dbg_check_dir_size(c, dir) 0 +#define dbg_check_tnc(c, x) 0 +#define dbg_check_idx_size(c, idx_size) 0 +#define dbg_check_filesystem(c) 0 +#define dbg_check_heap(c, heap, cat, add_pos) ({}) +#define dbg_check_lprops(c) 0 +#define dbg_check_lpt_nodes(c, cnode, row, col) 0 +#define dbg_force_in_the_gaps_enabled 0 +#define dbg_force_in_the_gaps() 0 +#define dbg_failure_mode 0 +#define dbg_failure_mode_registration(c) ({}) +#define dbg_failure_mode_deregistration(c) ({}) + +#endif /* !CONFIG_UBIFS_FS_DEBUG */ + +#endif /* !__UBIFS_DEBUG_H__ */ diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c new file mode 100644 index 000000000000..526c01ec8003 --- /dev/null +++ b/fs/ubifs/dir.c @@ -0,0 +1,1231 @@ +/* * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * Copyright (C) 2006, 2007 University of Szeged, Hungary + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + * Zoltan Sogor + */ + +/* + * This file implements directory operations. + * + * All FS operations in this file allocate budget before writing anything to the + * media. If they fail to allocate it, the error is returned. The only + * exceptions are 'ubifs_unlink()' and 'ubifs_rmdir()' which keep working even + * if they unable to allocate the budget, because deletion %-ENOSPC failure is + * not what users are usually ready to get. UBIFS budgeting subsystem has some + * space reserved for these purposes. + * + * All operations in this file write all inodes which they change straight + * away, instead of marking them dirty. For example, 'ubifs_link()' changes + * @i_size of the parent inode and writes the parent inode together with the + * target inode. This was done to simplify file-system recovery which would + * otherwise be very difficult to do. The only exception is rename which marks + * the re-named inode dirty (because its @i_ctime is updated) but does not + * write it, but just marks it as dirty. + */ + +#include "ubifs.h" + +/** + * inherit_flags - inherit flags of the parent inode. + * @dir: parent inode + * @mode: new inode mode flags + * + * This is a helper function for 'ubifs_new_inode()' which inherits flag of the + * parent directory inode @dir. UBIFS inodes inherit the following flags: + * o %UBIFS_COMPR_FL, which is useful to switch compression on/of on + * sub-directory basis; + * o %UBIFS_SYNC_FL - useful for the same reasons; + * o %UBIFS_DIRSYNC_FL - similar, but relevant only to directories. + * + * This function returns the inherited flags. + */ +static int inherit_flags(const struct inode *dir, int mode) +{ + int flags; + const struct ubifs_inode *ui = ubifs_inode(dir); + + if (!S_ISDIR(dir->i_mode)) + /* + * The parent is not a directory, which means that an extended + * attribute inode is being created. No flags. + */ + return 0; + + flags = ui->flags & (UBIFS_COMPR_FL | UBIFS_SYNC_FL | UBIFS_DIRSYNC_FL); + if (!S_ISDIR(mode)) + /* The "DIRSYNC" flag only applies to directories */ + flags &= ~UBIFS_DIRSYNC_FL; + return flags; +} + +/** + * ubifs_new_inode - allocate new UBIFS inode object. + * @c: UBIFS file-system description object + * @dir: parent directory inode + * @mode: inode mode flags + * + * This function finds an unused inode number, allocates new inode and + * initializes it. Returns new inode in case of success and an error code in + * case of failure. + */ +struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir, + int mode) +{ + struct inode *inode; + struct ubifs_inode *ui; + + inode = new_inode(c->vfs_sb); + ui = ubifs_inode(inode); + if (!inode) + return ERR_PTR(-ENOMEM); + + /* + * Set 'S_NOCMTIME' to prevent VFS form updating [mc]time of inodes and + * marking them dirty in file write path (see 'file_update_time()'). + * UBIFS has to fully control "clean <-> dirty" transitions of inodes + * to make budgeting work. + */ + inode->i_flags |= (S_NOCMTIME); + + inode->i_uid = current->fsuid; + if (dir->i_mode & S_ISGID) { + inode->i_gid = dir->i_gid; + if (S_ISDIR(mode)) + mode |= S_ISGID; + } else + inode->i_gid = current->fsgid; + inode->i_mode = mode; + inode->i_mtime = inode->i_atime = inode->i_ctime = + ubifs_current_time(inode); + inode->i_mapping->nrpages = 0; + /* Disable readahead */ + inode->i_mapping->backing_dev_info = &c->bdi; + + switch (mode & S_IFMT) { + case S_IFREG: + inode->i_mapping->a_ops = &ubifs_file_address_operations; + inode->i_op = &ubifs_file_inode_operations; + inode->i_fop = &ubifs_file_operations; + break; + case S_IFDIR: + inode->i_op = &ubifs_dir_inode_operations; + inode->i_fop = &ubifs_dir_operations; + inode->i_size = ui->ui_size = UBIFS_INO_NODE_SZ; + break; + case S_IFLNK: + inode->i_op = &ubifs_symlink_inode_operations; + break; + case S_IFSOCK: + case S_IFIFO: + case S_IFBLK: + case S_IFCHR: + inode->i_op = &ubifs_file_inode_operations; + break; + default: + BUG(); + } + + ui->flags = inherit_flags(dir, mode); + ubifs_set_inode_flags(inode); + if (S_ISREG(mode)) + ui->compr_type = c->default_compr; + else + ui->compr_type = UBIFS_COMPR_NONE; + ui->synced_i_size = 0; + + spin_lock(&c->cnt_lock); + /* Inode number overflow is currently not supported */ + if (c->highest_inum >= INUM_WARN_WATERMARK) { + if (c->highest_inum >= INUM_WATERMARK) { + spin_unlock(&c->cnt_lock); + ubifs_err("out of inode numbers"); + make_bad_inode(inode); + iput(inode); + return ERR_PTR(-EINVAL); + } + ubifs_warn("running out of inode numbers (current %lu, max %d)", + c->highest_inum, INUM_WATERMARK); + } + + inode->i_ino = ++c->highest_inum; + /* + * The creation sequence number remains with this inode for its + * lifetime. All nodes for this inode have a greater sequence number, + * and so it is possible to distinguish obsolete nodes belonging to a + * previous incarnation of the same inode number - for example, for the + * purpose of rebuilding the index. + */ + ui->creat_sqnum = ++c->max_sqnum; + spin_unlock(&c->cnt_lock); + return inode; +} + +#ifdef CONFIG_UBIFS_FS_DEBUG + +static int dbg_check_name(struct ubifs_dent_node *dent, struct qstr *nm) +{ + if (!(ubifs_chk_flags & UBIFS_CHK_GEN)) + return 0; + if (le16_to_cpu(dent->nlen) != nm->len) + return -EINVAL; + if (memcmp(dent->name, nm->name, nm->len)) + return -EINVAL; + return 0; +} + +#else + +#define dbg_check_name(dent, nm) 0 + +#endif + +static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + int err; + union ubifs_key key; + struct inode *inode = NULL; + struct ubifs_dent_node *dent; + struct ubifs_info *c = dir->i_sb->s_fs_info; + + dbg_gen("'%.*s' in dir ino %lu", + dentry->d_name.len, dentry->d_name.name, dir->i_ino); + + if (dentry->d_name.len > UBIFS_MAX_NLEN) + return ERR_PTR(-ENAMETOOLONG); + + dent = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS); + if (!dent) + return ERR_PTR(-ENOMEM); + + dent_key_init(c, &key, dir->i_ino, &dentry->d_name); + + err = ubifs_tnc_lookup_nm(c, &key, dent, &dentry->d_name); + if (err) { + if (err == -ENOENT) { + dbg_gen("not found"); + goto done; + } + goto out; + } + + if (dbg_check_name(dent, &dentry->d_name)) { + err = -EINVAL; + goto out; + } + + inode = ubifs_iget(dir->i_sb, le64_to_cpu(dent->inum)); + if (IS_ERR(inode)) { + /* + * This should not happen. Probably the file-system needs + * checking. + */ + err = PTR_ERR(inode); + ubifs_err("dead directory entry '%.*s', error %d", + dentry->d_name.len, dentry->d_name.name, err); + ubifs_ro_mode(c, err); + goto out; + } + +done: + kfree(dent); + /* + * Note, d_splice_alias() would be required instead if we supported + * NFS. + */ + d_add(dentry, inode); + return NULL; + +out: + kfree(dent); + return ERR_PTR(err); +} + +static int ubifs_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + struct inode *inode; + struct ubifs_info *c = dir->i_sb->s_fs_info; + int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len); + struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, + .dirtied_ino = 1 }; + struct ubifs_inode *dir_ui = ubifs_inode(dir); + + /* + * Budget request settings: new inode, new direntry, changing the + * parent directory inode. + */ + + dbg_gen("dent '%.*s', mode %#x in dir ino %lu", + dentry->d_name.len, dentry->d_name.name, mode, dir->i_ino); + + err = ubifs_budget_space(c, &req); + if (err) + return err; + + inode = ubifs_new_inode(c, dir, mode); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_budg; + } + + mutex_lock(&dir_ui->ui_mutex); + dir->i_size += sz_change; + dir_ui->ui_size = dir->i_size; + dir->i_mtime = dir->i_ctime = inode->i_ctime; + err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0); + if (err) + goto out_cancel; + mutex_unlock(&dir_ui->ui_mutex); + + ubifs_release_budget(c, &req); + insert_inode_hash(inode); + d_instantiate(dentry, inode); + return 0; + +out_cancel: + dir->i_size -= sz_change; + dir_ui->ui_size = dir->i_size; + mutex_unlock(&dir_ui->ui_mutex); + make_bad_inode(inode); + iput(inode); +out_budg: + ubifs_release_budget(c, &req); + ubifs_err("cannot create regular file, error %d", err); + return err; +} + +/** + * vfs_dent_type - get VFS directory entry type. + * @type: UBIFS directory entry type + * + * This function converts UBIFS directory entry type into VFS directory entry + * type. + */ +static unsigned int vfs_dent_type(uint8_t type) +{ + switch (type) { + case UBIFS_ITYPE_REG: + return DT_REG; + case UBIFS_ITYPE_DIR: + return DT_DIR; + case UBIFS_ITYPE_LNK: + return DT_LNK; + case UBIFS_ITYPE_BLK: + return DT_BLK; + case UBIFS_ITYPE_CHR: + return DT_CHR; + case UBIFS_ITYPE_FIFO: + return DT_FIFO; + case UBIFS_ITYPE_SOCK: + return DT_SOCK; + default: + BUG(); + } + return 0; +} + +/* + * The classical Unix view for directory is that it is a linear array of + * (name, inode number) entries. Linux/VFS assumes this model as well. + * Particularly, 'readdir()' call wants us to return a directory entry offset + * which later may be used to continue 'readdir()'ing the directory or to + * 'seek()' to that specific direntry. Obviously UBIFS does not really fit this + * model because directory entries are identified by keys, which may collide. + * + * UBIFS uses directory entry hash value for directory offsets, so + * 'seekdir()'/'telldir()' may not always work because of possible key + * collisions. But UBIFS guarantees that consecutive 'readdir()' calls work + * properly by means of saving full directory entry name in the private field + * of the file description object. + * + * This means that UBIFS cannot support NFS which requires full + * 'seekdir()'/'telldir()' support. + */ +static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir) +{ + int err, over = 0; + struct qstr nm; + union ubifs_key key; + struct ubifs_dent_node *dent; + struct inode *dir = file->f_path.dentry->d_inode; + struct ubifs_info *c = dir->i_sb->s_fs_info; + + dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, file->f_pos); + + if (file->f_pos > UBIFS_S_KEY_HASH_MASK || file->f_pos == 2) + /* + * The directory was seek'ed to a senseless position or there + * are no more entries. + */ + return 0; + + /* File positions 0 and 1 correspond to "." and ".." */ + if (file->f_pos == 0) { + ubifs_assert(!file->private_data); + over = filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR); + if (over) + return 0; + file->f_pos = 1; + } + + if (file->f_pos == 1) { + ubifs_assert(!file->private_data); + over = filldir(dirent, "..", 2, 1, + parent_ino(file->f_path.dentry), DT_DIR); + if (over) + return 0; + + /* Find the first entry in TNC and save it */ + lowest_dent_key(c, &key, dir->i_ino); + nm.name = NULL; + dent = ubifs_tnc_next_ent(c, &key, &nm); + if (IS_ERR(dent)) { + err = PTR_ERR(dent); + goto out; + } + + file->f_pos = key_hash_flash(c, &dent->key); + file->private_data = dent; + } + + dent = file->private_data; + if (!dent) { + /* + * The directory was seek'ed to and is now readdir'ed. + * Find the entry corresponding to @file->f_pos or the + * closest one. + */ + dent_key_init_hash(c, &key, dir->i_ino, file->f_pos); + nm.name = NULL; + dent = ubifs_tnc_next_ent(c, &key, &nm); + if (IS_ERR(dent)) { + err = PTR_ERR(dent); + goto out; + } + file->f_pos = key_hash_flash(c, &dent->key); + file->private_data = dent; + } + + while (1) { + dbg_gen("feed '%s', ino %llu, new f_pos %#x", + dent->name, (unsigned long long)le64_to_cpu(dent->inum), + key_hash_flash(c, &dent->key)); + ubifs_assert(dent->ch.sqnum > ubifs_inode(dir)->creat_sqnum); + + nm.len = le16_to_cpu(dent->nlen); + over = filldir(dirent, dent->name, nm.len, file->f_pos, + le64_to_cpu(dent->inum), + vfs_dent_type(dent->type)); + if (over) + return 0; + + /* Switch to the next entry */ + key_read(c, &dent->key, &key); + nm.name = dent->name; + dent = ubifs_tnc_next_ent(c, &key, &nm); + if (IS_ERR(dent)) { + err = PTR_ERR(dent); + goto out; + } + + kfree(file->private_data); + file->f_pos = key_hash_flash(c, &dent->key); + file->private_data = dent; + cond_resched(); + } + +out: + if (err != -ENOENT) { + ubifs_err("cannot find next direntry, error %d", err); + return err; + } + + kfree(file->private_data); + file->private_data = NULL; + file->f_pos = 2; + return 0; +} + +/* If a directory is seeked, we have to free saved readdir() state */ +static loff_t ubifs_dir_llseek(struct file *file, loff_t offset, int origin) +{ + kfree(file->private_data); + file->private_data = NULL; + return generic_file_llseek(file, offset, origin); +} + +/* Free saved readdir() state when the directory is closed */ +static int ubifs_dir_release(struct inode *dir, struct file *file) +{ + kfree(file->private_data); + file->private_data = NULL; + return 0; +} + +/** + * lock_2_inodes - lock two UBIFS inodes. + * @inode1: first inode + * @inode2: second inode + */ +static void lock_2_inodes(struct inode *inode1, struct inode *inode2) +{ + if (inode1->i_ino < inode2->i_ino) { + mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_2); + mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_3); + } else { + mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2); + mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_3); + } +} + +/** + * unlock_2_inodes - unlock two UBIFS inodes inodes. + * @inode1: first inode + * @inode2: second inode + */ +static void unlock_2_inodes(struct inode *inode1, struct inode *inode2) +{ + mutex_unlock(&ubifs_inode(inode1)->ui_mutex); + mutex_unlock(&ubifs_inode(inode2)->ui_mutex); +} + +static int ubifs_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct ubifs_info *c = dir->i_sb->s_fs_info; + struct inode *inode = old_dentry->d_inode; + struct ubifs_inode *ui = ubifs_inode(inode); + struct ubifs_inode *dir_ui = ubifs_inode(dir); + int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len); + struct ubifs_budget_req req = { .new_dent = 1, .dirtied_ino = 2, + .dirtied_ino_d = ALIGN(ui->data_len, 8) }; + + /* + * Budget request settings: new direntry, changing the target inode, + * changing the parent inode. + */ + + dbg_gen("dent '%.*s' to ino %lu (nlink %d) in dir ino %lu", + dentry->d_name.len, dentry->d_name.name, inode->i_ino, + inode->i_nlink, dir->i_ino); + err = dbg_check_synced_i_size(inode); + if (err) + return err; + + err = ubifs_budget_space(c, &req); + if (err) + return err; + + lock_2_inodes(dir, inode); + inc_nlink(inode); + atomic_inc(&inode->i_count); + inode->i_ctime = ubifs_current_time(inode); + dir->i_size += sz_change; + dir_ui->ui_size = dir->i_size; + dir->i_mtime = dir->i_ctime = inode->i_ctime; + err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0); + if (err) + goto out_cancel; + unlock_2_inodes(dir, inode); + + ubifs_release_budget(c, &req); + d_instantiate(dentry, inode); + return 0; + +out_cancel: + dir->i_size -= sz_change; + dir_ui->ui_size = dir->i_size; + drop_nlink(inode); + unlock_2_inodes(dir, inode); + ubifs_release_budget(c, &req); + iput(inode); + return err; +} + +static int ubifs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct ubifs_info *c = dir->i_sb->s_fs_info; + struct inode *inode = dentry->d_inode; + struct ubifs_inode *dir_ui = ubifs_inode(dir); + int sz_change = CALC_DENT_SIZE(dentry->d_name.len); + int err, budgeted = 1; + struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 }; + + /* + * Budget request settings: deletion direntry, deletion inode (+1 for + * @dirtied_ino), changing the parent directory inode. If budgeting + * fails, go ahead anyway because we have extra space reserved for + * deletions. + */ + + dbg_gen("dent '%.*s' from ino %lu (nlink %d) in dir ino %lu", + dentry->d_name.len, dentry->d_name.name, inode->i_ino, + inode->i_nlink, dir->i_ino); + err = dbg_check_synced_i_size(inode); + if (err) + return err; + + err = ubifs_budget_space(c, &req); + if (err) { + if (err != -ENOSPC) + return err; + budgeted = 0; + } + + lock_2_inodes(dir, inode); + inode->i_ctime = ubifs_current_time(dir); + drop_nlink(inode); + dir->i_size -= sz_change; + dir_ui->ui_size = dir->i_size; + dir->i_mtime = dir->i_ctime = inode->i_ctime; + err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0); + if (err) + goto out_cancel; + unlock_2_inodes(dir, inode); + + if (budgeted) + ubifs_release_budget(c, &req); + else { + /* We've deleted something - clean the "no space" flags */ + c->nospace = c->nospace_rp = 0; + smp_wmb(); + } + return 0; + +out_cancel: + dir->i_size += sz_change; + dir_ui->ui_size = dir->i_size; + inc_nlink(inode); + unlock_2_inodes(dir, inode); + if (budgeted) + ubifs_release_budget(c, &req); + return err; +} + +/** + * check_dir_empty - check if a directory is empty or not. + * @c: UBIFS file-system description object + * @dir: VFS inode object of the directory to check + * + * This function checks if directory @dir is empty. Returns zero if the + * directory is empty, %-ENOTEMPTY if it is not, and other negative error codes + * in case of of errors. + */ +static int check_dir_empty(struct ubifs_info *c, struct inode *dir) +{ + struct qstr nm = { .name = NULL }; + struct ubifs_dent_node *dent; + union ubifs_key key; + int err; + + lowest_dent_key(c, &key, dir->i_ino); + dent = ubifs_tnc_next_ent(c, &key, &nm); + if (IS_ERR(dent)) { + err = PTR_ERR(dent); + if (err == -ENOENT) + err = 0; + } else { + kfree(dent); + err = -ENOTEMPTY; + } + return err; +} + +static int ubifs_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct ubifs_info *c = dir->i_sb->s_fs_info; + struct inode *inode = dentry->d_inode; + int sz_change = CALC_DENT_SIZE(dentry->d_name.len); + int err, budgeted = 1; + struct ubifs_inode *dir_ui = ubifs_inode(dir); + struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 }; + + /* + * Budget request settings: deletion direntry, deletion inode and + * changing the parent inode. If budgeting fails, go ahead anyway + * because we have extra space reserved for deletions. + */ + + dbg_gen("directory '%.*s', ino %lu in dir ino %lu", dentry->d_name.len, + dentry->d_name.name, inode->i_ino, dir->i_ino); + + err = check_dir_empty(c, dentry->d_inode); + if (err) + return err; + + err = ubifs_budget_space(c, &req); + if (err) { + if (err != -ENOSPC) + return err; + budgeted = 0; + } + + lock_2_inodes(dir, inode); + inode->i_ctime = ubifs_current_time(dir); + clear_nlink(inode); + drop_nlink(dir); + dir->i_size -= sz_change; + dir_ui->ui_size = dir->i_size; + dir->i_mtime = dir->i_ctime = inode->i_ctime; + err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0); + if (err) + goto out_cancel; + unlock_2_inodes(dir, inode); + + if (budgeted) + ubifs_release_budget(c, &req); + else { + /* We've deleted something - clean the "no space" flags */ + c->nospace = c->nospace_rp = 0; + smp_wmb(); + } + return 0; + +out_cancel: + dir->i_size += sz_change; + dir_ui->ui_size = dir->i_size; + inc_nlink(dir); + inc_nlink(inode); + inc_nlink(inode); + unlock_2_inodes(dir, inode); + if (budgeted) + ubifs_release_budget(c, &req); + return err; +} + +static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + struct inode *inode; + struct ubifs_inode *dir_ui = ubifs_inode(dir); + struct ubifs_info *c = dir->i_sb->s_fs_info; + int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len); + struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1 }; + + /* + * Budget request settings: new inode, new direntry and changing parent + * directory inode. + */ + + dbg_gen("dent '%.*s', mode %#x in dir ino %lu", + dentry->d_name.len, dentry->d_name.name, mode, dir->i_ino); + + err = ubifs_budget_space(c, &req); + if (err) + return err; + + inode = ubifs_new_inode(c, dir, S_IFDIR | mode); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_budg; + } + + mutex_lock(&dir_ui->ui_mutex); + insert_inode_hash(inode); + inc_nlink(inode); + inc_nlink(dir); + dir->i_size += sz_change; + dir_ui->ui_size = dir->i_size; + dir->i_mtime = dir->i_ctime = inode->i_ctime; + err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0); + if (err) { + ubifs_err("cannot create directory, error %d", err); + goto out_cancel; + } + mutex_unlock(&dir_ui->ui_mutex); + + ubifs_release_budget(c, &req); + d_instantiate(dentry, inode); + return 0; + +out_cancel: + dir->i_size -= sz_change; + dir_ui->ui_size = dir->i_size; + drop_nlink(dir); + mutex_unlock(&dir_ui->ui_mutex); + make_bad_inode(inode); + iput(inode); +out_budg: + ubifs_release_budget(c, &req); + return err; +} + +static int ubifs_mknod(struct inode *dir, struct dentry *dentry, + int mode, dev_t rdev) +{ + struct inode *inode; + struct ubifs_inode *ui; + struct ubifs_inode *dir_ui = ubifs_inode(dir); + struct ubifs_info *c = dir->i_sb->s_fs_info; + union ubifs_dev_desc *dev = NULL; + int sz_change = CALC_DENT_SIZE(dentry->d_name.len); + int err, devlen = 0; + struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, + .new_ino_d = ALIGN(devlen, 8), + .dirtied_ino = 1 }; + + /* + * Budget request settings: new inode, new direntry and changing parent + * directory inode. + */ + + dbg_gen("dent '%.*s' in dir ino %lu", + dentry->d_name.len, dentry->d_name.name, dir->i_ino); + + if (!new_valid_dev(rdev)) + return -EINVAL; + + if (S_ISBLK(mode) || S_ISCHR(mode)) { + dev = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS); + if (!dev) + return -ENOMEM; + devlen = ubifs_encode_dev(dev, rdev); + } + + err = ubifs_budget_space(c, &req); + if (err) { + kfree(dev); + return err; + } + + inode = ubifs_new_inode(c, dir, mode); + if (IS_ERR(inode)) { + kfree(dev); + err = PTR_ERR(inode); + goto out_budg; + } + + init_special_inode(inode, inode->i_mode, rdev); + inode->i_size = ubifs_inode(inode)->ui_size = devlen; + ui = ubifs_inode(inode); + ui->data = dev; + ui->data_len = devlen; + + mutex_lock(&dir_ui->ui_mutex); + dir->i_size += sz_change; + dir_ui->ui_size = dir->i_size; + dir->i_mtime = dir->i_ctime = inode->i_ctime; + err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0); + if (err) + goto out_cancel; + mutex_unlock(&dir_ui->ui_mutex); + + ubifs_release_budget(c, &req); + insert_inode_hash(inode); + d_instantiate(dentry, inode); + return 0; + +out_cancel: + dir->i_size -= sz_change; + dir_ui->ui_size = dir->i_size; + mutex_unlock(&dir_ui->ui_mutex); + make_bad_inode(inode); + iput(inode); +out_budg: + ubifs_release_budget(c, &req); + return err; +} + +static int ubifs_symlink(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + struct inode *inode; + struct ubifs_inode *ui; + struct ubifs_inode *dir_ui = ubifs_inode(dir); + struct ubifs_info *c = dir->i_sb->s_fs_info; + int err, len = strlen(symname); + int sz_change = CALC_DENT_SIZE(dentry->d_name.len); + struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, + .new_ino_d = ALIGN(len, 8), + .dirtied_ino = 1 }; + + /* + * Budget request settings: new inode, new direntry and changing parent + * directory inode. + */ + + dbg_gen("dent '%.*s', target '%s' in dir ino %lu", dentry->d_name.len, + dentry->d_name.name, symname, dir->i_ino); + + if (len > UBIFS_MAX_INO_DATA) + return -ENAMETOOLONG; + + err = ubifs_budget_space(c, &req); + if (err) + return err; + + inode = ubifs_new_inode(c, dir, S_IFLNK | S_IRWXUGO); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_budg; + } + + ui = ubifs_inode(inode); + ui->data = kmalloc(len + 1, GFP_NOFS); + if (!ui->data) { + err = -ENOMEM; + goto out_inode; + } + + memcpy(ui->data, symname, len); + ((char *)ui->data)[len] = '\0'; + /* + * The terminating zero byte is not written to the flash media and it + * is put just to make later in-memory string processing simpler. Thus, + * data length is @len, not @len + %1. + */ + ui->data_len = len; + inode->i_size = ubifs_inode(inode)->ui_size = len; + + mutex_lock(&dir_ui->ui_mutex); + dir->i_size += sz_change; + dir_ui->ui_size = dir->i_size; + dir->i_mtime = dir->i_ctime = inode->i_ctime; + err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0); + if (err) + goto out_cancel; + mutex_unlock(&dir_ui->ui_mutex); + + ubifs_release_budget(c, &req); + insert_inode_hash(inode); + d_instantiate(dentry, inode); + return 0; + +out_cancel: + dir->i_size -= sz_change; + dir_ui->ui_size = dir->i_size; + mutex_unlock(&dir_ui->ui_mutex); +out_inode: + make_bad_inode(inode); + iput(inode); +out_budg: + ubifs_release_budget(c, &req); + return err; +} + +/** + * lock_3_inodes - lock three UBIFS inodes for rename. + * @inode1: first inode + * @inode2: second inode + * @inode3: third inode + * + * For 'ubifs_rename()', @inode1 may be the same as @inode2 whereas @inode3 may + * be null. + */ +static void lock_3_inodes(struct inode *inode1, struct inode *inode2, + struct inode *inode3) +{ + struct inode *i1, *i2, *i3; + + if (!inode3) { + if (inode1 != inode2) { + lock_2_inodes(inode1, inode2); + return; + } + mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1); + return; + } + + if (inode1 == inode2) { + lock_2_inodes(inode1, inode3); + return; + } + + /* 3 different inodes */ + if (inode1 < inode2) { + i3 = inode2; + if (inode1 < inode3) { + i1 = inode1; + i2 = inode3; + } else { + i1 = inode3; + i2 = inode1; + } + } else { + i3 = inode1; + if (inode2 < inode3) { + i1 = inode2; + i2 = inode3; + } else { + i1 = inode3; + i2 = inode2; + } + } + mutex_lock_nested(&ubifs_inode(i1)->ui_mutex, WB_MUTEX_1); + lock_2_inodes(i2, i3); +} + +/** + * unlock_3_inodes - unlock three UBIFS inodes for rename. + * @inode1: first inode + * @inode2: second inode + * @inode3: third inode + */ +static void unlock_3_inodes(struct inode *inode1, struct inode *inode2, + struct inode *inode3) +{ + mutex_unlock(&ubifs_inode(inode1)->ui_mutex); + if (inode1 != inode2) + mutex_unlock(&ubifs_inode(inode2)->ui_mutex); + if (inode3) + mutex_unlock(&ubifs_inode(inode3)->ui_mutex); +} + +static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct ubifs_info *c = old_dir->i_sb->s_fs_info; + struct inode *old_inode = old_dentry->d_inode; + struct inode *new_inode = new_dentry->d_inode; + struct ubifs_inode *old_inode_ui = ubifs_inode(old_inode); + int err, release, sync = 0, move = (new_dir != old_dir); + int is_dir = S_ISDIR(old_inode->i_mode); + int unlink = !!new_inode; + int new_sz = CALC_DENT_SIZE(new_dentry->d_name.len); + int old_sz = CALC_DENT_SIZE(old_dentry->d_name.len); + struct ubifs_budget_req req = { .new_dent = 1, .mod_dent = 1, + .dirtied_ino = 3 }; + struct ubifs_budget_req ino_req = { .dirtied_ino = 1, + .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) }; + struct timespec time; + + /* + * Budget request settings: deletion direntry, new direntry, removing + * the old inode, and changing old and new parent directory inodes. + * + * However, this operation also marks the target inode as dirty and + * does not write it, so we allocate budget for the target inode + * separately. + */ + + dbg_gen("dent '%.*s' ino %lu in dir ino %lu to dent '%.*s' in " + "dir ino %lu", old_dentry->d_name.len, old_dentry->d_name.name, + old_inode->i_ino, old_dir->i_ino, new_dentry->d_name.len, + new_dentry->d_name.name, new_dir->i_ino); + + if (unlink && is_dir) { + err = check_dir_empty(c, new_inode); + if (err) + return err; + } + + err = ubifs_budget_space(c, &req); + if (err) + return err; + err = ubifs_budget_space(c, &ino_req); + if (err) { + ubifs_release_budget(c, &req); + return err; + } + + lock_3_inodes(old_dir, new_dir, new_inode); + + /* + * Like most other Unix systems, set the @i_ctime for inodes on a + * rename. + */ + time = ubifs_current_time(old_dir); + old_inode->i_ctime = time; + + /* We must adjust parent link count when renaming directories */ + if (is_dir) { + if (move) { + /* + * @old_dir loses a link because we are moving + * @old_inode to a different directory. + */ + drop_nlink(old_dir); + /* + * @new_dir only gains a link if we are not also + * overwriting an existing directory. + */ + if (!unlink) + inc_nlink(new_dir); + } else { + /* + * @old_inode is not moving to a different directory, + * but @old_dir still loses a link if we are + * overwriting an existing directory. + */ + if (unlink) + drop_nlink(old_dir); + } + } + + old_dir->i_size -= old_sz; + ubifs_inode(old_dir)->ui_size = old_dir->i_size; + old_dir->i_mtime = old_dir->i_ctime = time; + new_dir->i_mtime = new_dir->i_ctime = time; + + /* + * And finally, if we unlinked a direntry which happened to have the + * same name as the moved direntry, we have to decrement @i_nlink of + * the unlinked inode and change its ctime. + */ + if (unlink) { + /* + * Directories cannot have hard-links, so if this is a + * directory, decrement its @i_nlink twice because an empty + * directory has @i_nlink 2. + */ + if (is_dir) + drop_nlink(new_inode); + new_inode->i_ctime = time; + drop_nlink(new_inode); + } else { + new_dir->i_size += new_sz; + ubifs_inode(new_dir)->ui_size = new_dir->i_size; + } + + /* + * Do not ask 'ubifs_jnl_rename()' to flush write-buffer if @old_inode + * is dirty, because this will be done later on at the end of + * 'ubifs_rename()'. + */ + if (IS_SYNC(old_inode)) { + sync = IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir); + if (unlink && IS_SYNC(new_inode)) + sync = 1; + } + err = ubifs_jnl_rename(c, old_dir, old_dentry, new_dir, new_dentry, + sync); + if (err) + goto out_cancel; + + unlock_3_inodes(old_dir, new_dir, new_inode); + ubifs_release_budget(c, &req); + + mutex_lock(&old_inode_ui->ui_mutex); + release = old_inode_ui->dirty; + mark_inode_dirty_sync(old_inode); + mutex_unlock(&old_inode_ui->ui_mutex); + + if (release) + ubifs_release_budget(c, &ino_req); + if (IS_SYNC(old_inode)) + err = old_inode->i_sb->s_op->write_inode(old_inode, 1); + return err; + +out_cancel: + if (unlink) { + if (is_dir) + inc_nlink(new_inode); + inc_nlink(new_inode); + } else { + new_dir->i_size -= new_sz; + ubifs_inode(new_dir)->ui_size = new_dir->i_size; + } + old_dir->i_size += old_sz; + ubifs_inode(old_dir)->ui_size = old_dir->i_size; + if (is_dir) { + if (move) { + inc_nlink(old_dir); + if (!unlink) + drop_nlink(new_dir); + } else { + if (unlink) + inc_nlink(old_dir); + } + } + unlock_3_inodes(old_dir, new_dir, new_inode); + ubifs_release_budget(c, &ino_req); + ubifs_release_budget(c, &req); + return err; +} + +int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + loff_t size; + struct inode *inode = dentry->d_inode; + struct ubifs_inode *ui = ubifs_inode(inode); + + mutex_lock(&ui->ui_mutex); + stat->dev = inode->i_sb->s_dev; + stat->ino = inode->i_ino; + stat->mode = inode->i_mode; + stat->nlink = inode->i_nlink; + stat->uid = inode->i_uid; + stat->gid = inode->i_gid; + stat->rdev = inode->i_rdev; + stat->atime = inode->i_atime; + stat->mtime = inode->i_mtime; + stat->ctime = inode->i_ctime; + stat->blksize = UBIFS_BLOCK_SIZE; + stat->size = ui->ui_size; + + /* + * Unfortunately, the 'stat()' system call was designed for block + * device based file systems, and it is not appropriate for UBIFS, + * because UBIFS does not have notion of "block". For example, it is + * difficult to tell how many block a directory takes - it actually + * takes less than 300 bytes, but we have to round it to block size, + * which introduces large mistake. This makes utilities like 'du' to + * report completely senseless numbers. This is the reason why UBIFS + * goes the same way as JFFS2 - it reports zero blocks for everything + * but regular files, which makes more sense than reporting completely + * wrong sizes. + */ + if (S_ISREG(inode->i_mode)) { + size = ui->xattr_size; + size += stat->size; + size = ALIGN(size, UBIFS_BLOCK_SIZE); + /* + * Note, user-space expects 512-byte blocks count irrespectively + * of what was reported in @stat->size. + */ + stat->blocks = size >> 9; + } else + stat->blocks = 0; + mutex_unlock(&ui->ui_mutex); + return 0; +} + +struct inode_operations ubifs_dir_inode_operations = { + .lookup = ubifs_lookup, + .create = ubifs_create, + .link = ubifs_link, + .symlink = ubifs_symlink, + .unlink = ubifs_unlink, + .mkdir = ubifs_mkdir, + .rmdir = ubifs_rmdir, + .mknod = ubifs_mknod, + .rename = ubifs_rename, + .setattr = ubifs_setattr, + .getattr = ubifs_getattr, +#ifdef CONFIG_UBIFS_FS_XATTR + .setxattr = ubifs_setxattr, + .getxattr = ubifs_getxattr, + .listxattr = ubifs_listxattr, + .removexattr = ubifs_removexattr, +#endif +}; + +struct file_operations ubifs_dir_operations = { + .llseek = ubifs_dir_llseek, + .release = ubifs_dir_release, + .read = generic_read_dir, + .readdir = ubifs_readdir, + .fsync = ubifs_fsync, + .unlocked_ioctl = ubifs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ubifs_compat_ioctl, +#endif +}; diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c new file mode 100644 index 000000000000..3d698e2022b1 --- /dev/null +++ b/fs/ubifs/file.c @@ -0,0 +1,1290 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* + * This file implements VFS file and inode operations of regular files, device + * nodes and symlinks as well as address space operations. + * + * UBIFS uses 2 page flags: PG_private and PG_checked. PG_private is set if the + * page is dirty and is used for budgeting purposes - dirty pages should not be + * budgeted. The PG_checked flag is set if full budgeting is required for the + * page e.g., when it corresponds to a file hole or it is just beyond the file + * size. The budgeting is done in 'ubifs_write_begin()', because it is OK to + * fail in this function, and the budget is released in 'ubifs_write_end()'. So + * the PG_private and PG_checked flags carry the information about how the page + * was budgeted, to make it possible to release the budget properly. + * + * A thing to keep in mind: inode's 'i_mutex' is locked in most VFS operations + * we implement. However, this is not true for '->writepage()', which might be + * called with 'i_mutex' unlocked. For example, when pdflush is performing + * write-back, it calls 'writepage()' with unlocked 'i_mutex', although the + * inode has 'I_LOCK' flag in this case. At "normal" work-paths 'i_mutex' is + * locked in '->writepage', e.g. in "sys_write -> alloc_pages -> direct reclaim + * path'. So, in '->writepage()' we are only guaranteed that the page is + * locked. + * + * Similarly, 'i_mutex' does not have to be locked in readpage(), e.g., + * readahead path does not have it locked ("sys_read -> generic_file_aio_read + * -> ondemand_readahead -> readpage"). In case of readahead, 'I_LOCK' flag is + * not set as well. However, UBIFS disables readahead. + * + * This, for example means that there might be 2 concurrent '->writepage()' + * calls for the same inode, but different inode dirty pages. + */ + +#include "ubifs.h" +#include <linux/mount.h> +#include <linux/namei.h> + +static int read_block(struct inode *inode, void *addr, unsigned int block, + struct ubifs_data_node *dn) +{ + struct ubifs_info *c = inode->i_sb->s_fs_info; + int err, len, out_len; + union ubifs_key key; + unsigned int dlen; + + data_key_init(c, &key, inode->i_ino, block); + err = ubifs_tnc_lookup(c, &key, dn); + if (err) { + if (err == -ENOENT) + /* Not found, so it must be a hole */ + memset(addr, 0, UBIFS_BLOCK_SIZE); + return err; + } + + ubifs_assert(dn->ch.sqnum > ubifs_inode(inode)->creat_sqnum); + + len = le32_to_cpu(dn->size); + if (len <= 0 || len > UBIFS_BLOCK_SIZE) + goto dump; + + dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ; + out_len = UBIFS_BLOCK_SIZE; + err = ubifs_decompress(&dn->data, dlen, addr, &out_len, + le16_to_cpu(dn->compr_type)); + if (err || len != out_len) + goto dump; + + /* + * Data length can be less than a full block, even for blocks that are + * not the last in the file (e.g., as a result of making a hole and + * appending data). Ensure that the remainder is zeroed out. + */ + if (len < UBIFS_BLOCK_SIZE) + memset(addr + len, 0, UBIFS_BLOCK_SIZE - len); + + return 0; + +dump: + ubifs_err("bad data node (block %u, inode %lu)", + block, inode->i_ino); + dbg_dump_node(c, dn); + return -EINVAL; +} + +static int do_readpage(struct page *page) +{ + void *addr; + int err = 0, i; + unsigned int block, beyond; + struct ubifs_data_node *dn; + struct inode *inode = page->mapping->host; + loff_t i_size = i_size_read(inode); + + dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx", + inode->i_ino, page->index, i_size, page->flags); + ubifs_assert(!PageChecked(page)); + ubifs_assert(!PagePrivate(page)); + + addr = kmap(page); + + block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT; + beyond = (i_size + UBIFS_BLOCK_SIZE - 1) >> UBIFS_BLOCK_SHIFT; + if (block >= beyond) { + /* Reading beyond inode */ + SetPageChecked(page); + memset(addr, 0, PAGE_CACHE_SIZE); + goto out; + } + + dn = kmalloc(UBIFS_MAX_DATA_NODE_SZ, GFP_NOFS); + if (!dn) { + err = -ENOMEM; + goto error; + } + + i = 0; + while (1) { + int ret; + + if (block >= beyond) { + /* Reading beyond inode */ + err = -ENOENT; + memset(addr, 0, UBIFS_BLOCK_SIZE); + } else { + ret = read_block(inode, addr, block, dn); + if (ret) { + err = ret; + if (err != -ENOENT) + break; + } + } + if (++i >= UBIFS_BLOCKS_PER_PAGE) + break; + block += 1; + addr += UBIFS_BLOCK_SIZE; + } + if (err) { + if (err == -ENOENT) { + /* Not found, so it must be a hole */ + SetPageChecked(page); + dbg_gen("hole"); + goto out_free; + } + ubifs_err("cannot read page %lu of inode %lu, error %d", + page->index, inode->i_ino, err); + goto error; + } + +out_free: + kfree(dn); +out: + SetPageUptodate(page); + ClearPageError(page); + flush_dcache_page(page); + kunmap(page); + return 0; + +error: + kfree(dn); + ClearPageUptodate(page); + SetPageError(page); + flush_dcache_page(page); + kunmap(page); + return err; +} + +/** + * release_new_page_budget - release budget of a new page. + * @c: UBIFS file-system description object + * + * This is a helper function which releases budget corresponding to the budget + * of one new page of data. + */ +static void release_new_page_budget(struct ubifs_info *c) +{ + struct ubifs_budget_req req = { .recalculate = 1, .new_page = 1 }; + + ubifs_release_budget(c, &req); +} + +/** + * release_existing_page_budget - release budget of an existing page. + * @c: UBIFS file-system description object + * + * This is a helper function which releases budget corresponding to the budget + * of changing one one page of data which already exists on the flash media. + */ +static void release_existing_page_budget(struct ubifs_info *c) +{ + struct ubifs_budget_req req = { .dd_growth = c->page_budget}; + + ubifs_release_budget(c, &req); +} + +static int write_begin_slow(struct address_space *mapping, + loff_t pos, unsigned len, struct page **pagep) +{ + struct inode *inode = mapping->host; + struct ubifs_info *c = inode->i_sb->s_fs_info; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + struct ubifs_budget_req req = { .new_page = 1 }; + int uninitialized_var(err), appending = !!(pos + len > inode->i_size); + struct page *page; + + dbg_gen("ino %lu, pos %llu, len %u, i_size %lld", + inode->i_ino, pos, len, inode->i_size); + + /* + * At the slow path we have to budget before locking the page, because + * budgeting may force write-back, which would wait on locked pages and + * deadlock if we had the page locked. At this point we do not know + * anything about the page, so assume that this is a new page which is + * written to a hole. This corresponds to largest budget. Later the + * budget will be amended if this is not true. + */ + if (appending) + /* We are appending data, budget for inode change */ + req.dirtied_ino = 1; + + err = ubifs_budget_space(c, &req); + if (unlikely(err)) + return err; + + page = __grab_cache_page(mapping, index); + if (unlikely(!page)) { + ubifs_release_budget(c, &req); + return -ENOMEM; + } + + if (!PageUptodate(page)) { + if (!(pos & PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) + SetPageChecked(page); + else { + err = do_readpage(page); + if (err) { + unlock_page(page); + page_cache_release(page); + return err; + } + } + + SetPageUptodate(page); + ClearPageError(page); + } + + if (PagePrivate(page)) + /* + * The page is dirty, which means it was budgeted twice: + * o first time the budget was allocated by the task which + * made the page dirty and set the PG_private flag; + * o and then we budgeted for it for the second time at the + * very beginning of this function. + * + * So what we have to do is to release the page budget we + * allocated. + */ + release_new_page_budget(c); + else if (!PageChecked(page)) + /* + * We are changing a page which already exists on the media. + * This means that changing the page does not make the amount + * of indexing information larger, and this part of the budget + * which we have already acquired may be released. + */ + ubifs_convert_page_budget(c); + + if (appending) { + struct ubifs_inode *ui = ubifs_inode(inode); + + /* + * 'ubifs_write_end()' is optimized from the fast-path part of + * 'ubifs_write_begin()' and expects the @ui_mutex to be locked + * if data is appended. + */ + mutex_lock(&ui->ui_mutex); + if (ui->dirty) + /* + * The inode is dirty already, so we may free the + * budget we allocated. + */ + ubifs_release_dirty_inode_budget(c, ui); + } + + *pagep = page; + return 0; +} + +/** + * allocate_budget - allocate budget for 'ubifs_write_begin()'. + * @c: UBIFS file-system description object + * @page: page to allocate budget for + * @ui: UBIFS inode object the page belongs to + * @appending: non-zero if the page is appended + * + * This is a helper function for 'ubifs_write_begin()' which allocates budget + * for the operation. The budget is allocated differently depending on whether + * this is appending, whether the page is dirty or not, and so on. This + * function leaves the @ui->ui_mutex locked in case of appending. Returns zero + * in case of success and %-ENOSPC in case of failure. + */ +static int allocate_budget(struct ubifs_info *c, struct page *page, + struct ubifs_inode *ui, int appending) +{ + struct ubifs_budget_req req = { .fast = 1 }; + + if (PagePrivate(page)) { + if (!appending) + /* + * The page is dirty and we are not appending, which + * means no budget is needed at all. + */ + return 0; + + mutex_lock(&ui->ui_mutex); + if (ui->dirty) + /* + * The page is dirty and we are appending, so the inode + * has to be marked as dirty. However, it is already + * dirty, so we do not need any budget. We may return, + * but @ui->ui_mutex hast to be left locked because we + * should prevent write-back from flushing the inode + * and freeing the budget. The lock will be released in + * 'ubifs_write_end()'. + */ + return 0; + + /* + * The page is dirty, we are appending, the inode is clean, so + * we need to budget the inode change. + */ + req.dirtied_ino = 1; + } else { + if (PageChecked(page)) + /* + * The page corresponds to a hole and does not + * exist on the media. So changing it makes + * make the amount of indexing information + * larger, and we have to budget for a new + * page. + */ + req.new_page = 1; + else + /* + * Not a hole, the change will not add any new + * indexing information, budget for page + * change. + */ + req.dirtied_page = 1; + + if (appending) { + mutex_lock(&ui->ui_mutex); + if (!ui->dirty) + /* + * The inode is clean but we will have to mark + * it as dirty because we are appending. This + * needs a budget. + */ + req.dirtied_ino = 1; + } + } + + return ubifs_budget_space(c, &req); +} + +/* + * This function is called when a page of data is going to be written. Since + * the page of data will not necessarily go to the flash straight away, UBIFS + * has to reserve space on the media for it, which is done by means of + * budgeting. + * + * This is the hot-path of the file-system and we are trying to optimize it as + * much as possible. For this reasons it is split on 2 parts - slow and fast. + * + * There many budgeting cases: + * o a new page is appended - we have to budget for a new page and for + * changing the inode; however, if the inode is already dirty, there is + * no need to budget for it; + * o an existing clean page is changed - we have budget for it; if the page + * does not exist on the media (a hole), we have to budget for a new + * page; otherwise, we may budget for changing an existing page; the + * difference between these cases is that changing an existing page does + * not introduce anything new to the FS indexing information, so it does + * not grow, and smaller budget is acquired in this case; + * o an existing dirty page is changed - no need to budget at all, because + * the page budget has been acquired by earlier, when the page has been + * marked dirty. + * + * UBIFS budgeting sub-system may force write-back if it thinks there is no + * space to reserve. This imposes some locking restrictions and makes it + * impossible to take into account the above cases, and makes it impossible to + * optimize budgeting. + * + * The solution for this is that the fast path of 'ubifs_write_begin()' assumes + * there is a plenty of flash space and the budget will be acquired quickly, + * without forcing write-back. The slow path does not make this assumption. + */ +static int ubifs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct inode *inode = mapping->host; + struct ubifs_info *c = inode->i_sb->s_fs_info; + struct ubifs_inode *ui = ubifs_inode(inode); + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + int uninitialized_var(err), appending = !!(pos + len > inode->i_size); + struct page *page; + + + ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size); + + if (unlikely(c->ro_media)) + return -EROFS; + + /* Try out the fast-path part first */ + page = __grab_cache_page(mapping, index); + if (unlikely(!page)) + return -ENOMEM; + + if (!PageUptodate(page)) { + /* The page is not loaded from the flash */ + if (!(pos & PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) + /* + * We change whole page so no need to load it. But we + * have to set the @PG_checked flag to make the further + * code the page is new. This might be not true, but it + * is better to budget more that to read the page from + * the media. + */ + SetPageChecked(page); + else { + err = do_readpage(page); + if (err) { + unlock_page(page); + page_cache_release(page); + return err; + } + } + + SetPageUptodate(page); + ClearPageError(page); + } + + err = allocate_budget(c, page, ui, appending); + if (unlikely(err)) { + ubifs_assert(err == -ENOSPC); + /* + * Budgeting failed which means it would have to force + * write-back but didn't, because we set the @fast flag in the + * request. Write-back cannot be done now, while we have the + * page locked, because it would deadlock. Unlock and free + * everything and fall-back to slow-path. + */ + if (appending) { + ubifs_assert(mutex_is_locked(&ui->ui_mutex)); + mutex_unlock(&ui->ui_mutex); + } + unlock_page(page); + page_cache_release(page); + + return write_begin_slow(mapping, pos, len, pagep); + } + + /* + * Whee, we aquired budgeting quickly - without involving + * garbage-collection, committing or forceing write-back. We return + * with @ui->ui_mutex locked if we are appending pages, and unlocked + * otherwise. This is an optimization (slightly hacky though). + */ + *pagep = page; + return 0; + +} + +/** + * cancel_budget - cancel budget. + * @c: UBIFS file-system description object + * @page: page to cancel budget for + * @ui: UBIFS inode object the page belongs to + * @appending: non-zero if the page is appended + * + * This is a helper function for a page write operation. It unlocks the + * @ui->ui_mutex in case of appending. + */ +static void cancel_budget(struct ubifs_info *c, struct page *page, + struct ubifs_inode *ui, int appending) +{ + if (appending) { + if (!ui->dirty) + ubifs_release_dirty_inode_budget(c, ui); + mutex_unlock(&ui->ui_mutex); + } + if (!PagePrivate(page)) { + if (PageChecked(page)) + release_new_page_budget(c); + else + release_existing_page_budget(c); + } +} + +static int ubifs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = mapping->host; + struct ubifs_inode *ui = ubifs_inode(inode); + struct ubifs_info *c = inode->i_sb->s_fs_info; + loff_t end_pos = pos + len; + int appending = !!(end_pos > inode->i_size); + + dbg_gen("ino %lu, pos %llu, pg %lu, len %u, copied %d, i_size %lld", + inode->i_ino, pos, page->index, len, copied, inode->i_size); + + if (unlikely(copied < len && len == PAGE_CACHE_SIZE)) { + /* + * VFS copied less data to the page that it intended and + * declared in its '->write_begin()' call via the @len + * argument. If the page was not up-to-date, and @len was + * @PAGE_CACHE_SIZE, the 'ubifs_write_begin()' function did + * not load it from the media (for optimization reasons). This + * means that part of the page contains garbage. So read the + * page now. + */ + dbg_gen("copied %d instead of %d, read page and repeat", + copied, len); + cancel_budget(c, page, ui, appending); + + /* + * Return 0 to force VFS to repeat the whole operation, or the + * error code if 'do_readpage()' failes. + */ + copied = do_readpage(page); + goto out; + } + + if (!PagePrivate(page)) { + SetPagePrivate(page); + atomic_long_inc(&c->dirty_pg_cnt); + __set_page_dirty_nobuffers(page); + } + + if (appending) { + i_size_write(inode, end_pos); + ui->ui_size = end_pos; + /* + * Note, we do not set @I_DIRTY_PAGES (which means that the + * inode has dirty pages), this has been done in + * '__set_page_dirty_nobuffers()'. + */ + __mark_inode_dirty(inode, I_DIRTY_DATASYNC); + ubifs_assert(mutex_is_locked(&ui->ui_mutex)); + mutex_unlock(&ui->ui_mutex); + } + +out: + unlock_page(page); + page_cache_release(page); + return copied; +} + +static int ubifs_readpage(struct file *file, struct page *page) +{ + do_readpage(page); + unlock_page(page); + return 0; +} + +static int do_writepage(struct page *page, int len) +{ + int err = 0, i, blen; + unsigned int block; + void *addr; + union ubifs_key key; + struct inode *inode = page->mapping->host; + struct ubifs_info *c = inode->i_sb->s_fs_info; + +#ifdef UBIFS_DEBUG + spin_lock(&ui->ui_lock); + ubifs_assert(page->index <= ui->synced_i_size << PAGE_CACHE_SIZE); + spin_unlock(&ui->ui_lock); +#endif + + /* Update radix tree tags */ + set_page_writeback(page); + + addr = kmap(page); + block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT; + i = 0; + while (len) { + blen = min_t(int, len, UBIFS_BLOCK_SIZE); + data_key_init(c, &key, inode->i_ino, block); + err = ubifs_jnl_write_data(c, inode, &key, addr, blen); + if (err) + break; + if (++i >= UBIFS_BLOCKS_PER_PAGE) + break; + block += 1; + addr += blen; + len -= blen; + } + if (err) { + SetPageError(page); + ubifs_err("cannot write page %lu of inode %lu, error %d", + page->index, inode->i_ino, err); + ubifs_ro_mode(c, err); + } + + ubifs_assert(PagePrivate(page)); + if (PageChecked(page)) + release_new_page_budget(c); + else + release_existing_page_budget(c); + + atomic_long_dec(&c->dirty_pg_cnt); + ClearPagePrivate(page); + ClearPageChecked(page); + + kunmap(page); + unlock_page(page); + end_page_writeback(page); + return err; +} + +/* + * When writing-back dirty inodes, VFS first writes-back pages belonging to the + * inode, then the inode itself. For UBIFS this may cause a problem. Consider a + * situation when a we have an inode with size 0, then a megabyte of data is + * appended to the inode, then write-back starts and flushes some amount of the + * dirty pages, the journal becomes full, commit happens and finishes, and then + * an unclean reboot happens. When the file system is mounted next time, the + * inode size would still be 0, but there would be many pages which are beyond + * the inode size, they would be indexed and consume flash space. Because the + * journal has been committed, the replay would not be able to detect this + * situation and correct the inode size. This means UBIFS would have to scan + * whole index and correct all inode sizes, which is long an unacceptable. + * + * To prevent situations like this, UBIFS writes pages back only if they are + * within last synchronized inode size, i.e. the the size which has been + * written to the flash media last time. Otherwise, UBIFS forces inode + * write-back, thus making sure the on-flash inode contains current inode size, + * and then keeps writing pages back. + * + * Some locking issues explanation. 'ubifs_writepage()' first is called with + * the page locked, and it locks @ui_mutex. However, write-back does take inode + * @i_mutex, which means other VFS operations may be run on this inode at the + * same time. And the problematic one is truncation to smaller size, from where + * we have to call 'vmtruncate()', which first changes @inode->i_size, then + * drops the truncated pages. And while dropping the pages, it takes the page + * lock. This means that 'do_truncation()' cannot call 'vmtruncate()' with + * @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'. This + * means that @inode->i_size is changed while @ui_mutex is unlocked. + * + * But in 'ubifs_writepage()' we have to guarantee that we do not write beyond + * inode size. How do we do this if @inode->i_size may became smaller while we + * are in the middle of 'ubifs_writepage()'? The UBIFS solution is the + * @ui->ui_isize "shadow" field which UBIFS uses instead of @inode->i_size + * internally and updates it under @ui_mutex. + * + * Q: why we do not worry that if we race with truncation, we may end up with a + * situation when the inode is truncated while we are in the middle of + * 'do_writepage()', so we do write beyond inode size? + * A: If we are in the middle of 'do_writepage()', truncation would be locked + * on the page lock and it would not write the truncated inode node to the + * journal before we have finished. + */ +static int ubifs_writepage(struct page *page, struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + struct ubifs_inode *ui = ubifs_inode(inode); + loff_t i_size = i_size_read(inode), synced_i_size; + pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + int err, len = i_size & (PAGE_CACHE_SIZE - 1); + void *kaddr; + + dbg_gen("ino %lu, pg %lu, pg flags %#lx", + inode->i_ino, page->index, page->flags); + ubifs_assert(PagePrivate(page)); + + /* Is the page fully outside @i_size? (truncate in progress) */ + if (page->index > end_index || (page->index == end_index && !len)) { + err = 0; + goto out_unlock; + } + + spin_lock(&ui->ui_lock); + synced_i_size = ui->synced_i_size; + spin_unlock(&ui->ui_lock); + + /* Is the page fully inside @i_size? */ + if (page->index < end_index) { + if (page->index >= synced_i_size >> PAGE_CACHE_SHIFT) { + err = inode->i_sb->s_op->write_inode(inode, 1); + if (err) + goto out_unlock; + /* + * The inode has been written, but the write-buffer has + * not been synchronized, so in case of an unclean + * reboot we may end up with some pages beyond inode + * size, but they would be in the journal (because + * commit flushes write buffers) and recovery would deal + * with this. + */ + } + return do_writepage(page, PAGE_CACHE_SIZE); + } + + /* + * The page straddles @i_size. It must be zeroed out on each and every + * writepage invocation because it may be mmapped. "A file is mapped + * in multiples of the page size. For a file that is not a multiple of + * the page size, the remaining memory is zeroed when mapped, and + * writes to that region are not written out to the file." + */ + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + len, 0, PAGE_CACHE_SIZE - len); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + + if (i_size > synced_i_size) { + err = inode->i_sb->s_op->write_inode(inode, 1); + if (err) + goto out_unlock; + } + + return do_writepage(page, len); + +out_unlock: + unlock_page(page); + return err; +} + +/** + * do_attr_changes - change inode attributes. + * @inode: inode to change attributes for + * @attr: describes attributes to change + */ +static void do_attr_changes(struct inode *inode, const struct iattr *attr) +{ + if (attr->ia_valid & ATTR_UID) + inode->i_uid = attr->ia_uid; + if (attr->ia_valid & ATTR_GID) + inode->i_gid = attr->ia_gid; + if (attr->ia_valid & ATTR_ATIME) + inode->i_atime = timespec_trunc(attr->ia_atime, + inode->i_sb->s_time_gran); + if (attr->ia_valid & ATTR_MTIME) + inode->i_mtime = timespec_trunc(attr->ia_mtime, + inode->i_sb->s_time_gran); + if (attr->ia_valid & ATTR_CTIME) + inode->i_ctime = timespec_trunc(attr->ia_ctime, + inode->i_sb->s_time_gran); + if (attr->ia_valid & ATTR_MODE) { + umode_t mode = attr->ia_mode; + + if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) + mode &= ~S_ISGID; + inode->i_mode = mode; + } +} + +/** + * do_truncation - truncate an inode. + * @c: UBIFS file-system description object + * @inode: inode to truncate + * @attr: inode attribute changes description + * + * This function implements VFS '->setattr()' call when the inode is truncated + * to a smaller size. Returns zero in case of success and a negative error code + * in case of failure. + */ +static int do_truncation(struct ubifs_info *c, struct inode *inode, + const struct iattr *attr) +{ + int err; + struct ubifs_budget_req req; + loff_t old_size = inode->i_size, new_size = attr->ia_size; + int offset = new_size & (UBIFS_BLOCK_SIZE - 1), budgeted = 1; + struct ubifs_inode *ui = ubifs_inode(inode); + + dbg_gen("ino %lu, size %lld -> %lld", inode->i_ino, old_size, new_size); + memset(&req, 0, sizeof(struct ubifs_budget_req)); + + /* + * If this is truncation to a smaller size, and we do not truncate on a + * block boundary, budget for changing one data block, because the last + * block will be re-written. + */ + if (new_size & (UBIFS_BLOCK_SIZE - 1)) + req.dirtied_page = 1; + + req.dirtied_ino = 1; + /* A funny way to budget for truncation node */ + req.dirtied_ino_d = UBIFS_TRUN_NODE_SZ; + err = ubifs_budget_space(c, &req); + if (err) { + /* + * Treat truncations to zero as deletion and always allow them, + * just like we do for '->unlink()'. + */ + if (new_size || err != -ENOSPC) + return err; + budgeted = 0; + } + + err = vmtruncate(inode, new_size); + if (err) + goto out_budg; + + if (offset) { + pgoff_t index = new_size >> PAGE_CACHE_SHIFT; + struct page *page; + + page = find_lock_page(inode->i_mapping, index); + if (page) { + if (PageDirty(page)) { + /* + * 'ubifs_jnl_truncate()' will try to truncate + * the last data node, but it contains + * out-of-date data because the page is dirty. + * Write the page now, so that + * 'ubifs_jnl_truncate()' will see an already + * truncated (and up to date) data node. + */ + ubifs_assert(PagePrivate(page)); + + clear_page_dirty_for_io(page); + if (UBIFS_BLOCKS_PER_PAGE_SHIFT) + offset = new_size & + (PAGE_CACHE_SIZE - 1); + err = do_writepage(page, offset); + page_cache_release(page); + if (err) + goto out_budg; + /* + * We could now tell 'ubifs_jnl_truncate()' not + * to read the last block. + */ + } else { + /* + * We could 'kmap()' the page and pass the data + * to 'ubifs_jnl_truncate()' to save it from + * having to read it. + */ + unlock_page(page); + page_cache_release(page); + } + } + } + + mutex_lock(&ui->ui_mutex); + ui->ui_size = inode->i_size; + /* Truncation changes inode [mc]time */ + inode->i_mtime = inode->i_ctime = ubifs_current_time(inode); + /* The other attributes may be changed at the same time as well */ + do_attr_changes(inode, attr); + + err = ubifs_jnl_truncate(c, inode, old_size, new_size); + mutex_unlock(&ui->ui_mutex); +out_budg: + if (budgeted) + ubifs_release_budget(c, &req); + else { + c->nospace = c->nospace_rp = 0; + smp_wmb(); + } + return err; +} + +/** + * do_setattr - change inode attributes. + * @c: UBIFS file-system description object + * @inode: inode to change attributes for + * @attr: inode attribute changes description + * + * This function implements VFS '->setattr()' call for all cases except + * truncations to smaller size. Returns zero in case of success and a negative + * error code in case of failure. + */ +static int do_setattr(struct ubifs_info *c, struct inode *inode, + const struct iattr *attr) +{ + int err, release; + loff_t new_size = attr->ia_size; + struct ubifs_inode *ui = ubifs_inode(inode); + struct ubifs_budget_req req = { .dirtied_ino = 1, + .dirtied_ino_d = ALIGN(ui->data_len, 8) }; + + err = ubifs_budget_space(c, &req); + if (err) + return err; + + if (attr->ia_valid & ATTR_SIZE) { + dbg_gen("size %lld -> %lld", inode->i_size, new_size); + err = vmtruncate(inode, new_size); + if (err) + goto out; + } + + mutex_lock(&ui->ui_mutex); + if (attr->ia_valid & ATTR_SIZE) { + /* Truncation changes inode [mc]time */ + inode->i_mtime = inode->i_ctime = ubifs_current_time(inode); + /* 'vmtruncate()' changed @i_size, update @ui_size */ + ui->ui_size = inode->i_size; + } + + do_attr_changes(inode, attr); + + release = ui->dirty; + if (attr->ia_valid & ATTR_SIZE) + /* + * Inode length changed, so we have to make sure + * @I_DIRTY_DATASYNC is set. + */ + __mark_inode_dirty(inode, I_DIRTY_SYNC | I_DIRTY_DATASYNC); + else + mark_inode_dirty_sync(inode); + mutex_unlock(&ui->ui_mutex); + + if (release) + ubifs_release_budget(c, &req); + if (IS_SYNC(inode)) + err = inode->i_sb->s_op->write_inode(inode, 1); + return err; + +out: + ubifs_release_budget(c, &req); + return err; +} + +int ubifs_setattr(struct dentry *dentry, struct iattr *attr) +{ + int err; + struct inode *inode = dentry->d_inode; + struct ubifs_info *c = inode->i_sb->s_fs_info; + + dbg_gen("ino %lu, mode %#x, ia_valid %#x", + inode->i_ino, inode->i_mode, attr->ia_valid); + err = inode_change_ok(inode, attr); + if (err) + return err; + + err = dbg_check_synced_i_size(inode); + if (err) + return err; + + if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size < inode->i_size) + /* Truncation to a smaller size */ + err = do_truncation(c, inode, attr); + else + err = do_setattr(c, inode, attr); + + return err; +} + +static void ubifs_invalidatepage(struct page *page, unsigned long offset) +{ + struct inode *inode = page->mapping->host; + struct ubifs_info *c = inode->i_sb->s_fs_info; + + ubifs_assert(PagePrivate(page)); + if (offset) + /* Partial page remains dirty */ + return; + + if (PageChecked(page)) + release_new_page_budget(c); + else + release_existing_page_budget(c); + + atomic_long_dec(&c->dirty_pg_cnt); + ClearPagePrivate(page); + ClearPageChecked(page); +} + +static void *ubifs_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct ubifs_inode *ui = ubifs_inode(dentry->d_inode); + + nd_set_link(nd, ui->data); + return NULL; +} + +int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync) +{ + struct inode *inode = dentry->d_inode; + struct ubifs_info *c = inode->i_sb->s_fs_info; + int err; + + dbg_gen("syncing inode %lu", inode->i_ino); + + /* + * VFS has already synchronized dirty pages for this inode. Synchronize + * the inode unless this is a 'datasync()' call. + */ + if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) { + err = inode->i_sb->s_op->write_inode(inode, 1); + if (err) + return err; + } + + /* + * Nodes related to this inode may still sit in a write-buffer. Flush + * them. + */ + err = ubifs_sync_wbufs_by_inode(c, inode); + if (err) + return err; + + return 0; +} + +/** + * mctime_update_needed - check if mtime or ctime update is needed. + * @inode: the inode to do the check for + * @now: current time + * + * This helper function checks if the inode mtime/ctime should be updated or + * not. If current values of the time-stamps are within the UBIFS inode time + * granularity, they are not updated. This is an optimization. + */ +static inline int mctime_update_needed(const struct inode *inode, + const struct timespec *now) +{ + if (!timespec_equal(&inode->i_mtime, now) || + !timespec_equal(&inode->i_ctime, now)) + return 1; + return 0; +} + +/** + * update_ctime - update mtime and ctime of an inode. + * @c: UBIFS file-system description object + * @inode: inode to update + * + * This function updates mtime and ctime of the inode if it is not equivalent to + * current time. Returns zero in case of success and a negative error code in + * case of failure. + */ +static int update_mctime(struct ubifs_info *c, struct inode *inode) +{ + struct timespec now = ubifs_current_time(inode); + struct ubifs_inode *ui = ubifs_inode(inode); + + if (mctime_update_needed(inode, &now)) { + int err, release; + struct ubifs_budget_req req = { .dirtied_ino = 1, + .dirtied_ino_d = ALIGN(ui->data_len, 8) }; + + err = ubifs_budget_space(c, &req); + if (err) + return err; + + mutex_lock(&ui->ui_mutex); + inode->i_mtime = inode->i_ctime = ubifs_current_time(inode); + release = ui->dirty; + mark_inode_dirty_sync(inode); + mutex_unlock(&ui->ui_mutex); + if (release) + ubifs_release_budget(c, &req); + } + + return 0; +} + +static ssize_t ubifs_aio_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + int err; + ssize_t ret; + struct inode *inode = iocb->ki_filp->f_mapping->host; + struct ubifs_info *c = inode->i_sb->s_fs_info; + + err = update_mctime(c, inode); + if (err) + return err; + + ret = generic_file_aio_write(iocb, iov, nr_segs, pos); + if (ret < 0) + return ret; + + if (ret > 0 && (IS_SYNC(inode) || iocb->ki_filp->f_flags & O_SYNC)) { + err = ubifs_sync_wbufs_by_inode(c, inode); + if (err) + return err; + } + + return ret; +} + +static int ubifs_set_page_dirty(struct page *page) +{ + int ret; + + ret = __set_page_dirty_nobuffers(page); + /* + * An attempt to dirty a page without budgeting for it - should not + * happen. + */ + ubifs_assert(ret == 0); + return ret; +} + +static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags) +{ + /* + * An attempt to release a dirty page without budgeting for it - should + * not happen. + */ + if (PageWriteback(page)) + return 0; + ubifs_assert(PagePrivate(page)); + ubifs_assert(0); + ClearPagePrivate(page); + ClearPageChecked(page); + return 1; +} + +/* + * mmap()d file has taken write protection fault and is being made + * writable. UBIFS must ensure page is budgeted for. + */ +static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) +{ + struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + struct ubifs_info *c = inode->i_sb->s_fs_info; + struct timespec now = ubifs_current_time(inode); + struct ubifs_budget_req req = { .new_page = 1 }; + int err, update_time; + + dbg_gen("ino %lu, pg %lu, i_size %lld", inode->i_ino, page->index, + i_size_read(inode)); + ubifs_assert(!(inode->i_sb->s_flags & MS_RDONLY)); + + if (unlikely(c->ro_media)) + return -EROFS; + + /* + * We have not locked @page so far so we may budget for changing the + * page. Note, we cannot do this after we locked the page, because + * budgeting may cause write-back which would cause deadlock. + * + * At the moment we do not know whether the page is dirty or not, so we + * assume that it is not and budget for a new page. We could look at + * the @PG_private flag and figure this out, but we may race with write + * back and the page state may change by the time we lock it, so this + * would need additional care. We do not bother with this at the + * moment, although it might be good idea to do. Instead, we allocate + * budget for a new page and amend it later on if the page was in fact + * dirty. + * + * The budgeting-related logic of this function is similar to what we + * do in 'ubifs_write_begin()' and 'ubifs_write_end()'. Glance there + * for more comments. + */ + update_time = mctime_update_needed(inode, &now); + if (update_time) + /* + * We have to change inode time stamp which requires extra + * budgeting. + */ + req.dirtied_ino = 1; + + err = ubifs_budget_space(c, &req); + if (unlikely(err)) { + if (err == -ENOSPC) + ubifs_warn("out of space for mmapped file " + "(inode number %lu)", inode->i_ino); + return err; + } + + lock_page(page); + if (unlikely(page->mapping != inode->i_mapping || + page_offset(page) > i_size_read(inode))) { + /* Page got truncated out from underneath us */ + err = -EINVAL; + goto out_unlock; + } + + if (PagePrivate(page)) + release_new_page_budget(c); + else { + if (!PageChecked(page)) + ubifs_convert_page_budget(c); + SetPagePrivate(page); + atomic_long_inc(&c->dirty_pg_cnt); + __set_page_dirty_nobuffers(page); + } + + if (update_time) { + int release; + struct ubifs_inode *ui = ubifs_inode(inode); + + mutex_lock(&ui->ui_mutex); + inode->i_mtime = inode->i_ctime = ubifs_current_time(inode); + release = ui->dirty; + mark_inode_dirty_sync(inode); + mutex_unlock(&ui->ui_mutex); + if (release) + ubifs_release_dirty_inode_budget(c, ui); + } + + unlock_page(page); + return 0; + +out_unlock: + unlock_page(page); + ubifs_release_budget(c, &req); + return err; +} + +static struct vm_operations_struct ubifs_file_vm_ops = { + .fault = filemap_fault, + .page_mkwrite = ubifs_vm_page_mkwrite, +}; + +static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + int err; + + /* 'generic_file_mmap()' takes care of NOMMU case */ + err = generic_file_mmap(file, vma); + if (err) + return err; + vma->vm_ops = &ubifs_file_vm_ops; + return 0; +} + +struct address_space_operations ubifs_file_address_operations = { + .readpage = ubifs_readpage, + .writepage = ubifs_writepage, + .write_begin = ubifs_write_begin, + .write_end = ubifs_write_end, + .invalidatepage = ubifs_invalidatepage, + .set_page_dirty = ubifs_set_page_dirty, + .releasepage = ubifs_releasepage, +}; + +struct inode_operations ubifs_file_inode_operations = { + .setattr = ubifs_setattr, + .getattr = ubifs_getattr, +#ifdef CONFIG_UBIFS_FS_XATTR + .setxattr = ubifs_setxattr, + .getxattr = ubifs_getxattr, + .listxattr = ubifs_listxattr, + .removexattr = ubifs_removexattr, +#endif +}; + +struct inode_operations ubifs_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = ubifs_follow_link, + .setattr = ubifs_setattr, + .getattr = ubifs_getattr, +}; + +struct file_operations ubifs_file_operations = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .aio_read = generic_file_aio_read, + .aio_write = ubifs_aio_write, + .mmap = ubifs_file_mmap, + .fsync = ubifs_fsync, + .unlocked_ioctl = ubifs_ioctl, + .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, +#ifdef CONFIG_COMPAT + .compat_ioctl = ubifs_compat_ioctl, +#endif +}; diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c new file mode 100644 index 000000000000..47814cde2407 --- /dev/null +++ b/fs/ubifs/find.c @@ -0,0 +1,977 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* + * This file contains functions for finding LEBs for various purposes e.g. + * garbage collection. In general, lprops category heaps and lists are used + * for fast access, falling back on scanning the LPT as a last resort. + */ + +#include <linux/sort.h> +#include "ubifs.h" + +/** + * struct scan_data - data provided to scan callback functions + * @min_space: minimum number of bytes for which to scan + * @pick_free: whether it is OK to scan for empty LEBs + * @lnum: LEB number found is returned here + * @exclude_index: whether to exclude index LEBs + */ +struct scan_data { + int min_space; + int pick_free; + int lnum; + int exclude_index; +}; + +/** + * valuable - determine whether LEB properties are valuable. + * @c: the UBIFS file-system description object + * @lprops: LEB properties + * + * This function return %1 if the LEB properties should be added to the LEB + * properties tree in memory. Otherwise %0 is returned. + */ +static int valuable(struct ubifs_info *c, const struct ubifs_lprops *lprops) +{ + int n, cat = lprops->flags & LPROPS_CAT_MASK; + struct ubifs_lpt_heap *heap; + + switch (cat) { + case LPROPS_DIRTY: + case LPROPS_DIRTY_IDX: + case LPROPS_FREE: + heap = &c->lpt_heap[cat - 1]; + if (heap->cnt < heap->max_cnt) + return 1; + if (lprops->free + lprops->dirty >= c->dark_wm) + return 1; + return 0; + case LPROPS_EMPTY: + n = c->lst.empty_lebs + c->freeable_cnt - + c->lst.taken_empty_lebs; + if (n < c->lsave_cnt) + return 1; + return 0; + case LPROPS_FREEABLE: + return 1; + case LPROPS_FRDI_IDX: + return 1; + } + return 0; +} + +/** + * scan_for_dirty_cb - dirty space scan callback. + * @c: the UBIFS file-system description object + * @lprops: LEB properties to scan + * @in_tree: whether the LEB properties are in main memory + * @data: information passed to and from the caller of the scan + * + * This function returns a code that indicates whether the scan should continue + * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree + * in main memory (%LPT_SCAN_ADD), or whether the scan should stop + * (%LPT_SCAN_STOP). + */ +static int scan_for_dirty_cb(struct ubifs_info *c, + const struct ubifs_lprops *lprops, int in_tree, + struct scan_data *data) +{ + int ret = LPT_SCAN_CONTINUE; + + /* Exclude LEBs that are currently in use */ + if (lprops->flags & LPROPS_TAKEN) + return LPT_SCAN_CONTINUE; + /* Determine whether to add these LEB properties to the tree */ + if (!in_tree && valuable(c, lprops)) + ret |= LPT_SCAN_ADD; + /* Exclude LEBs with too little space */ + if (lprops->free + lprops->dirty < data->min_space) + return ret; + /* If specified, exclude index LEBs */ + if (data->exclude_index && lprops->flags & LPROPS_INDEX) + return ret; + /* If specified, exclude empty or freeable LEBs */ + if (lprops->free + lprops->dirty == c->leb_size) { + if (!data->pick_free) + return ret; + /* Exclude LEBs with too little dirty space (unless it is empty) */ + } else if (lprops->dirty < c->dead_wm) + return ret; + /* Finally we found space */ + data->lnum = lprops->lnum; + return LPT_SCAN_ADD | LPT_SCAN_STOP; +} + +/** + * scan_for_dirty - find a data LEB with free space. + * @c: the UBIFS file-system description object + * @min_space: minimum amount free plus dirty space the returned LEB has to + * have + * @pick_free: if it is OK to return a free or freeable LEB + * @exclude_index: whether to exclude index LEBs + * + * This function returns a pointer to the LEB properties found or a negative + * error code. + */ +static const struct ubifs_lprops *scan_for_dirty(struct ubifs_info *c, + int min_space, int pick_free, + int exclude_index) +{ + const struct ubifs_lprops *lprops; + struct ubifs_lpt_heap *heap; + struct scan_data data; + int err, i; + + /* There may be an LEB with enough dirty space on the free heap */ + heap = &c->lpt_heap[LPROPS_FREE - 1]; + for (i = 0; i < heap->cnt; i++) { + lprops = heap->arr[i]; + if (lprops->free + lprops->dirty < min_space) + continue; + if (lprops->dirty < c->dead_wm) + continue; + return lprops; + } + /* + * A LEB may have fallen off of the bottom of the dirty heap, and ended + * up as uncategorized even though it has enough dirty space for us now, + * so check the uncategorized list. N.B. neither empty nor freeable LEBs + * can end up as uncategorized because they are kept on lists not + * finite-sized heaps. + */ + list_for_each_entry(lprops, &c->uncat_list, list) { + if (lprops->flags & LPROPS_TAKEN) + continue; + if (lprops->free + lprops->dirty < min_space) + continue; + if (exclude_index && (lprops->flags & LPROPS_INDEX)) + continue; + if (lprops->dirty < c->dead_wm) + continue; + return lprops; + } + /* We have looked everywhere in main memory, now scan the flash */ + if (c->pnodes_have >= c->pnode_cnt) + /* All pnodes are in memory, so skip scan */ + return ERR_PTR(-ENOSPC); + data.min_space = min_space; + data.pick_free = pick_free; + data.lnum = -1; + data.exclude_index = exclude_index; + err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, + (ubifs_lpt_scan_callback)scan_for_dirty_cb, + &data); + if (err) + return ERR_PTR(err); + ubifs_assert(data.lnum >= c->main_first && data.lnum < c->leb_cnt); + c->lscan_lnum = data.lnum; + lprops = ubifs_lpt_lookup_dirty(c, data.lnum); + if (IS_ERR(lprops)) + return lprops; + ubifs_assert(lprops->lnum == data.lnum); + ubifs_assert(lprops->free + lprops->dirty >= min_space); + ubifs_assert(lprops->dirty >= c->dead_wm || + (pick_free && + lprops->free + lprops->dirty == c->leb_size)); + ubifs_assert(!(lprops->flags & LPROPS_TAKEN)); + ubifs_assert(!exclude_index || !(lprops->flags & LPROPS_INDEX)); + return lprops; +} + +/** + * ubifs_find_dirty_leb - find a dirty LEB for the Garbage Collector. + * @c: the UBIFS file-system description object + * @ret_lp: LEB properties are returned here on exit + * @min_space: minimum amount free plus dirty space the returned LEB has to + * have + * @pick_free: controls whether it is OK to pick empty or index LEBs + * + * This function tries to find a dirty logical eraseblock which has at least + * @min_space free and dirty space. It prefers to take an LEB from the dirty or + * dirty index heap, and it falls-back to LPT scanning if the heaps are empty + * or do not have an LEB which satisfies the @min_space criteria. + * + * Note, LEBs which have less than dead watermark of free + dirty space are + * never picked by this function. + * + * The additional @pick_free argument controls if this function has to return a + * free or freeable LEB if one is present. For example, GC must to set it to %1, + * when called from the journal space reservation function, because the + * appearance of free space may coincide with the loss of enough dirty space + * for GC to succeed anyway. + * + * In contrast, if the Garbage Collector is called from budgeting, it should + * just make free space, not return LEBs which are already free or freeable. + * + * In addition @pick_free is set to %2 by the recovery process in order to + * recover gc_lnum in which case an index LEB must not be returned. + * + * This function returns zero and the LEB properties of found dirty LEB in case + * of success, %-ENOSPC if no dirty LEB was found and a negative error code in + * case of other failures. The returned LEB is marked as "taken". + */ +int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp, + int min_space, int pick_free) +{ + int err = 0, sum, exclude_index = pick_free == 2 ? 1 : 0; + const struct ubifs_lprops *lp = NULL, *idx_lp = NULL; + struct ubifs_lpt_heap *heap, *idx_heap; + + ubifs_get_lprops(c); + + if (pick_free) { + int lebs, rsvd_idx_lebs = 0; + + spin_lock(&c->space_lock); + lebs = c->lst.empty_lebs + c->idx_gc_cnt; + lebs += c->freeable_cnt - c->lst.taken_empty_lebs; + + /* + * Note, the index may consume more LEBs than have been reserved + * for it. It is OK because it might be consolidated by GC. + * But if the index takes fewer LEBs than it is reserved for it, + * this function must avoid picking those reserved LEBs. + */ + if (c->min_idx_lebs >= c->lst.idx_lebs) { + rsvd_idx_lebs = c->min_idx_lebs - c->lst.idx_lebs; + exclude_index = 1; + } + spin_unlock(&c->space_lock); + + /* Check if there are enough free LEBs for the index */ + if (rsvd_idx_lebs < lebs) { + /* OK, try to find an empty LEB */ + lp = ubifs_fast_find_empty(c); + if (lp) + goto found; + + /* Or a freeable LEB */ + lp = ubifs_fast_find_freeable(c); + if (lp) + goto found; + } else + /* + * We cannot pick free/freeable LEBs in the below code. + */ + pick_free = 0; + } else { + spin_lock(&c->space_lock); + exclude_index = (c->min_idx_lebs >= c->lst.idx_lebs); + spin_unlock(&c->space_lock); + } + + /* Look on the dirty and dirty index heaps */ + heap = &c->lpt_heap[LPROPS_DIRTY - 1]; + idx_heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1]; + + if (idx_heap->cnt && !exclude_index) { + idx_lp = idx_heap->arr[0]; + sum = idx_lp->free + idx_lp->dirty; + /* + * Since we reserve thrice as much space for the index than it + * actually takes, it does not make sense to pick indexing LEBs + * with less than, say, half LEB of dirty space. May be half is + * not the optimal boundary - this should be tested and + * checked. This boundary should determine how much we use + * in-the-gaps to consolidate the index comparing to how much + * we use garbage collector to consolidate it. The "half" + * criteria just feels to be fine. + */ + if (sum < min_space || sum < c->half_leb_size) + idx_lp = NULL; + } + + if (heap->cnt) { + lp = heap->arr[0]; + if (lp->dirty + lp->free < min_space) + lp = NULL; + } + + /* Pick the LEB with most space */ + if (idx_lp && lp) { + if (idx_lp->free + idx_lp->dirty >= lp->free + lp->dirty) + lp = idx_lp; + } else if (idx_lp && !lp) + lp = idx_lp; + + if (lp) { + ubifs_assert(lp->free + lp->dirty >= c->dead_wm); + goto found; + } + + /* Did not find a dirty LEB on the dirty heaps, have to scan */ + dbg_find("scanning LPT for a dirty LEB"); + lp = scan_for_dirty(c, min_space, pick_free, exclude_index); + if (IS_ERR(lp)) { + err = PTR_ERR(lp); + goto out; + } + ubifs_assert(lp->dirty >= c->dead_wm || + (pick_free && lp->free + lp->dirty == c->leb_size)); + +found: + dbg_find("found LEB %d, free %d, dirty %d, flags %#x", + lp->lnum, lp->free, lp->dirty, lp->flags); + + lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC, + lp->flags | LPROPS_TAKEN, 0); + if (IS_ERR(lp)) { + err = PTR_ERR(lp); + goto out; + } + + memcpy(ret_lp, lp, sizeof(struct ubifs_lprops)); + +out: + ubifs_release_lprops(c); + return err; +} + +/** + * scan_for_free_cb - free space scan callback. + * @c: the UBIFS file-system description object + * @lprops: LEB properties to scan + * @in_tree: whether the LEB properties are in main memory + * @data: information passed to and from the caller of the scan + * + * This function returns a code that indicates whether the scan should continue + * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree + * in main memory (%LPT_SCAN_ADD), or whether the scan should stop + * (%LPT_SCAN_STOP). + */ +static int scan_for_free_cb(struct ubifs_info *c, + const struct ubifs_lprops *lprops, int in_tree, + struct scan_data *data) +{ + int ret = LPT_SCAN_CONTINUE; + + /* Exclude LEBs that are currently in use */ + if (lprops->flags & LPROPS_TAKEN) + return LPT_SCAN_CONTINUE; + /* Determine whether to add these LEB properties to the tree */ + if (!in_tree && valuable(c, lprops)) + ret |= LPT_SCAN_ADD; + /* Exclude index LEBs */ + if (lprops->flags & LPROPS_INDEX) + return ret; + /* Exclude LEBs with too little space */ + if (lprops->free < data->min_space) + return ret; + /* If specified, exclude empty LEBs */ + if (!data->pick_free && lprops->free == c->leb_size) + return ret; + /* + * LEBs that have only free and dirty space must not be allocated + * because they may have been unmapped already or they may have data + * that is obsolete only because of nodes that are still sitting in a + * wbuf. + */ + if (lprops->free + lprops->dirty == c->leb_size && lprops->dirty > 0) + return ret; + /* Finally we found space */ + data->lnum = lprops->lnum; + return LPT_SCAN_ADD | LPT_SCAN_STOP; +} + +/** + * do_find_free_space - find a data LEB with free space. + * @c: the UBIFS file-system description object + * @min_space: minimum amount of free space required + * @pick_free: whether it is OK to scan for empty LEBs + * @squeeze: whether to try to find space in a non-empty LEB first + * + * This function returns a pointer to the LEB properties found or a negative + * error code. + */ +static +const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c, + int min_space, int pick_free, + int squeeze) +{ + const struct ubifs_lprops *lprops; + struct ubifs_lpt_heap *heap; + struct scan_data data; + int err, i; + + if (squeeze) { + lprops = ubifs_fast_find_free(c); + if (lprops && lprops->free >= min_space) + return lprops; + } + if (pick_free) { + lprops = ubifs_fast_find_empty(c); + if (lprops) + return lprops; + } + if (!squeeze) { + lprops = ubifs_fast_find_free(c); + if (lprops && lprops->free >= min_space) + return lprops; + } + /* There may be an LEB with enough free space on the dirty heap */ + heap = &c->lpt_heap[LPROPS_DIRTY - 1]; + for (i = 0; i < heap->cnt; i++) { + lprops = heap->arr[i]; + if (lprops->free >= min_space) + return lprops; + } + /* + * A LEB may have fallen off of the bottom of the free heap, and ended + * up as uncategorized even though it has enough free space for us now, + * so check the uncategorized list. N.B. neither empty nor freeable LEBs + * can end up as uncategorized because they are kept on lists not + * finite-sized heaps. + */ + list_for_each_entry(lprops, &c->uncat_list, list) { + if (lprops->flags & LPROPS_TAKEN) + continue; + if (lprops->flags & LPROPS_INDEX) + continue; + if (lprops->free >= min_space) + return lprops; + } + /* We have looked everywhere in main memory, now scan the flash */ + if (c->pnodes_have >= c->pnode_cnt) + /* All pnodes are in memory, so skip scan */ + return ERR_PTR(-ENOSPC); + data.min_space = min_space; + data.pick_free = pick_free; + data.lnum = -1; + err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, + (ubifs_lpt_scan_callback)scan_for_free_cb, + &data); + if (err) + return ERR_PTR(err); + ubifs_assert(data.lnum >= c->main_first && data.lnum < c->leb_cnt); + c->lscan_lnum = data.lnum; + lprops = ubifs_lpt_lookup_dirty(c, data.lnum); + if (IS_ERR(lprops)) + return lprops; + ubifs_assert(lprops->lnum == data.lnum); + ubifs_assert(lprops->free >= min_space); + ubifs_assert(!(lprops->flags & LPROPS_TAKEN)); + ubifs_assert(!(lprops->flags & LPROPS_INDEX)); + return lprops; +} + +/** + * ubifs_find_free_space - find a data LEB with free space. + * @c: the UBIFS file-system description object + * @min_space: minimum amount of required free space + * @free: contains amount of free space in the LEB on exit + * @squeeze: whether to try to find space in a non-empty LEB first + * + * This function looks for an LEB with at least @min_space bytes of free space. + * It tries to find an empty LEB if possible. If no empty LEBs are available, + * this function searches for a non-empty data LEB. The returned LEB is marked + * as "taken". + * + * This function returns found LEB number in case of success, %-ENOSPC if it + * failed to find a LEB with @min_space bytes of free space and other a negative + * error codes in case of failure. + */ +int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free, + int squeeze) +{ + const struct ubifs_lprops *lprops; + int lebs, rsvd_idx_lebs, pick_free = 0, err, lnum, flags; + + dbg_find("min_space %d", min_space); + ubifs_get_lprops(c); + + /* Check if there are enough empty LEBs for commit */ + spin_lock(&c->space_lock); + if (c->min_idx_lebs > c->lst.idx_lebs) + rsvd_idx_lebs = c->min_idx_lebs - c->lst.idx_lebs; + else + rsvd_idx_lebs = 0; + lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - + c->lst.taken_empty_lebs; + if (rsvd_idx_lebs < lebs) + /* + * OK to allocate an empty LEB, but we still don't want to go + * looking for one if there aren't any. + */ + if (c->lst.empty_lebs - c->lst.taken_empty_lebs > 0) { + pick_free = 1; + /* + * Because we release the space lock, we must account + * for this allocation here. After the LEB properties + * flags have been updated, we subtract one. Note, the + * result of this is that lprops also decreases + * @taken_empty_lebs in 'ubifs_change_lp()', so it is + * off by one for a short period of time which may + * introduce a small disturbance to budgeting + * calculations, but this is harmless because at the + * worst case this would make the budgeting subsystem + * be more pessimistic than needed. + * + * Fundamentally, this is about serialization of the + * budgeting and lprops subsystems. We could make the + * @space_lock a mutex and avoid dropping it before + * calling 'ubifs_change_lp()', but mutex is more + * heavy-weight, and we want budgeting to be as fast as + * possible. + */ + c->lst.taken_empty_lebs += 1; + } + spin_unlock(&c->space_lock); + + lprops = do_find_free_space(c, min_space, pick_free, squeeze); + if (IS_ERR(lprops)) { + err = PTR_ERR(lprops); + goto out; + } + + lnum = lprops->lnum; + flags = lprops->flags | LPROPS_TAKEN; + + lprops = ubifs_change_lp(c, lprops, LPROPS_NC, LPROPS_NC, flags, 0); + if (IS_ERR(lprops)) { + err = PTR_ERR(lprops); + goto out; + } + + if (pick_free) { + spin_lock(&c->space_lock); + c->lst.taken_empty_lebs -= 1; + spin_unlock(&c->space_lock); + } + + *free = lprops->free; + ubifs_release_lprops(c); + + if (*free == c->leb_size) { + /* + * Ensure that empty LEBs have been unmapped. They may not have + * been, for example, because of an unclean unmount. Also + * LEBs that were freeable LEBs (free + dirty == leb_size) will + * not have been unmapped. + */ + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + } + + dbg_find("found LEB %d, free %d", lnum, *free); + ubifs_assert(*free >= min_space); + return lnum; + +out: + if (pick_free) { + spin_lock(&c->space_lock); + c->lst.taken_empty_lebs -= 1; + spin_unlock(&c->space_lock); + } + ubifs_release_lprops(c); + return err; +} + +/** + * scan_for_idx_cb - callback used by the scan for a free LEB for the index. + * @c: the UBIFS file-system description object + * @lprops: LEB properties to scan + * @in_tree: whether the LEB properties are in main memory + * @data: information passed to and from the caller of the scan + * + * This function returns a code that indicates whether the scan should continue + * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree + * in main memory (%LPT_SCAN_ADD), or whether the scan should stop + * (%LPT_SCAN_STOP). + */ +static int scan_for_idx_cb(struct ubifs_info *c, + const struct ubifs_lprops *lprops, int in_tree, + struct scan_data *data) +{ + int ret = LPT_SCAN_CONTINUE; + + /* Exclude LEBs that are currently in use */ + if (lprops->flags & LPROPS_TAKEN) + return LPT_SCAN_CONTINUE; + /* Determine whether to add these LEB properties to the tree */ + if (!in_tree && valuable(c, lprops)) + ret |= LPT_SCAN_ADD; + /* Exclude index LEBS */ + if (lprops->flags & LPROPS_INDEX) + return ret; + /* Exclude LEBs that cannot be made empty */ + if (lprops->free + lprops->dirty != c->leb_size) + return ret; + /* + * We are allocating for the index so it is safe to allocate LEBs with + * only free and dirty space, because write buffers are sync'd at commit + * start. + */ + data->lnum = lprops->lnum; + return LPT_SCAN_ADD | LPT_SCAN_STOP; +} + +/** + * scan_for_leb_for_idx - scan for a free LEB for the index. + * @c: the UBIFS file-system description object + */ +static const struct ubifs_lprops *scan_for_leb_for_idx(struct ubifs_info *c) +{ + struct ubifs_lprops *lprops; + struct scan_data data; + int err; + + data.lnum = -1; + err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, + (ubifs_lpt_scan_callback)scan_for_idx_cb, + &data); + if (err) + return ERR_PTR(err); + ubifs_assert(data.lnum >= c->main_first && data.lnum < c->leb_cnt); + c->lscan_lnum = data.lnum; + lprops = ubifs_lpt_lookup_dirty(c, data.lnum); + if (IS_ERR(lprops)) + return lprops; + ubifs_assert(lprops->lnum == data.lnum); + ubifs_assert(lprops->free + lprops->dirty == c->leb_size); + ubifs_assert(!(lprops->flags & LPROPS_TAKEN)); + ubifs_assert(!(lprops->flags & LPROPS_INDEX)); + return lprops; +} + +/** + * ubifs_find_free_leb_for_idx - find a free LEB for the index. + * @c: the UBIFS file-system description object + * + * This function looks for a free LEB and returns that LEB number. The returned + * LEB is marked as "taken", "index". + * + * Only empty LEBs are allocated. This is for two reasons. First, the commit + * calculates the number of LEBs to allocate based on the assumption that they + * will be empty. Secondly, free space at the end of an index LEB is not + * guaranteed to be empty because it may have been used by the in-the-gaps + * method prior to an unclean unmount. + * + * If no LEB is found %-ENOSPC is returned. For other failures another negative + * error code is returned. + */ +int ubifs_find_free_leb_for_idx(struct ubifs_info *c) +{ + const struct ubifs_lprops *lprops; + int lnum = -1, err, flags; + + ubifs_get_lprops(c); + + lprops = ubifs_fast_find_empty(c); + if (!lprops) { + lprops = ubifs_fast_find_freeable(c); + if (!lprops) { + ubifs_assert(c->freeable_cnt == 0); + if (c->lst.empty_lebs - c->lst.taken_empty_lebs > 0) { + lprops = scan_for_leb_for_idx(c); + if (IS_ERR(lprops)) { + err = PTR_ERR(lprops); + goto out; + } + } + } + } + + if (!lprops) { + err = -ENOSPC; + goto out; + } + + lnum = lprops->lnum; + + dbg_find("found LEB %d, free %d, dirty %d, flags %#x", + lnum, lprops->free, lprops->dirty, lprops->flags); + + flags = lprops->flags | LPROPS_TAKEN | LPROPS_INDEX; + lprops = ubifs_change_lp(c, lprops, c->leb_size, 0, flags, 0); + if (IS_ERR(lprops)) { + err = PTR_ERR(lprops); + goto out; + } + + ubifs_release_lprops(c); + + /* + * Ensure that empty LEBs have been unmapped. They may not have been, + * for example, because of an unclean unmount. Also LEBs that were + * freeable LEBs (free + dirty == leb_size) will not have been unmapped. + */ + err = ubifs_leb_unmap(c, lnum); + if (err) { + ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0, + LPROPS_TAKEN | LPROPS_INDEX, 0); + return err; + } + + return lnum; + +out: + ubifs_release_lprops(c); + return err; +} + +static int cmp_dirty_idx(const struct ubifs_lprops **a, + const struct ubifs_lprops **b) +{ + const struct ubifs_lprops *lpa = *a; + const struct ubifs_lprops *lpb = *b; + + return lpa->dirty + lpa->free - lpb->dirty - lpb->free; +} + +static void swap_dirty_idx(struct ubifs_lprops **a, struct ubifs_lprops **b, + int size) +{ + struct ubifs_lprops *t = *a; + + *a = *b; + *b = t; +} + +/** + * ubifs_save_dirty_idx_lnums - save an array of the most dirty index LEB nos. + * @c: the UBIFS file-system description object + * + * This function is called each commit to create an array of LEB numbers of + * dirty index LEBs sorted in order of dirty and free space. This is used by + * the in-the-gaps method of TNC commit. + */ +int ubifs_save_dirty_idx_lnums(struct ubifs_info *c) +{ + int i; + + ubifs_get_lprops(c); + /* Copy the LPROPS_DIRTY_IDX heap */ + c->dirty_idx.cnt = c->lpt_heap[LPROPS_DIRTY_IDX - 1].cnt; + memcpy(c->dirty_idx.arr, c->lpt_heap[LPROPS_DIRTY_IDX - 1].arr, + sizeof(void *) * c->dirty_idx.cnt); + /* Sort it so that the dirtiest is now at the end */ + sort(c->dirty_idx.arr, c->dirty_idx.cnt, sizeof(void *), + (int (*)(const void *, const void *))cmp_dirty_idx, + (void (*)(void *, void *, int))swap_dirty_idx); + dbg_find("found %d dirty index LEBs", c->dirty_idx.cnt); + if (c->dirty_idx.cnt) + dbg_find("dirtiest index LEB is %d with dirty %d and free %d", + c->dirty_idx.arr[c->dirty_idx.cnt - 1]->lnum, + c->dirty_idx.arr[c->dirty_idx.cnt - 1]->dirty, + c->dirty_idx.arr[c->dirty_idx.cnt - 1]->free); + /* Replace the lprops pointers with LEB numbers */ + for (i = 0; i < c->dirty_idx.cnt; i++) + c->dirty_idx.arr[i] = (void *)(size_t)c->dirty_idx.arr[i]->lnum; + ubifs_release_lprops(c); + return 0; +} + +/** + * scan_dirty_idx_cb - callback used by the scan for a dirty index LEB. + * @c: the UBIFS file-system description object + * @lprops: LEB properties to scan + * @in_tree: whether the LEB properties are in main memory + * @data: information passed to and from the caller of the scan + * + * This function returns a code that indicates whether the scan should continue + * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree + * in main memory (%LPT_SCAN_ADD), or whether the scan should stop + * (%LPT_SCAN_STOP). + */ +static int scan_dirty_idx_cb(struct ubifs_info *c, + const struct ubifs_lprops *lprops, int in_tree, + struct scan_data *data) +{ + int ret = LPT_SCAN_CONTINUE; + + /* Exclude LEBs that are currently in use */ + if (lprops->flags & LPROPS_TAKEN) + return LPT_SCAN_CONTINUE; + /* Determine whether to add these LEB properties to the tree */ + if (!in_tree && valuable(c, lprops)) + ret |= LPT_SCAN_ADD; + /* Exclude non-index LEBs */ + if (!(lprops->flags & LPROPS_INDEX)) + return ret; + /* Exclude LEBs with too little space */ + if (lprops->free + lprops->dirty < c->min_idx_node_sz) + return ret; + /* Finally we found space */ + data->lnum = lprops->lnum; + return LPT_SCAN_ADD | LPT_SCAN_STOP; +} + +/** + * find_dirty_idx_leb - find a dirty index LEB. + * @c: the UBIFS file-system description object + * + * This function returns LEB number upon success and a negative error code upon + * failure. In particular, -ENOSPC is returned if a dirty index LEB is not + * found. + * + * Note that this function scans the entire LPT but it is called very rarely. + */ +static int find_dirty_idx_leb(struct ubifs_info *c) +{ + const struct ubifs_lprops *lprops; + struct ubifs_lpt_heap *heap; + struct scan_data data; + int err, i, ret; + + /* Check all structures in memory first */ + data.lnum = -1; + heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1]; + for (i = 0; i < heap->cnt; i++) { + lprops = heap->arr[i]; + ret = scan_dirty_idx_cb(c, lprops, 1, &data); + if (ret & LPT_SCAN_STOP) + goto found; + } + list_for_each_entry(lprops, &c->frdi_idx_list, list) { + ret = scan_dirty_idx_cb(c, lprops, 1, &data); + if (ret & LPT_SCAN_STOP) + goto found; + } + list_for_each_entry(lprops, &c->uncat_list, list) { + ret = scan_dirty_idx_cb(c, lprops, 1, &data); + if (ret & LPT_SCAN_STOP) + goto found; + } + if (c->pnodes_have >= c->pnode_cnt) + /* All pnodes are in memory, so skip scan */ + return -ENOSPC; + err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, + (ubifs_lpt_scan_callback)scan_dirty_idx_cb, + &data); + if (err) + return err; +found: + ubifs_assert(data.lnum >= c->main_first && data.lnum < c->leb_cnt); + c->lscan_lnum = data.lnum; + lprops = ubifs_lpt_lookup_dirty(c, data.lnum); + if (IS_ERR(lprops)) + return PTR_ERR(lprops); + ubifs_assert(lprops->lnum == data.lnum); + ubifs_assert(lprops->free + lprops->dirty >= c->min_idx_node_sz); + ubifs_assert(!(lprops->flags & LPROPS_TAKEN)); + ubifs_assert((lprops->flags & LPROPS_INDEX)); + + dbg_find("found dirty LEB %d, free %d, dirty %d, flags %#x", + lprops->lnum, lprops->free, lprops->dirty, lprops->flags); + + lprops = ubifs_change_lp(c, lprops, LPROPS_NC, LPROPS_NC, + lprops->flags | LPROPS_TAKEN, 0); + if (IS_ERR(lprops)) + return PTR_ERR(lprops); + + return lprops->lnum; +} + +/** + * get_idx_gc_leb - try to get a LEB number from trivial GC. + * @c: the UBIFS file-system description object + */ +static int get_idx_gc_leb(struct ubifs_info *c) +{ + const struct ubifs_lprops *lp; + int err, lnum; + + err = ubifs_get_idx_gc_leb(c); + if (err < 0) + return err; + lnum = err; + /* + * The LEB was due to be unmapped after the commit but + * it is needed now for this commit. + */ + lp = ubifs_lpt_lookup_dirty(c, lnum); + if (unlikely(IS_ERR(lp))) + return PTR_ERR(lp); + lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC, + lp->flags | LPROPS_INDEX, -1); + if (unlikely(IS_ERR(lp))) + return PTR_ERR(lp); + dbg_find("LEB %d, dirty %d and free %d flags %#x", + lp->lnum, lp->dirty, lp->free, lp->flags); + return lnum; +} + +/** + * find_dirtiest_idx_leb - find dirtiest index LEB from dirtiest array. + * @c: the UBIFS file-system description object + */ +static int find_dirtiest_idx_leb(struct ubifs_info *c) +{ + const struct ubifs_lprops *lp; + int lnum; + + while (1) { + if (!c->dirty_idx.cnt) + return -ENOSPC; + /* The lprops pointers were replaced by LEB numbers */ + lnum = (size_t)c->dirty_idx.arr[--c->dirty_idx.cnt]; + lp = ubifs_lpt_lookup(c, lnum); + if (IS_ERR(lp)) + return PTR_ERR(lp); + if ((lp->flags & LPROPS_TAKEN) || !(lp->flags & LPROPS_INDEX)) + continue; + lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC, + lp->flags | LPROPS_TAKEN, 0); + if (IS_ERR(lp)) + return PTR_ERR(lp); + break; + } + dbg_find("LEB %d, dirty %d and free %d flags %#x", lp->lnum, lp->dirty, + lp->free, lp->flags); + ubifs_assert(lp->flags | LPROPS_TAKEN); + ubifs_assert(lp->flags | LPROPS_INDEX); + return lnum; +} + +/** + * ubifs_find_dirty_idx_leb - try to find dirtiest index LEB as at last commit. + * @c: the UBIFS file-system description object + * + * This function attempts to find an untaken index LEB with the most free and + * dirty space that can be used without overwriting index nodes that were in the + * last index committed. + */ +int ubifs_find_dirty_idx_leb(struct ubifs_info *c) +{ + int err; + + ubifs_get_lprops(c); + + /* + * We made an array of the dirtiest index LEB numbers as at the start of + * last commit. Try that array first. + */ + err = find_dirtiest_idx_leb(c); + + /* Next try scanning the entire LPT */ + if (err == -ENOSPC) + err = find_dirty_idx_leb(c); + + /* Finally take any index LEBs awaiting trivial GC */ + if (err == -ENOSPC) + err = get_idx_gc_leb(c); + + ubifs_release_lprops(c); + return err; +} diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c new file mode 100644 index 000000000000..02aba36fe3d4 --- /dev/null +++ b/fs/ubifs/gc.c @@ -0,0 +1,787 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * This file implements garbage collection. The procedure for garbage collection + * is different depending on whether a LEB as an index LEB (contains index + * nodes) or not. For non-index LEBs, garbage collection finds a LEB which + * contains a lot of dirty space (obsolete nodes), and copies the non-obsolete + * nodes to the journal, at which point the garbage-collected LEB is free to be + * reused. For index LEBs, garbage collection marks the non-obsolete index nodes + * dirty in the TNC, and after the next commit, the garbage-collected LEB is + * to be reused. Garbage collection will cause the number of dirty index nodes + * to grow, however sufficient space is reserved for the index to ensure the + * commit will never run out of space. + */ + +#include <linux/pagemap.h> +#include "ubifs.h" + +/* + * GC tries to optimize the way it fit nodes to available space, and it sorts + * nodes a little. The below constants are watermarks which define "large", + * "medium", and "small" nodes. + */ +#define MEDIUM_NODE_WM (UBIFS_BLOCK_SIZE / 4) +#define SMALL_NODE_WM UBIFS_MAX_DENT_NODE_SZ + +/* + * GC may need to move more then one LEB to make progress. The below constants + * define "soft" and "hard" limits on the number of LEBs the garbage collector + * may move. + */ +#define SOFT_LEBS_LIMIT 4 +#define HARD_LEBS_LIMIT 32 + +/** + * switch_gc_head - switch the garbage collection journal head. + * @c: UBIFS file-system description object + * @buf: buffer to write + * @len: length of the buffer to write + * @lnum: LEB number written is returned here + * @offs: offset written is returned here + * + * This function switch the GC head to the next LEB which is reserved in + * @c->gc_lnum. Returns %0 in case of success, %-EAGAIN if commit is required, + * and other negative error code in case of failures. + */ +static int switch_gc_head(struct ubifs_info *c) +{ + int err, gc_lnum = c->gc_lnum; + struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; + + ubifs_assert(gc_lnum != -1); + dbg_gc("switch GC head from LEB %d:%d to LEB %d (waste %d bytes)", + wbuf->lnum, wbuf->offs + wbuf->used, gc_lnum, + c->leb_size - wbuf->offs - wbuf->used); + + err = ubifs_wbuf_sync_nolock(wbuf); + if (err) + return err; + + /* + * The GC write-buffer was synchronized, we may safely unmap + * 'c->gc_lnum'. + */ + err = ubifs_leb_unmap(c, gc_lnum); + if (err) + return err; + + err = ubifs_add_bud_to_log(c, GCHD, gc_lnum, 0); + if (err) + return err; + + c->gc_lnum = -1; + err = ubifs_wbuf_seek_nolock(wbuf, gc_lnum, 0, UBI_LONGTERM); + return err; +} + +/** + * move_nodes - move nodes. + * @c: UBIFS file-system description object + * @sleb: describes nodes to move + * + * This function moves valid nodes from data LEB described by @sleb to the GC + * journal head. The obsolete nodes are dropped. + * + * When moving nodes we have to deal with classical bin-packing problem: the + * space in the current GC journal head LEB and in @c->gc_lnum are the "bins", + * where the nodes in the @sleb->nodes list are the elements which should be + * fit optimally to the bins. This function uses the "first fit decreasing" + * strategy, although it does not really sort the nodes but just split them on + * 3 classes - large, medium, and small, so they are roughly sorted. + * + * This function returns zero in case of success, %-EAGAIN if commit is + * required, and other negative error codes in case of other failures. + */ +static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) +{ + struct ubifs_scan_node *snod, *tmp; + struct list_head large, medium, small; + struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; + int avail, err, min = INT_MAX; + + INIT_LIST_HEAD(&large); + INIT_LIST_HEAD(&medium); + INIT_LIST_HEAD(&small); + + list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) { + struct list_head *lst; + + ubifs_assert(snod->type != UBIFS_IDX_NODE); + ubifs_assert(snod->type != UBIFS_REF_NODE); + ubifs_assert(snod->type != UBIFS_CS_NODE); + + err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum, + snod->offs, 0); + if (err < 0) + goto out; + + lst = &snod->list; + list_del(lst); + if (!err) { + /* The node is obsolete, remove it from the list */ + kfree(snod); + continue; + } + + /* + * Sort the list of nodes so that large nodes go first, and + * small nodes go last. + */ + if (snod->len > MEDIUM_NODE_WM) + list_add(lst, &large); + else if (snod->len > SMALL_NODE_WM) + list_add(lst, &medium); + else + list_add(lst, &small); + + /* And find the smallest node */ + if (snod->len < min) + min = snod->len; + } + + /* + * Join the tree lists so that we'd have one roughly sorted list + * ('large' will be the head of the joined list). + */ + list_splice(&medium, large.prev); + list_splice(&small, large.prev); + + if (wbuf->lnum == -1) { + /* + * The GC journal head is not set, because it is the first GC + * invocation since mount. + */ + err = switch_gc_head(c); + if (err) + goto out; + } + + /* Write nodes to their new location. Use the first-fit strategy */ + while (1) { + avail = c->leb_size - wbuf->offs - wbuf->used; + list_for_each_entry_safe(snod, tmp, &large, list) { + int new_lnum, new_offs; + + if (avail < min) + break; + + if (snod->len > avail) + /* This node does not fit */ + continue; + + cond_resched(); + + new_lnum = wbuf->lnum; + new_offs = wbuf->offs + wbuf->used; + err = ubifs_wbuf_write_nolock(wbuf, snod->node, + snod->len); + if (err) + goto out; + err = ubifs_tnc_replace(c, &snod->key, sleb->lnum, + snod->offs, new_lnum, new_offs, + snod->len); + if (err) + goto out; + + avail = c->leb_size - wbuf->offs - wbuf->used; + list_del(&snod->list); + kfree(snod); + } + + if (list_empty(&large)) + break; + + /* + * Waste the rest of the space in the LEB and switch to the + * next LEB. + */ + err = switch_gc_head(c); + if (err) + goto out; + } + + return 0; + +out: + list_for_each_entry_safe(snod, tmp, &large, list) { + list_del(&snod->list); + kfree(snod); + } + return err; +} + +/** + * gc_sync_wbufs - sync write-buffers for GC. + * @c: UBIFS file-system description object + * + * We must guarantee that obsoleting nodes are on flash. Unfortunately they may + * be in a write-buffer instead. That is, a node could be written to a + * write-buffer, obsoleting another node in a LEB that is GC'd. If that LEB is + * erased before the write-buffer is sync'd and then there is an unclean + * unmount, then an existing node is lost. To avoid this, we sync all + * write-buffers. + * + * This function returns %0 on success or a negative error code on failure. + */ +static int gc_sync_wbufs(struct ubifs_info *c) +{ + int err, i; + + for (i = 0; i < c->jhead_cnt; i++) { + if (i == GCHD) + continue; + err = ubifs_wbuf_sync(&c->jheads[i].wbuf); + if (err) + return err; + } + return 0; +} + +/** + * ubifs_garbage_collect_leb - garbage-collect a logical eraseblock. + * @c: UBIFS file-system description object + * @lp: describes the LEB to garbage collect + * + * This function garbage-collects an LEB and returns one of the @LEB_FREED, + * @LEB_RETAINED, etc positive codes in case of success, %-EAGAIN if commit is + * required, and other negative error codes in case of failures. + */ +int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp) +{ + struct ubifs_scan_leb *sleb; + struct ubifs_scan_node *snod; + struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; + int err = 0, lnum = lp->lnum; + + ubifs_assert(c->gc_lnum != -1 || wbuf->offs + wbuf->used == 0 || + c->need_recovery); + ubifs_assert(c->gc_lnum != lnum); + ubifs_assert(wbuf->lnum != lnum); + + /* + * We scan the entire LEB even though we only really need to scan up to + * (c->leb_size - lp->free). + */ + sleb = ubifs_scan(c, lnum, 0, c->sbuf); + if (IS_ERR(sleb)) + return PTR_ERR(sleb); + + ubifs_assert(!list_empty(&sleb->nodes)); + snod = list_entry(sleb->nodes.next, struct ubifs_scan_node, list); + + if (snod->type == UBIFS_IDX_NODE) { + struct ubifs_gced_idx_leb *idx_gc; + + dbg_gc("indexing LEB %d (free %d, dirty %d)", + lnum, lp->free, lp->dirty); + list_for_each_entry(snod, &sleb->nodes, list) { + struct ubifs_idx_node *idx = snod->node; + int level = le16_to_cpu(idx->level); + + ubifs_assert(snod->type == UBIFS_IDX_NODE); + key_read(c, ubifs_idx_key(c, idx), &snod->key); + err = ubifs_dirty_idx_node(c, &snod->key, level, lnum, + snod->offs); + if (err) + goto out; + } + + idx_gc = kmalloc(sizeof(struct ubifs_gced_idx_leb), GFP_NOFS); + if (!idx_gc) { + err = -ENOMEM; + goto out; + } + + idx_gc->lnum = lnum; + idx_gc->unmap = 0; + list_add(&idx_gc->list, &c->idx_gc); + + /* + * Don't release the LEB until after the next commit, because + * it may contain date which is needed for recovery. So + * although we freed this LEB, it will become usable only after + * the commit. + */ + err = ubifs_change_one_lp(c, lnum, c->leb_size, 0, 0, + LPROPS_INDEX, 1); + if (err) + goto out; + err = LEB_FREED_IDX; + } else { + dbg_gc("data LEB %d (free %d, dirty %d)", + lnum, lp->free, lp->dirty); + + err = move_nodes(c, sleb); + if (err) + goto out_inc_seq; + + err = gc_sync_wbufs(c); + if (err) + goto out_inc_seq; + + err = ubifs_change_one_lp(c, lnum, c->leb_size, 0, 0, 0, 0); + if (err) + goto out_inc_seq; + + /* Allow for races with TNC */ + c->gced_lnum = lnum; + smp_wmb(); + c->gc_seq += 1; + smp_wmb(); + + if (c->gc_lnum == -1) { + c->gc_lnum = lnum; + err = LEB_RETAINED; + } else { + err = ubifs_wbuf_sync_nolock(wbuf); + if (err) + goto out; + + err = ubifs_leb_unmap(c, lnum); + if (err) + goto out; + + err = LEB_FREED; + } + } + +out: + ubifs_scan_destroy(sleb); + return err; + +out_inc_seq: + /* We may have moved at least some nodes so allow for races with TNC */ + c->gced_lnum = lnum; + smp_wmb(); + c->gc_seq += 1; + smp_wmb(); + goto out; +} + +/** + * ubifs_garbage_collect - UBIFS garbage collector. + * @c: UBIFS file-system description object + * @anyway: do GC even if there are free LEBs + * + * This function does out-of-place garbage collection. The return codes are: + * o positive LEB number if the LEB has been freed and may be used; + * o %-EAGAIN if the caller has to run commit; + * o %-ENOSPC if GC failed to make any progress; + * o other negative error codes in case of other errors. + * + * Garbage collector writes data to the journal when GC'ing data LEBs, and just + * marking indexing nodes dirty when GC'ing indexing LEBs. Thus, at some point + * commit may be required. But commit cannot be run from inside GC, because the + * caller might be holding the commit lock, so %-EAGAIN is returned instead; + * And this error code means that the caller has to run commit, and re-run GC + * if there is still no free space. + * + * There are many reasons why this function may return %-EAGAIN: + * o the log is full and there is no space to write an LEB reference for + * @c->gc_lnum; + * o the journal is too large and exceeds size limitations; + * o GC moved indexing LEBs, but they can be used only after the commit; + * o the shrinker fails to find clean znodes to free and requests the commit; + * o etc. + * + * Note, if the file-system is close to be full, this function may return + * %-EAGAIN infinitely, so the caller has to limit amount of re-invocations of + * the function. E.g., this happens if the limits on the journal size are too + * tough and GC writes too much to the journal before an LEB is freed. This + * might also mean that the journal is too large, and the TNC becomes to big, + * so that the shrinker is constantly called, finds not clean znodes to free, + * and requests commit. Well, this may also happen if the journal is all right, + * but another kernel process consumes too much memory. Anyway, infinite + * %-EAGAIN may happen, but in some extreme/misconfiguration cases. + */ +int ubifs_garbage_collect(struct ubifs_info *c, int anyway) +{ + int i, err, ret, min_space = c->dead_wm; + struct ubifs_lprops lp; + struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; + + ubifs_assert_cmt_locked(c); + + if (ubifs_gc_should_commit(c)) + return -EAGAIN; + + mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); + + if (c->ro_media) { + ret = -EROFS; + goto out_unlock; + } + + /* We expect the write-buffer to be empty on entry */ + ubifs_assert(!wbuf->used); + + for (i = 0; ; i++) { + int space_before = c->leb_size - wbuf->offs - wbuf->used; + int space_after; + + cond_resched(); + + /* Give the commit an opportunity to run */ + if (ubifs_gc_should_commit(c)) { + ret = -EAGAIN; + break; + } + + if (i > SOFT_LEBS_LIMIT && !list_empty(&c->idx_gc)) { + /* + * We've done enough iterations. Indexing LEBs were + * moved and will be available after the commit. + */ + dbg_gc("soft limit, some index LEBs GC'ed, -EAGAIN"); + ubifs_commit_required(c); + ret = -EAGAIN; + break; + } + + if (i > HARD_LEBS_LIMIT) { + /* + * We've moved too many LEBs and have not made + * progress, give up. + */ + dbg_gc("hard limit, -ENOSPC"); + ret = -ENOSPC; + break; + } + + /* + * Empty and freeable LEBs can turn up while we waited for + * the wbuf lock, or while we have been running GC. In that + * case, we should just return one of those instead of + * continuing to GC dirty LEBs. Hence we request + * 'ubifs_find_dirty_leb()' to return an empty LEB if it can. + */ + ret = ubifs_find_dirty_leb(c, &lp, min_space, anyway ? 0 : 1); + if (ret) { + if (ret == -ENOSPC) + dbg_gc("no more dirty LEBs"); + break; + } + + dbg_gc("found LEB %d: free %d, dirty %d, sum %d " + "(min. space %d)", lp.lnum, lp.free, lp.dirty, + lp.free + lp.dirty, min_space); + + if (lp.free + lp.dirty == c->leb_size) { + /* An empty LEB was returned */ + dbg_gc("LEB %d is free, return it", lp.lnum); + /* + * ubifs_find_dirty_leb() doesn't return freeable index + * LEBs. + */ + ubifs_assert(!(lp.flags & LPROPS_INDEX)); + if (lp.free != c->leb_size) { + /* + * Write buffers must be sync'd before + * unmapping freeable LEBs, because one of them + * may contain data which obsoletes something + * in 'lp.pnum'. + */ + ret = gc_sync_wbufs(c); + if (ret) + goto out; + ret = ubifs_change_one_lp(c, lp.lnum, + c->leb_size, 0, 0, 0, + 0); + if (ret) + goto out; + } + ret = ubifs_leb_unmap(c, lp.lnum); + if (ret) + goto out; + ret = lp.lnum; + break; + } + + space_before = c->leb_size - wbuf->offs - wbuf->used; + if (wbuf->lnum == -1) + space_before = 0; + + ret = ubifs_garbage_collect_leb(c, &lp); + if (ret < 0) { + if (ret == -EAGAIN || ret == -ENOSPC) { + /* + * These codes are not errors, so we have to + * return the LEB to lprops. But if the + * 'ubifs_return_leb()' function fails, its + * failure code is propagated to the caller + * instead of the original '-EAGAIN' or + * '-ENOSPC'. + */ + err = ubifs_return_leb(c, lp.lnum); + if (err) + ret = err; + break; + } + goto out; + } + + if (ret == LEB_FREED) { + /* An LEB has been freed and is ready for use */ + dbg_gc("LEB %d freed, return", lp.lnum); + ret = lp.lnum; + break; + } + + if (ret == LEB_FREED_IDX) { + /* + * This was an indexing LEB and it cannot be + * immediately used. And instead of requesting the + * commit straight away, we try to garbage collect some + * more. + */ + dbg_gc("indexing LEB %d freed, continue", lp.lnum); + continue; + } + + ubifs_assert(ret == LEB_RETAINED); + space_after = c->leb_size - wbuf->offs - wbuf->used; + dbg_gc("LEB %d retained, freed %d bytes", lp.lnum, + space_after - space_before); + + if (space_after > space_before) { + /* GC makes progress, keep working */ + min_space >>= 1; + if (min_space < c->dead_wm) + min_space = c->dead_wm; + continue; + } + + dbg_gc("did not make progress"); + + /* + * GC moved an LEB bud have not done any progress. This means + * that the previous GC head LEB contained too few free space + * and the LEB which was GC'ed contained only large nodes which + * did not fit that space. + * + * We can do 2 things: + * 1. pick another LEB in a hope it'll contain a small node + * which will fit the space we have at the end of current GC + * head LEB, but there is no guarantee, so we try this out + * unless we have already been working for too long; + * 2. request an LEB with more dirty space, which will force + * 'ubifs_find_dirty_leb()' to start scanning the lprops + * table, instead of just picking one from the heap + * (previously it already picked the dirtiest LEB). + */ + if (i < SOFT_LEBS_LIMIT) { + dbg_gc("try again"); + continue; + } + + min_space <<= 1; + if (min_space > c->dark_wm) + min_space = c->dark_wm; + dbg_gc("set min. space to %d", min_space); + } + + if (ret == -ENOSPC && !list_empty(&c->idx_gc)) { + dbg_gc("no space, some index LEBs GC'ed, -EAGAIN"); + ubifs_commit_required(c); + ret = -EAGAIN; + } + + err = ubifs_wbuf_sync_nolock(wbuf); + if (!err) + err = ubifs_leb_unmap(c, c->gc_lnum); + if (err) { + ret = err; + goto out; + } +out_unlock: + mutex_unlock(&wbuf->io_mutex); + return ret; + +out: + ubifs_assert(ret < 0); + ubifs_assert(ret != -ENOSPC && ret != -EAGAIN); + ubifs_ro_mode(c, ret); + ubifs_wbuf_sync_nolock(wbuf); + mutex_unlock(&wbuf->io_mutex); + ubifs_return_leb(c, lp.lnum); + return ret; +} + +/** + * ubifs_gc_start_commit - garbage collection at start of commit. + * @c: UBIFS file-system description object + * + * If a LEB has only dirty and free space, then we may safely unmap it and make + * it free. Note, we cannot do this with indexing LEBs because dirty space may + * correspond index nodes that are required for recovery. In that case, the + * LEB cannot be unmapped until after the next commit. + * + * This function returns %0 upon success and a negative error code upon failure. + */ +int ubifs_gc_start_commit(struct ubifs_info *c) +{ + struct ubifs_gced_idx_leb *idx_gc; + const struct ubifs_lprops *lp; + int err = 0, flags; + + ubifs_get_lprops(c); + + /* + * Unmap (non-index) freeable LEBs. Note that recovery requires that all + * wbufs are sync'd before this, which is done in 'do_commit()'. + */ + while (1) { + lp = ubifs_fast_find_freeable(c); + if (unlikely(IS_ERR(lp))) { + err = PTR_ERR(lp); + goto out; + } + if (!lp) + break; + ubifs_assert(!(lp->flags & LPROPS_TAKEN)); + ubifs_assert(!(lp->flags & LPROPS_INDEX)); + err = ubifs_leb_unmap(c, lp->lnum); + if (err) + goto out; + lp = ubifs_change_lp(c, lp, c->leb_size, 0, lp->flags, 0); + if (unlikely(IS_ERR(lp))) { + err = PTR_ERR(lp); + goto out; + } + ubifs_assert(!(lp->flags & LPROPS_TAKEN)); + ubifs_assert(!(lp->flags & LPROPS_INDEX)); + } + + /* Mark GC'd index LEBs OK to unmap after this commit finishes */ + list_for_each_entry(idx_gc, &c->idx_gc, list) + idx_gc->unmap = 1; + + /* Record index freeable LEBs for unmapping after commit */ + while (1) { + lp = ubifs_fast_find_frdi_idx(c); + if (unlikely(IS_ERR(lp))) { + err = PTR_ERR(lp); + goto out; + } + if (!lp) + break; + idx_gc = kmalloc(sizeof(struct ubifs_gced_idx_leb), GFP_NOFS); + if (!idx_gc) { + err = -ENOMEM; + goto out; + } + ubifs_assert(!(lp->flags & LPROPS_TAKEN)); + ubifs_assert(lp->flags & LPROPS_INDEX); + /* Don't release the LEB until after the next commit */ + flags = (lp->flags | LPROPS_TAKEN) ^ LPROPS_INDEX; + lp = ubifs_change_lp(c, lp, c->leb_size, 0, flags, 1); + if (unlikely(IS_ERR(lp))) { + err = PTR_ERR(lp); + kfree(idx_gc); + goto out; + } + ubifs_assert(lp->flags & LPROPS_TAKEN); + ubifs_assert(!(lp->flags & LPROPS_INDEX)); + idx_gc->lnum = lp->lnum; + idx_gc->unmap = 1; + list_add(&idx_gc->list, &c->idx_gc); + } +out: + ubifs_release_lprops(c); + return err; +} + +/** + * ubifs_gc_end_commit - garbage collection at end of commit. + * @c: UBIFS file-system description object + * + * This function completes out-of-place garbage collection of index LEBs. + */ +int ubifs_gc_end_commit(struct ubifs_info *c) +{ + struct ubifs_gced_idx_leb *idx_gc, *tmp; + struct ubifs_wbuf *wbuf; + int err = 0; + + wbuf = &c->jheads[GCHD].wbuf; + mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); + list_for_each_entry_safe(idx_gc, tmp, &c->idx_gc, list) + if (idx_gc->unmap) { + dbg_gc("LEB %d", idx_gc->lnum); + err = ubifs_leb_unmap(c, idx_gc->lnum); + if (err) + goto out; + err = ubifs_change_one_lp(c, idx_gc->lnum, LPROPS_NC, + LPROPS_NC, 0, LPROPS_TAKEN, -1); + if (err) + goto out; + list_del(&idx_gc->list); + kfree(idx_gc); + } +out: + mutex_unlock(&wbuf->io_mutex); + return err; +} + +/** + * ubifs_destroy_idx_gc - destroy idx_gc list. + * @c: UBIFS file-system description object + * + * This function destroys the idx_gc list. It is called when unmounting or + * remounting read-only so locks are not needed. + */ +void ubifs_destroy_idx_gc(struct ubifs_info *c) +{ + while (!list_empty(&c->idx_gc)) { + struct ubifs_gced_idx_leb *idx_gc; + + idx_gc = list_entry(c->idx_gc.next, struct ubifs_gced_idx_leb, + list); + c->idx_gc_cnt -= 1; + list_del(&idx_gc->list); + kfree(idx_gc); + } + +} + +/** + * ubifs_get_idx_gc_leb - get a LEB from GC'd index LEB list. + * @c: UBIFS file-system description object + * + * Called during start commit so locks are not needed. + */ +int ubifs_get_idx_gc_leb(struct ubifs_info *c) +{ + struct ubifs_gced_idx_leb *idx_gc; + int lnum; + + if (list_empty(&c->idx_gc)) + return -ENOSPC; + idx_gc = list_entry(c->idx_gc.next, struct ubifs_gced_idx_leb, list); + lnum = idx_gc->lnum; + /* c->idx_gc_cnt is updated by the caller when lprops are updated */ + list_del(&idx_gc->list); + kfree(idx_gc); + return lnum; +} diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c new file mode 100644 index 000000000000..054363f2b207 --- /dev/null +++ b/fs/ubifs/io.c @@ -0,0 +1,928 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * Copyright (C) 2006, 2007 University of Szeged, Hungary + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + * Zoltan Sogor + */ + +/* + * This file implements UBIFS I/O subsystem which provides various I/O-related + * helper functions (reading/writing/checking/validating nodes) and implements + * write-buffering support. Write buffers help to save space which otherwise + * would have been wasted for padding to the nearest minimal I/O unit boundary. + * Instead, data first goes to the write-buffer and is flushed when the + * buffer is full or when it is not used for some time (by timer). This is + * similarto the mechanism is used by JFFS2. + * + * Write-buffers are defined by 'struct ubifs_wbuf' objects and protected by + * mutexes defined inside these objects. Since sometimes upper-level code + * has to lock the write-buffer (e.g. journal space reservation code), many + * functions related to write-buffers have "nolock" suffix which means that the + * caller has to lock the write-buffer before calling this function. + * + * UBIFS stores nodes at 64 bit-aligned addresses. If the node length is not + * aligned, UBIFS starts the next node from the aligned address, and the padded + * bytes may contain any rubbish. In other words, UBIFS does not put padding + * bytes in those small gaps. Common headers of nodes store real node lengths, + * not aligned lengths. Indexing nodes also store real lengths in branches. + * + * UBIFS uses padding when it pads to the next min. I/O unit. In this case it + * uses padding nodes or padding bytes, if the padding node does not fit. + * + * All UBIFS nodes are protected by CRC checksums and UBIFS checks all nodes + * every time they are read from the flash media. + */ + +#include <linux/crc32.h> +#include "ubifs.h" + +/** + * ubifs_ro_mode - switch UBIFS to read read-only mode. + * @c: UBIFS file-system description object + * @err: error code which is the reason of switching to R/O mode + */ +void ubifs_ro_mode(struct ubifs_info *c, int err) +{ + if (!c->ro_media) { + c->ro_media = 1; + ubifs_warn("switched to read-only mode, error %d", err); + dbg_dump_stack(); + } +} + +/** + * ubifs_check_node - check node. + * @c: UBIFS file-system description object + * @buf: node to check + * @lnum: logical eraseblock number + * @offs: offset within the logical eraseblock + * @quiet: print no messages + * + * This function checks node magic number and CRC checksum. This function also + * validates node length to prevent UBIFS from becoming crazy when an attacker + * feeds it a file-system image with incorrect nodes. For example, too large + * node length in the common header could cause UBIFS to read memory outside of + * allocated buffer when checking the CRC checksum. + * + * This function returns zero in case of success %-EUCLEAN in case of bad CRC + * or magic. + */ +int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, + int offs, int quiet) +{ + int err = -EINVAL, type, node_len; + uint32_t crc, node_crc, magic; + const struct ubifs_ch *ch = buf; + + ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0); + ubifs_assert(!(offs & 7) && offs < c->leb_size); + + magic = le32_to_cpu(ch->magic); + if (magic != UBIFS_NODE_MAGIC) { + if (!quiet) + ubifs_err("bad magic %#08x, expected %#08x", + magic, UBIFS_NODE_MAGIC); + err = -EUCLEAN; + goto out; + } + + type = ch->node_type; + if (type < 0 || type >= UBIFS_NODE_TYPES_CNT) { + if (!quiet) + ubifs_err("bad node type %d", type); + goto out; + } + + node_len = le32_to_cpu(ch->len); + if (node_len + offs > c->leb_size) + goto out_len; + + if (c->ranges[type].max_len == 0) { + if (node_len != c->ranges[type].len) + goto out_len; + } else if (node_len < c->ranges[type].min_len || + node_len > c->ranges[type].max_len) + goto out_len; + + crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8); + node_crc = le32_to_cpu(ch->crc); + if (crc != node_crc) { + if (!quiet) + ubifs_err("bad CRC: calculated %#08x, read %#08x", + crc, node_crc); + err = -EUCLEAN; + goto out; + } + + return 0; + +out_len: + if (!quiet) + ubifs_err("bad node length %d", node_len); +out: + if (!quiet) { + ubifs_err("bad node at LEB %d:%d", lnum, offs); + dbg_dump_node(c, buf); + dbg_dump_stack(); + } + return err; +} + +/** + * ubifs_pad - pad flash space. + * @c: UBIFS file-system description object + * @buf: buffer to put padding to + * @pad: how many bytes to pad + * + * The flash media obliges us to write only in chunks of %c->min_io_size and + * when we have to write less data we add padding node to the write-buffer and + * pad it to the next minimal I/O unit's boundary. Padding nodes help when the + * media is being scanned. If the amount of wasted space is not enough to fit a + * padding node which takes %UBIFS_PAD_NODE_SZ bytes, we write padding bytes + * pattern (%UBIFS_PADDING_BYTE). + * + * Padding nodes are also used to fill gaps when the "commit-in-gaps" method is + * used. + */ +void ubifs_pad(const struct ubifs_info *c, void *buf, int pad) +{ + uint32_t crc; + + ubifs_assert(pad >= 0 && !(pad & 7)); + + if (pad >= UBIFS_PAD_NODE_SZ) { + struct ubifs_ch *ch = buf; + struct ubifs_pad_node *pad_node = buf; + + ch->magic = cpu_to_le32(UBIFS_NODE_MAGIC); + ch->node_type = UBIFS_PAD_NODE; + ch->group_type = UBIFS_NO_NODE_GROUP; + ch->padding[0] = ch->padding[1] = 0; + ch->sqnum = 0; + ch->len = cpu_to_le32(UBIFS_PAD_NODE_SZ); + pad -= UBIFS_PAD_NODE_SZ; + pad_node->pad_len = cpu_to_le32(pad); + crc = crc32(UBIFS_CRC32_INIT, buf + 8, UBIFS_PAD_NODE_SZ - 8); + ch->crc = cpu_to_le32(crc); + memset(buf + UBIFS_PAD_NODE_SZ, 0, pad); + } else if (pad > 0) + /* Too little space, padding node won't fit */ + memset(buf, UBIFS_PADDING_BYTE, pad); +} + +/** + * next_sqnum - get next sequence number. + * @c: UBIFS file-system description object + */ +static unsigned long long next_sqnum(struct ubifs_info *c) +{ + unsigned long long sqnum; + + spin_lock(&c->cnt_lock); + sqnum = ++c->max_sqnum; + spin_unlock(&c->cnt_lock); + + if (unlikely(sqnum >= SQNUM_WARN_WATERMARK)) { + if (sqnum >= SQNUM_WATERMARK) { + ubifs_err("sequence number overflow %llu, end of life", + sqnum); + ubifs_ro_mode(c, -EINVAL); + } + ubifs_warn("running out of sequence numbers, end of life soon"); + } + + return sqnum; +} + +/** + * ubifs_prepare_node - prepare node to be written to flash. + * @c: UBIFS file-system description object + * @node: the node to pad + * @len: node length + * @pad: if the buffer has to be padded + * + * This function prepares node at @node to be written to the media - it + * calculates node CRC, fills the common header, and adds proper padding up to + * the next minimum I/O unit if @pad is not zero. + */ +void ubifs_prepare_node(struct ubifs_info *c, void *node, int len, int pad) +{ + uint32_t crc; + struct ubifs_ch *ch = node; + unsigned long long sqnum = next_sqnum(c); + + ubifs_assert(len >= UBIFS_CH_SZ); + + ch->magic = cpu_to_le32(UBIFS_NODE_MAGIC); + ch->len = cpu_to_le32(len); + ch->group_type = UBIFS_NO_NODE_GROUP; + ch->sqnum = cpu_to_le64(sqnum); + ch->padding[0] = ch->padding[1] = 0; + crc = crc32(UBIFS_CRC32_INIT, node + 8, len - 8); + ch->crc = cpu_to_le32(crc); + + if (pad) { + len = ALIGN(len, 8); + pad = ALIGN(len, c->min_io_size) - len; + ubifs_pad(c, node + len, pad); + } +} + +/** + * ubifs_prep_grp_node - prepare node of a group to be written to flash. + * @c: UBIFS file-system description object + * @node: the node to pad + * @len: node length + * @last: indicates the last node of the group + * + * This function prepares node at @node to be written to the media - it + * calculates node CRC and fills the common header. + */ +void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last) +{ + uint32_t crc; + struct ubifs_ch *ch = node; + unsigned long long sqnum = next_sqnum(c); + + ubifs_assert(len >= UBIFS_CH_SZ); + + ch->magic = cpu_to_le32(UBIFS_NODE_MAGIC); + ch->len = cpu_to_le32(len); + if (last) + ch->group_type = UBIFS_LAST_OF_NODE_GROUP; + else + ch->group_type = UBIFS_IN_NODE_GROUP; + ch->sqnum = cpu_to_le64(sqnum); + ch->padding[0] = ch->padding[1] = 0; + crc = crc32(UBIFS_CRC32_INIT, node + 8, len - 8); + ch->crc = cpu_to_le32(crc); +} + +/** + * wbuf_timer_callback - write-buffer timer callback function. + * @data: timer data (write-buffer descriptor) + * + * This function is called when the write-buffer timer expires. + */ +static void wbuf_timer_callback_nolock(unsigned long data) +{ + struct ubifs_wbuf *wbuf = (struct ubifs_wbuf *)data; + + wbuf->need_sync = 1; + wbuf->c->need_wbuf_sync = 1; + ubifs_wake_up_bgt(wbuf->c); +} + +/** + * new_wbuf_timer - start new write-buffer timer. + * @wbuf: write-buffer descriptor + */ +static void new_wbuf_timer_nolock(struct ubifs_wbuf *wbuf) +{ + ubifs_assert(!timer_pending(&wbuf->timer)); + + if (!wbuf->timeout) + return; + + wbuf->timer.expires = jiffies + wbuf->timeout; + add_timer(&wbuf->timer); +} + +/** + * cancel_wbuf_timer - cancel write-buffer timer. + * @wbuf: write-buffer descriptor + */ +static void cancel_wbuf_timer_nolock(struct ubifs_wbuf *wbuf) +{ + /* + * If the syncer is waiting for the lock (from the background thread's + * context) and another task is changing write-buffer then the syncing + * should be canceled. + */ + wbuf->need_sync = 0; + del_timer(&wbuf->timer); +} + +/** + * ubifs_wbuf_sync_nolock - synchronize write-buffer. + * @wbuf: write-buffer to synchronize + * + * This function synchronizes write-buffer @buf and returns zero in case of + * success or a negative error code in case of failure. + */ +int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf) +{ + struct ubifs_info *c = wbuf->c; + int err, dirt; + + cancel_wbuf_timer_nolock(wbuf); + if (!wbuf->used || wbuf->lnum == -1) + /* Write-buffer is empty or not seeked */ + return 0; + + dbg_io("LEB %d:%d, %d bytes", + wbuf->lnum, wbuf->offs, wbuf->used); + ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY)); + ubifs_assert(!(wbuf->avail & 7)); + ubifs_assert(wbuf->offs + c->min_io_size <= c->leb_size); + + if (c->ro_media) + return -EROFS; + + ubifs_pad(c, wbuf->buf + wbuf->used, wbuf->avail); + err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs, + c->min_io_size, wbuf->dtype); + if (err) { + ubifs_err("cannot write %d bytes to LEB %d:%d", + c->min_io_size, wbuf->lnum, wbuf->offs); + dbg_dump_stack(); + return err; + } + + dirt = wbuf->avail; + + spin_lock(&wbuf->lock); + wbuf->offs += c->min_io_size; + wbuf->avail = c->min_io_size; + wbuf->used = 0; + wbuf->next_ino = 0; + spin_unlock(&wbuf->lock); + + if (wbuf->sync_callback) + err = wbuf->sync_callback(c, wbuf->lnum, + c->leb_size - wbuf->offs, dirt); + return err; +} + +/** + * ubifs_wbuf_seek_nolock - seek write-buffer. + * @wbuf: write-buffer + * @lnum: logical eraseblock number to seek to + * @offs: logical eraseblock offset to seek to + * @dtype: data type + * + * This function targets the write buffer to logical eraseblock @lnum:@offs. + * The write-buffer is synchronized if it is not empty. Returns zero in case of + * success and a negative error code in case of failure. + */ +int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs, + int dtype) +{ + const struct ubifs_info *c = wbuf->c; + + dbg_io("LEB %d:%d", lnum, offs); + ubifs_assert(lnum >= 0 && lnum < c->leb_cnt); + ubifs_assert(offs >= 0 && offs <= c->leb_size); + ubifs_assert(offs % c->min_io_size == 0 && !(offs & 7)); + ubifs_assert(lnum != wbuf->lnum); + + if (wbuf->used > 0) { + int err = ubifs_wbuf_sync_nolock(wbuf); + + if (err) + return err; + } + + spin_lock(&wbuf->lock); + wbuf->lnum = lnum; + wbuf->offs = offs; + wbuf->avail = c->min_io_size; + wbuf->used = 0; + spin_unlock(&wbuf->lock); + wbuf->dtype = dtype; + + return 0; +} + +/** + * ubifs_bg_wbufs_sync - synchronize write-buffers. + * @c: UBIFS file-system description object + * + * This function is called by background thread to synchronize write-buffers. + * Returns zero in case of success and a negative error code in case of + * failure. + */ +int ubifs_bg_wbufs_sync(struct ubifs_info *c) +{ + int err, i; + + if (!c->need_wbuf_sync) + return 0; + c->need_wbuf_sync = 0; + + if (c->ro_media) { + err = -EROFS; + goto out_timers; + } + + dbg_io("synchronize"); + for (i = 0; i < c->jhead_cnt; i++) { + struct ubifs_wbuf *wbuf = &c->jheads[i].wbuf; + + cond_resched(); + + /* + * If the mutex is locked then wbuf is being changed, so + * synchronization is not necessary. + */ + if (mutex_is_locked(&wbuf->io_mutex)) + continue; + + mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); + if (!wbuf->need_sync) { + mutex_unlock(&wbuf->io_mutex); + continue; + } + + err = ubifs_wbuf_sync_nolock(wbuf); + mutex_unlock(&wbuf->io_mutex); + if (err) { + ubifs_err("cannot sync write-buffer, error %d", err); + ubifs_ro_mode(c, err); + goto out_timers; + } + } + + return 0; + +out_timers: + /* Cancel all timers to prevent repeated errors */ + for (i = 0; i < c->jhead_cnt; i++) { + struct ubifs_wbuf *wbuf = &c->jheads[i].wbuf; + + mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); + cancel_wbuf_timer_nolock(wbuf); + mutex_unlock(&wbuf->io_mutex); + } + return err; +} + +/** + * ubifs_wbuf_write_nolock - write data to flash via write-buffer. + * @wbuf: write-buffer + * @buf: node to write + * @len: node length + * + * This function writes data to flash via write-buffer @wbuf. This means that + * the last piece of the node won't reach the flash media immediately if it + * does not take whole minimal I/O unit. Instead, the node will sit in RAM + * until the write-buffer is synchronized (e.g., by timer). + * + * This function returns zero in case of success and a negative error code in + * case of failure. If the node cannot be written because there is no more + * space in this logical eraseblock, %-ENOSPC is returned. + */ +int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) +{ + struct ubifs_info *c = wbuf->c; + int err, written, n, aligned_len = ALIGN(len, 8), offs; + + dbg_io("%d bytes (%s) to wbuf at LEB %d:%d", len, + dbg_ntype(((struct ubifs_ch *)buf)->node_type), wbuf->lnum, + wbuf->offs + wbuf->used); + ubifs_assert(len > 0 && wbuf->lnum >= 0 && wbuf->lnum < c->leb_cnt); + ubifs_assert(wbuf->offs >= 0 && wbuf->offs % c->min_io_size == 0); + ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size); + ubifs_assert(wbuf->avail > 0 && wbuf->avail <= c->min_io_size); + ubifs_assert(mutex_is_locked(&wbuf->io_mutex)); + + if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) { + err = -ENOSPC; + goto out; + } + + cancel_wbuf_timer_nolock(wbuf); + + if (c->ro_media) + return -EROFS; + + if (aligned_len <= wbuf->avail) { + /* + * The node is not very large and fits entirely within + * write-buffer. + */ + memcpy(wbuf->buf + wbuf->used, buf, len); + + if (aligned_len == wbuf->avail) { + dbg_io("flush wbuf to LEB %d:%d", wbuf->lnum, + wbuf->offs); + err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, + wbuf->offs, c->min_io_size, + wbuf->dtype); + if (err) + goto out; + + spin_lock(&wbuf->lock); + wbuf->offs += c->min_io_size; + wbuf->avail = c->min_io_size; + wbuf->used = 0; + wbuf->next_ino = 0; + spin_unlock(&wbuf->lock); + } else { + spin_lock(&wbuf->lock); + wbuf->avail -= aligned_len; + wbuf->used += aligned_len; + spin_unlock(&wbuf->lock); + } + + goto exit; + } + + /* + * The node is large enough and does not fit entirely within current + * minimal I/O unit. We have to fill and flush write-buffer and switch + * to the next min. I/O unit. + */ + dbg_io("flush wbuf to LEB %d:%d", wbuf->lnum, wbuf->offs); + memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail); + err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs, + c->min_io_size, wbuf->dtype); + if (err) + goto out; + + offs = wbuf->offs + c->min_io_size; + len -= wbuf->avail; + aligned_len -= wbuf->avail; + written = wbuf->avail; + + /* + * The remaining data may take more whole min. I/O units, so write the + * remains multiple to min. I/O unit size directly to the flash media. + * We align node length to 8-byte boundary because we anyway flash wbuf + * if the remaining space is less than 8 bytes. + */ + n = aligned_len >> c->min_io_shift; + if (n) { + n <<= c->min_io_shift; + dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum, offs); + err = ubi_leb_write(c->ubi, wbuf->lnum, buf + written, offs, n, + wbuf->dtype); + if (err) + goto out; + offs += n; + aligned_len -= n; + len -= n; + written += n; + } + + spin_lock(&wbuf->lock); + if (aligned_len) + /* + * And now we have what's left and what does not take whole + * min. I/O unit, so write it to the write-buffer and we are + * done. + */ + memcpy(wbuf->buf, buf + written, len); + + wbuf->offs = offs; + wbuf->used = aligned_len; + wbuf->avail = c->min_io_size - aligned_len; + wbuf->next_ino = 0; + spin_unlock(&wbuf->lock); + +exit: + if (wbuf->sync_callback) { + int free = c->leb_size - wbuf->offs - wbuf->used; + + err = wbuf->sync_callback(c, wbuf->lnum, free, 0); + if (err) + goto out; + } + + if (wbuf->used) + new_wbuf_timer_nolock(wbuf); + + return 0; + +out: + ubifs_err("cannot write %d bytes to LEB %d:%d, error %d", + len, wbuf->lnum, wbuf->offs, err); + dbg_dump_node(c, buf); + dbg_dump_stack(); + dbg_dump_leb(c, wbuf->lnum); + return err; +} + +/** + * ubifs_write_node - write node to the media. + * @c: UBIFS file-system description object + * @buf: the node to write + * @len: node length + * @lnum: logical eraseblock number + * @offs: offset within the logical eraseblock + * @dtype: node life-time hint (%UBI_LONGTERM, %UBI_SHORTTERM, %UBI_UNKNOWN) + * + * This function automatically fills node magic number, assigns sequence + * number, and calculates node CRC checksum. The length of the @buf buffer has + * to be aligned to the minimal I/O unit size. This function automatically + * appends padding node and padding bytes if needed. Returns zero in case of + * success and a negative error code in case of failure. + */ +int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum, + int offs, int dtype) +{ + int err, buf_len = ALIGN(len, c->min_io_size); + + dbg_io("LEB %d:%d, %s, length %d (aligned %d)", + lnum, offs, dbg_ntype(((struct ubifs_ch *)buf)->node_type), len, + buf_len); + ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0); + ubifs_assert(offs % c->min_io_size == 0 && offs < c->leb_size); + + if (c->ro_media) + return -EROFS; + + ubifs_prepare_node(c, buf, len, 1); + err = ubi_leb_write(c->ubi, lnum, buf, offs, buf_len, dtype); + if (err) { + ubifs_err("cannot write %d bytes to LEB %d:%d, error %d", + buf_len, lnum, offs, err); + dbg_dump_node(c, buf); + dbg_dump_stack(); + } + + return err; +} + +/** + * ubifs_read_node_wbuf - read node from the media or write-buffer. + * @wbuf: wbuf to check for un-written data + * @buf: buffer to read to + * @type: node type + * @len: node length + * @lnum: logical eraseblock number + * @offs: offset within the logical eraseblock + * + * This function reads a node of known type and length, checks it and stores + * in @buf. If the node partially or fully sits in the write-buffer, this + * function takes data from the buffer, otherwise it reads the flash media. + * Returns zero in case of success, %-EUCLEAN if CRC mismatched and a negative + * error code in case of failure. + */ +int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len, + int lnum, int offs) +{ + const struct ubifs_info *c = wbuf->c; + int err, rlen, overlap; + struct ubifs_ch *ch = buf; + + dbg_io("LEB %d:%d, %s, length %d", lnum, offs, dbg_ntype(type), len); + ubifs_assert(wbuf && lnum >= 0 && lnum < c->leb_cnt && offs >= 0); + ubifs_assert(!(offs & 7) && offs < c->leb_size); + ubifs_assert(type >= 0 && type < UBIFS_NODE_TYPES_CNT); + + spin_lock(&wbuf->lock); + overlap = (lnum == wbuf->lnum && offs + len > wbuf->offs); + if (!overlap) { + /* We may safely unlock the write-buffer and read the data */ + spin_unlock(&wbuf->lock); + return ubifs_read_node(c, buf, type, len, lnum, offs); + } + + /* Don't read under wbuf */ + rlen = wbuf->offs - offs; + if (rlen < 0) + rlen = 0; + + /* Copy the rest from the write-buffer */ + memcpy(buf + rlen, wbuf->buf + offs + rlen - wbuf->offs, len - rlen); + spin_unlock(&wbuf->lock); + + if (rlen > 0) { + /* Read everything that goes before write-buffer */ + err = ubi_read(c->ubi, lnum, buf, offs, rlen); + if (err && err != -EBADMSG) { + ubifs_err("failed to read node %d from LEB %d:%d, " + "error %d", type, lnum, offs, err); + dbg_dump_stack(); + return err; + } + } + + if (type != ch->node_type) { + ubifs_err("bad node type (%d but expected %d)", + ch->node_type, type); + goto out; + } + + err = ubifs_check_node(c, buf, lnum, offs, 0); + if (err) { + ubifs_err("expected node type %d", type); + return err; + } + + rlen = le32_to_cpu(ch->len); + if (rlen != len) { + ubifs_err("bad node length %d, expected %d", rlen, len); + goto out; + } + + return 0; + +out: + ubifs_err("bad node at LEB %d:%d", lnum, offs); + dbg_dump_node(c, buf); + dbg_dump_stack(); + return -EINVAL; +} + +/** + * ubifs_read_node - read node. + * @c: UBIFS file-system description object + * @buf: buffer to read to + * @type: node type + * @len: node length (not aligned) + * @lnum: logical eraseblock number + * @offs: offset within the logical eraseblock + * + * This function reads a node of known type and and length, checks it and + * stores in @buf. Returns zero in case of success, %-EUCLEAN if CRC mismatched + * and a negative error code in case of failure. + */ +int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len, + int lnum, int offs) +{ + int err, l; + struct ubifs_ch *ch = buf; + + dbg_io("LEB %d:%d, %s, length %d", lnum, offs, dbg_ntype(type), len); + ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0); + ubifs_assert(len >= UBIFS_CH_SZ && offs + len <= c->leb_size); + ubifs_assert(!(offs & 7) && offs < c->leb_size); + ubifs_assert(type >= 0 && type < UBIFS_NODE_TYPES_CNT); + + err = ubi_read(c->ubi, lnum, buf, offs, len); + if (err && err != -EBADMSG) { + ubifs_err("cannot read node %d from LEB %d:%d, error %d", + type, lnum, offs, err); + return err; + } + + if (type != ch->node_type) { + ubifs_err("bad node type (%d but expected %d)", + ch->node_type, type); + goto out; + } + + err = ubifs_check_node(c, buf, lnum, offs, 0); + if (err) { + ubifs_err("expected node type %d", type); + return err; + } + + l = le32_to_cpu(ch->len); + if (l != len) { + ubifs_err("bad node length %d, expected %d", l, len); + goto out; + } + + return 0; + +out: + ubifs_err("bad node at LEB %d:%d", lnum, offs); + dbg_dump_node(c, buf); + dbg_dump_stack(); + return -EINVAL; +} + +/** + * ubifs_wbuf_init - initialize write-buffer. + * @c: UBIFS file-system description object + * @wbuf: write-buffer to initialize + * + * This function initializes write buffer. Returns zero in case of success + * %-ENOMEM in case of failure. + */ +int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf) +{ + size_t size; + + wbuf->buf = kmalloc(c->min_io_size, GFP_KERNEL); + if (!wbuf->buf) + return -ENOMEM; + + size = (c->min_io_size / UBIFS_CH_SZ + 1) * sizeof(ino_t); + wbuf->inodes = kmalloc(size, GFP_KERNEL); + if (!wbuf->inodes) { + kfree(wbuf->buf); + wbuf->buf = NULL; + return -ENOMEM; + } + + wbuf->used = 0; + wbuf->lnum = wbuf->offs = -1; + wbuf->avail = c->min_io_size; + wbuf->dtype = UBI_UNKNOWN; + wbuf->sync_callback = NULL; + mutex_init(&wbuf->io_mutex); + spin_lock_init(&wbuf->lock); + + wbuf->c = c; + init_timer(&wbuf->timer); + wbuf->timer.function = wbuf_timer_callback_nolock; + wbuf->timer.data = (unsigned long)wbuf; + wbuf->timeout = DEFAULT_WBUF_TIMEOUT; + wbuf->next_ino = 0; + + return 0; +} + +/** + * ubifs_wbuf_add_ino_nolock - add an inode number into the wbuf inode array. + * @wbuf: the write-buffer whereto add + * @inum: the inode number + * + * This function adds an inode number to the inode array of the write-buffer. + */ +void ubifs_wbuf_add_ino_nolock(struct ubifs_wbuf *wbuf, ino_t inum) +{ + if (!wbuf->buf) + /* NOR flash or something similar */ + return; + + spin_lock(&wbuf->lock); + if (wbuf->used) + wbuf->inodes[wbuf->next_ino++] = inum; + spin_unlock(&wbuf->lock); +} + +/** + * wbuf_has_ino - returns if the wbuf contains data from the inode. + * @wbuf: the write-buffer + * @inum: the inode number + * + * This function returns with %1 if the write-buffer contains some data from the + * given inode otherwise it returns with %0. + */ +static int wbuf_has_ino(struct ubifs_wbuf *wbuf, ino_t inum) +{ + int i, ret = 0; + + spin_lock(&wbuf->lock); + for (i = 0; i < wbuf->next_ino; i++) + if (inum == wbuf->inodes[i]) { + ret = 1; + break; + } + spin_unlock(&wbuf->lock); + + return ret; +} + +/** + * ubifs_sync_wbufs_by_inode - synchronize write-buffers for an inode. + * @c: UBIFS file-system description object + * @inode: inode to synchronize + * + * This function synchronizes write-buffers which contain nodes belonging to + * @inode. Returns zero in case of success and a negative error code in case of + * failure. + */ +int ubifs_sync_wbufs_by_inode(struct ubifs_info *c, struct inode *inode) +{ + int i, err = 0; + + for (i = 0; i < c->jhead_cnt; i++) { + struct ubifs_wbuf *wbuf = &c->jheads[i].wbuf; + + if (i == GCHD) + /* + * GC head is special, do not look at it. Even if the + * head contains something related to this inode, it is + * a _copy_ of corresponding on-flash node which sits + * somewhere else. + */ + continue; + + if (!wbuf_has_ino(wbuf, inode->i_ino)) + continue; + + mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); + if (wbuf_has_ino(wbuf, inode->i_ino)) + err = ubifs_wbuf_sync_nolock(wbuf); + mutex_unlock(&wbuf->io_mutex); + + if (err) { + ubifs_ro_mode(c, err); + return err; + } + } + return 0; +} diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c new file mode 100644 index 000000000000..5e82cffe9695 --- /dev/null +++ b/fs/ubifs/ioctl.c @@ -0,0 +1,204 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * Copyright (C) 2006, 2007 University of Szeged, Hungary + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Zoltan Sogor + * Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* This file implements EXT2-compatible extended attribute ioctl() calls */ + +#include <linux/compat.h> +#include <linux/smp_lock.h> +#include <linux/mount.h> +#include "ubifs.h" + +/** + * ubifs_set_inode_flags - set VFS inode flags. + * @inode: VFS inode to set flags for + * + * This function propagates flags from UBIFS inode object to VFS inode object. + */ +void ubifs_set_inode_flags(struct inode *inode) +{ + unsigned int flags = ubifs_inode(inode)->flags; + + inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_DIRSYNC); + if (flags & UBIFS_SYNC_FL) + inode->i_flags |= S_SYNC; + if (flags & UBIFS_APPEND_FL) + inode->i_flags |= S_APPEND; + if (flags & UBIFS_IMMUTABLE_FL) + inode->i_flags |= S_IMMUTABLE; + if (flags & UBIFS_DIRSYNC_FL) + inode->i_flags |= S_DIRSYNC; +} + +/* + * ioctl2ubifs - convert ioctl inode flags to UBIFS inode flags. + * @ioctl_flags: flags to convert + * + * This function convert ioctl flags (@FS_COMPR_FL, etc) to UBIFS inode flags + * (@UBIFS_COMPR_FL, etc). + */ +static int ioctl2ubifs(int ioctl_flags) +{ + int ubifs_flags = 0; + + if (ioctl_flags & FS_COMPR_FL) + ubifs_flags |= UBIFS_COMPR_FL; + if (ioctl_flags & FS_SYNC_FL) + ubifs_flags |= UBIFS_SYNC_FL; + if (ioctl_flags & FS_APPEND_FL) + ubifs_flags |= UBIFS_APPEND_FL; + if (ioctl_flags & FS_IMMUTABLE_FL) + ubifs_flags |= UBIFS_IMMUTABLE_FL; + if (ioctl_flags & FS_DIRSYNC_FL) + ubifs_flags |= UBIFS_DIRSYNC_FL; + + return ubifs_flags; +} + +/* + * ubifs2ioctl - convert UBIFS inode flags to ioctl inode flags. + * @ubifs_flags: flags to convert + * + * This function convert UBIFS (@UBIFS_COMPR_FL, etc) to ioctl flags + * (@FS_COMPR_FL, etc). + */ +static int ubifs2ioctl(int ubifs_flags) +{ + int ioctl_flags = 0; + + if (ubifs_flags & UBIFS_COMPR_FL) + ioctl_flags |= FS_COMPR_FL; + if (ubifs_flags & UBIFS_SYNC_FL) + ioctl_flags |= FS_SYNC_FL; + if (ubifs_flags & UBIFS_APPEND_FL) + ioctl_flags |= FS_APPEND_FL; + if (ubifs_flags & UBIFS_IMMUTABLE_FL) + ioctl_flags |= FS_IMMUTABLE_FL; + if (ubifs_flags & UBIFS_DIRSYNC_FL) + ioctl_flags |= FS_DIRSYNC_FL; + + return ioctl_flags; +} + +static int setflags(struct inode *inode, int flags) +{ + int oldflags, err, release; + struct ubifs_inode *ui = ubifs_inode(inode); + struct ubifs_info *c = inode->i_sb->s_fs_info; + struct ubifs_budget_req req = { .dirtied_ino = 1, + .dirtied_ino_d = ui->data_len }; + + err = ubifs_budget_space(c, &req); + if (err) + return err; + + /* + * The IMMUTABLE and APPEND_ONLY flags can only be changed by + * the relevant capability. + */ + mutex_lock(&ui->ui_mutex); + oldflags = ubifs2ioctl(ui->flags); + if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { + if (!capable(CAP_LINUX_IMMUTABLE)) { + err = -EPERM; + goto out_unlock; + } + } + + ui->flags = ioctl2ubifs(flags); + ubifs_set_inode_flags(inode); + inode->i_ctime = ubifs_current_time(inode); + release = ui->dirty; + mark_inode_dirty_sync(inode); + mutex_unlock(&ui->ui_mutex); + + if (release) + ubifs_release_budget(c, &req); + if (IS_SYNC(inode)) + err = write_inode_now(inode, 1); + return err; + +out_unlock: + ubifs_err("can't modify inode %lu attributes", inode->i_ino); + mutex_unlock(&ui->ui_mutex); + ubifs_release_budget(c, &req); + return err; +} + +long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + int flags, err; + struct inode *inode = file->f_path.dentry->d_inode; + + switch (cmd) { + case FS_IOC_GETFLAGS: + flags = ubifs2ioctl(ubifs_inode(inode)->flags); + + return put_user(flags, (int __user *) arg); + + case FS_IOC_SETFLAGS: { + if (IS_RDONLY(inode)) + return -EROFS; + + if (!is_owner_or_cap(inode)) + return -EACCES; + + if (get_user(flags, (int __user *) arg)) + return -EFAULT; + + if (!S_ISDIR(inode->i_mode)) + flags &= ~FS_DIRSYNC_FL; + + /* + * Make sure the file-system is read-write and make sure it + * will not become read-only while we are changing the flags. + */ + err = mnt_want_write(file->f_path.mnt); + if (err) + return err; + err = setflags(inode, flags); + mnt_drop_write(file->f_path.mnt); + return err; + } + + default: + return -ENOTTY; + } +} + +#ifdef CONFIG_COMPAT +long ubifs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case FS_IOC32_GETFLAGS: + cmd = FS_IOC_GETFLAGS; + break; + case FS_IOC32_SETFLAGS: + cmd = FS_IOC_SETFLAGS; + break; + default: + return -ENOIOCTLCMD; + } + return ubifs_ioctl(file, cmd, (unsigned long)compat_ptr(arg)); +} +#endif diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c new file mode 100644 index 000000000000..22993f867d19 --- /dev/null +++ b/fs/ubifs/journal.c @@ -0,0 +1,1441 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* + * This file implements UBIFS journal. + * + * The journal consists of 2 parts - the log and bud LEBs. The log has fixed + * length and position, while a bud logical eraseblock is any LEB in the main + * area. Buds contain file system data - data nodes, inode nodes, etc. The log + * contains only references to buds and some other stuff like commit + * start node. The idea is that when we commit the journal, we do + * not copy the data, the buds just become indexed. Since after the commit the + * nodes in bud eraseblocks become leaf nodes of the file system index tree, we + * use term "bud". Analogy is obvious, bud eraseblocks contain nodes which will + * become leafs in the future. + * + * The journal is multi-headed because we want to write data to the journal as + * optimally as possible. It is nice to have nodes belonging to the same inode + * in one LEB, so we may write data owned by different inodes to different + * journal heads, although at present only one data head is used. + * + * For recovery reasons, the base head contains all inode nodes, all directory + * entry nodes and all truncate nodes. This means that the other heads contain + * only data nodes. + * + * Bud LEBs may be half-indexed. For example, if the bud was not full at the + * time of commit, the bud is retained to continue to be used in the journal, + * even though the "front" of the LEB is now indexed. In that case, the log + * reference contains the offset where the bud starts for the purposes of the + * journal. + * + * The journal size has to be limited, because the larger is the journal, the + * longer it takes to mount UBIFS (scanning the journal) and the more memory it + * takes (indexing in the TNC). + * + * All the journal write operations like 'ubifs_jnl_update()' here, which write + * multiple UBIFS nodes to the journal at one go, are atomic with respect to + * unclean reboots. Should the unclean reboot happen, the recovery code drops + * all the nodes. + */ + +#include "ubifs.h" + +/** + * zero_ino_node_unused - zero out unused fields of an on-flash inode node. + * @ino: the inode to zero out + */ +static inline void zero_ino_node_unused(struct ubifs_ino_node *ino) +{ + memset(ino->padding1, 0, 4); + memset(ino->padding2, 0, 26); +} + +/** + * zero_dent_node_unused - zero out unused fields of an on-flash directory + * entry node. + * @dent: the directory entry to zero out + */ +static inline void zero_dent_node_unused(struct ubifs_dent_node *dent) +{ + dent->padding1 = 0; + memset(dent->padding2, 0, 4); +} + +/** + * zero_data_node_unused - zero out unused fields of an on-flash data node. + * @data: the data node to zero out + */ +static inline void zero_data_node_unused(struct ubifs_data_node *data) +{ + memset(data->padding, 0, 2); +} + +/** + * zero_trun_node_unused - zero out unused fields of an on-flash truncation + * node. + * @trun: the truncation node to zero out + */ +static inline void zero_trun_node_unused(struct ubifs_trun_node *trun) +{ + memset(trun->padding, 0, 12); +} + +/** + * reserve_space - reserve space in the journal. + * @c: UBIFS file-system description object + * @jhead: journal head number + * @len: node length + * + * This function reserves space in journal head @head. If the reservation + * succeeded, the journal head stays locked and later has to be unlocked using + * 'release_head()'. 'write_node()' and 'write_head()' functions also unlock + * it. Returns zero in case of success, %-EAGAIN if commit has to be done, and + * other negative error codes in case of other failures. + */ +static int reserve_space(struct ubifs_info *c, int jhead, int len) +{ + int err = 0, err1, retries = 0, avail, lnum, offs, free, squeeze; + struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf; + + /* + * Typically, the base head has smaller nodes written to it, so it is + * better to try to allocate space at the ends of eraseblocks. This is + * what the squeeze parameter does. + */ + squeeze = (jhead == BASEHD); +again: + mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); + + if (c->ro_media) { + err = -EROFS; + goto out_unlock; + } + + avail = c->leb_size - wbuf->offs - wbuf->used; + if (wbuf->lnum != -1 && avail >= len) + return 0; + + /* + * Write buffer wasn't seek'ed or there is no enough space - look for an + * LEB with some empty space. + */ + lnum = ubifs_find_free_space(c, len, &free, squeeze); + if (lnum >= 0) { + /* Found an LEB, add it to the journal head */ + offs = c->leb_size - free; + err = ubifs_add_bud_to_log(c, jhead, lnum, offs); + if (err) + goto out_return; + /* A new bud was successfully allocated and added to the log */ + goto out; + } + + err = lnum; + if (err != -ENOSPC) + goto out_unlock; + + /* + * No free space, we have to run garbage collector to make + * some. But the write-buffer mutex has to be unlocked because + * GC also takes it. + */ + dbg_jnl("no free space jhead %d, run GC", jhead); + mutex_unlock(&wbuf->io_mutex); + + lnum = ubifs_garbage_collect(c, 0); + if (lnum < 0) { + err = lnum; + if (err != -ENOSPC) + return err; + + /* + * GC could not make a free LEB. But someone else may + * have allocated new bud for this journal head, + * because we dropped @wbuf->io_mutex, so try once + * again. + */ + dbg_jnl("GC couldn't make a free LEB for jhead %d", jhead); + if (retries++ < 2) { + dbg_jnl("retry (%d)", retries); + goto again; + } + + dbg_jnl("return -ENOSPC"); + return err; + } + + mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); + dbg_jnl("got LEB %d for jhead %d", lnum, jhead); + avail = c->leb_size - wbuf->offs - wbuf->used; + + if (wbuf->lnum != -1 && avail >= len) { + /* + * Someone else has switched the journal head and we have + * enough space now. This happens when more then one process is + * trying to write to the same journal head at the same time. + */ + dbg_jnl("return LEB %d back, already have LEB %d:%d", + lnum, wbuf->lnum, wbuf->offs + wbuf->used); + err = ubifs_return_leb(c, lnum); + if (err) + goto out_unlock; + return 0; + } + + err = ubifs_add_bud_to_log(c, jhead, lnum, 0); + if (err) + goto out_return; + offs = 0; + +out: + err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs, UBI_SHORTTERM); + if (err) + goto out_unlock; + + return 0; + +out_unlock: + mutex_unlock(&wbuf->io_mutex); + return err; + +out_return: + /* An error occurred and the LEB has to be returned to lprops */ + ubifs_assert(err < 0); + err1 = ubifs_return_leb(c, lnum); + if (err1 && err == -EAGAIN) + /* + * Return original error code only if it is not %-EAGAIN, + * which is not really an error. Otherwise, return the error + * code of 'ubifs_return_leb()'. + */ + err = err1; + mutex_unlock(&wbuf->io_mutex); + return err; +} + +/** + * write_node - write node to a journal head. + * @c: UBIFS file-system description object + * @jhead: journal head + * @node: node to write + * @len: node length + * @lnum: LEB number written is returned here + * @offs: offset written is returned here + * + * This function writes a node to reserved space of journal head @jhead. + * Returns zero in case of success and a negative error code in case of + * failure. + */ +static int write_node(struct ubifs_info *c, int jhead, void *node, int len, + int *lnum, int *offs) +{ + struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf; + + ubifs_assert(jhead != GCHD); + + *lnum = c->jheads[jhead].wbuf.lnum; + *offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used; + + dbg_jnl("jhead %d, LEB %d:%d, len %d", jhead, *lnum, *offs, len); + ubifs_prepare_node(c, node, len, 0); + + return ubifs_wbuf_write_nolock(wbuf, node, len); +} + +/** + * write_head - write data to a journal head. + * @c: UBIFS file-system description object + * @jhead: journal head + * @buf: buffer to write + * @len: length to write + * @lnum: LEB number written is returned here + * @offs: offset written is returned here + * @sync: non-zero if the write-buffer has to by synchronized + * + * This function is the same as 'write_node()' but it does not assume the + * buffer it is writing is a node, so it does not prepare it (which means + * initializing common header and calculating CRC). + */ +static int write_head(struct ubifs_info *c, int jhead, void *buf, int len, + int *lnum, int *offs, int sync) +{ + int err; + struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf; + + ubifs_assert(jhead != GCHD); + + *lnum = c->jheads[jhead].wbuf.lnum; + *offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used; + dbg_jnl("jhead %d, LEB %d:%d, len %d", jhead, *lnum, *offs, len); + + err = ubifs_wbuf_write_nolock(wbuf, buf, len); + if (err) + return err; + if (sync) + err = ubifs_wbuf_sync_nolock(wbuf); + return err; +} + +/** + * make_reservation - reserve journal space. + * @c: UBIFS file-system description object + * @jhead: journal head + * @len: how many bytes to reserve + * + * This function makes space reservation in journal head @jhead. The function + * takes the commit lock and locks the journal head, and the caller has to + * unlock the head and finish the reservation with 'finish_reservation()'. + * Returns zero in case of success and a negative error code in case of + * failure. + * + * Note, the journal head may be unlocked as soon as the data is written, while + * the commit lock has to be released after the data has been added to the + * TNC. + */ +static int make_reservation(struct ubifs_info *c, int jhead, int len) +{ + int err, cmt_retries = 0, nospc_retries = 0; + +again: + down_read(&c->commit_sem); + err = reserve_space(c, jhead, len); + if (!err) + return 0; + up_read(&c->commit_sem); + + if (err == -ENOSPC) { + /* + * GC could not make any progress. We should try to commit + * once because it could make some dirty space and GC would + * make progress, so make the error -EAGAIN so that the below + * will commit and re-try. + */ + if (nospc_retries++ < 2) { + dbg_jnl("no space, retry"); + err = -EAGAIN; + } + + /* + * This means that the budgeting is incorrect. We always have + * to be able to write to the media, because all operations are + * budgeted. Deletions are not budgeted, though, but we reserve + * an extra LEB for them. + */ + } + + if (err != -EAGAIN) + goto out; + + /* + * -EAGAIN means that the journal is full or too large, or the above + * code wants to do one commit. Do this and re-try. + */ + if (cmt_retries > 128) { + /* + * This should not happen unless the journal size limitations + * are too tough. + */ + ubifs_err("stuck in space allocation"); + err = -ENOSPC; + goto out; + } else if (cmt_retries > 32) + ubifs_warn("too many space allocation re-tries (%d)", + cmt_retries); + + dbg_jnl("-EAGAIN, commit and retry (retried %d times)", + cmt_retries); + cmt_retries += 1; + + err = ubifs_run_commit(c); + if (err) + return err; + goto again; + +out: + ubifs_err("cannot reserve %d bytes in jhead %d, error %d", + len, jhead, err); + if (err == -ENOSPC) { + /* This are some budgeting problems, print useful information */ + down_write(&c->commit_sem); + spin_lock(&c->space_lock); + dbg_dump_stack(); + dbg_dump_budg(c); + spin_unlock(&c->space_lock); + dbg_dump_lprops(c); + cmt_retries = dbg_check_lprops(c); + up_write(&c->commit_sem); + } + return err; +} + +/** + * release_head - release a journal head. + * @c: UBIFS file-system description object + * @jhead: journal head + * + * This function releases journal head @jhead which was locked by + * the 'make_reservation()' function. It has to be called after each successful + * 'make_reservation()' invocation. + */ +static inline void release_head(struct ubifs_info *c, int jhead) +{ + mutex_unlock(&c->jheads[jhead].wbuf.io_mutex); +} + +/** + * finish_reservation - finish a reservation. + * @c: UBIFS file-system description object + * + * This function finishes journal space reservation. It must be called after + * 'make_reservation()'. + */ +static void finish_reservation(struct ubifs_info *c) +{ + up_read(&c->commit_sem); +} + +/** + * get_dent_type - translate VFS inode mode to UBIFS directory entry type. + * @mode: inode mode + */ +static int get_dent_type(int mode) +{ + switch (mode & S_IFMT) { + case S_IFREG: + return UBIFS_ITYPE_REG; + case S_IFDIR: + return UBIFS_ITYPE_DIR; + case S_IFLNK: + return UBIFS_ITYPE_LNK; + case S_IFBLK: + return UBIFS_ITYPE_BLK; + case S_IFCHR: + return UBIFS_ITYPE_CHR; + case S_IFIFO: + return UBIFS_ITYPE_FIFO; + case S_IFSOCK: + return UBIFS_ITYPE_SOCK; + default: + BUG(); + } + return 0; +} + +/** + * pack_inode - pack an inode node. + * @c: UBIFS file-system description object + * @ino: buffer in which to pack inode node + * @inode: inode to pack + * @last: indicates the last node of the group + */ +static void pack_inode(struct ubifs_info *c, struct ubifs_ino_node *ino, + const struct inode *inode, int last) +{ + int data_len = 0, last_reference = !inode->i_nlink; + struct ubifs_inode *ui = ubifs_inode(inode); + + ino->ch.node_type = UBIFS_INO_NODE; + ino_key_init_flash(c, &ino->key, inode->i_ino); + ino->creat_sqnum = cpu_to_le64(ui->creat_sqnum); + ino->atime_sec = cpu_to_le64(inode->i_atime.tv_sec); + ino->atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); + ino->ctime_sec = cpu_to_le64(inode->i_ctime.tv_sec); + ino->ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + ino->mtime_sec = cpu_to_le64(inode->i_mtime.tv_sec); + ino->mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + ino->uid = cpu_to_le32(inode->i_uid); + ino->gid = cpu_to_le32(inode->i_gid); + ino->mode = cpu_to_le32(inode->i_mode); + ino->flags = cpu_to_le32(ui->flags); + ino->size = cpu_to_le64(ui->ui_size); + ino->nlink = cpu_to_le32(inode->i_nlink); + ino->compr_type = cpu_to_le16(ui->compr_type); + ino->data_len = cpu_to_le32(ui->data_len); + ino->xattr_cnt = cpu_to_le32(ui->xattr_cnt); + ino->xattr_size = cpu_to_le32(ui->xattr_size); + ino->xattr_names = cpu_to_le32(ui->xattr_names); + zero_ino_node_unused(ino); + + /* + * Drop the attached data if this is a deletion inode, the data is not + * needed anymore. + */ + if (!last_reference) { + memcpy(ino->data, ui->data, ui->data_len); + data_len = ui->data_len; + } + + ubifs_prep_grp_node(c, ino, UBIFS_INO_NODE_SZ + data_len, last); +} + +/** + * mark_inode_clean - mark UBIFS inode as clean. + * @c: UBIFS file-system description object + * @ui: UBIFS inode to mark as clean + * + * This helper function marks UBIFS inode @ui as clean by cleaning the + * @ui->dirty flag and releasing its budget. Note, VFS may still treat the + * inode as dirty and try to write it back, but 'ubifs_write_inode()' would + * just do nothing. + */ +static void mark_inode_clean(struct ubifs_info *c, struct ubifs_inode *ui) +{ + if (ui->dirty) + ubifs_release_dirty_inode_budget(c, ui); + ui->dirty = 0; +} + +/** + * ubifs_jnl_update - update inode. + * @c: UBIFS file-system description object + * @dir: parent inode or host inode in case of extended attributes + * @nm: directory entry name + * @inode: inode to update + * @deletion: indicates a directory entry deletion i.e unlink or rmdir + * @xent: non-zero if the directory entry is an extended attribute entry + * + * This function updates an inode by writing a directory entry (or extended + * attribute entry), the inode itself, and the parent directory inode (or the + * host inode) to the journal. + * + * The function writes the host inode @dir last, which is important in case of + * extended attributes. Indeed, then we guarantee that if the host inode gets + * synchronized (with 'fsync()'), and the write-buffer it sits in gets flushed, + * the extended attribute inode gets flushed too. And this is exactly what the + * user expects - synchronizing the host inode synchronizes its extended + * attributes. Similarly, this guarantees that if @dir is synchronized, its + * directory entry corresponding to @nm gets synchronized too. + * + * If the inode (@inode) or the parent directory (@dir) are synchronous, this + * function synchronizes the write-buffer. + * + * This function marks the @dir and @inode inodes as clean and returns zero on + * success. In case of failure, a negative error code is returned. + */ +int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, + const struct qstr *nm, const struct inode *inode, + int deletion, int xent) +{ + int err, dlen, ilen, len, lnum, ino_offs, dent_offs; + int aligned_dlen, aligned_ilen, sync = IS_DIRSYNC(dir); + int last_reference = !!(deletion && inode->i_nlink == 0); + struct ubifs_inode *ui = ubifs_inode(inode); + struct ubifs_inode *dir_ui = ubifs_inode(dir); + struct ubifs_dent_node *dent; + struct ubifs_ino_node *ino; + union ubifs_key dent_key, ino_key; + + dbg_jnl("ino %lu, dent '%.*s', data len %d in dir ino %lu", + inode->i_ino, nm->len, nm->name, ui->data_len, dir->i_ino); + ubifs_assert(dir_ui->data_len == 0); + ubifs_assert(mutex_is_locked(&dir_ui->ui_mutex)); + + dlen = UBIFS_DENT_NODE_SZ + nm->len + 1; + ilen = UBIFS_INO_NODE_SZ; + + /* + * If the last reference to the inode is being deleted, then there is + * no need to attach and write inode data, it is being deleted anyway. + * And if the inode is being deleted, no need to synchronize + * write-buffer even if the inode is synchronous. + */ + if (!last_reference) { + ilen += ui->data_len; + sync |= IS_SYNC(inode); + } + + aligned_dlen = ALIGN(dlen, 8); + aligned_ilen = ALIGN(ilen, 8); + len = aligned_dlen + aligned_ilen + UBIFS_INO_NODE_SZ; + dent = kmalloc(len, GFP_NOFS); + if (!dent) + return -ENOMEM; + + /* Make reservation before allocating sequence numbers */ + err = make_reservation(c, BASEHD, len); + if (err) + goto out_free; + + if (!xent) { + dent->ch.node_type = UBIFS_DENT_NODE; + dent_key_init(c, &dent_key, dir->i_ino, nm); + } else { + dent->ch.node_type = UBIFS_XENT_NODE; + xent_key_init(c, &dent_key, dir->i_ino, nm); + } + + key_write(c, &dent_key, dent->key); + dent->inum = deletion ? 0 : cpu_to_le64(inode->i_ino); + dent->type = get_dent_type(inode->i_mode); + dent->nlen = cpu_to_le16(nm->len); + memcpy(dent->name, nm->name, nm->len); + dent->name[nm->len] = '\0'; + zero_dent_node_unused(dent); + ubifs_prep_grp_node(c, dent, dlen, 0); + + ino = (void *)dent + aligned_dlen; + pack_inode(c, ino, inode, 0); + ino = (void *)ino + aligned_ilen; + pack_inode(c, ino, dir, 1); + + if (last_reference) { + err = ubifs_add_orphan(c, inode->i_ino); + if (err) { + release_head(c, BASEHD); + goto out_finish; + } + ui->del_cmtno = c->cmt_no; + } + + err = write_head(c, BASEHD, dent, len, &lnum, &dent_offs, sync); + if (err) + goto out_release; + if (!sync) { + struct ubifs_wbuf *wbuf = &c->jheads[BASEHD].wbuf; + + ubifs_wbuf_add_ino_nolock(wbuf, inode->i_ino); + ubifs_wbuf_add_ino_nolock(wbuf, dir->i_ino); + } + release_head(c, BASEHD); + kfree(dent); + + if (deletion) { + err = ubifs_tnc_remove_nm(c, &dent_key, nm); + if (err) + goto out_ro; + err = ubifs_add_dirt(c, lnum, dlen); + } else + err = ubifs_tnc_add_nm(c, &dent_key, lnum, dent_offs, dlen, nm); + if (err) + goto out_ro; + + /* + * Note, we do not remove the inode from TNC even if the last reference + * to it has just been deleted, because the inode may still be opened. + * Instead, the inode has been added to orphan lists and the orphan + * subsystem will take further care about it. + */ + ino_key_init(c, &ino_key, inode->i_ino); + ino_offs = dent_offs + aligned_dlen; + err = ubifs_tnc_add(c, &ino_key, lnum, ino_offs, ilen); + if (err) + goto out_ro; + + ino_key_init(c, &ino_key, dir->i_ino); + ino_offs += aligned_ilen; + err = ubifs_tnc_add(c, &ino_key, lnum, ino_offs, UBIFS_INO_NODE_SZ); + if (err) + goto out_ro; + + finish_reservation(c); + spin_lock(&ui->ui_lock); + ui->synced_i_size = ui->ui_size; + spin_unlock(&ui->ui_lock); + mark_inode_clean(c, ui); + mark_inode_clean(c, dir_ui); + return 0; + +out_finish: + finish_reservation(c); +out_free: + kfree(dent); + return err; + +out_release: + release_head(c, BASEHD); +out_ro: + ubifs_ro_mode(c, err); + if (last_reference) + ubifs_delete_orphan(c, inode->i_ino); + finish_reservation(c); + return err; +} + +/** + * ubifs_jnl_write_data - write a data node to the journal. + * @c: UBIFS file-system description object + * @inode: inode the data node belongs to + * @key: node key + * @buf: buffer to write + * @len: data length (must not exceed %UBIFS_BLOCK_SIZE) + * + * This function writes a data node to the journal. Returns %0 if the data node + * was successfully written, and a negative error code in case of failure. + */ +int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, + const union ubifs_key *key, const void *buf, int len) +{ + struct ubifs_data_node *data; + int err, lnum, offs, compr_type, out_len; + int dlen = UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE * WORST_COMPR_FACTOR; + struct ubifs_inode *ui = ubifs_inode(inode); + + dbg_jnl("ino %lu, blk %u, len %d, key %s", key_inum(c, key), + key_block(c, key), len, DBGKEY(key)); + ubifs_assert(len <= UBIFS_BLOCK_SIZE); + + data = kmalloc(dlen, GFP_NOFS); + if (!data) + return -ENOMEM; + + data->ch.node_type = UBIFS_DATA_NODE; + key_write(c, key, &data->key); + data->size = cpu_to_le32(len); + zero_data_node_unused(data); + + if (!(ui->flags && UBIFS_COMPR_FL)) + /* Compression is disabled for this inode */ + compr_type = UBIFS_COMPR_NONE; + else + compr_type = ui->compr_type; + + out_len = dlen - UBIFS_DATA_NODE_SZ; + ubifs_compress(buf, len, &data->data, &out_len, &compr_type); + ubifs_assert(out_len <= UBIFS_BLOCK_SIZE); + + dlen = UBIFS_DATA_NODE_SZ + out_len; + data->compr_type = cpu_to_le16(compr_type); + + /* Make reservation before allocating sequence numbers */ + err = make_reservation(c, DATAHD, dlen); + if (err) + goto out_free; + + err = write_node(c, DATAHD, data, dlen, &lnum, &offs); + if (err) + goto out_release; + ubifs_wbuf_add_ino_nolock(&c->jheads[DATAHD].wbuf, key_inum(c, key)); + release_head(c, DATAHD); + + err = ubifs_tnc_add(c, key, lnum, offs, dlen); + if (err) + goto out_ro; + + finish_reservation(c); + kfree(data); + return 0; + +out_release: + release_head(c, DATAHD); +out_ro: + ubifs_ro_mode(c, err); + finish_reservation(c); +out_free: + kfree(data); + return err; +} + +/** + * ubifs_jnl_write_inode - flush inode to the journal. + * @c: UBIFS file-system description object + * @inode: inode to flush + * + * This function writes inode @inode to the journal. If the inode is + * synchronous, it also synchronizes the write-buffer. Returns zero in case of + * success and a negative error code in case of failure. + */ +int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode) +{ + int err, lnum, offs; + struct ubifs_ino_node *ino; + struct ubifs_inode *ui = ubifs_inode(inode); + int sync = 0, len = UBIFS_INO_NODE_SZ, last_reference = !inode->i_nlink; + + dbg_jnl("ino %lu, nlink %u", inode->i_ino, inode->i_nlink); + + /* + * If the inode is being deleted, do not write the attached data. No + * need to synchronize the write-buffer either. + */ + if (!last_reference) { + len += ui->data_len; + sync = IS_SYNC(inode); + } + ino = kmalloc(len, GFP_NOFS); + if (!ino) + return -ENOMEM; + + /* Make reservation before allocating sequence numbers */ + err = make_reservation(c, BASEHD, len); + if (err) + goto out_free; + + pack_inode(c, ino, inode, 1); + err = write_head(c, BASEHD, ino, len, &lnum, &offs, sync); + if (err) + goto out_release; + if (!sync) + ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, + inode->i_ino); + release_head(c, BASEHD); + + if (last_reference) { + err = ubifs_tnc_remove_ino(c, inode->i_ino); + if (err) + goto out_ro; + ubifs_delete_orphan(c, inode->i_ino); + err = ubifs_add_dirt(c, lnum, len); + } else { + union ubifs_key key; + + ino_key_init(c, &key, inode->i_ino); + err = ubifs_tnc_add(c, &key, lnum, offs, len); + } + if (err) + goto out_ro; + + finish_reservation(c); + spin_lock(&ui->ui_lock); + ui->synced_i_size = ui->ui_size; + spin_unlock(&ui->ui_lock); + kfree(ino); + return 0; + +out_release: + release_head(c, BASEHD); +out_ro: + ubifs_ro_mode(c, err); + finish_reservation(c); +out_free: + kfree(ino); + return err; +} + +/** + * ubifs_jnl_delete_inode - delete an inode. + * @c: UBIFS file-system description object + * @inode: inode to delete + * + * This function deletes inode @inode which includes removing it from orphans, + * deleting it from TNC and, in some cases, writing a deletion inode to the + * journal. + * + * When regular file inodes are unlinked or a directory inode is removed, the + * 'ubifs_jnl_update()' function writes a corresponding deletion inode and + * direntry to the media, and adds the inode to orphans. After this, when the + * last reference to this inode has been dropped, this function is called. In + * general, it has to write one more deletion inode to the media, because if + * a commit happened between 'ubifs_jnl_update()' and + * 'ubifs_jnl_delete_inode()', the deletion inode is not in the journal + * anymore, and in fact it might not be on the flash anymore, because it might + * have been garbage-collected already. And for optimization reasons UBIFS does + * not read the orphan area if it has been unmounted cleanly, so it would have + * no indication in the journal that there is a deleted inode which has to be + * removed from TNC. + * + * However, if there was no commit between 'ubifs_jnl_update()' and + * 'ubifs_jnl_delete_inode()', then there is no need to write the deletion + * inode to the media for the second time. And this is quite a typical case. + * + * This function returns zero in case of success and a negative error code in + * case of failure. + */ +int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode) +{ + int err; + struct ubifs_inode *ui = ubifs_inode(inode); + + ubifs_assert(inode->i_nlink == 0); + + if (ui->del_cmtno != c->cmt_no) + /* A commit happened for sure */ + return ubifs_jnl_write_inode(c, inode); + + down_read(&c->commit_sem); + /* + * Check commit number again, because the first test has been done + * without @c->commit_sem, so a commit might have happened. + */ + if (ui->del_cmtno != c->cmt_no) { + up_read(&c->commit_sem); + return ubifs_jnl_write_inode(c, inode); + } + + err = ubifs_tnc_remove_ino(c, inode->i_ino); + if (err) + ubifs_ro_mode(c, err); + else + ubifs_delete_orphan(c, inode->i_ino); + up_read(&c->commit_sem); + return err; +} + +/** + * ubifs_jnl_rename - rename a directory entry. + * @c: UBIFS file-system description object + * @old_dir: parent inode of directory entry to rename + * @old_dentry: directory entry to rename + * @new_dir: parent inode of directory entry to rename + * @new_dentry: new directory entry (or directory entry to replace) + * @sync: non-zero if the write-buffer has to be synchronized + * + * This function implements the re-name operation which may involve writing up + * to 3 inodes and 2 directory entries. It marks the written inodes as clean + * and returns zero on success. In case of failure, a negative error code is + * returned. + */ +int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, + const struct dentry *old_dentry, + const struct inode *new_dir, + const struct dentry *new_dentry, int sync) +{ + void *p; + union ubifs_key key; + struct ubifs_dent_node *dent, *dent2; + int err, dlen1, dlen2, ilen, lnum, offs, len; + const struct inode *old_inode = old_dentry->d_inode; + const struct inode *new_inode = new_dentry->d_inode; + int aligned_dlen1, aligned_dlen2, plen = UBIFS_INO_NODE_SZ; + int last_reference = !!(new_inode && new_inode->i_nlink == 0); + int move = (old_dir != new_dir); + struct ubifs_inode *uninitialized_var(new_ui); + + dbg_jnl("dent '%.*s' in dir ino %lu to dent '%.*s' in dir ino %lu", + old_dentry->d_name.len, old_dentry->d_name.name, + old_dir->i_ino, new_dentry->d_name.len, + new_dentry->d_name.name, new_dir->i_ino); + ubifs_assert(ubifs_inode(old_dir)->data_len == 0); + ubifs_assert(ubifs_inode(new_dir)->data_len == 0); + ubifs_assert(mutex_is_locked(&ubifs_inode(old_dir)->ui_mutex)); + ubifs_assert(mutex_is_locked(&ubifs_inode(new_dir)->ui_mutex)); + + dlen1 = UBIFS_DENT_NODE_SZ + new_dentry->d_name.len + 1; + dlen2 = UBIFS_DENT_NODE_SZ + old_dentry->d_name.len + 1; + if (new_inode) { + new_ui = ubifs_inode(new_inode); + ubifs_assert(mutex_is_locked(&new_ui->ui_mutex)); + ilen = UBIFS_INO_NODE_SZ; + if (!last_reference) + ilen += new_ui->data_len; + } else + ilen = 0; + + aligned_dlen1 = ALIGN(dlen1, 8); + aligned_dlen2 = ALIGN(dlen2, 8); + len = aligned_dlen1 + aligned_dlen2 + ALIGN(ilen, 8) + ALIGN(plen, 8); + if (old_dir != new_dir) + len += plen; + dent = kmalloc(len, GFP_NOFS); + if (!dent) + return -ENOMEM; + + /* Make reservation before allocating sequence numbers */ + err = make_reservation(c, BASEHD, len); + if (err) + goto out_free; + + /* Make new dent */ + dent->ch.node_type = UBIFS_DENT_NODE; + dent_key_init_flash(c, &dent->key, new_dir->i_ino, &new_dentry->d_name); + dent->inum = cpu_to_le64(old_inode->i_ino); + dent->type = get_dent_type(old_inode->i_mode); + dent->nlen = cpu_to_le16(new_dentry->d_name.len); + memcpy(dent->name, new_dentry->d_name.name, new_dentry->d_name.len); + dent->name[new_dentry->d_name.len] = '\0'; + zero_dent_node_unused(dent); + ubifs_prep_grp_node(c, dent, dlen1, 0); + + /* Make deletion dent */ + dent2 = (void *)dent + aligned_dlen1; + dent2->ch.node_type = UBIFS_DENT_NODE; + dent_key_init_flash(c, &dent2->key, old_dir->i_ino, + &old_dentry->d_name); + dent2->inum = 0; + dent2->type = DT_UNKNOWN; + dent2->nlen = cpu_to_le16(old_dentry->d_name.len); + memcpy(dent2->name, old_dentry->d_name.name, old_dentry->d_name.len); + dent2->name[old_dentry->d_name.len] = '\0'; + zero_dent_node_unused(dent2); + ubifs_prep_grp_node(c, dent2, dlen2, 0); + + p = (void *)dent2 + aligned_dlen2; + if (new_inode) { + pack_inode(c, p, new_inode, 0); + p += ALIGN(ilen, 8); + } + + if (!move) + pack_inode(c, p, old_dir, 1); + else { + pack_inode(c, p, old_dir, 0); + p += ALIGN(plen, 8); + pack_inode(c, p, new_dir, 1); + } + + if (last_reference) { + err = ubifs_add_orphan(c, new_inode->i_ino); + if (err) { + release_head(c, BASEHD); + goto out_finish; + } + new_ui->del_cmtno = c->cmt_no; + } + + err = write_head(c, BASEHD, dent, len, &lnum, &offs, sync); + if (err) + goto out_release; + if (!sync) { + struct ubifs_wbuf *wbuf = &c->jheads[BASEHD].wbuf; + + ubifs_wbuf_add_ino_nolock(wbuf, new_dir->i_ino); + ubifs_wbuf_add_ino_nolock(wbuf, old_dir->i_ino); + if (new_inode) + ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, + new_inode->i_ino); + } + release_head(c, BASEHD); + + dent_key_init(c, &key, new_dir->i_ino, &new_dentry->d_name); + err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen1, &new_dentry->d_name); + if (err) + goto out_ro; + + err = ubifs_add_dirt(c, lnum, dlen2); + if (err) + goto out_ro; + + dent_key_init(c, &key, old_dir->i_ino, &old_dentry->d_name); + err = ubifs_tnc_remove_nm(c, &key, &old_dentry->d_name); + if (err) + goto out_ro; + + offs += aligned_dlen1 + aligned_dlen2; + if (new_inode) { + ino_key_init(c, &key, new_inode->i_ino); + err = ubifs_tnc_add(c, &key, lnum, offs, ilen); + if (err) + goto out_ro; + offs += ALIGN(ilen, 8); + } + + ino_key_init(c, &key, old_dir->i_ino); + err = ubifs_tnc_add(c, &key, lnum, offs, plen); + if (err) + goto out_ro; + + if (old_dir != new_dir) { + offs += ALIGN(plen, 8); + ino_key_init(c, &key, new_dir->i_ino); + err = ubifs_tnc_add(c, &key, lnum, offs, plen); + if (err) + goto out_ro; + } + + finish_reservation(c); + if (new_inode) { + mark_inode_clean(c, new_ui); + spin_lock(&new_ui->ui_lock); + new_ui->synced_i_size = new_ui->ui_size; + spin_unlock(&new_ui->ui_lock); + } + mark_inode_clean(c, ubifs_inode(old_dir)); + if (move) + mark_inode_clean(c, ubifs_inode(new_dir)); + kfree(dent); + return 0; + +out_release: + release_head(c, BASEHD); +out_ro: + ubifs_ro_mode(c, err); + if (last_reference) + ubifs_delete_orphan(c, new_inode->i_ino); +out_finish: + finish_reservation(c); +out_free: + kfree(dent); + return err; +} + +/** + * recomp_data_node - re-compress a truncated data node. + * @dn: data node to re-compress + * @new_len: new length + * + * This function is used when an inode is truncated and the last data node of + * the inode has to be re-compressed and re-written. + */ +static int recomp_data_node(struct ubifs_data_node *dn, int *new_len) +{ + void *buf; + int err, len, compr_type, out_len; + + out_len = le32_to_cpu(dn->size); + buf = kmalloc(out_len * WORST_COMPR_FACTOR, GFP_NOFS); + if (!buf) + return -ENOMEM; + + len = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ; + compr_type = le16_to_cpu(dn->compr_type); + err = ubifs_decompress(&dn->data, len, buf, &out_len, compr_type); + if (err) + goto out; + + ubifs_compress(buf, *new_len, &dn->data, &out_len, &compr_type); + ubifs_assert(out_len <= UBIFS_BLOCK_SIZE); + dn->compr_type = cpu_to_le16(compr_type); + dn->size = cpu_to_le32(*new_len); + *new_len = UBIFS_DATA_NODE_SZ + out_len; +out: + kfree(buf); + return err; +} + +/** + * ubifs_jnl_truncate - update the journal for a truncation. + * @c: UBIFS file-system description object + * @inode: inode to truncate + * @old_size: old size + * @new_size: new size + * + * When the size of a file decreases due to truncation, a truncation node is + * written, the journal tree is updated, and the last data block is re-written + * if it has been affected. The inode is also updated in order to synchronize + * the new inode size. + * + * This function marks the inode as clean and returns zero on success. In case + * of failure, a negative error code is returned. + */ +int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, + loff_t old_size, loff_t new_size) +{ + union ubifs_key key, to_key; + struct ubifs_ino_node *ino; + struct ubifs_trun_node *trun; + struct ubifs_data_node *uninitialized_var(dn); + int err, dlen, len, lnum, offs, bit, sz, sync = IS_SYNC(inode); + struct ubifs_inode *ui = ubifs_inode(inode); + ino_t inum = inode->i_ino; + unsigned int blk; + + dbg_jnl("ino %lu, size %lld -> %lld", inum, old_size, new_size); + ubifs_assert(!ui->data_len); + ubifs_assert(S_ISREG(inode->i_mode)); + ubifs_assert(mutex_is_locked(&ui->ui_mutex)); + + sz = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ + + UBIFS_MAX_DATA_NODE_SZ * WORST_COMPR_FACTOR; + ino = kmalloc(sz, GFP_NOFS); + if (!ino) + return -ENOMEM; + + trun = (void *)ino + UBIFS_INO_NODE_SZ; + trun->ch.node_type = UBIFS_TRUN_NODE; + trun->inum = cpu_to_le32(inum); + trun->old_size = cpu_to_le64(old_size); + trun->new_size = cpu_to_le64(new_size); + zero_trun_node_unused(trun); + + dlen = new_size & (UBIFS_BLOCK_SIZE - 1); + if (dlen) { + /* Get last data block so it can be truncated */ + dn = (void *)trun + UBIFS_TRUN_NODE_SZ; + blk = new_size >> UBIFS_BLOCK_SHIFT; + data_key_init(c, &key, inum, blk); + dbg_jnl("last block key %s", DBGKEY(&key)); + err = ubifs_tnc_lookup(c, &key, dn); + if (err == -ENOENT) + dlen = 0; /* Not found (so it is a hole) */ + else if (err) + goto out_free; + else { + if (le32_to_cpu(dn->size) <= dlen) + dlen = 0; /* Nothing to do */ + else { + int compr_type = le16_to_cpu(dn->compr_type); + + if (compr_type != UBIFS_COMPR_NONE) { + err = recomp_data_node(dn, &dlen); + if (err) + goto out_free; + } else { + dn->size = cpu_to_le32(dlen); + dlen += UBIFS_DATA_NODE_SZ; + } + zero_data_node_unused(dn); + } + } + } + + /* Must make reservation before allocating sequence numbers */ + len = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ; + if (dlen) + len += dlen; + err = make_reservation(c, BASEHD, len); + if (err) + goto out_free; + + pack_inode(c, ino, inode, 0); + ubifs_prep_grp_node(c, trun, UBIFS_TRUN_NODE_SZ, dlen ? 0 : 1); + if (dlen) + ubifs_prep_grp_node(c, dn, dlen, 1); + + err = write_head(c, BASEHD, ino, len, &lnum, &offs, sync); + if (err) + goto out_release; + if (!sync) + ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, inum); + release_head(c, BASEHD); + + if (dlen) { + sz = offs + UBIFS_INO_NODE_SZ + UBIFS_TRUN_NODE_SZ; + err = ubifs_tnc_add(c, &key, lnum, sz, dlen); + if (err) + goto out_ro; + } + + ino_key_init(c, &key, inum); + err = ubifs_tnc_add(c, &key, lnum, offs, UBIFS_INO_NODE_SZ); + if (err) + goto out_ro; + + err = ubifs_add_dirt(c, lnum, UBIFS_TRUN_NODE_SZ); + if (err) + goto out_ro; + + bit = new_size & (UBIFS_BLOCK_SIZE - 1); + blk = (new_size >> UBIFS_BLOCK_SHIFT) + (bit ? 1 : 0); + data_key_init(c, &key, inum, blk); + + bit = old_size & (UBIFS_BLOCK_SIZE - 1); + blk = (old_size >> UBIFS_BLOCK_SHIFT) - (bit ? 0: 1); + data_key_init(c, &to_key, inum, blk); + + err = ubifs_tnc_remove_range(c, &key, &to_key); + if (err) + goto out_ro; + + finish_reservation(c); + spin_lock(&ui->ui_lock); + ui->synced_i_size = ui->ui_size; + spin_unlock(&ui->ui_lock); + mark_inode_clean(c, ui); + kfree(ino); + return 0; + +out_release: + release_head(c, BASEHD); +out_ro: + ubifs_ro_mode(c, err); + finish_reservation(c); +out_free: + kfree(ino); + return err; +} + +#ifdef CONFIG_UBIFS_FS_XATTR + +/** + * ubifs_jnl_delete_xattr - delete an extended attribute. + * @c: UBIFS file-system description object + * @host: host inode + * @inode: extended attribute inode + * @nm: extended attribute entry name + * + * This function delete an extended attribute which is very similar to + * un-linking regular files - it writes a deletion xentry, a deletion inode and + * updates the target inode. Returns zero in case of success and a negative + * error code in case of failure. + */ +int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host, + const struct inode *inode, const struct qstr *nm) +{ + int err, xlen, hlen, len, lnum, xent_offs, aligned_xlen; + struct ubifs_dent_node *xent; + struct ubifs_ino_node *ino; + union ubifs_key xent_key, key1, key2; + int sync = IS_DIRSYNC(host); + struct ubifs_inode *host_ui = ubifs_inode(host); + + dbg_jnl("host %lu, xattr ino %lu, name '%s', data len %d", + host->i_ino, inode->i_ino, nm->name, + ubifs_inode(inode)->data_len); + ubifs_assert(inode->i_nlink == 0); + ubifs_assert(mutex_is_locked(&host_ui->ui_mutex)); + + /* + * Since we are deleting the inode, we do not bother to attach any data + * to it and assume its length is %UBIFS_INO_NODE_SZ. + */ + xlen = UBIFS_DENT_NODE_SZ + nm->len + 1; + aligned_xlen = ALIGN(xlen, 8); + hlen = host_ui->data_len + UBIFS_INO_NODE_SZ; + len = aligned_xlen + UBIFS_INO_NODE_SZ + ALIGN(hlen, 8); + + xent = kmalloc(len, GFP_NOFS); + if (!xent) + return -ENOMEM; + + /* Make reservation before allocating sequence numbers */ + err = make_reservation(c, BASEHD, len); + if (err) { + kfree(xent); + return err; + } + + xent->ch.node_type = UBIFS_XENT_NODE; + xent_key_init(c, &xent_key, host->i_ino, nm); + key_write(c, &xent_key, xent->key); + xent->inum = 0; + xent->type = get_dent_type(inode->i_mode); + xent->nlen = cpu_to_le16(nm->len); + memcpy(xent->name, nm->name, nm->len); + xent->name[nm->len] = '\0'; + zero_dent_node_unused(xent); + ubifs_prep_grp_node(c, xent, xlen, 0); + + ino = (void *)xent + aligned_xlen; + pack_inode(c, ino, inode, 0); + ino = (void *)ino + UBIFS_INO_NODE_SZ; + pack_inode(c, ino, host, 1); + + err = write_head(c, BASEHD, xent, len, &lnum, &xent_offs, sync); + if (!sync && !err) + ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, host->i_ino); + release_head(c, BASEHD); + kfree(xent); + if (err) + goto out_ro; + + /* Remove the extended attribute entry from TNC */ + err = ubifs_tnc_remove_nm(c, &xent_key, nm); + if (err) + goto out_ro; + err = ubifs_add_dirt(c, lnum, xlen); + if (err) + goto out_ro; + + /* + * Remove all nodes belonging to the extended attribute inode from TNC. + * Well, there actually must be only one node - the inode itself. + */ + lowest_ino_key(c, &key1, inode->i_ino); + highest_ino_key(c, &key2, inode->i_ino); + err = ubifs_tnc_remove_range(c, &key1, &key2); + if (err) + goto out_ro; + err = ubifs_add_dirt(c, lnum, UBIFS_INO_NODE_SZ); + if (err) + goto out_ro; + + /* And update TNC with the new host inode position */ + ino_key_init(c, &key1, host->i_ino); + err = ubifs_tnc_add(c, &key1, lnum, xent_offs + len - hlen, hlen); + if (err) + goto out_ro; + + finish_reservation(c); + spin_lock(&host_ui->ui_lock); + host_ui->synced_i_size = host_ui->ui_size; + spin_unlock(&host_ui->ui_lock); + mark_inode_clean(c, host_ui); + return 0; + +out_ro: + ubifs_ro_mode(c, err); + finish_reservation(c); + return err; +} + +/** + * ubifs_jnl_change_xattr - change an extended attribute. + * @c: UBIFS file-system description object + * @inode: extended attribute inode + * @host: host inode + * + * This function writes the updated version of an extended attribute inode and + * the host inode tho the journal (to the base head). The host inode is written + * after the extended attribute inode in order to guarantee that the extended + * attribute will be flushed when the inode is synchronized by 'fsync()' and + * consequently, the write-buffer is synchronized. This function returns zero + * in case of success and a negative error code in case of failure. + */ +int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode, + const struct inode *host) +{ + int err, len1, len2, aligned_len, aligned_len1, lnum, offs; + struct ubifs_inode *host_ui = ubifs_inode(host); + struct ubifs_ino_node *ino; + union ubifs_key key; + int sync = IS_DIRSYNC(host); + + dbg_jnl("ino %lu, ino %lu", host->i_ino, inode->i_ino); + ubifs_assert(host->i_nlink > 0); + ubifs_assert(inode->i_nlink > 0); + ubifs_assert(mutex_is_locked(&host_ui->ui_mutex)); + + len1 = UBIFS_INO_NODE_SZ + host_ui->data_len; + len2 = UBIFS_INO_NODE_SZ + ubifs_inode(inode)->data_len; + aligned_len1 = ALIGN(len1, 8); + aligned_len = aligned_len1 + ALIGN(len2, 8); + + ino = kmalloc(aligned_len, GFP_NOFS); + if (!ino) + return -ENOMEM; + + /* Make reservation before allocating sequence numbers */ + err = make_reservation(c, BASEHD, aligned_len); + if (err) + goto out_free; + + pack_inode(c, ino, host, 0); + pack_inode(c, (void *)ino + aligned_len1, inode, 1); + + err = write_head(c, BASEHD, ino, aligned_len, &lnum, &offs, 0); + if (!sync && !err) { + struct ubifs_wbuf *wbuf = &c->jheads[BASEHD].wbuf; + + ubifs_wbuf_add_ino_nolock(wbuf, host->i_ino); + ubifs_wbuf_add_ino_nolock(wbuf, inode->i_ino); + } + release_head(c, BASEHD); + if (err) + goto out_ro; + + ino_key_init(c, &key, host->i_ino); + err = ubifs_tnc_add(c, &key, lnum, offs, len1); + if (err) + goto out_ro; + + ino_key_init(c, &key, inode->i_ino); + err = ubifs_tnc_add(c, &key, lnum, offs + aligned_len1, len2); + if (err) + goto out_ro; + + finish_reservation(c); + spin_lock(&host_ui->ui_lock); + host_ui->synced_i_size = host_ui->ui_size; + spin_unlock(&host_ui->ui_lock); + mark_inode_clean(c, host_ui); + kfree(ino); + return 0; + +out_ro: + ubifs_ro_mode(c, err); + finish_reservation(c); +out_free: + kfree(ino); + return err; +} + +#endif /* CONFIG_UBIFS_FS_XATTR */ diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h new file mode 100644 index 000000000000..8f7476007549 --- /dev/null +++ b/fs/ubifs/key.h @@ -0,0 +1,533 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* + * This header contains various key-related definitions and helper function. + * UBIFS allows several key schemes, so we access key fields only via these + * helpers. At the moment only one key scheme is supported. + * + * Simple key scheme + * ~~~~~~~~~~~~~~~~~ + * + * Keys are 64-bits long. First 32-bits are inode number (parent inode number + * in case of direntry key). Next 3 bits are node type. The last 29 bits are + * 4KiB offset in case of inode node, and direntry hash in case of a direntry + * node. We use "r5" hash borrowed from reiserfs. + */ + +#ifndef __UBIFS_KEY_H__ +#define __UBIFS_KEY_H__ + +/** + * key_r5_hash - R5 hash function (borrowed from reiserfs). + * @s: direntry name + * @len: name length + */ +static inline uint32_t key_r5_hash(const char *s, int len) +{ + uint32_t a = 0; + const signed char *str = (const signed char *)s; + + while (*str) { + a += *str << 4; + a += *str >> 4; + a *= 11; + str++; + } + + a &= UBIFS_S_KEY_HASH_MASK; + + /* + * We use hash values as offset in directories, so values %0 and %1 are + * reserved for "." and "..". %2 is reserved for "end of readdir" + * marker. + */ + if (unlikely(a >= 0 && a <= 2)) + a += 3; + return a; +} + +/** + * key_test_hash - testing hash function. + * @str: direntry name + * @len: name length + */ +static inline uint32_t key_test_hash(const char *str, int len) +{ + uint32_t a = 0; + + len = min_t(uint32_t, len, 4); + memcpy(&a, str, len); + a &= UBIFS_S_KEY_HASH_MASK; + if (unlikely(a >= 0 && a <= 2)) + a += 3; + return a; +} + +/** + * ino_key_init - initialize inode key. + * @c: UBIFS file-system description object + * @key: key to initialize + * @inum: inode number + */ +static inline void ino_key_init(const struct ubifs_info *c, + union ubifs_key *key, ino_t inum) +{ + key->u32[0] = inum; + key->u32[1] = UBIFS_INO_KEY << UBIFS_S_KEY_BLOCK_BITS; +} + +/** + * ino_key_init_flash - initialize on-flash inode key. + * @c: UBIFS file-system description object + * @k: key to initialize + * @inum: inode number + */ +static inline void ino_key_init_flash(const struct ubifs_info *c, void *k, + ino_t inum) +{ + union ubifs_key *key = k; + + key->j32[0] = cpu_to_le32(inum); + key->j32[1] = cpu_to_le32(UBIFS_INO_KEY << UBIFS_S_KEY_BLOCK_BITS); + memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8); +} + +/** + * lowest_ino_key - get the lowest possible inode key. + * @c: UBIFS file-system description object + * @key: key to initialize + * @inum: inode number + */ +static inline void lowest_ino_key(const struct ubifs_info *c, + union ubifs_key *key, ino_t inum) +{ + key->u32[0] = inum; + key->u32[1] = 0; +} + +/** + * highest_ino_key - get the highest possible inode key. + * @c: UBIFS file-system description object + * @key: key to initialize + * @inum: inode number + */ +static inline void highest_ino_key(const struct ubifs_info *c, + union ubifs_key *key, ino_t inum) +{ + key->u32[0] = inum; + key->u32[1] = 0xffffffff; +} + +/** + * dent_key_init - initialize directory entry key. + * @c: UBIFS file-system description object + * @key: key to initialize + * @inum: parent inode number + * @nm: direntry name and length + */ +static inline void dent_key_init(const struct ubifs_info *c, + union ubifs_key *key, ino_t inum, + const struct qstr *nm) +{ + uint32_t hash = c->key_hash(nm->name, nm->len); + + ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK)); + key->u32[0] = inum; + key->u32[1] = hash | (UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS); +} + +/** + * dent_key_init_hash - initialize directory entry key without re-calculating + * hash function. + * @c: UBIFS file-system description object + * @key: key to initialize + * @inum: parent inode number + * @hash: direntry name hash + */ +static inline void dent_key_init_hash(const struct ubifs_info *c, + union ubifs_key *key, ino_t inum, + uint32_t hash) +{ + ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK)); + key->u32[0] = inum; + key->u32[1] = hash | (UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS); +} + +/** + * dent_key_init_flash - initialize on-flash directory entry key. + * @c: UBIFS file-system description object + * @k: key to initialize + * @inum: parent inode number + * @nm: direntry name and length + */ +static inline void dent_key_init_flash(const struct ubifs_info *c, void *k, + ino_t inum, const struct qstr *nm) +{ + union ubifs_key *key = k; + uint32_t hash = c->key_hash(nm->name, nm->len); + + ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK)); + key->j32[0] = cpu_to_le32(inum); + key->j32[1] = cpu_to_le32(hash | + (UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS)); + memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8); +} + +/** + * lowest_dent_key - get the lowest possible directory entry key. + * @c: UBIFS file-system description object + * @key: where to store the lowest key + * @inum: parent inode number + */ +static inline void lowest_dent_key(const struct ubifs_info *c, + union ubifs_key *key, ino_t inum) +{ + key->u32[0] = inum; + key->u32[1] = UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS; +} + +/** + * xent_key_init - initialize extended attribute entry key. + * @c: UBIFS file-system description object + * @key: key to initialize + * @inum: host inode number + * @nm: extended attribute entry name and length + */ +static inline void xent_key_init(const struct ubifs_info *c, + union ubifs_key *key, ino_t inum, + const struct qstr *nm) +{ + uint32_t hash = c->key_hash(nm->name, nm->len); + + ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK)); + key->u32[0] = inum; + key->u32[1] = hash | (UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS); +} + +/** + * xent_key_init_hash - initialize extended attribute entry key without + * re-calculating hash function. + * @c: UBIFS file-system description object + * @key: key to initialize + * @inum: host inode number + * @hash: extended attribute entry name hash + */ +static inline void xent_key_init_hash(const struct ubifs_info *c, + union ubifs_key *key, ino_t inum, + uint32_t hash) +{ + ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK)); + key->u32[0] = inum; + key->u32[1] = hash | (UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS); +} + +/** + * xent_key_init_flash - initialize on-flash extended attribute entry key. + * @c: UBIFS file-system description object + * @k: key to initialize + * @inum: host inode number + * @nm: extended attribute entry name and length + */ +static inline void xent_key_init_flash(const struct ubifs_info *c, void *k, + ino_t inum, const struct qstr *nm) +{ + union ubifs_key *key = k; + uint32_t hash = c->key_hash(nm->name, nm->len); + + ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK)); + key->j32[0] = cpu_to_le32(inum); + key->j32[1] = cpu_to_le32(hash | + (UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS)); + memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8); +} + +/** + * lowest_xent_key - get the lowest possible extended attribute entry key. + * @c: UBIFS file-system description object + * @key: where to store the lowest key + * @inum: host inode number + */ +static inline void lowest_xent_key(const struct ubifs_info *c, + union ubifs_key *key, ino_t inum) +{ + key->u32[0] = inum; + key->u32[1] = UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS; +} + +/** + * data_key_init - initialize data key. + * @c: UBIFS file-system description object + * @key: key to initialize + * @inum: inode number + * @block: block number + */ +static inline void data_key_init(const struct ubifs_info *c, + union ubifs_key *key, ino_t inum, + unsigned int block) +{ + ubifs_assert(!(block & ~UBIFS_S_KEY_BLOCK_MASK)); + key->u32[0] = inum; + key->u32[1] = block | (UBIFS_DATA_KEY << UBIFS_S_KEY_BLOCK_BITS); +} + +/** + * data_key_init_flash - initialize on-flash data key. + * @c: UBIFS file-system description object + * @k: key to initialize + * @inum: inode number + * @block: block number + */ +static inline void data_key_init_flash(const struct ubifs_info *c, void *k, + ino_t inum, unsigned int block) +{ + union ubifs_key *key = k; + + ubifs_assert(!(block & ~UBIFS_S_KEY_BLOCK_MASK)); + key->j32[0] = cpu_to_le32(inum); + key->j32[1] = cpu_to_le32(block | + (UBIFS_DATA_KEY << UBIFS_S_KEY_BLOCK_BITS)); + memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8); +} + +/** + * trun_key_init - initialize truncation node key. + * @c: UBIFS file-system description object + * @key: key to initialize + * @inum: inode number + * + * Note, UBIFS does not have truncation keys on the media and this function is + * only used for purposes of replay. + */ +static inline void trun_key_init(const struct ubifs_info *c, + union ubifs_key *key, ino_t inum) +{ + key->u32[0] = inum; + key->u32[1] = UBIFS_TRUN_KEY << UBIFS_S_KEY_BLOCK_BITS; +} + +/** + * key_type - get key type. + * @c: UBIFS file-system description object + * @key: key to get type of + */ +static inline int key_type(const struct ubifs_info *c, + const union ubifs_key *key) +{ + return key->u32[1] >> UBIFS_S_KEY_BLOCK_BITS; +} + +/** + * key_type_flash - get type of a on-flash formatted key. + * @c: UBIFS file-system description object + * @k: key to get type of + */ +static inline int key_type_flash(const struct ubifs_info *c, const void *k) +{ + const union ubifs_key *key = k; + + return le32_to_cpu(key->u32[1]) >> UBIFS_S_KEY_BLOCK_BITS; +} + +/** + * key_inum - fetch inode number from key. + * @c: UBIFS file-system description object + * @k: key to fetch inode number from + */ +static inline ino_t key_inum(const struct ubifs_info *c, const void *k) +{ + const union ubifs_key *key = k; + + return key->u32[0]; +} + +/** + * key_inum_flash - fetch inode number from an on-flash formatted key. + * @c: UBIFS file-system description object + * @k: key to fetch inode number from + */ +static inline ino_t key_inum_flash(const struct ubifs_info *c, const void *k) +{ + const union ubifs_key *key = k; + + return le32_to_cpu(key->j32[0]); +} + +/** + * key_hash - get directory entry hash. + * @c: UBIFS file-system description object + * @key: the key to get hash from + */ +static inline int key_hash(const struct ubifs_info *c, + const union ubifs_key *key) +{ + return key->u32[1] & UBIFS_S_KEY_HASH_MASK; +} + +/** + * key_hash_flash - get directory entry hash from an on-flash formatted key. + * @c: UBIFS file-system description object + * @k: the key to get hash from + */ +static inline int key_hash_flash(const struct ubifs_info *c, const void *k) +{ + const union ubifs_key *key = k; + + return le32_to_cpu(key->j32[1]) & UBIFS_S_KEY_HASH_MASK; +} + +/** + * key_block - get data block number. + * @c: UBIFS file-system description object + * @key: the key to get the block number from + */ +static inline unsigned int key_block(const struct ubifs_info *c, + const union ubifs_key *key) +{ + return key->u32[1] & UBIFS_S_KEY_BLOCK_MASK; +} + +/** + * key_block_flash - get data block number from an on-flash formatted key. + * @c: UBIFS file-system description object + * @k: the key to get the block number from + */ +static inline unsigned int key_block_flash(const struct ubifs_info *c, + const void *k) +{ + const union ubifs_key *key = k; + + return le32_to_cpu(key->u32[1]) & UBIFS_S_KEY_BLOCK_MASK; +} + +/** + * key_read - transform a key to in-memory format. + * @c: UBIFS file-system description object + * @from: the key to transform + * @to: the key to store the result + */ +static inline void key_read(const struct ubifs_info *c, const void *from, + union ubifs_key *to) +{ + const union ubifs_key *f = from; + + to->u32[0] = le32_to_cpu(f->j32[0]); + to->u32[1] = le32_to_cpu(f->j32[1]); +} + +/** + * key_write - transform a key from in-memory format. + * @c: UBIFS file-system description object + * @from: the key to transform + * @to: the key to store the result + */ +static inline void key_write(const struct ubifs_info *c, + const union ubifs_key *from, void *to) +{ + union ubifs_key *t = to; + + t->j32[0] = cpu_to_le32(from->u32[0]); + t->j32[1] = cpu_to_le32(from->u32[1]); + memset(to + 8, 0, UBIFS_MAX_KEY_LEN - 8); +} + +/** + * key_write_idx - transform a key from in-memory format for the index. + * @c: UBIFS file-system description object + * @from: the key to transform + * @to: the key to store the result + */ +static inline void key_write_idx(const struct ubifs_info *c, + const union ubifs_key *from, void *to) +{ + union ubifs_key *t = to; + + t->j32[0] = cpu_to_le32(from->u32[0]); + t->j32[1] = cpu_to_le32(from->u32[1]); +} + +/** + * key_copy - copy a key. + * @c: UBIFS file-system description object + * @from: the key to copy from + * @to: the key to copy to + */ +static inline void key_copy(const struct ubifs_info *c, + const union ubifs_key *from, union ubifs_key *to) +{ + to->u64[0] = from->u64[0]; +} + +/** + * keys_cmp - compare keys. + * @c: UBIFS file-system description object + * @key1: the first key to compare + * @key2: the second key to compare + * + * This function compares 2 keys and returns %-1 if @key1 is less than + * @key2, 0 if the keys are equivalent and %1 if @key1 is greater than @key2. + */ +static inline int keys_cmp(const struct ubifs_info *c, + const union ubifs_key *key1, + const union ubifs_key *key2) +{ + if (key1->u32[0] < key2->u32[0]) + return -1; + if (key1->u32[0] > key2->u32[0]) + return 1; + if (key1->u32[1] < key2->u32[1]) + return -1; + if (key1->u32[1] > key2->u32[1]) + return 1; + + return 0; +} + +/** + * is_hash_key - is a key vulnerable to hash collisions. + * @c: UBIFS file-system description object + * @key: key + * + * This function returns %1 if @key is a hashed key or %0 otherwise. + */ +static inline int is_hash_key(const struct ubifs_info *c, + const union ubifs_key *key) +{ + int type = key_type(c, key); + + return type == UBIFS_DENT_KEY || type == UBIFS_XENT_KEY; +} + +/** + * key_max_inode_size - get maximum file size allowed by current key format. + * @c: UBIFS file-system description object + */ +static inline unsigned long long key_max_inode_size(const struct ubifs_info *c) +{ + switch (c->key_fmt) { + case UBIFS_SIMPLE_KEY_FMT: + return (1ULL << UBIFS_S_KEY_BLOCK_BITS) * UBIFS_BLOCK_SIZE; + default: + return 0; + } +} +#endif /* !__UBIFS_KEY_H__ */ diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c new file mode 100644 index 000000000000..3e0aa7367556 --- /dev/null +++ b/fs/ubifs/log.c @@ -0,0 +1,807 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* + * This file is a part of UBIFS journal implementation and contains various + * functions which manipulate the log. The log is a fixed area on the flash + * which does not contain any data but refers to buds. The log is a part of the + * journal. + */ + +#include "ubifs.h" + +#ifdef CONFIG_UBIFS_FS_DEBUG +static int dbg_check_bud_bytes(struct ubifs_info *c); +#else +#define dbg_check_bud_bytes(c) 0 +#endif + +/** + * ubifs_search_bud - search bud LEB. + * @c: UBIFS file-system description object + * @lnum: logical eraseblock number to search + * + * This function searches bud LEB @lnum. Returns bud description object in case + * of success and %NULL if there is no bud with this LEB number. + */ +struct ubifs_bud *ubifs_search_bud(struct ubifs_info *c, int lnum) +{ + struct rb_node *p; + struct ubifs_bud *bud; + + spin_lock(&c->buds_lock); + p = c->buds.rb_node; + while (p) { + bud = rb_entry(p, struct ubifs_bud, rb); + if (lnum < bud->lnum) + p = p->rb_left; + else if (lnum > bud->lnum) + p = p->rb_right; + else { + spin_unlock(&c->buds_lock); + return bud; + } + } + spin_unlock(&c->buds_lock); + return NULL; +} + +/** + * ubifs_get_wbuf - get the wbuf associated with a LEB, if there is one. + * @c: UBIFS file-system description object + * @lnum: logical eraseblock number to search + * + * This functions returns the wbuf for @lnum or %NULL if there is not one. + */ +struct ubifs_wbuf *ubifs_get_wbuf(struct ubifs_info *c, int lnum) +{ + struct rb_node *p; + struct ubifs_bud *bud; + int jhead; + + if (!c->jheads) + return NULL; + + spin_lock(&c->buds_lock); + p = c->buds.rb_node; + while (p) { + bud = rb_entry(p, struct ubifs_bud, rb); + if (lnum < bud->lnum) + p = p->rb_left; + else if (lnum > bud->lnum) + p = p->rb_right; + else { + jhead = bud->jhead; + spin_unlock(&c->buds_lock); + return &c->jheads[jhead].wbuf; + } + } + spin_unlock(&c->buds_lock); + return NULL; +} + +/** + * next_log_lnum - switch to the next log LEB. + * @c: UBIFS file-system description object + * @lnum: current log LEB + */ +static inline int next_log_lnum(const struct ubifs_info *c, int lnum) +{ + lnum += 1; + if (lnum > c->log_last) + lnum = UBIFS_LOG_LNUM; + + return lnum; +} + +/** + * empty_log_bytes - calculate amount of empty space in the log. + * @c: UBIFS file-system description object + */ +static inline long long empty_log_bytes(const struct ubifs_info *c) +{ + long long h, t; + + h = (long long)c->lhead_lnum * c->leb_size + c->lhead_offs; + t = (long long)c->ltail_lnum * c->leb_size; + + if (h >= t) + return c->log_bytes - h + t; + else + return t - h; +} + +/** + * ubifs_add_bud - add bud LEB to the tree of buds and its journal head list. + * @c: UBIFS file-system description object + * @bud: the bud to add + */ +void ubifs_add_bud(struct ubifs_info *c, struct ubifs_bud *bud) +{ + struct rb_node **p, *parent = NULL; + struct ubifs_bud *b; + struct ubifs_jhead *jhead; + + spin_lock(&c->buds_lock); + p = &c->buds.rb_node; + while (*p) { + parent = *p; + b = rb_entry(parent, struct ubifs_bud, rb); + ubifs_assert(bud->lnum != b->lnum); + if (bud->lnum < b->lnum) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + + rb_link_node(&bud->rb, parent, p); + rb_insert_color(&bud->rb, &c->buds); + if (c->jheads) { + jhead = &c->jheads[bud->jhead]; + list_add_tail(&bud->list, &jhead->buds_list); + } else + ubifs_assert(c->replaying && (c->vfs_sb->s_flags & MS_RDONLY)); + + /* + * Note, although this is a new bud, we anyway account this space now, + * before any data has been written to it, because this is about to + * guarantee fixed mount time, and this bud will anyway be read and + * scanned. + */ + c->bud_bytes += c->leb_size - bud->start; + + dbg_log("LEB %d:%d, jhead %d, bud_bytes %lld", bud->lnum, + bud->start, bud->jhead, c->bud_bytes); + spin_unlock(&c->buds_lock); +} + +/** + * ubifs_create_buds_lists - create journal head buds lists for remount rw. + * @c: UBIFS file-system description object + */ +void ubifs_create_buds_lists(struct ubifs_info *c) +{ + struct rb_node *p; + + spin_lock(&c->buds_lock); + p = rb_first(&c->buds); + while (p) { + struct ubifs_bud *bud = rb_entry(p, struct ubifs_bud, rb); + struct ubifs_jhead *jhead = &c->jheads[bud->jhead]; + + list_add_tail(&bud->list, &jhead->buds_list); + p = rb_next(p); + } + spin_unlock(&c->buds_lock); +} + +/** + * ubifs_add_bud_to_log - add a new bud to the log. + * @c: UBIFS file-system description object + * @jhead: journal head the bud belongs to + * @lnum: LEB number of the bud + * @offs: starting offset of the bud + * + * This function writes reference node for the new bud LEB @lnum it to the log, + * and adds it to the buds tress. It also makes sure that log size does not + * exceed the 'c->max_bud_bytes' limit. Returns zero in case of success, + * %-EAGAIN if commit is required, and a negative error codes in case of + * failure. + */ +int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs) +{ + int err; + struct ubifs_bud *bud; + struct ubifs_ref_node *ref; + + bud = kmalloc(sizeof(struct ubifs_bud), GFP_NOFS); + if (!bud) + return -ENOMEM; + ref = kzalloc(c->ref_node_alsz, GFP_NOFS); + if (!ref) { + kfree(bud); + return -ENOMEM; + } + + mutex_lock(&c->log_mutex); + + if (c->ro_media) { + err = -EROFS; + goto out_unlock; + } + + /* Make sure we have enough space in the log */ + if (empty_log_bytes(c) - c->ref_node_alsz < c->min_log_bytes) { + dbg_log("not enough log space - %lld, required %d", + empty_log_bytes(c), c->min_log_bytes); + ubifs_commit_required(c); + err = -EAGAIN; + goto out_unlock; + } + + /* + * Make sure the the amount of space in buds will not exceed + * 'c->max_bud_bytes' limit, because we want to guarantee mount time + * limits. + * + * It is not necessary to hold @c->buds_lock when reading @c->bud_bytes + * because we are holding @c->log_mutex. All @c->bud_bytes take place + * when both @c->log_mutex and @c->bud_bytes are locked. + */ + if (c->bud_bytes + c->leb_size - offs > c->max_bud_bytes) { + dbg_log("bud bytes %lld (%lld max), require commit", + c->bud_bytes, c->max_bud_bytes); + ubifs_commit_required(c); + err = -EAGAIN; + goto out_unlock; + } + + /* + * If the journal is full enough - start background commit. Note, it is + * OK to read 'c->cmt_state' without spinlock because integer reads + * are atomic in the kernel. + */ + if (c->bud_bytes >= c->bg_bud_bytes && + c->cmt_state == COMMIT_RESTING) { + dbg_log("bud bytes %lld (%lld max), initiate BG commit", + c->bud_bytes, c->max_bud_bytes); + ubifs_request_bg_commit(c); + } + + bud->lnum = lnum; + bud->start = offs; + bud->jhead = jhead; + + ref->ch.node_type = UBIFS_REF_NODE; + ref->lnum = cpu_to_le32(bud->lnum); + ref->offs = cpu_to_le32(bud->start); + ref->jhead = cpu_to_le32(jhead); + + if (c->lhead_offs > c->leb_size - c->ref_node_alsz) { + c->lhead_lnum = next_log_lnum(c, c->lhead_lnum); + c->lhead_offs = 0; + } + + if (c->lhead_offs == 0) { + /* Must ensure next log LEB has been unmapped */ + err = ubifs_leb_unmap(c, c->lhead_lnum); + if (err) + goto out_unlock; + } + + if (bud->start == 0) { + /* + * Before writing the LEB reference which refers an empty LEB + * to the log, we have to make sure it is mapped, because + * otherwise we'd risk to refer an LEB with garbage in case of + * an unclean reboot, because the target LEB might have been + * unmapped, but not yet physically erased. + */ + err = ubi_leb_map(c->ubi, bud->lnum, UBI_SHORTTERM); + if (err) + goto out_unlock; + } + + dbg_log("write ref LEB %d:%d", + c->lhead_lnum, c->lhead_offs); + err = ubifs_write_node(c, ref, UBIFS_REF_NODE_SZ, c->lhead_lnum, + c->lhead_offs, UBI_SHORTTERM); + if (err) + goto out_unlock; + + c->lhead_offs += c->ref_node_alsz; + + ubifs_add_bud(c, bud); + + mutex_unlock(&c->log_mutex); + kfree(ref); + return 0; + +out_unlock: + if (err != -EAGAIN) + ubifs_ro_mode(c, err); + mutex_unlock(&c->log_mutex); + kfree(ref); + kfree(bud); + return err; +} + +/** + * remove_buds - remove used buds. + * @c: UBIFS file-system description object + * + * This function removes use buds from the buds tree. It does not remove the + * buds which are pointed to by journal heads. + */ +static void remove_buds(struct ubifs_info *c) +{ + struct rb_node *p; + + ubifs_assert(list_empty(&c->old_buds)); + c->cmt_bud_bytes = 0; + spin_lock(&c->buds_lock); + p = rb_first(&c->buds); + while (p) { + struct rb_node *p1 = p; + struct ubifs_bud *bud; + struct ubifs_wbuf *wbuf; + + p = rb_next(p); + bud = rb_entry(p1, struct ubifs_bud, rb); + wbuf = &c->jheads[bud->jhead].wbuf; + + if (wbuf->lnum == bud->lnum) { + /* + * Do not remove buds which are pointed to by journal + * heads (non-closed buds). + */ + c->cmt_bud_bytes += wbuf->offs - bud->start; + dbg_log("preserve %d:%d, jhead %d, bud bytes %d, " + "cmt_bud_bytes %lld", bud->lnum, bud->start, + bud->jhead, wbuf->offs - bud->start, + c->cmt_bud_bytes); + bud->start = wbuf->offs; + } else { + c->cmt_bud_bytes += c->leb_size - bud->start; + dbg_log("remove %d:%d, jhead %d, bud bytes %d, " + "cmt_bud_bytes %lld", bud->lnum, bud->start, + bud->jhead, c->leb_size - bud->start, + c->cmt_bud_bytes); + rb_erase(p1, &c->buds); + list_del(&bud->list); + /* + * If the commit does not finish, the recovery will need + * to replay the journal, in which case the old buds + * must be unchanged. Do not release them until post + * commit i.e. do not allow them to be garbage + * collected. + */ + list_add(&bud->list, &c->old_buds); + } + } + spin_unlock(&c->buds_lock); +} + +/** + * ubifs_log_start_commit - start commit. + * @c: UBIFS file-system description object + * @ltail_lnum: return new log tail LEB number + * + * The commit operation starts with writing "commit start" node to the log and + * reference nodes for all journal heads which will define new journal after + * the commit has been finished. The commit start and reference nodes are + * written in one go to the nearest empty log LEB (hence, when commit is + * finished UBIFS may safely unmap all the previous log LEBs). This function + * returns zero in case of success and a negative error code in case of + * failure. + */ +int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum) +{ + void *buf; + struct ubifs_cs_node *cs; + struct ubifs_ref_node *ref; + int err, i, max_len, len; + + err = dbg_check_bud_bytes(c); + if (err) + return err; + + max_len = UBIFS_CS_NODE_SZ + c->jhead_cnt * UBIFS_REF_NODE_SZ; + max_len = ALIGN(max_len, c->min_io_size); + buf = cs = kmalloc(max_len, GFP_NOFS); + if (!buf) + return -ENOMEM; + + cs->ch.node_type = UBIFS_CS_NODE; + cs->cmt_no = cpu_to_le64(c->cmt_no); + ubifs_prepare_node(c, cs, UBIFS_CS_NODE_SZ, 0); + + /* + * Note, we do not lock 'c->log_mutex' because this is the commit start + * phase and we are exclusively using the log. And we do not lock + * write-buffer because nobody can write to the file-system at this + * phase. + */ + + len = UBIFS_CS_NODE_SZ; + for (i = 0; i < c->jhead_cnt; i++) { + int lnum = c->jheads[i].wbuf.lnum; + int offs = c->jheads[i].wbuf.offs; + + if (lnum == -1 || offs == c->leb_size) + continue; + + dbg_log("add ref to LEB %d:%d for jhead %d", lnum, offs, i); + ref = buf + len; + ref->ch.node_type = UBIFS_REF_NODE; + ref->lnum = cpu_to_le32(lnum); + ref->offs = cpu_to_le32(offs); + ref->jhead = cpu_to_le32(i); + + ubifs_prepare_node(c, ref, UBIFS_REF_NODE_SZ, 0); + len += UBIFS_REF_NODE_SZ; + } + + ubifs_pad(c, buf + len, ALIGN(len, c->min_io_size) - len); + + /* Switch to the next log LEB */ + if (c->lhead_offs) { + c->lhead_lnum = next_log_lnum(c, c->lhead_lnum); + c->lhead_offs = 0; + } + + if (c->lhead_offs == 0) { + /* Must ensure next LEB has been unmapped */ + err = ubifs_leb_unmap(c, c->lhead_lnum); + if (err) + goto out; + } + + len = ALIGN(len, c->min_io_size); + dbg_log("writing commit start at LEB %d:0, len %d", c->lhead_lnum, len); + err = ubifs_leb_write(c, c->lhead_lnum, cs, 0, len, UBI_SHORTTERM); + if (err) + goto out; + + *ltail_lnum = c->lhead_lnum; + + c->lhead_offs += len; + if (c->lhead_offs == c->leb_size) { + c->lhead_lnum = next_log_lnum(c, c->lhead_lnum); + c->lhead_offs = 0; + } + + remove_buds(c); + + /* + * We have started the commit and now users may use the rest of the log + * for new writes. + */ + c->min_log_bytes = 0; + +out: + kfree(buf); + return err; +} + +/** + * ubifs_log_end_commit - end commit. + * @c: UBIFS file-system description object + * @ltail_lnum: new log tail LEB number + * + * This function is called on when the commit operation was finished. It + * moves log tail to new position and unmaps LEBs which contain obsolete data. + * Returns zero in case of success and a negative error code in case of + * failure. + */ +int ubifs_log_end_commit(struct ubifs_info *c, int ltail_lnum) +{ + int err; + + /* + * At this phase we have to lock 'c->log_mutex' because UBIFS allows FS + * writes during commit. Its only short "commit" start phase when + * writers are blocked. + */ + mutex_lock(&c->log_mutex); + + dbg_log("old tail was LEB %d:0, new tail is LEB %d:0", + c->ltail_lnum, ltail_lnum); + + c->ltail_lnum = ltail_lnum; + /* + * The commit is finished and from now on it must be guaranteed that + * there is always enough space for the next commit. + */ + c->min_log_bytes = c->leb_size; + + spin_lock(&c->buds_lock); + c->bud_bytes -= c->cmt_bud_bytes; + spin_unlock(&c->buds_lock); + + err = dbg_check_bud_bytes(c); + + mutex_unlock(&c->log_mutex); + return err; +} + +/** + * ubifs_log_post_commit - things to do after commit is completed. + * @c: UBIFS file-system description object + * @old_ltail_lnum: old log tail LEB number + * + * Release buds only after commit is completed, because they must be unchanged + * if recovery is needed. + * + * Unmap log LEBs only after commit is completed, because they may be needed for + * recovery. + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_log_post_commit(struct ubifs_info *c, int old_ltail_lnum) +{ + int lnum, err = 0; + + while (!list_empty(&c->old_buds)) { + struct ubifs_bud *bud; + + bud = list_entry(c->old_buds.next, struct ubifs_bud, list); + err = ubifs_return_leb(c, bud->lnum); + if (err) + return err; + list_del(&bud->list); + kfree(bud); + } + mutex_lock(&c->log_mutex); + for (lnum = old_ltail_lnum; lnum != c->ltail_lnum; + lnum = next_log_lnum(c, lnum)) { + dbg_log("unmap log LEB %d", lnum); + err = ubifs_leb_unmap(c, lnum); + if (err) + goto out; + } +out: + mutex_unlock(&c->log_mutex); + return err; +} + +/** + * struct done_ref - references that have been done. + * @rb: rb-tree node + * @lnum: LEB number + */ +struct done_ref { + struct rb_node rb; + int lnum; +}; + +/** + * done_already - determine if a reference has been done already. + * @done_tree: rb-tree to store references that have been done + * @lnum: LEB number of reference + * + * This function returns %1 if the reference has been done, %0 if not, otherwise + * a negative error code is returned. + */ +static int done_already(struct rb_root *done_tree, int lnum) +{ + struct rb_node **p = &done_tree->rb_node, *parent = NULL; + struct done_ref *dr; + + while (*p) { + parent = *p; + dr = rb_entry(parent, struct done_ref, rb); + if (lnum < dr->lnum) + p = &(*p)->rb_left; + else if (lnum > dr->lnum) + p = &(*p)->rb_right; + else + return 1; + } + + dr = kzalloc(sizeof(struct done_ref), GFP_NOFS); + if (!dr) + return -ENOMEM; + + dr->lnum = lnum; + + rb_link_node(&dr->rb, parent, p); + rb_insert_color(&dr->rb, done_tree); + + return 0; +} + +/** + * destroy_done_tree - destroy the done tree. + * @done_tree: done tree to destroy + */ +static void destroy_done_tree(struct rb_root *done_tree) +{ + struct rb_node *this = done_tree->rb_node; + struct done_ref *dr; + + while (this) { + if (this->rb_left) { + this = this->rb_left; + continue; + } else if (this->rb_right) { + this = this->rb_right; + continue; + } + dr = rb_entry(this, struct done_ref, rb); + this = rb_parent(this); + if (this) { + if (this->rb_left == &dr->rb) + this->rb_left = NULL; + else + this->rb_right = NULL; + } + kfree(dr); + } +} + +/** + * add_node - add a node to the consolidated log. + * @c: UBIFS file-system description object + * @buf: buffer to which to add + * @lnum: LEB number to which to write is passed and returned here + * @offs: offset to where to write is passed and returned here + * @node: node to add + * + * This function returns %0 on success and a negative error code on failure. + */ +static int add_node(struct ubifs_info *c, void *buf, int *lnum, int *offs, + void *node) +{ + struct ubifs_ch *ch = node; + int len = le32_to_cpu(ch->len), remains = c->leb_size - *offs; + + if (len > remains) { + int sz = ALIGN(*offs, c->min_io_size), err; + + ubifs_pad(c, buf + *offs, sz - *offs); + err = ubifs_leb_change(c, *lnum, buf, sz, UBI_SHORTTERM); + if (err) + return err; + *lnum = next_log_lnum(c, *lnum); + *offs = 0; + } + memcpy(buf + *offs, node, len); + *offs += ALIGN(len, 8); + return 0; +} + +/** + * ubifs_consolidate_log - consolidate the log. + * @c: UBIFS file-system description object + * + * Repeated failed commits could cause the log to be full, but at least 1 LEB is + * needed for commit. This function rewrites the reference nodes in the log + * omitting duplicates, and failed CS nodes, and leaving no gaps. + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_consolidate_log(struct ubifs_info *c) +{ + struct ubifs_scan_leb *sleb; + struct ubifs_scan_node *snod; + struct rb_root done_tree = RB_ROOT; + int lnum, err, first = 1, write_lnum, offs = 0; + void *buf; + + dbg_rcvry("log tail LEB %d, log head LEB %d", c->ltail_lnum, + c->lhead_lnum); + buf = vmalloc(c->leb_size); + if (!buf) + return -ENOMEM; + lnum = c->ltail_lnum; + write_lnum = lnum; + while (1) { + sleb = ubifs_scan(c, lnum, 0, c->sbuf); + if (IS_ERR(sleb)) { + err = PTR_ERR(sleb); + goto out_free; + } + list_for_each_entry(snod, &sleb->nodes, list) { + switch (snod->type) { + case UBIFS_REF_NODE: { + struct ubifs_ref_node *ref = snod->node; + int ref_lnum = le32_to_cpu(ref->lnum); + + err = done_already(&done_tree, ref_lnum); + if (err < 0) + goto out_scan; + if (err != 1) { + err = add_node(c, buf, &write_lnum, + &offs, snod->node); + if (err) + goto out_scan; + } + break; + } + case UBIFS_CS_NODE: + if (!first) + break; + err = add_node(c, buf, &write_lnum, &offs, + snod->node); + if (err) + goto out_scan; + first = 0; + break; + } + } + ubifs_scan_destroy(sleb); + if (lnum == c->lhead_lnum) + break; + lnum = next_log_lnum(c, lnum); + } + if (offs) { + int sz = ALIGN(offs, c->min_io_size); + + ubifs_pad(c, buf + offs, sz - offs); + err = ubifs_leb_change(c, write_lnum, buf, sz, UBI_SHORTTERM); + if (err) + goto out_free; + offs = ALIGN(offs, c->min_io_size); + } + destroy_done_tree(&done_tree); + vfree(buf); + if (write_lnum == c->lhead_lnum) { + ubifs_err("log is too full"); + return -EINVAL; + } + /* Unmap remaining LEBs */ + lnum = write_lnum; + do { + lnum = next_log_lnum(c, lnum); + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + } while (lnum != c->lhead_lnum); + c->lhead_lnum = write_lnum; + c->lhead_offs = offs; + dbg_rcvry("new log head at %d:%d", c->lhead_lnum, c->lhead_offs); + return 0; + +out_scan: + ubifs_scan_destroy(sleb); +out_free: + destroy_done_tree(&done_tree); + vfree(buf); + return err; +} + +#ifdef CONFIG_UBIFS_FS_DEBUG + +/** + * dbg_check_bud_bytes - make sure bud bytes calculation are all right. + * @c: UBIFS file-system description object + * + * This function makes sure the amount of flash space used by closed buds + * ('c->bud_bytes' is correct). Returns zero in case of success and %-EINVAL in + * case of failure. + */ +static int dbg_check_bud_bytes(struct ubifs_info *c) +{ + int i, err = 0; + struct ubifs_bud *bud; + long long bud_bytes = 0; + + if (!(ubifs_chk_flags & UBIFS_CHK_GEN)) + return 0; + + spin_lock(&c->buds_lock); + for (i = 0; i < c->jhead_cnt; i++) + list_for_each_entry(bud, &c->jheads[i].buds_list, list) + bud_bytes += c->leb_size - bud->start; + + if (c->bud_bytes != bud_bytes) { + ubifs_err("bad bud_bytes %lld, calculated %lld", + c->bud_bytes, bud_bytes); + err = -EINVAL; + } + spin_unlock(&c->buds_lock); + + return err; +} + +#endif /* CONFIG_UBIFS_FS_DEBUG */ diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c new file mode 100644 index 000000000000..2ba93da71b65 --- /dev/null +++ b/fs/ubifs/lprops.c @@ -0,0 +1,1357 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * This file implements the functions that access LEB properties and their + * categories. LEBs are categorized based on the needs of UBIFS, and the + * categories are stored as either heaps or lists to provide a fast way of + * finding a LEB in a particular category. For example, UBIFS may need to find + * an empty LEB for the journal, or a very dirty LEB for garbage collection. + */ + +#include "ubifs.h" + +/** + * get_heap_comp_val - get the LEB properties value for heap comparisons. + * @lprops: LEB properties + * @cat: LEB category + */ +static int get_heap_comp_val(struct ubifs_lprops *lprops, int cat) +{ + switch (cat) { + case LPROPS_FREE: + return lprops->free; + case LPROPS_DIRTY_IDX: + return lprops->free + lprops->dirty; + default: + return lprops->dirty; + } +} + +/** + * move_up_lpt_heap - move a new heap entry up as far as possible. + * @c: UBIFS file-system description object + * @heap: LEB category heap + * @lprops: LEB properties to move + * @cat: LEB category + * + * New entries to a heap are added at the bottom and then moved up until the + * parent's value is greater. In the case of LPT's category heaps, the value + * is either the amount of free space or the amount of dirty space, depending + * on the category. + */ +static void move_up_lpt_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, + struct ubifs_lprops *lprops, int cat) +{ + int val1, val2, hpos; + + hpos = lprops->hpos; + if (!hpos) + return; /* Already top of the heap */ + val1 = get_heap_comp_val(lprops, cat); + /* Compare to parent and, if greater, move up the heap */ + do { + int ppos = (hpos - 1) / 2; + + val2 = get_heap_comp_val(heap->arr[ppos], cat); + if (val2 >= val1) + return; + /* Greater than parent so move up */ + heap->arr[ppos]->hpos = hpos; + heap->arr[hpos] = heap->arr[ppos]; + heap->arr[ppos] = lprops; + lprops->hpos = ppos; + hpos = ppos; + } while (hpos); +} + +/** + * adjust_lpt_heap - move a changed heap entry up or down the heap. + * @c: UBIFS file-system description object + * @heap: LEB category heap + * @lprops: LEB properties to move + * @hpos: heap position of @lprops + * @cat: LEB category + * + * Changed entries in a heap are moved up or down until the parent's value is + * greater. In the case of LPT's category heaps, the value is either the amount + * of free space or the amount of dirty space, depending on the category. + */ +static void adjust_lpt_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, + struct ubifs_lprops *lprops, int hpos, int cat) +{ + int val1, val2, val3, cpos; + + val1 = get_heap_comp_val(lprops, cat); + /* Compare to parent and, if greater than parent, move up the heap */ + if (hpos) { + int ppos = (hpos - 1) / 2; + + val2 = get_heap_comp_val(heap->arr[ppos], cat); + if (val1 > val2) { + /* Greater than parent so move up */ + while (1) { + heap->arr[ppos]->hpos = hpos; + heap->arr[hpos] = heap->arr[ppos]; + heap->arr[ppos] = lprops; + lprops->hpos = ppos; + hpos = ppos; + if (!hpos) + return; + ppos = (hpos - 1) / 2; + val2 = get_heap_comp_val(heap->arr[ppos], cat); + if (val1 <= val2) + return; + /* Still greater than parent so keep going */ + } + } + } + /* Not greater than parent, so compare to children */ + while (1) { + /* Compare to left child */ + cpos = hpos * 2 + 1; + if (cpos >= heap->cnt) + return; + val2 = get_heap_comp_val(heap->arr[cpos], cat); + if (val1 < val2) { + /* Less than left child, so promote biggest child */ + if (cpos + 1 < heap->cnt) { + val3 = get_heap_comp_val(heap->arr[cpos + 1], + cat); + if (val3 > val2) + cpos += 1; /* Right child is bigger */ + } + heap->arr[cpos]->hpos = hpos; + heap->arr[hpos] = heap->arr[cpos]; + heap->arr[cpos] = lprops; + lprops->hpos = cpos; + hpos = cpos; + continue; + } + /* Compare to right child */ + cpos += 1; + if (cpos >= heap->cnt) + return; + val3 = get_heap_comp_val(heap->arr[cpos], cat); + if (val1 < val3) { + /* Less than right child, so promote right child */ + heap->arr[cpos]->hpos = hpos; + heap->arr[hpos] = heap->arr[cpos]; + heap->arr[cpos] = lprops; + lprops->hpos = cpos; + hpos = cpos; + continue; + } + return; + } +} + +/** + * add_to_lpt_heap - add LEB properties to a LEB category heap. + * @c: UBIFS file-system description object + * @lprops: LEB properties to add + * @cat: LEB category + * + * This function returns %1 if @lprops is added to the heap for LEB category + * @cat, otherwise %0 is returned because the heap is full. + */ +static int add_to_lpt_heap(struct ubifs_info *c, struct ubifs_lprops *lprops, + int cat) +{ + struct ubifs_lpt_heap *heap = &c->lpt_heap[cat - 1]; + + if (heap->cnt >= heap->max_cnt) { + const int b = LPT_HEAP_SZ / 2 - 1; + int cpos, val1, val2; + + /* Compare to some other LEB on the bottom of heap */ + /* Pick a position kind of randomly */ + cpos = (((size_t)lprops >> 4) & b) + b; + ubifs_assert(cpos >= b); + ubifs_assert(cpos < LPT_HEAP_SZ); + ubifs_assert(cpos < heap->cnt); + + val1 = get_heap_comp_val(lprops, cat); + val2 = get_heap_comp_val(heap->arr[cpos], cat); + if (val1 > val2) { + struct ubifs_lprops *lp; + + lp = heap->arr[cpos]; + lp->flags &= ~LPROPS_CAT_MASK; + lp->flags |= LPROPS_UNCAT; + list_add(&lp->list, &c->uncat_list); + lprops->hpos = cpos; + heap->arr[cpos] = lprops; + move_up_lpt_heap(c, heap, lprops, cat); + dbg_check_heap(c, heap, cat, lprops->hpos); + return 1; /* Added to heap */ + } + dbg_check_heap(c, heap, cat, -1); + return 0; /* Not added to heap */ + } else { + lprops->hpos = heap->cnt++; + heap->arr[lprops->hpos] = lprops; + move_up_lpt_heap(c, heap, lprops, cat); + dbg_check_heap(c, heap, cat, lprops->hpos); + return 1; /* Added to heap */ + } +} + +/** + * remove_from_lpt_heap - remove LEB properties from a LEB category heap. + * @c: UBIFS file-system description object + * @lprops: LEB properties to remove + * @cat: LEB category + */ +static void remove_from_lpt_heap(struct ubifs_info *c, + struct ubifs_lprops *lprops, int cat) +{ + struct ubifs_lpt_heap *heap; + int hpos = lprops->hpos; + + heap = &c->lpt_heap[cat - 1]; + ubifs_assert(hpos >= 0 && hpos < heap->cnt); + ubifs_assert(heap->arr[hpos] == lprops); + heap->cnt -= 1; + if (hpos < heap->cnt) { + heap->arr[hpos] = heap->arr[heap->cnt]; + heap->arr[hpos]->hpos = hpos; + adjust_lpt_heap(c, heap, heap->arr[hpos], hpos, cat); + } + dbg_check_heap(c, heap, cat, -1); +} + +/** + * lpt_heap_replace - replace lprops in a category heap. + * @c: UBIFS file-system description object + * @old_lprops: LEB properties to replace + * @new_lprops: LEB properties with which to replace + * @cat: LEB category + * + * During commit it is sometimes necessary to copy a pnode (see dirty_cow_pnode) + * and the lprops that the pnode contains. When that happens, references in + * the category heaps to those lprops must be updated to point to the new + * lprops. This function does that. + */ +static void lpt_heap_replace(struct ubifs_info *c, + struct ubifs_lprops *old_lprops, + struct ubifs_lprops *new_lprops, int cat) +{ + struct ubifs_lpt_heap *heap; + int hpos = new_lprops->hpos; + + heap = &c->lpt_heap[cat - 1]; + heap->arr[hpos] = new_lprops; +} + +/** + * ubifs_add_to_cat - add LEB properties to a category list or heap. + * @c: UBIFS file-system description object + * @lprops: LEB properties to add + * @cat: LEB category to which to add + * + * LEB properties are categorized to enable fast find operations. + */ +void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops, + int cat) +{ + switch (cat) { + case LPROPS_DIRTY: + case LPROPS_DIRTY_IDX: + case LPROPS_FREE: + if (add_to_lpt_heap(c, lprops, cat)) + break; + /* No more room on heap so make it uncategorized */ + cat = LPROPS_UNCAT; + /* Fall through */ + case LPROPS_UNCAT: + list_add(&lprops->list, &c->uncat_list); + break; + case LPROPS_EMPTY: + list_add(&lprops->list, &c->empty_list); + break; + case LPROPS_FREEABLE: + list_add(&lprops->list, &c->freeable_list); + c->freeable_cnt += 1; + break; + case LPROPS_FRDI_IDX: + list_add(&lprops->list, &c->frdi_idx_list); + break; + default: + ubifs_assert(0); + } + lprops->flags &= ~LPROPS_CAT_MASK; + lprops->flags |= cat; +} + +/** + * ubifs_remove_from_cat - remove LEB properties from a category list or heap. + * @c: UBIFS file-system description object + * @lprops: LEB properties to remove + * @cat: LEB category from which to remove + * + * LEB properties are categorized to enable fast find operations. + */ +static void ubifs_remove_from_cat(struct ubifs_info *c, + struct ubifs_lprops *lprops, int cat) +{ + switch (cat) { + case LPROPS_DIRTY: + case LPROPS_DIRTY_IDX: + case LPROPS_FREE: + remove_from_lpt_heap(c, lprops, cat); + break; + case LPROPS_FREEABLE: + c->freeable_cnt -= 1; + ubifs_assert(c->freeable_cnt >= 0); + /* Fall through */ + case LPROPS_UNCAT: + case LPROPS_EMPTY: + case LPROPS_FRDI_IDX: + ubifs_assert(!list_empty(&lprops->list)); + list_del(&lprops->list); + break; + default: + ubifs_assert(0); + } +} + +/** + * ubifs_replace_cat - replace lprops in a category list or heap. + * @c: UBIFS file-system description object + * @old_lprops: LEB properties to replace + * @new_lprops: LEB properties with which to replace + * + * During commit it is sometimes necessary to copy a pnode (see dirty_cow_pnode) + * and the lprops that the pnode contains. When that happens, references in + * category lists and heaps must be replaced. This function does that. + */ +void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops, + struct ubifs_lprops *new_lprops) +{ + int cat; + + cat = new_lprops->flags & LPROPS_CAT_MASK; + switch (cat) { + case LPROPS_DIRTY: + case LPROPS_DIRTY_IDX: + case LPROPS_FREE: + lpt_heap_replace(c, old_lprops, new_lprops, cat); + break; + case LPROPS_UNCAT: + case LPROPS_EMPTY: + case LPROPS_FREEABLE: + case LPROPS_FRDI_IDX: + list_replace(&old_lprops->list, &new_lprops->list); + break; + default: + ubifs_assert(0); + } +} + +/** + * ubifs_ensure_cat - ensure LEB properties are categorized. + * @c: UBIFS file-system description object + * @lprops: LEB properties + * + * A LEB may have fallen off of the bottom of a heap, and ended up as + * uncategorized even though it has enough space for us now. If that is the case + * this function will put the LEB back onto a heap. + */ +void ubifs_ensure_cat(struct ubifs_info *c, struct ubifs_lprops *lprops) +{ + int cat = lprops->flags & LPROPS_CAT_MASK; + + if (cat != LPROPS_UNCAT) + return; + cat = ubifs_categorize_lprops(c, lprops); + if (cat == LPROPS_UNCAT) + return; + ubifs_remove_from_cat(c, lprops, LPROPS_UNCAT); + ubifs_add_to_cat(c, lprops, cat); +} + +/** + * ubifs_categorize_lprops - categorize LEB properties. + * @c: UBIFS file-system description object + * @lprops: LEB properties to categorize + * + * LEB properties are categorized to enable fast find operations. This function + * returns the LEB category to which the LEB properties belong. Note however + * that if the LEB category is stored as a heap and the heap is full, the + * LEB properties may have their category changed to %LPROPS_UNCAT. + */ +int ubifs_categorize_lprops(const struct ubifs_info *c, + const struct ubifs_lprops *lprops) +{ + if (lprops->flags & LPROPS_TAKEN) + return LPROPS_UNCAT; + + if (lprops->free == c->leb_size) { + ubifs_assert(!(lprops->flags & LPROPS_INDEX)); + return LPROPS_EMPTY; + } + + if (lprops->free + lprops->dirty == c->leb_size) { + if (lprops->flags & LPROPS_INDEX) + return LPROPS_FRDI_IDX; + else + return LPROPS_FREEABLE; + } + + if (lprops->flags & LPROPS_INDEX) { + if (lprops->dirty + lprops->free >= c->min_idx_node_sz) + return LPROPS_DIRTY_IDX; + } else { + if (lprops->dirty >= c->dead_wm && + lprops->dirty > lprops->free) + return LPROPS_DIRTY; + if (lprops->free > 0) + return LPROPS_FREE; + } + + return LPROPS_UNCAT; +} + +/** + * change_category - change LEB properties category. + * @c: UBIFS file-system description object + * @lprops: LEB properties to recategorize + * + * LEB properties are categorized to enable fast find operations. When the LEB + * properties change they must be recategorized. + */ +static void change_category(struct ubifs_info *c, struct ubifs_lprops *lprops) +{ + int old_cat = lprops->flags & LPROPS_CAT_MASK; + int new_cat = ubifs_categorize_lprops(c, lprops); + + if (old_cat == new_cat) { + struct ubifs_lpt_heap *heap = &c->lpt_heap[new_cat - 1]; + + /* lprops on a heap now must be moved up or down */ + if (new_cat < 1 || new_cat > LPROPS_HEAP_CNT) + return; /* Not on a heap */ + heap = &c->lpt_heap[new_cat - 1]; + adjust_lpt_heap(c, heap, lprops, lprops->hpos, new_cat); + } else { + ubifs_remove_from_cat(c, lprops, old_cat); + ubifs_add_to_cat(c, lprops, new_cat); + } +} + +/** + * ubifs_get_lprops - get reference to LEB properties. + * @c: the UBIFS file-system description object + * + * This function locks lprops. Lprops have to be unlocked by + * 'ubifs_release_lprops()'. + */ +void ubifs_get_lprops(struct ubifs_info *c) +{ + mutex_lock(&c->lp_mutex); +} + +/** + * calc_dark - calculate LEB dark space size. + * @c: the UBIFS file-system description object + * @spc: amount of free and dirty space in the LEB + * + * This function calculates amount of dark space in an LEB which has @spc bytes + * of free and dirty space. Returns the calculations result. + * + * Dark space is the space which is not always usable - it depends on which + * nodes are written in which order. E.g., if an LEB has only 512 free bytes, + * it is dark space, because it cannot fit a large data node. So UBIFS cannot + * count on this LEB and treat these 512 bytes as usable because it is not true + * if, for example, only big chunks of uncompressible data will be written to + * the FS. + */ +static int calc_dark(struct ubifs_info *c, int spc) +{ + ubifs_assert(!(spc & 7)); + + if (spc < c->dark_wm) + return spc; + + /* + * If we have slightly more space then the dark space watermark, we can + * anyway safely assume it we'll be able to write a node of the + * smallest size there. + */ + if (spc - c->dark_wm < MIN_WRITE_SZ) + return spc - MIN_WRITE_SZ; + + return c->dark_wm; +} + +/** + * is_lprops_dirty - determine if LEB properties are dirty. + * @c: the UBIFS file-system description object + * @lprops: LEB properties to test + */ +static int is_lprops_dirty(struct ubifs_info *c, struct ubifs_lprops *lprops) +{ + struct ubifs_pnode *pnode; + int pos; + + pos = (lprops->lnum - c->main_first) & (UBIFS_LPT_FANOUT - 1); + pnode = (struct ubifs_pnode *)container_of(lprops - pos, + struct ubifs_pnode, + lprops[0]); + return !test_bit(COW_ZNODE, &pnode->flags) && + test_bit(DIRTY_CNODE, &pnode->flags); +} + +/** + * ubifs_change_lp - change LEB properties. + * @c: the UBIFS file-system description object + * @lp: LEB properties to change + * @free: new free space amount + * @dirty: new dirty space amount + * @flags: new flags + * @idx_gc_cnt: change to the count of idx_gc list + * + * This function changes LEB properties. This function does not change a LEB + * property (@free, @dirty or @flag) if the value passed is %LPROPS_NC. + * + * This function returns a pointer to the updated LEB properties on success + * and a negative error code on failure. N.B. the LEB properties may have had to + * be copied (due to COW) and consequently the pointer returned may not be the + * same as the pointer passed. + */ +const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c, + const struct ubifs_lprops *lp, + int free, int dirty, int flags, + int idx_gc_cnt) +{ + /* + * This is the only function that is allowed to change lprops, so we + * discard the const qualifier. + */ + struct ubifs_lprops *lprops = (struct ubifs_lprops *)lp; + + dbg_lp("LEB %d, free %d, dirty %d, flags %d", + lprops->lnum, free, dirty, flags); + + ubifs_assert(mutex_is_locked(&c->lp_mutex)); + ubifs_assert(c->lst.empty_lebs >= 0 && + c->lst.empty_lebs <= c->main_lebs); + ubifs_assert(c->freeable_cnt >= 0); + ubifs_assert(c->freeable_cnt <= c->main_lebs); + ubifs_assert(c->lst.taken_empty_lebs >= 0); + ubifs_assert(c->lst.taken_empty_lebs <= c->lst.empty_lebs); + ubifs_assert(!(c->lst.total_free & 7) && !(c->lst.total_dirty & 7)); + ubifs_assert(!(c->lst.total_dead & 7) && !(c->lst.total_dark & 7)); + ubifs_assert(!(c->lst.total_used & 7)); + ubifs_assert(free == LPROPS_NC || free >= 0); + ubifs_assert(dirty == LPROPS_NC || dirty >= 0); + + if (!is_lprops_dirty(c, lprops)) { + lprops = ubifs_lpt_lookup_dirty(c, lprops->lnum); + if (IS_ERR(lprops)) + return lprops; + } else + ubifs_assert(lprops == ubifs_lpt_lookup_dirty(c, lprops->lnum)); + + ubifs_assert(!(lprops->free & 7) && !(lprops->dirty & 7)); + + spin_lock(&c->space_lock); + + if ((lprops->flags & LPROPS_TAKEN) && lprops->free == c->leb_size) + c->lst.taken_empty_lebs -= 1; + + if (!(lprops->flags & LPROPS_INDEX)) { + int old_spc; + + old_spc = lprops->free + lprops->dirty; + if (old_spc < c->dead_wm) + c->lst.total_dead -= old_spc; + else + c->lst.total_dark -= calc_dark(c, old_spc); + + c->lst.total_used -= c->leb_size - old_spc; + } + + if (free != LPROPS_NC) { + free = ALIGN(free, 8); + c->lst.total_free += free - lprops->free; + + /* Increase or decrease empty LEBs counter if needed */ + if (free == c->leb_size) { + if (lprops->free != c->leb_size) + c->lst.empty_lebs += 1; + } else if (lprops->free == c->leb_size) + c->lst.empty_lebs -= 1; + lprops->free = free; + } + + if (dirty != LPROPS_NC) { + dirty = ALIGN(dirty, 8); + c->lst.total_dirty += dirty - lprops->dirty; + lprops->dirty = dirty; + } + + if (flags != LPROPS_NC) { + /* Take care about indexing LEBs counter if needed */ + if ((lprops->flags & LPROPS_INDEX)) { + if (!(flags & LPROPS_INDEX)) + c->lst.idx_lebs -= 1; + } else if (flags & LPROPS_INDEX) + c->lst.idx_lebs += 1; + lprops->flags = flags; + } + + if (!(lprops->flags & LPROPS_INDEX)) { + int new_spc; + + new_spc = lprops->free + lprops->dirty; + if (new_spc < c->dead_wm) + c->lst.total_dead += new_spc; + else + c->lst.total_dark += calc_dark(c, new_spc); + + c->lst.total_used += c->leb_size - new_spc; + } + + if ((lprops->flags & LPROPS_TAKEN) && lprops->free == c->leb_size) + c->lst.taken_empty_lebs += 1; + + change_category(c, lprops); + + c->idx_gc_cnt += idx_gc_cnt; + + spin_unlock(&c->space_lock); + + return lprops; +} + +/** + * ubifs_release_lprops - release lprops lock. + * @c: the UBIFS file-system description object + * + * This function has to be called after each 'ubifs_get_lprops()' call to + * unlock lprops. + */ +void ubifs_release_lprops(struct ubifs_info *c) +{ + ubifs_assert(mutex_is_locked(&c->lp_mutex)); + ubifs_assert(c->lst.empty_lebs >= 0 && + c->lst.empty_lebs <= c->main_lebs); + + mutex_unlock(&c->lp_mutex); +} + +/** + * ubifs_get_lp_stats - get lprops statistics. + * @c: UBIFS file-system description object + * @st: return statistics + */ +void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *st) +{ + spin_lock(&c->space_lock); + memcpy(st, &c->lst, sizeof(struct ubifs_lp_stats)); + spin_unlock(&c->space_lock); +} + +/** + * ubifs_change_one_lp - change LEB properties. + * @c: the UBIFS file-system description object + * @lnum: LEB to change properties for + * @free: amount of free space + * @dirty: amount of dirty space + * @flags_set: flags to set + * @flags_clean: flags to clean + * @idx_gc_cnt: change to the count of idx_gc list + * + * This function changes properties of LEB @lnum. It is a helper wrapper over + * 'ubifs_change_lp()' which hides lprops get/release. The arguments are the + * same as in case of 'ubifs_change_lp()'. Returns zero in case of success and + * a negative error code in case of failure. + */ +int ubifs_change_one_lp(struct ubifs_info *c, int lnum, int free, int dirty, + int flags_set, int flags_clean, int idx_gc_cnt) +{ + int err = 0, flags; + const struct ubifs_lprops *lp; + + ubifs_get_lprops(c); + + lp = ubifs_lpt_lookup_dirty(c, lnum); + if (IS_ERR(lp)) { + err = PTR_ERR(lp); + goto out; + } + + flags = (lp->flags | flags_set) & ~flags_clean; + lp = ubifs_change_lp(c, lp, free, dirty, flags, idx_gc_cnt); + if (IS_ERR(lp)) + err = PTR_ERR(lp); + +out: + ubifs_release_lprops(c); + return err; +} + +/** + * ubifs_update_one_lp - update LEB properties. + * @c: the UBIFS file-system description object + * @lnum: LEB to change properties for + * @free: amount of free space + * @dirty: amount of dirty space to add + * @flags_set: flags to set + * @flags_clean: flags to clean + * + * This function is the same as 'ubifs_change_one_lp()' but @dirty is added to + * current dirty space, not substitutes it. + */ +int ubifs_update_one_lp(struct ubifs_info *c, int lnum, int free, int dirty, + int flags_set, int flags_clean) +{ + int err = 0, flags; + const struct ubifs_lprops *lp; + + ubifs_get_lprops(c); + + lp = ubifs_lpt_lookup_dirty(c, lnum); + if (IS_ERR(lp)) { + err = PTR_ERR(lp); + goto out; + } + + flags = (lp->flags | flags_set) & ~flags_clean; + lp = ubifs_change_lp(c, lp, free, lp->dirty + dirty, flags, 0); + if (IS_ERR(lp)) + err = PTR_ERR(lp); + +out: + ubifs_release_lprops(c); + return err; +} + +/** + * ubifs_read_one_lp - read LEB properties. + * @c: the UBIFS file-system description object + * @lnum: LEB to read properties for + * @lp: where to store read properties + * + * This helper function reads properties of a LEB @lnum and stores them in @lp. + * Returns zero in case of success and a negative error code in case of + * failure. + */ +int ubifs_read_one_lp(struct ubifs_info *c, int lnum, struct ubifs_lprops *lp) +{ + int err = 0; + const struct ubifs_lprops *lpp; + + ubifs_get_lprops(c); + + lpp = ubifs_lpt_lookup(c, lnum); + if (IS_ERR(lpp)) { + err = PTR_ERR(lpp); + goto out; + } + + memcpy(lp, lpp, sizeof(struct ubifs_lprops)); + +out: + ubifs_release_lprops(c); + return err; +} + +/** + * ubifs_fast_find_free - try to find a LEB with free space quickly. + * @c: the UBIFS file-system description object + * + * This function returns LEB properties for a LEB with free space or %NULL if + * the function is unable to find a LEB quickly. + */ +const struct ubifs_lprops *ubifs_fast_find_free(struct ubifs_info *c) +{ + struct ubifs_lprops *lprops; + struct ubifs_lpt_heap *heap; + + ubifs_assert(mutex_is_locked(&c->lp_mutex)); + + heap = &c->lpt_heap[LPROPS_FREE - 1]; + if (heap->cnt == 0) + return NULL; + + lprops = heap->arr[0]; + ubifs_assert(!(lprops->flags & LPROPS_TAKEN)); + ubifs_assert(!(lprops->flags & LPROPS_INDEX)); + return lprops; +} + +/** + * ubifs_fast_find_empty - try to find an empty LEB quickly. + * @c: the UBIFS file-system description object + * + * This function returns LEB properties for an empty LEB or %NULL if the + * function is unable to find an empty LEB quickly. + */ +const struct ubifs_lprops *ubifs_fast_find_empty(struct ubifs_info *c) +{ + struct ubifs_lprops *lprops; + + ubifs_assert(mutex_is_locked(&c->lp_mutex)); + + if (list_empty(&c->empty_list)) + return NULL; + + lprops = list_entry(c->empty_list.next, struct ubifs_lprops, list); + ubifs_assert(!(lprops->flags & LPROPS_TAKEN)); + ubifs_assert(!(lprops->flags & LPROPS_INDEX)); + ubifs_assert(lprops->free == c->leb_size); + return lprops; +} + +/** + * ubifs_fast_find_freeable - try to find a freeable LEB quickly. + * @c: the UBIFS file-system description object + * + * This function returns LEB properties for a freeable LEB or %NULL if the + * function is unable to find a freeable LEB quickly. + */ +const struct ubifs_lprops *ubifs_fast_find_freeable(struct ubifs_info *c) +{ + struct ubifs_lprops *lprops; + + ubifs_assert(mutex_is_locked(&c->lp_mutex)); + + if (list_empty(&c->freeable_list)) + return NULL; + + lprops = list_entry(c->freeable_list.next, struct ubifs_lprops, list); + ubifs_assert(!(lprops->flags & LPROPS_TAKEN)); + ubifs_assert(!(lprops->flags & LPROPS_INDEX)); + ubifs_assert(lprops->free + lprops->dirty == c->leb_size); + ubifs_assert(c->freeable_cnt > 0); + return lprops; +} + +/** + * ubifs_fast_find_frdi_idx - try to find a freeable index LEB quickly. + * @c: the UBIFS file-system description object + * + * This function returns LEB properties for a freeable index LEB or %NULL if the + * function is unable to find a freeable index LEB quickly. + */ +const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c) +{ + struct ubifs_lprops *lprops; + + ubifs_assert(mutex_is_locked(&c->lp_mutex)); + + if (list_empty(&c->frdi_idx_list)) + return NULL; + + lprops = list_entry(c->frdi_idx_list.next, struct ubifs_lprops, list); + ubifs_assert(!(lprops->flags & LPROPS_TAKEN)); + ubifs_assert((lprops->flags & LPROPS_INDEX)); + ubifs_assert(lprops->free + lprops->dirty == c->leb_size); + return lprops; +} + +#ifdef CONFIG_UBIFS_FS_DEBUG + +/** + * dbg_check_cats - check category heaps and lists. + * @c: UBIFS file-system description object + * + * This function returns %0 on success and a negative error code on failure. + */ +int dbg_check_cats(struct ubifs_info *c) +{ + struct ubifs_lprops *lprops; + struct list_head *pos; + int i, cat; + + if (!(ubifs_chk_flags & (UBIFS_CHK_GEN | UBIFS_CHK_LPROPS))) + return 0; + + list_for_each_entry(lprops, &c->empty_list, list) { + if (lprops->free != c->leb_size) { + ubifs_err("non-empty LEB %d on empty list " + "(free %d dirty %d flags %d)", lprops->lnum, + lprops->free, lprops->dirty, lprops->flags); + return -EINVAL; + } + if (lprops->flags & LPROPS_TAKEN) { + ubifs_err("taken LEB %d on empty list " + "(free %d dirty %d flags %d)", lprops->lnum, + lprops->free, lprops->dirty, lprops->flags); + return -EINVAL; + } + } + + i = 0; + list_for_each_entry(lprops, &c->freeable_list, list) { + if (lprops->free + lprops->dirty != c->leb_size) { + ubifs_err("non-freeable LEB %d on freeable list " + "(free %d dirty %d flags %d)", lprops->lnum, + lprops->free, lprops->dirty, lprops->flags); + return -EINVAL; + } + if (lprops->flags & LPROPS_TAKEN) { + ubifs_err("taken LEB %d on freeable list " + "(free %d dirty %d flags %d)", lprops->lnum, + lprops->free, lprops->dirty, lprops->flags); + return -EINVAL; + } + i += 1; + } + if (i != c->freeable_cnt) { + ubifs_err("freeable list count %d expected %d", i, + c->freeable_cnt); + return -EINVAL; + } + + i = 0; + list_for_each(pos, &c->idx_gc) + i += 1; + if (i != c->idx_gc_cnt) { + ubifs_err("idx_gc list count %d expected %d", i, + c->idx_gc_cnt); + return -EINVAL; + } + + list_for_each_entry(lprops, &c->frdi_idx_list, list) { + if (lprops->free + lprops->dirty != c->leb_size) { + ubifs_err("non-freeable LEB %d on frdi_idx list " + "(free %d dirty %d flags %d)", lprops->lnum, + lprops->free, lprops->dirty, lprops->flags); + return -EINVAL; + } + if (lprops->flags & LPROPS_TAKEN) { + ubifs_err("taken LEB %d on frdi_idx list " + "(free %d dirty %d flags %d)", lprops->lnum, + lprops->free, lprops->dirty, lprops->flags); + return -EINVAL; + } + if (!(lprops->flags & LPROPS_INDEX)) { + ubifs_err("non-index LEB %d on frdi_idx list " + "(free %d dirty %d flags %d)", lprops->lnum, + lprops->free, lprops->dirty, lprops->flags); + return -EINVAL; + } + } + + for (cat = 1; cat <= LPROPS_HEAP_CNT; cat++) { + struct ubifs_lpt_heap *heap = &c->lpt_heap[cat - 1]; + + for (i = 0; i < heap->cnt; i++) { + lprops = heap->arr[i]; + if (!lprops) { + ubifs_err("null ptr in LPT heap cat %d", cat); + return -EINVAL; + } + if (lprops->hpos != i) { + ubifs_err("bad ptr in LPT heap cat %d", cat); + return -EINVAL; + } + if (lprops->flags & LPROPS_TAKEN) { + ubifs_err("taken LEB in LPT heap cat %d", cat); + return -EINVAL; + } + } + } + + return 0; +} + +void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat, + int add_pos) +{ + int i = 0, j, err = 0; + + if (!(ubifs_chk_flags & (UBIFS_CHK_GEN | UBIFS_CHK_LPROPS))) + return; + + for (i = 0; i < heap->cnt; i++) { + struct ubifs_lprops *lprops = heap->arr[i]; + struct ubifs_lprops *lp; + + if (i != add_pos) + if ((lprops->flags & LPROPS_CAT_MASK) != cat) { + err = 1; + goto out; + } + if (lprops->hpos != i) { + err = 2; + goto out; + } + lp = ubifs_lpt_lookup(c, lprops->lnum); + if (IS_ERR(lp)) { + err = 3; + goto out; + } + if (lprops != lp) { + dbg_msg("lprops %zx lp %zx lprops->lnum %d lp->lnum %d", + (size_t)lprops, (size_t)lp, lprops->lnum, + lp->lnum); + err = 4; + goto out; + } + for (j = 0; j < i; j++) { + lp = heap->arr[j]; + if (lp == lprops) { + err = 5; + goto out; + } + if (lp->lnum == lprops->lnum) { + err = 6; + goto out; + } + } + } +out: + if (err) { + dbg_msg("failed cat %d hpos %d err %d", cat, i, err); + dbg_dump_stack(); + dbg_dump_heap(c, heap, cat); + } +} + +/** + * struct scan_check_data - data provided to scan callback function. + * @lst: LEB properties statistics + * @err: error code + */ +struct scan_check_data { + struct ubifs_lp_stats lst; + int err; +}; + +/** + * scan_check_cb - scan callback. + * @c: the UBIFS file-system description object + * @lp: LEB properties to scan + * @in_tree: whether the LEB properties are in main memory + * @data: information passed to and from the caller of the scan + * + * This function returns a code that indicates whether the scan should continue + * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree + * in main memory (%LPT_SCAN_ADD), or whether the scan should stop + * (%LPT_SCAN_STOP). + */ +static int scan_check_cb(struct ubifs_info *c, + const struct ubifs_lprops *lp, int in_tree, + struct scan_check_data *data) +{ + struct ubifs_scan_leb *sleb; + struct ubifs_scan_node *snod; + struct ubifs_lp_stats *lst = &data->lst; + int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty; + + cat = lp->flags & LPROPS_CAT_MASK; + if (cat != LPROPS_UNCAT) { + cat = ubifs_categorize_lprops(c, lp); + if (cat != (lp->flags & LPROPS_CAT_MASK)) { + ubifs_err("bad LEB category %d expected %d", + (lp->flags & LPROPS_CAT_MASK), cat); + goto out; + } + } + + /* Check lp is on its category list (if it has one) */ + if (in_tree) { + struct list_head *list = NULL; + + switch (cat) { + case LPROPS_EMPTY: + list = &c->empty_list; + break; + case LPROPS_FREEABLE: + list = &c->freeable_list; + break; + case LPROPS_FRDI_IDX: + list = &c->frdi_idx_list; + break; + case LPROPS_UNCAT: + list = &c->uncat_list; + break; + } + if (list) { + struct ubifs_lprops *lprops; + int found = 0; + + list_for_each_entry(lprops, list, list) { + if (lprops == lp) { + found = 1; + break; + } + } + if (!found) { + ubifs_err("bad LPT list (category %d)", cat); + goto out; + } + } + } + + /* Check lp is on its category heap (if it has one) */ + if (in_tree && cat > 0 && cat <= LPROPS_HEAP_CNT) { + struct ubifs_lpt_heap *heap = &c->lpt_heap[cat - 1]; + + if ((lp->hpos != -1 && heap->arr[lp->hpos]->lnum != lnum) || + lp != heap->arr[lp->hpos]) { + ubifs_err("bad LPT heap (category %d)", cat); + goto out; + } + } + + sleb = ubifs_scan(c, lnum, 0, c->dbg_buf); + if (IS_ERR(sleb)) { + /* + * After an unclean unmount, empty and freeable LEBs + * may contain garbage. + */ + if (lp->free == c->leb_size) { + ubifs_err("scan errors were in empty LEB " + "- continuing checking"); + lst->empty_lebs += 1; + lst->total_free += c->leb_size; + lst->total_dark += calc_dark(c, c->leb_size); + return LPT_SCAN_CONTINUE; + } + + if (lp->free + lp->dirty == c->leb_size && + !(lp->flags & LPROPS_INDEX)) { + ubifs_err("scan errors were in freeable LEB " + "- continuing checking"); + lst->total_free += lp->free; + lst->total_dirty += lp->dirty; + lst->total_dark += calc_dark(c, c->leb_size); + return LPT_SCAN_CONTINUE; + } + data->err = PTR_ERR(sleb); + return LPT_SCAN_STOP; + } + + is_idx = -1; + list_for_each_entry(snod, &sleb->nodes, list) { + int found, level = 0; + + cond_resched(); + + if (is_idx == -1) + is_idx = (snod->type == UBIFS_IDX_NODE) ? 1 : 0; + + if (is_idx && snod->type != UBIFS_IDX_NODE) { + ubifs_err("indexing node in data LEB %d:%d", + lnum, snod->offs); + goto out_destroy; + } + + if (snod->type == UBIFS_IDX_NODE) { + struct ubifs_idx_node *idx = snod->node; + + key_read(c, ubifs_idx_key(c, idx), &snod->key); + level = le16_to_cpu(idx->level); + } + + found = ubifs_tnc_has_node(c, &snod->key, level, lnum, + snod->offs, is_idx); + if (found) { + if (found < 0) + goto out_destroy; + used += ALIGN(snod->len, 8); + } + } + + free = c->leb_size - sleb->endpt; + dirty = sleb->endpt - used; + + if (free > c->leb_size || free < 0 || dirty > c->leb_size || + dirty < 0) { + ubifs_err("bad calculated accounting for LEB %d: " + "free %d, dirty %d", lnum, free, dirty); + goto out_destroy; + } + + if (lp->free + lp->dirty == c->leb_size && + free + dirty == c->leb_size) + if ((is_idx && !(lp->flags & LPROPS_INDEX)) || + (!is_idx && free == c->leb_size) || + lp->free == c->leb_size) { + /* + * Empty or freeable LEBs could contain index + * nodes from an uncompleted commit due to an + * unclean unmount. Or they could be empty for + * the same reason. Or it may simply not have been + * unmapped. + */ + free = lp->free; + dirty = lp->dirty; + is_idx = 0; + } + + if (is_idx && lp->free + lp->dirty == free + dirty && + lnum != c->ihead_lnum) { + /* + * After an unclean unmount, an index LEB could have a different + * amount of free space than the value recorded by lprops. That + * is because the in-the-gaps method may use free space or + * create free space (as a side-effect of using ubi_leb_change + * and not writing the whole LEB). The incorrect free space + * value is not a problem because the index is only ever + * allocated empty LEBs, so there will never be an attempt to + * write to the free space at the end of an index LEB - except + * by the in-the-gaps method for which it is not a problem. + */ + free = lp->free; + dirty = lp->dirty; + } + + if (lp->free != free || lp->dirty != dirty) + goto out_print; + + if (is_idx && !(lp->flags & LPROPS_INDEX)) { + if (free == c->leb_size) + /* Free but not unmapped LEB, it's fine */ + is_idx = 0; + else { + ubifs_err("indexing node without indexing " + "flag"); + goto out_print; + } + } + + if (!is_idx && (lp->flags & LPROPS_INDEX)) { + ubifs_err("data node with indexing flag"); + goto out_print; + } + + if (free == c->leb_size) + lst->empty_lebs += 1; + + if (is_idx) + lst->idx_lebs += 1; + + if (!(lp->flags & LPROPS_INDEX)) + lst->total_used += c->leb_size - free - dirty; + lst->total_free += free; + lst->total_dirty += dirty; + + if (!(lp->flags & LPROPS_INDEX)) { + int spc = free + dirty; + + if (spc < c->dead_wm) + lst->total_dead += spc; + else + lst->total_dark += calc_dark(c, spc); + } + + ubifs_scan_destroy(sleb); + + return LPT_SCAN_CONTINUE; + +out_print: + ubifs_err("bad accounting of LEB %d: free %d, dirty %d flags %#x, " + "should be free %d, dirty %d", + lnum, lp->free, lp->dirty, lp->flags, free, dirty); + dbg_dump_leb(c, lnum); +out_destroy: + ubifs_scan_destroy(sleb); +out: + data->err = -EINVAL; + return LPT_SCAN_STOP; +} + +/** + * dbg_check_lprops - check all LEB properties. + * @c: UBIFS file-system description object + * + * This function checks all LEB properties and makes sure they are all correct. + * It returns zero if everything is fine, %-EINVAL if there is an inconsistency + * and other negative error codes in case of other errors. This function is + * called while the file system is locked (because of commit start), so no + * additional locking is required. Note that locking the LPT mutex would cause + * a circular lock dependency with the TNC mutex. + */ +int dbg_check_lprops(struct ubifs_info *c) +{ + int i, err; + struct scan_check_data data; + struct ubifs_lp_stats *lst = &data.lst; + + if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS)) + return 0; + + /* + * As we are going to scan the media, the write buffers have to be + * synchronized. + */ + for (i = 0; i < c->jhead_cnt; i++) { + err = ubifs_wbuf_sync(&c->jheads[i].wbuf); + if (err) + return err; + } + + memset(lst, 0, sizeof(struct ubifs_lp_stats)); + + data.err = 0; + err = ubifs_lpt_scan_nolock(c, c->main_first, c->leb_cnt - 1, + (ubifs_lpt_scan_callback)scan_check_cb, + &data); + if (err && err != -ENOSPC) + goto out; + if (data.err) { + err = data.err; + goto out; + } + + if (lst->empty_lebs != c->lst.empty_lebs || + lst->idx_lebs != c->lst.idx_lebs || + lst->total_free != c->lst.total_free || + lst->total_dirty != c->lst.total_dirty || + lst->total_used != c->lst.total_used) { + ubifs_err("bad overall accounting"); + ubifs_err("calculated: empty_lebs %d, idx_lebs %d, " + "total_free %lld, total_dirty %lld, total_used %lld", + lst->empty_lebs, lst->idx_lebs, lst->total_free, + lst->total_dirty, lst->total_used); + ubifs_err("read from lprops: empty_lebs %d, idx_lebs %d, " + "total_free %lld, total_dirty %lld, total_used %lld", + c->lst.empty_lebs, c->lst.idx_lebs, c->lst.total_free, + c->lst.total_dirty, c->lst.total_used); + err = -EINVAL; + goto out; + } + + if (lst->total_dead != c->lst.total_dead || + lst->total_dark != c->lst.total_dark) { + ubifs_err("bad dead/dark space accounting"); + ubifs_err("calculated: total_dead %lld, total_dark %lld", + lst->total_dead, lst->total_dark); + ubifs_err("read from lprops: total_dead %lld, total_dark %lld", + c->lst.total_dead, c->lst.total_dark); + err = -EINVAL; + goto out; + } + + err = dbg_check_cats(c); +out: + return err; +} + +#endif /* CONFIG_UBIFS_FS_DEBUG */ diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c new file mode 100644 index 000000000000..9ff2463177e5 --- /dev/null +++ b/fs/ubifs/lpt.c @@ -0,0 +1,2243 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * This file implements the LEB properties tree (LPT) area. The LPT area + * contains the LEB properties tree, a table of LPT area eraseblocks (ltab), and + * (for the "big" model) a table of saved LEB numbers (lsave). The LPT area sits + * between the log and the orphan area. + * + * The LPT area is like a miniature self-contained file system. It is required + * that it never runs out of space, is fast to access and update, and scales + * logarithmically. The LEB properties tree is implemented as a wandering tree + * much like the TNC, and the LPT area has its own garbage collection. + * + * The LPT has two slightly different forms called the "small model" and the + * "big model". The small model is used when the entire LEB properties table + * can be written into a single eraseblock. In that case, garbage collection + * consists of just writing the whole table, which therefore makes all other + * eraseblocks reusable. In the case of the big model, dirty eraseblocks are + * selected for garbage collection, which consists are marking the nodes in + * that LEB as dirty, and then only the dirty nodes are written out. Also, in + * the case of the big model, a table of LEB numbers is saved so that the entire + * LPT does not to be scanned looking for empty eraseblocks when UBIFS is first + * mounted. + */ + +#include <linux/crc16.h> +#include "ubifs.h" + +/** + * do_calc_lpt_geom - calculate sizes for the LPT area. + * @c: the UBIFS file-system description object + * + * Calculate the sizes of LPT bit fields, nodes, and tree, based on the + * properties of the flash and whether LPT is "big" (c->big_lpt). + */ +static void do_calc_lpt_geom(struct ubifs_info *c) +{ + int i, n, bits, per_leb_wastage, max_pnode_cnt; + long long sz, tot_wastage; + + n = c->main_lebs + c->max_leb_cnt - c->leb_cnt; + max_pnode_cnt = DIV_ROUND_UP(n, UBIFS_LPT_FANOUT); + + c->lpt_hght = 1; + n = UBIFS_LPT_FANOUT; + while (n < max_pnode_cnt) { + c->lpt_hght += 1; + n <<= UBIFS_LPT_FANOUT_SHIFT; + } + + c->pnode_cnt = DIV_ROUND_UP(c->main_lebs, UBIFS_LPT_FANOUT); + + n = DIV_ROUND_UP(c->pnode_cnt, UBIFS_LPT_FANOUT); + c->nnode_cnt = n; + for (i = 1; i < c->lpt_hght; i++) { + n = DIV_ROUND_UP(n, UBIFS_LPT_FANOUT); + c->nnode_cnt += n; + } + + c->space_bits = fls(c->leb_size) - 3; + c->lpt_lnum_bits = fls(c->lpt_lebs); + c->lpt_offs_bits = fls(c->leb_size - 1); + c->lpt_spc_bits = fls(c->leb_size); + + n = DIV_ROUND_UP(c->max_leb_cnt, UBIFS_LPT_FANOUT); + c->pcnt_bits = fls(n - 1); + + c->lnum_bits = fls(c->max_leb_cnt - 1); + + bits = UBIFS_LPT_CRC_BITS + UBIFS_LPT_TYPE_BITS + + (c->big_lpt ? c->pcnt_bits : 0) + + (c->space_bits * 2 + 1) * UBIFS_LPT_FANOUT; + c->pnode_sz = (bits + 7) / 8; + + bits = UBIFS_LPT_CRC_BITS + UBIFS_LPT_TYPE_BITS + + (c->big_lpt ? c->pcnt_bits : 0) + + (c->lpt_lnum_bits + c->lpt_offs_bits) * UBIFS_LPT_FANOUT; + c->nnode_sz = (bits + 7) / 8; + + bits = UBIFS_LPT_CRC_BITS + UBIFS_LPT_TYPE_BITS + + c->lpt_lebs * c->lpt_spc_bits * 2; + c->ltab_sz = (bits + 7) / 8; + + bits = UBIFS_LPT_CRC_BITS + UBIFS_LPT_TYPE_BITS + + c->lnum_bits * c->lsave_cnt; + c->lsave_sz = (bits + 7) / 8; + + /* Calculate the minimum LPT size */ + c->lpt_sz = (long long)c->pnode_cnt * c->pnode_sz; + c->lpt_sz += (long long)c->nnode_cnt * c->nnode_sz; + c->lpt_sz += c->ltab_sz; + c->lpt_sz += c->lsave_sz; + + /* Add wastage */ + sz = c->lpt_sz; + per_leb_wastage = max_t(int, c->pnode_sz, c->nnode_sz); + sz += per_leb_wastage; + tot_wastage = per_leb_wastage; + while (sz > c->leb_size) { + sz += per_leb_wastage; + sz -= c->leb_size; + tot_wastage += per_leb_wastage; + } + tot_wastage += ALIGN(sz, c->min_io_size) - sz; + c->lpt_sz += tot_wastage; +} + +/** + * ubifs_calc_lpt_geom - calculate and check sizes for the LPT area. + * @c: the UBIFS file-system description object + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_calc_lpt_geom(struct ubifs_info *c) +{ + int lebs_needed; + uint64_t sz; + + do_calc_lpt_geom(c); + + /* Verify that lpt_lebs is big enough */ + sz = c->lpt_sz * 2; /* Must have at least 2 times the size */ + sz += c->leb_size - 1; + do_div(sz, c->leb_size); + lebs_needed = sz; + if (lebs_needed > c->lpt_lebs) { + ubifs_err("too few LPT LEBs"); + return -EINVAL; + } + + /* Verify that ltab fits in a single LEB (since ltab is a single node */ + if (c->ltab_sz > c->leb_size) { + ubifs_err("LPT ltab too big"); + return -EINVAL; + } + + c->check_lpt_free = c->big_lpt; + + return 0; +} + +/** + * calc_dflt_lpt_geom - calculate default LPT geometry. + * @c: the UBIFS file-system description object + * @main_lebs: number of main area LEBs is passed and returned here + * @big_lpt: whether the LPT area is "big" is returned here + * + * The size of the LPT area depends on parameters that themselves are dependent + * on the size of the LPT area. This function, successively recalculates the LPT + * area geometry until the parameters and resultant geometry are consistent. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int calc_dflt_lpt_geom(struct ubifs_info *c, int *main_lebs, + int *big_lpt) +{ + int i, lebs_needed; + uint64_t sz; + + /* Start by assuming the minimum number of LPT LEBs */ + c->lpt_lebs = UBIFS_MIN_LPT_LEBS; + c->main_lebs = *main_lebs - c->lpt_lebs; + if (c->main_lebs <= 0) + return -EINVAL; + + /* And assume we will use the small LPT model */ + c->big_lpt = 0; + + /* + * Calculate the geometry based on assumptions above and then see if it + * makes sense + */ + do_calc_lpt_geom(c); + + /* Small LPT model must have lpt_sz < leb_size */ + if (c->lpt_sz > c->leb_size) { + /* Nope, so try again using big LPT model */ + c->big_lpt = 1; + do_calc_lpt_geom(c); + } + + /* Now check there are enough LPT LEBs */ + for (i = 0; i < 64 ; i++) { + sz = c->lpt_sz * 4; /* Allow 4 times the size */ + sz += c->leb_size - 1; + do_div(sz, c->leb_size); + lebs_needed = sz; + if (lebs_needed > c->lpt_lebs) { + /* Not enough LPT LEBs so try again with more */ + c->lpt_lebs = lebs_needed; + c->main_lebs = *main_lebs - c->lpt_lebs; + if (c->main_lebs <= 0) + return -EINVAL; + do_calc_lpt_geom(c); + continue; + } + if (c->ltab_sz > c->leb_size) { + ubifs_err("LPT ltab too big"); + return -EINVAL; + } + *main_lebs = c->main_lebs; + *big_lpt = c->big_lpt; + return 0; + } + return -EINVAL; +} + +/** + * pack_bits - pack bit fields end-to-end. + * @addr: address at which to pack (passed and next address returned) + * @pos: bit position at which to pack (passed and next position returned) + * @val: value to pack + * @nrbits: number of bits of value to pack (1-32) + */ +static void pack_bits(uint8_t **addr, int *pos, uint32_t val, int nrbits) +{ + uint8_t *p = *addr; + int b = *pos; + + ubifs_assert(nrbits > 0); + ubifs_assert(nrbits <= 32); + ubifs_assert(*pos >= 0); + ubifs_assert(*pos < 8); + ubifs_assert((val >> nrbits) == 0 || nrbits == 32); + if (b) { + *p |= ((uint8_t)val) << b; + nrbits += b; + if (nrbits > 8) { + *++p = (uint8_t)(val >>= (8 - b)); + if (nrbits > 16) { + *++p = (uint8_t)(val >>= 8); + if (nrbits > 24) { + *++p = (uint8_t)(val >>= 8); + if (nrbits > 32) + *++p = (uint8_t)(val >>= 8); + } + } + } + } else { + *p = (uint8_t)val; + if (nrbits > 8) { + *++p = (uint8_t)(val >>= 8); + if (nrbits > 16) { + *++p = (uint8_t)(val >>= 8); + if (nrbits > 24) + *++p = (uint8_t)(val >>= 8); + } + } + } + b = nrbits & 7; + if (b == 0) + p++; + *addr = p; + *pos = b; +} + +/** + * ubifs_unpack_bits - unpack bit fields. + * @addr: address at which to unpack (passed and next address returned) + * @pos: bit position at which to unpack (passed and next position returned) + * @nrbits: number of bits of value to unpack (1-32) + * + * This functions returns the value unpacked. + */ +uint32_t ubifs_unpack_bits(uint8_t **addr, int *pos, int nrbits) +{ + const int k = 32 - nrbits; + uint8_t *p = *addr; + int b = *pos; + uint32_t val; + + ubifs_assert(nrbits > 0); + ubifs_assert(nrbits <= 32); + ubifs_assert(*pos >= 0); + ubifs_assert(*pos < 8); + if (b) { + val = p[1] | ((uint32_t)p[2] << 8) | ((uint32_t)p[3] << 16) | + ((uint32_t)p[4] << 24); + val <<= (8 - b); + val |= *p >> b; + nrbits += b; + } else + val = p[0] | ((uint32_t)p[1] << 8) | ((uint32_t)p[2] << 16) | + ((uint32_t)p[3] << 24); + val <<= k; + val >>= k; + b = nrbits & 7; + p += nrbits / 8; + *addr = p; + *pos = b; + ubifs_assert((val >> nrbits) == 0 || nrbits - b == 32); + return val; +} + +/** + * ubifs_pack_pnode - pack all the bit fields of a pnode. + * @c: UBIFS file-system description object + * @buf: buffer into which to pack + * @pnode: pnode to pack + */ +void ubifs_pack_pnode(struct ubifs_info *c, void *buf, + struct ubifs_pnode *pnode) +{ + uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; + int i, pos = 0; + uint16_t crc; + + pack_bits(&addr, &pos, UBIFS_LPT_PNODE, UBIFS_LPT_TYPE_BITS); + if (c->big_lpt) + pack_bits(&addr, &pos, pnode->num, c->pcnt_bits); + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + pack_bits(&addr, &pos, pnode->lprops[i].free >> 3, + c->space_bits); + pack_bits(&addr, &pos, pnode->lprops[i].dirty >> 3, + c->space_bits); + if (pnode->lprops[i].flags & LPROPS_INDEX) + pack_bits(&addr, &pos, 1, 1); + else + pack_bits(&addr, &pos, 0, 1); + } + crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES, + c->pnode_sz - UBIFS_LPT_CRC_BYTES); + addr = buf; + pos = 0; + pack_bits(&addr, &pos, crc, UBIFS_LPT_CRC_BITS); +} + +/** + * ubifs_pack_nnode - pack all the bit fields of a nnode. + * @c: UBIFS file-system description object + * @buf: buffer into which to pack + * @nnode: nnode to pack + */ +void ubifs_pack_nnode(struct ubifs_info *c, void *buf, + struct ubifs_nnode *nnode) +{ + uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; + int i, pos = 0; + uint16_t crc; + + pack_bits(&addr, &pos, UBIFS_LPT_NNODE, UBIFS_LPT_TYPE_BITS); + if (c->big_lpt) + pack_bits(&addr, &pos, nnode->num, c->pcnt_bits); + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + int lnum = nnode->nbranch[i].lnum; + + if (lnum == 0) + lnum = c->lpt_last + 1; + pack_bits(&addr, &pos, lnum - c->lpt_first, c->lpt_lnum_bits); + pack_bits(&addr, &pos, nnode->nbranch[i].offs, + c->lpt_offs_bits); + } + crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES, + c->nnode_sz - UBIFS_LPT_CRC_BYTES); + addr = buf; + pos = 0; + pack_bits(&addr, &pos, crc, UBIFS_LPT_CRC_BITS); +} + +/** + * ubifs_pack_ltab - pack the LPT's own lprops table. + * @c: UBIFS file-system description object + * @buf: buffer into which to pack + * @ltab: LPT's own lprops table to pack + */ +void ubifs_pack_ltab(struct ubifs_info *c, void *buf, + struct ubifs_lpt_lprops *ltab) +{ + uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; + int i, pos = 0; + uint16_t crc; + + pack_bits(&addr, &pos, UBIFS_LPT_LTAB, UBIFS_LPT_TYPE_BITS); + for (i = 0; i < c->lpt_lebs; i++) { + pack_bits(&addr, &pos, ltab[i].free, c->lpt_spc_bits); + pack_bits(&addr, &pos, ltab[i].dirty, c->lpt_spc_bits); + } + crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES, + c->ltab_sz - UBIFS_LPT_CRC_BYTES); + addr = buf; + pos = 0; + pack_bits(&addr, &pos, crc, UBIFS_LPT_CRC_BITS); +} + +/** + * ubifs_pack_lsave - pack the LPT's save table. + * @c: UBIFS file-system description object + * @buf: buffer into which to pack + * @lsave: LPT's save table to pack + */ +void ubifs_pack_lsave(struct ubifs_info *c, void *buf, int *lsave) +{ + uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; + int i, pos = 0; + uint16_t crc; + + pack_bits(&addr, &pos, UBIFS_LPT_LSAVE, UBIFS_LPT_TYPE_BITS); + for (i = 0; i < c->lsave_cnt; i++) + pack_bits(&addr, &pos, lsave[i], c->lnum_bits); + crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES, + c->lsave_sz - UBIFS_LPT_CRC_BYTES); + addr = buf; + pos = 0; + pack_bits(&addr, &pos, crc, UBIFS_LPT_CRC_BITS); +} + +/** + * ubifs_add_lpt_dirt - add dirty space to LPT LEB properties. + * @c: UBIFS file-system description object + * @lnum: LEB number to which to add dirty space + * @dirty: amount of dirty space to add + */ +void ubifs_add_lpt_dirt(struct ubifs_info *c, int lnum, int dirty) +{ + if (!dirty || !lnum) + return; + dbg_lp("LEB %d add %d to %d", + lnum, dirty, c->ltab[lnum - c->lpt_first].dirty); + ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last); + c->ltab[lnum - c->lpt_first].dirty += dirty; +} + +/** + * set_ltab - set LPT LEB properties. + * @c: UBIFS file-system description object + * @lnum: LEB number + * @free: amount of free space + * @dirty: amount of dirty space + */ +static void set_ltab(struct ubifs_info *c, int lnum, int free, int dirty) +{ + dbg_lp("LEB %d free %d dirty %d to %d %d", + lnum, c->ltab[lnum - c->lpt_first].free, + c->ltab[lnum - c->lpt_first].dirty, free, dirty); + ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last); + c->ltab[lnum - c->lpt_first].free = free; + c->ltab[lnum - c->lpt_first].dirty = dirty; +} + +/** + * ubifs_add_nnode_dirt - add dirty space to LPT LEB properties. + * @c: UBIFS file-system description object + * @nnode: nnode for which to add dirt + */ +void ubifs_add_nnode_dirt(struct ubifs_info *c, struct ubifs_nnode *nnode) +{ + struct ubifs_nnode *np = nnode->parent; + + if (np) + ubifs_add_lpt_dirt(c, np->nbranch[nnode->iip].lnum, + c->nnode_sz); + else { + ubifs_add_lpt_dirt(c, c->lpt_lnum, c->nnode_sz); + if (!(c->lpt_drty_flgs & LTAB_DIRTY)) { + c->lpt_drty_flgs |= LTAB_DIRTY; + ubifs_add_lpt_dirt(c, c->ltab_lnum, c->ltab_sz); + } + } +} + +/** + * add_pnode_dirt - add dirty space to LPT LEB properties. + * @c: UBIFS file-system description object + * @pnode: pnode for which to add dirt + */ +static void add_pnode_dirt(struct ubifs_info *c, struct ubifs_pnode *pnode) +{ + ubifs_add_lpt_dirt(c, pnode->parent->nbranch[pnode->iip].lnum, + c->pnode_sz); +} + +/** + * calc_nnode_num - calculate nnode number. + * @row: the row in the tree (root is zero) + * @col: the column in the row (leftmost is zero) + * + * The nnode number is a number that uniquely identifies a nnode and can be used + * easily to traverse the tree from the root to that nnode. + * + * This function calculates and returns the nnode number for the nnode at @row + * and @col. + */ +static int calc_nnode_num(int row, int col) +{ + int num, bits; + + num = 1; + while (row--) { + bits = (col & (UBIFS_LPT_FANOUT - 1)); + col >>= UBIFS_LPT_FANOUT_SHIFT; + num <<= UBIFS_LPT_FANOUT_SHIFT; + num |= bits; + } + return num; +} + +/** + * calc_nnode_num_from_parent - calculate nnode number. + * @c: UBIFS file-system description object + * @parent: parent nnode + * @iip: index in parent + * + * The nnode number is a number that uniquely identifies a nnode and can be used + * easily to traverse the tree from the root to that nnode. + * + * This function calculates and returns the nnode number based on the parent's + * nnode number and the index in parent. + */ +static int calc_nnode_num_from_parent(struct ubifs_info *c, + struct ubifs_nnode *parent, int iip) +{ + int num, shft; + + if (!parent) + return 1; + shft = (c->lpt_hght - parent->level) * UBIFS_LPT_FANOUT_SHIFT; + num = parent->num ^ (1 << shft); + num |= (UBIFS_LPT_FANOUT + iip) << shft; + return num; +} + +/** + * calc_pnode_num_from_parent - calculate pnode number. + * @c: UBIFS file-system description object + * @parent: parent nnode + * @iip: index in parent + * + * The pnode number is a number that uniquely identifies a pnode and can be used + * easily to traverse the tree from the root to that pnode. + * + * This function calculates and returns the pnode number based on the parent's + * nnode number and the index in parent. + */ +static int calc_pnode_num_from_parent(struct ubifs_info *c, + struct ubifs_nnode *parent, int iip) +{ + int i, n = c->lpt_hght - 1, pnum = parent->num, num = 0; + + for (i = 0; i < n; i++) { + num <<= UBIFS_LPT_FANOUT_SHIFT; + num |= pnum & (UBIFS_LPT_FANOUT - 1); + pnum >>= UBIFS_LPT_FANOUT_SHIFT; + } + num <<= UBIFS_LPT_FANOUT_SHIFT; + num |= iip; + return num; +} + +/** + * ubifs_create_dflt_lpt - create default LPT. + * @c: UBIFS file-system description object + * @main_lebs: number of main area LEBs is passed and returned here + * @lpt_first: LEB number of first LPT LEB + * @lpt_lebs: number of LEBs for LPT is passed and returned here + * @big_lpt: use big LPT model is passed and returned here + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first, + int *lpt_lebs, int *big_lpt) +{ + int lnum, err = 0, node_sz, iopos, i, j, cnt, len, alen, row; + int blnum, boffs, bsz, bcnt; + struct ubifs_pnode *pnode = NULL; + struct ubifs_nnode *nnode = NULL; + void *buf = NULL, *p; + struct ubifs_lpt_lprops *ltab = NULL; + int *lsave = NULL; + + err = calc_dflt_lpt_geom(c, main_lebs, big_lpt); + if (err) + return err; + *lpt_lebs = c->lpt_lebs; + + /* Needed by 'ubifs_pack_nnode()' and 'set_ltab()' */ + c->lpt_first = lpt_first; + /* Needed by 'set_ltab()' */ + c->lpt_last = lpt_first + c->lpt_lebs - 1; + /* Needed by 'ubifs_pack_lsave()' */ + c->main_first = c->leb_cnt - *main_lebs; + + lsave = kmalloc(sizeof(int) * c->lsave_cnt, GFP_KERNEL); + pnode = kzalloc(sizeof(struct ubifs_pnode), GFP_KERNEL); + nnode = kzalloc(sizeof(struct ubifs_nnode), GFP_KERNEL); + buf = vmalloc(c->leb_size); + ltab = vmalloc(sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs); + if (!pnode || !nnode || !buf || !ltab || !lsave) { + err = -ENOMEM; + goto out; + } + + ubifs_assert(!c->ltab); + c->ltab = ltab; /* Needed by set_ltab */ + + /* Initialize LPT's own lprops */ + for (i = 0; i < c->lpt_lebs; i++) { + ltab[i].free = c->leb_size; + ltab[i].dirty = 0; + ltab[i].tgc = 0; + ltab[i].cmt = 0; + } + + lnum = lpt_first; + p = buf; + /* Number of leaf nodes (pnodes) */ + cnt = c->pnode_cnt; + + /* + * The first pnode contains the LEB properties for the LEBs that contain + * the root inode node and the root index node of the index tree. + */ + node_sz = ALIGN(ubifs_idx_node_sz(c, 1), 8); + iopos = ALIGN(node_sz, c->min_io_size); + pnode->lprops[0].free = c->leb_size - iopos; + pnode->lprops[0].dirty = iopos - node_sz; + pnode->lprops[0].flags = LPROPS_INDEX; + + node_sz = UBIFS_INO_NODE_SZ; + iopos = ALIGN(node_sz, c->min_io_size); + pnode->lprops[1].free = c->leb_size - iopos; + pnode->lprops[1].dirty = iopos - node_sz; + + for (i = 2; i < UBIFS_LPT_FANOUT; i++) + pnode->lprops[i].free = c->leb_size; + + /* Add first pnode */ + ubifs_pack_pnode(c, p, pnode); + p += c->pnode_sz; + len = c->pnode_sz; + pnode->num += 1; + + /* Reset pnode values for remaining pnodes */ + pnode->lprops[0].free = c->leb_size; + pnode->lprops[0].dirty = 0; + pnode->lprops[0].flags = 0; + + pnode->lprops[1].free = c->leb_size; + pnode->lprops[1].dirty = 0; + + /* + * To calculate the internal node branches, we keep information about + * the level below. + */ + blnum = lnum; /* LEB number of level below */ + boffs = 0; /* Offset of level below */ + bcnt = cnt; /* Number of nodes in level below */ + bsz = c->pnode_sz; /* Size of nodes in level below */ + + /* Add all remaining pnodes */ + for (i = 1; i < cnt; i++) { + if (len + c->pnode_sz > c->leb_size) { + alen = ALIGN(len, c->min_io_size); + set_ltab(c, lnum, c->leb_size - alen, alen - len); + memset(p, 0xff, alen - len); + err = ubi_leb_change(c->ubi, lnum++, buf, alen, + UBI_SHORTTERM); + if (err) + goto out; + p = buf; + len = 0; + } + ubifs_pack_pnode(c, p, pnode); + p += c->pnode_sz; + len += c->pnode_sz; + /* + * pnodes are simply numbered left to right starting at zero, + * which means the pnode number can be used easily to traverse + * down the tree to the corresponding pnode. + */ + pnode->num += 1; + } + + row = 0; + for (i = UBIFS_LPT_FANOUT; cnt > i; i <<= UBIFS_LPT_FANOUT_SHIFT) + row += 1; + /* Add all nnodes, one level at a time */ + while (1) { + /* Number of internal nodes (nnodes) at next level */ + cnt = DIV_ROUND_UP(cnt, UBIFS_LPT_FANOUT); + for (i = 0; i < cnt; i++) { + if (len + c->nnode_sz > c->leb_size) { + alen = ALIGN(len, c->min_io_size); + set_ltab(c, lnum, c->leb_size - alen, + alen - len); + memset(p, 0xff, alen - len); + err = ubi_leb_change(c->ubi, lnum++, buf, alen, + UBI_SHORTTERM); + if (err) + goto out; + p = buf; + len = 0; + } + /* Only 1 nnode at this level, so it is the root */ + if (cnt == 1) { + c->lpt_lnum = lnum; + c->lpt_offs = len; + } + /* Set branches to the level below */ + for (j = 0; j < UBIFS_LPT_FANOUT; j++) { + if (bcnt) { + if (boffs + bsz > c->leb_size) { + blnum += 1; + boffs = 0; + } + nnode->nbranch[j].lnum = blnum; + nnode->nbranch[j].offs = boffs; + boffs += bsz; + bcnt--; + } else { + nnode->nbranch[j].lnum = 0; + nnode->nbranch[j].offs = 0; + } + } + nnode->num = calc_nnode_num(row, i); + ubifs_pack_nnode(c, p, nnode); + p += c->nnode_sz; + len += c->nnode_sz; + } + /* Only 1 nnode at this level, so it is the root */ + if (cnt == 1) + break; + /* Update the information about the level below */ + bcnt = cnt; + bsz = c->nnode_sz; + row -= 1; + } + + if (*big_lpt) { + /* Need to add LPT's save table */ + if (len + c->lsave_sz > c->leb_size) { + alen = ALIGN(len, c->min_io_size); + set_ltab(c, lnum, c->leb_size - alen, alen - len); + memset(p, 0xff, alen - len); + err = ubi_leb_change(c->ubi, lnum++, buf, alen, + UBI_SHORTTERM); + if (err) + goto out; + p = buf; + len = 0; + } + + c->lsave_lnum = lnum; + c->lsave_offs = len; + + for (i = 0; i < c->lsave_cnt && i < *main_lebs; i++) + lsave[i] = c->main_first + i; + for (; i < c->lsave_cnt; i++) + lsave[i] = c->main_first; + + ubifs_pack_lsave(c, p, lsave); + p += c->lsave_sz; + len += c->lsave_sz; + } + + /* Need to add LPT's own LEB properties table */ + if (len + c->ltab_sz > c->leb_size) { + alen = ALIGN(len, c->min_io_size); + set_ltab(c, lnum, c->leb_size - alen, alen - len); + memset(p, 0xff, alen - len); + err = ubi_leb_change(c->ubi, lnum++, buf, alen, UBI_SHORTTERM); + if (err) + goto out; + p = buf; + len = 0; + } + + c->ltab_lnum = lnum; + c->ltab_offs = len; + + /* Update ltab before packing it */ + len += c->ltab_sz; + alen = ALIGN(len, c->min_io_size); + set_ltab(c, lnum, c->leb_size - alen, alen - len); + + ubifs_pack_ltab(c, p, ltab); + p += c->ltab_sz; + + /* Write remaining buffer */ + memset(p, 0xff, alen - len); + err = ubi_leb_change(c->ubi, lnum, buf, alen, UBI_SHORTTERM); + if (err) + goto out; + + c->nhead_lnum = lnum; + c->nhead_offs = ALIGN(len, c->min_io_size); + + dbg_lp("space_bits %d", c->space_bits); + dbg_lp("lpt_lnum_bits %d", c->lpt_lnum_bits); + dbg_lp("lpt_offs_bits %d", c->lpt_offs_bits); + dbg_lp("lpt_spc_bits %d", c->lpt_spc_bits); + dbg_lp("pcnt_bits %d", c->pcnt_bits); + dbg_lp("lnum_bits %d", c->lnum_bits); + dbg_lp("pnode_sz %d", c->pnode_sz); + dbg_lp("nnode_sz %d", c->nnode_sz); + dbg_lp("ltab_sz %d", c->ltab_sz); + dbg_lp("lsave_sz %d", c->lsave_sz); + dbg_lp("lsave_cnt %d", c->lsave_cnt); + dbg_lp("lpt_hght %d", c->lpt_hght); + dbg_lp("big_lpt %d", c->big_lpt); + dbg_lp("LPT root is at %d:%d", c->lpt_lnum, c->lpt_offs); + dbg_lp("LPT head is at %d:%d", c->nhead_lnum, c->nhead_offs); + dbg_lp("LPT ltab is at %d:%d", c->ltab_lnum, c->ltab_offs); + if (c->big_lpt) + dbg_lp("LPT lsave is at %d:%d", c->lsave_lnum, c->lsave_offs); +out: + c->ltab = NULL; + kfree(lsave); + vfree(ltab); + vfree(buf); + kfree(nnode); + kfree(pnode); + return err; +} + +/** + * update_cats - add LEB properties of a pnode to LEB category lists and heaps. + * @c: UBIFS file-system description object + * @pnode: pnode + * + * When a pnode is loaded into memory, the LEB properties it contains are added, + * by this function, to the LEB category lists and heaps. + */ +static void update_cats(struct ubifs_info *c, struct ubifs_pnode *pnode) +{ + int i; + + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + int cat = pnode->lprops[i].flags & LPROPS_CAT_MASK; + int lnum = pnode->lprops[i].lnum; + + if (!lnum) + return; + ubifs_add_to_cat(c, &pnode->lprops[i], cat); + } +} + +/** + * replace_cats - add LEB properties of a pnode to LEB category lists and heaps. + * @c: UBIFS file-system description object + * @old_pnode: pnode copied + * @new_pnode: pnode copy + * + * During commit it is sometimes necessary to copy a pnode + * (see dirty_cow_pnode). When that happens, references in + * category lists and heaps must be replaced. This function does that. + */ +static void replace_cats(struct ubifs_info *c, struct ubifs_pnode *old_pnode, + struct ubifs_pnode *new_pnode) +{ + int i; + + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + if (!new_pnode->lprops[i].lnum) + return; + ubifs_replace_cat(c, &old_pnode->lprops[i], + &new_pnode->lprops[i]); + } +} + +/** + * check_lpt_crc - check LPT node crc is correct. + * @c: UBIFS file-system description object + * @buf: buffer containing node + * @len: length of node + * + * This function returns %0 on success and a negative error code on failure. + */ +static int check_lpt_crc(void *buf, int len) +{ + int pos = 0; + uint8_t *addr = buf; + uint16_t crc, calc_crc; + + crc = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_CRC_BITS); + calc_crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES, + len - UBIFS_LPT_CRC_BYTES); + if (crc != calc_crc) { + ubifs_err("invalid crc in LPT node: crc %hx calc %hx", crc, + calc_crc); + dbg_dump_stack(); + return -EINVAL; + } + return 0; +} + +/** + * check_lpt_type - check LPT node type is correct. + * @c: UBIFS file-system description object + * @addr: address of type bit field is passed and returned updated here + * @pos: position of type bit field is passed and returned updated here + * @type: expected type + * + * This function returns %0 on success and a negative error code on failure. + */ +static int check_lpt_type(uint8_t **addr, int *pos, int type) +{ + int node_type; + + node_type = ubifs_unpack_bits(addr, pos, UBIFS_LPT_TYPE_BITS); + if (node_type != type) { + ubifs_err("invalid type (%d) in LPT node type %d", node_type, + type); + dbg_dump_stack(); + return -EINVAL; + } + return 0; +} + +/** + * unpack_pnode - unpack a pnode. + * @c: UBIFS file-system description object + * @buf: buffer containing packed pnode to unpack + * @pnode: pnode structure to fill + * + * This function returns %0 on success and a negative error code on failure. + */ +static int unpack_pnode(struct ubifs_info *c, void *buf, + struct ubifs_pnode *pnode) +{ + uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; + int i, pos = 0, err; + + err = check_lpt_type(&addr, &pos, UBIFS_LPT_PNODE); + if (err) + return err; + if (c->big_lpt) + pnode->num = ubifs_unpack_bits(&addr, &pos, c->pcnt_bits); + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + struct ubifs_lprops * const lprops = &pnode->lprops[i]; + + lprops->free = ubifs_unpack_bits(&addr, &pos, c->space_bits); + lprops->free <<= 3; + lprops->dirty = ubifs_unpack_bits(&addr, &pos, c->space_bits); + lprops->dirty <<= 3; + + if (ubifs_unpack_bits(&addr, &pos, 1)) + lprops->flags = LPROPS_INDEX; + else + lprops->flags = 0; + lprops->flags |= ubifs_categorize_lprops(c, lprops); + } + err = check_lpt_crc(buf, c->pnode_sz); + return err; +} + +/** + * unpack_nnode - unpack a nnode. + * @c: UBIFS file-system description object + * @buf: buffer containing packed nnode to unpack + * @nnode: nnode structure to fill + * + * This function returns %0 on success and a negative error code on failure. + */ +static int unpack_nnode(struct ubifs_info *c, void *buf, + struct ubifs_nnode *nnode) +{ + uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; + int i, pos = 0, err; + + err = check_lpt_type(&addr, &pos, UBIFS_LPT_NNODE); + if (err) + return err; + if (c->big_lpt) + nnode->num = ubifs_unpack_bits(&addr, &pos, c->pcnt_bits); + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + int lnum; + + lnum = ubifs_unpack_bits(&addr, &pos, c->lpt_lnum_bits) + + c->lpt_first; + if (lnum == c->lpt_last + 1) + lnum = 0; + nnode->nbranch[i].lnum = lnum; + nnode->nbranch[i].offs = ubifs_unpack_bits(&addr, &pos, + c->lpt_offs_bits); + } + err = check_lpt_crc(buf, c->nnode_sz); + return err; +} + +/** + * unpack_ltab - unpack the LPT's own lprops table. + * @c: UBIFS file-system description object + * @buf: buffer from which to unpack + * + * This function returns %0 on success and a negative error code on failure. + */ +static int unpack_ltab(struct ubifs_info *c, void *buf) +{ + uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; + int i, pos = 0, err; + + err = check_lpt_type(&addr, &pos, UBIFS_LPT_LTAB); + if (err) + return err; + for (i = 0; i < c->lpt_lebs; i++) { + int free = ubifs_unpack_bits(&addr, &pos, c->lpt_spc_bits); + int dirty = ubifs_unpack_bits(&addr, &pos, c->lpt_spc_bits); + + if (free < 0 || free > c->leb_size || dirty < 0 || + dirty > c->leb_size || free + dirty > c->leb_size) + return -EINVAL; + + c->ltab[i].free = free; + c->ltab[i].dirty = dirty; + c->ltab[i].tgc = 0; + c->ltab[i].cmt = 0; + } + err = check_lpt_crc(buf, c->ltab_sz); + return err; +} + +/** + * unpack_lsave - unpack the LPT's save table. + * @c: UBIFS file-system description object + * @buf: buffer from which to unpack + * + * This function returns %0 on success and a negative error code on failure. + */ +static int unpack_lsave(struct ubifs_info *c, void *buf) +{ + uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; + int i, pos = 0, err; + + err = check_lpt_type(&addr, &pos, UBIFS_LPT_LSAVE); + if (err) + return err; + for (i = 0; i < c->lsave_cnt; i++) { + int lnum = ubifs_unpack_bits(&addr, &pos, c->lnum_bits); + + if (lnum < c->main_first || lnum >= c->leb_cnt) + return -EINVAL; + c->lsave[i] = lnum; + } + err = check_lpt_crc(buf, c->lsave_sz); + return err; +} + +/** + * validate_nnode - validate a nnode. + * @c: UBIFS file-system description object + * @nnode: nnode to validate + * @parent: parent nnode (or NULL for the root nnode) + * @iip: index in parent + * + * This function returns %0 on success and a negative error code on failure. + */ +static int validate_nnode(struct ubifs_info *c, struct ubifs_nnode *nnode, + struct ubifs_nnode *parent, int iip) +{ + int i, lvl, max_offs; + + if (c->big_lpt) { + int num = calc_nnode_num_from_parent(c, parent, iip); + + if (nnode->num != num) + return -EINVAL; + } + lvl = parent ? parent->level - 1 : c->lpt_hght; + if (lvl < 1) + return -EINVAL; + if (lvl == 1) + max_offs = c->leb_size - c->pnode_sz; + else + max_offs = c->leb_size - c->nnode_sz; + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + int lnum = nnode->nbranch[i].lnum; + int offs = nnode->nbranch[i].offs; + + if (lnum == 0) { + if (offs != 0) + return -EINVAL; + continue; + } + if (lnum < c->lpt_first || lnum > c->lpt_last) + return -EINVAL; + if (offs < 0 || offs > max_offs) + return -EINVAL; + } + return 0; +} + +/** + * validate_pnode - validate a pnode. + * @c: UBIFS file-system description object + * @pnode: pnode to validate + * @parent: parent nnode + * @iip: index in parent + * + * This function returns %0 on success and a negative error code on failure. + */ +static int validate_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, + struct ubifs_nnode *parent, int iip) +{ + int i; + + if (c->big_lpt) { + int num = calc_pnode_num_from_parent(c, parent, iip); + + if (pnode->num != num) + return -EINVAL; + } + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + int free = pnode->lprops[i].free; + int dirty = pnode->lprops[i].dirty; + + if (free < 0 || free > c->leb_size || free % c->min_io_size || + (free & 7)) + return -EINVAL; + if (dirty < 0 || dirty > c->leb_size || (dirty & 7)) + return -EINVAL; + if (dirty + free > c->leb_size) + return -EINVAL; + } + return 0; +} + +/** + * set_pnode_lnum - set LEB numbers on a pnode. + * @c: UBIFS file-system description object + * @pnode: pnode to update + * + * This function calculates the LEB numbers for the LEB properties it contains + * based on the pnode number. + */ +static void set_pnode_lnum(struct ubifs_info *c, struct ubifs_pnode *pnode) +{ + int i, lnum; + + lnum = (pnode->num << UBIFS_LPT_FANOUT_SHIFT) + c->main_first; + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + if (lnum >= c->leb_cnt) + return; + pnode->lprops[i].lnum = lnum++; + } +} + +/** + * ubifs_read_nnode - read a nnode from flash and link it to the tree in memory. + * @c: UBIFS file-system description object + * @parent: parent nnode (or NULL for the root) + * @iip: index in parent + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_read_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip) +{ + struct ubifs_nbranch *branch = NULL; + struct ubifs_nnode *nnode = NULL; + void *buf = c->lpt_nod_buf; + int err, lnum, offs; + + if (parent) { + branch = &parent->nbranch[iip]; + lnum = branch->lnum; + offs = branch->offs; + } else { + lnum = c->lpt_lnum; + offs = c->lpt_offs; + } + nnode = kzalloc(sizeof(struct ubifs_nnode), GFP_NOFS); + if (!nnode) { + err = -ENOMEM; + goto out; + } + if (lnum == 0) { + /* + * This nnode was not written which just means that the LEB + * properties in the subtree below it describe empty LEBs. We + * make the nnode as though we had read it, which in fact means + * doing almost nothing. + */ + if (c->big_lpt) + nnode->num = calc_nnode_num_from_parent(c, parent, iip); + } else { + err = ubi_read(c->ubi, lnum, buf, offs, c->nnode_sz); + if (err) + goto out; + err = unpack_nnode(c, buf, nnode); + if (err) + goto out; + } + err = validate_nnode(c, nnode, parent, iip); + if (err) + goto out; + if (!c->big_lpt) + nnode->num = calc_nnode_num_from_parent(c, parent, iip); + if (parent) { + branch->nnode = nnode; + nnode->level = parent->level - 1; + } else { + c->nroot = nnode; + nnode->level = c->lpt_hght; + } + nnode->parent = parent; + nnode->iip = iip; + return 0; + +out: + ubifs_err("error %d reading nnode at %d:%d", err, lnum, offs); + kfree(nnode); + return err; +} + +/** + * read_pnode - read a pnode from flash and link it to the tree in memory. + * @c: UBIFS file-system description object + * @parent: parent nnode + * @iip: index in parent + * + * This function returns %0 on success and a negative error code on failure. + */ +static int read_pnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip) +{ + struct ubifs_nbranch *branch; + struct ubifs_pnode *pnode = NULL; + void *buf = c->lpt_nod_buf; + int err, lnum, offs; + + branch = &parent->nbranch[iip]; + lnum = branch->lnum; + offs = branch->offs; + pnode = kzalloc(sizeof(struct ubifs_pnode), GFP_NOFS); + if (!pnode) { + err = -ENOMEM; + goto out; + } + if (lnum == 0) { + /* + * This pnode was not written which just means that the LEB + * properties in it describe empty LEBs. We make the pnode as + * though we had read it. + */ + int i; + + if (c->big_lpt) + pnode->num = calc_pnode_num_from_parent(c, parent, iip); + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + struct ubifs_lprops * const lprops = &pnode->lprops[i]; + + lprops->free = c->leb_size; + lprops->flags = ubifs_categorize_lprops(c, lprops); + } + } else { + err = ubi_read(c->ubi, lnum, buf, offs, c->pnode_sz); + if (err) + goto out; + err = unpack_pnode(c, buf, pnode); + if (err) + goto out; + } + err = validate_pnode(c, pnode, parent, iip); + if (err) + goto out; + if (!c->big_lpt) + pnode->num = calc_pnode_num_from_parent(c, parent, iip); + branch->pnode = pnode; + pnode->parent = parent; + pnode->iip = iip; + set_pnode_lnum(c, pnode); + c->pnodes_have += 1; + return 0; + +out: + ubifs_err("error %d reading pnode at %d:%d", err, lnum, offs); + dbg_dump_pnode(c, pnode, parent, iip); + dbg_msg("calc num: %d", calc_pnode_num_from_parent(c, parent, iip)); + kfree(pnode); + return err; +} + +/** + * read_ltab - read LPT's own lprops table. + * @c: UBIFS file-system description object + * + * This function returns %0 on success and a negative error code on failure. + */ +static int read_ltab(struct ubifs_info *c) +{ + int err; + void *buf; + + buf = vmalloc(c->ltab_sz); + if (!buf) + return -ENOMEM; + err = ubi_read(c->ubi, c->ltab_lnum, buf, c->ltab_offs, c->ltab_sz); + if (err) + goto out; + err = unpack_ltab(c, buf); +out: + vfree(buf); + return err; +} + +/** + * read_lsave - read LPT's save table. + * @c: UBIFS file-system description object + * + * This function returns %0 on success and a negative error code on failure. + */ +static int read_lsave(struct ubifs_info *c) +{ + int err, i; + void *buf; + + buf = vmalloc(c->lsave_sz); + if (!buf) + return -ENOMEM; + err = ubi_read(c->ubi, c->lsave_lnum, buf, c->lsave_offs, c->lsave_sz); + if (err) + goto out; + err = unpack_lsave(c, buf); + if (err) + goto out; + for (i = 0; i < c->lsave_cnt; i++) { + int lnum = c->lsave[i]; + + /* + * Due to automatic resizing, the values in the lsave table + * could be beyond the volume size - just ignore them. + */ + if (lnum >= c->leb_cnt) + continue; + ubifs_lpt_lookup(c, lnum); + } +out: + vfree(buf); + return err; +} + +/** + * ubifs_get_nnode - get a nnode. + * @c: UBIFS file-system description object + * @parent: parent nnode (or NULL for the root) + * @iip: index in parent + * + * This function returns a pointer to the nnode on success or a negative error + * code on failure. + */ +struct ubifs_nnode *ubifs_get_nnode(struct ubifs_info *c, + struct ubifs_nnode *parent, int iip) +{ + struct ubifs_nbranch *branch; + struct ubifs_nnode *nnode; + int err; + + branch = &parent->nbranch[iip]; + nnode = branch->nnode; + if (nnode) + return nnode; + err = ubifs_read_nnode(c, parent, iip); + if (err) + return ERR_PTR(err); + return branch->nnode; +} + +/** + * ubifs_get_pnode - get a pnode. + * @c: UBIFS file-system description object + * @parent: parent nnode + * @iip: index in parent + * + * This function returns a pointer to the pnode on success or a negative error + * code on failure. + */ +struct ubifs_pnode *ubifs_get_pnode(struct ubifs_info *c, + struct ubifs_nnode *parent, int iip) +{ + struct ubifs_nbranch *branch; + struct ubifs_pnode *pnode; + int err; + + branch = &parent->nbranch[iip]; + pnode = branch->pnode; + if (pnode) + return pnode; + err = read_pnode(c, parent, iip); + if (err) + return ERR_PTR(err); + update_cats(c, branch->pnode); + return branch->pnode; +} + +/** + * ubifs_lpt_lookup - lookup LEB properties in the LPT. + * @c: UBIFS file-system description object + * @lnum: LEB number to lookup + * + * This function returns a pointer to the LEB properties on success or a + * negative error code on failure. + */ +struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum) +{ + int err, i, h, iip, shft; + struct ubifs_nnode *nnode; + struct ubifs_pnode *pnode; + + if (!c->nroot) { + err = ubifs_read_nnode(c, NULL, 0); + if (err) + return ERR_PTR(err); + } + nnode = c->nroot; + i = lnum - c->main_first; + shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT; + for (h = 1; h < c->lpt_hght; h++) { + iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); + shft -= UBIFS_LPT_FANOUT_SHIFT; + nnode = ubifs_get_nnode(c, nnode, iip); + if (IS_ERR(nnode)) + return ERR_PTR(PTR_ERR(nnode)); + } + iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); + shft -= UBIFS_LPT_FANOUT_SHIFT; + pnode = ubifs_get_pnode(c, nnode, iip); + if (IS_ERR(pnode)) + return ERR_PTR(PTR_ERR(pnode)); + iip = (i & (UBIFS_LPT_FANOUT - 1)); + dbg_lp("LEB %d, free %d, dirty %d, flags %d", lnum, + pnode->lprops[iip].free, pnode->lprops[iip].dirty, + pnode->lprops[iip].flags); + return &pnode->lprops[iip]; +} + +/** + * dirty_cow_nnode - ensure a nnode is not being committed. + * @c: UBIFS file-system description object + * @nnode: nnode to check + * + * Returns dirtied nnode on success or negative error code on failure. + */ +static struct ubifs_nnode *dirty_cow_nnode(struct ubifs_info *c, + struct ubifs_nnode *nnode) +{ + struct ubifs_nnode *n; + int i; + + if (!test_bit(COW_CNODE, &nnode->flags)) { + /* nnode is not being committed */ + if (!test_and_set_bit(DIRTY_CNODE, &nnode->flags)) { + c->dirty_nn_cnt += 1; + ubifs_add_nnode_dirt(c, nnode); + } + return nnode; + } + + /* nnode is being committed, so copy it */ + n = kmalloc(sizeof(struct ubifs_nnode), GFP_NOFS); + if (unlikely(!n)) + return ERR_PTR(-ENOMEM); + + memcpy(n, nnode, sizeof(struct ubifs_nnode)); + n->cnext = NULL; + __set_bit(DIRTY_CNODE, &n->flags); + __clear_bit(COW_CNODE, &n->flags); + + /* The children now have new parent */ + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + struct ubifs_nbranch *branch = &n->nbranch[i]; + + if (branch->cnode) + branch->cnode->parent = n; + } + + ubifs_assert(!test_bit(OBSOLETE_CNODE, &nnode->flags)); + __set_bit(OBSOLETE_CNODE, &nnode->flags); + + c->dirty_nn_cnt += 1; + ubifs_add_nnode_dirt(c, nnode); + if (nnode->parent) + nnode->parent->nbranch[n->iip].nnode = n; + else + c->nroot = n; + return n; +} + +/** + * dirty_cow_pnode - ensure a pnode is not being committed. + * @c: UBIFS file-system description object + * @pnode: pnode to check + * + * Returns dirtied pnode on success or negative error code on failure. + */ +static struct ubifs_pnode *dirty_cow_pnode(struct ubifs_info *c, + struct ubifs_pnode *pnode) +{ + struct ubifs_pnode *p; + + if (!test_bit(COW_CNODE, &pnode->flags)) { + /* pnode is not being committed */ + if (!test_and_set_bit(DIRTY_CNODE, &pnode->flags)) { + c->dirty_pn_cnt += 1; + add_pnode_dirt(c, pnode); + } + return pnode; + } + + /* pnode is being committed, so copy it */ + p = kmalloc(sizeof(struct ubifs_pnode), GFP_NOFS); + if (unlikely(!p)) + return ERR_PTR(-ENOMEM); + + memcpy(p, pnode, sizeof(struct ubifs_pnode)); + p->cnext = NULL; + __set_bit(DIRTY_CNODE, &p->flags); + __clear_bit(COW_CNODE, &p->flags); + replace_cats(c, pnode, p); + + ubifs_assert(!test_bit(OBSOLETE_CNODE, &pnode->flags)); + __set_bit(OBSOLETE_CNODE, &pnode->flags); + + c->dirty_pn_cnt += 1; + add_pnode_dirt(c, pnode); + pnode->parent->nbranch[p->iip].pnode = p; + return p; +} + +/** + * ubifs_lpt_lookup_dirty - lookup LEB properties in the LPT. + * @c: UBIFS file-system description object + * @lnum: LEB number to lookup + * + * This function returns a pointer to the LEB properties on success or a + * negative error code on failure. + */ +struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum) +{ + int err, i, h, iip, shft; + struct ubifs_nnode *nnode; + struct ubifs_pnode *pnode; + + if (!c->nroot) { + err = ubifs_read_nnode(c, NULL, 0); + if (err) + return ERR_PTR(err); + } + nnode = c->nroot; + nnode = dirty_cow_nnode(c, nnode); + if (IS_ERR(nnode)) + return ERR_PTR(PTR_ERR(nnode)); + i = lnum - c->main_first; + shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT; + for (h = 1; h < c->lpt_hght; h++) { + iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); + shft -= UBIFS_LPT_FANOUT_SHIFT; + nnode = ubifs_get_nnode(c, nnode, iip); + if (IS_ERR(nnode)) + return ERR_PTR(PTR_ERR(nnode)); + nnode = dirty_cow_nnode(c, nnode); + if (IS_ERR(nnode)) + return ERR_PTR(PTR_ERR(nnode)); + } + iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); + shft -= UBIFS_LPT_FANOUT_SHIFT; + pnode = ubifs_get_pnode(c, nnode, iip); + if (IS_ERR(pnode)) + return ERR_PTR(PTR_ERR(pnode)); + pnode = dirty_cow_pnode(c, pnode); + if (IS_ERR(pnode)) + return ERR_PTR(PTR_ERR(pnode)); + iip = (i & (UBIFS_LPT_FANOUT - 1)); + dbg_lp("LEB %d, free %d, dirty %d, flags %d", lnum, + pnode->lprops[iip].free, pnode->lprops[iip].dirty, + pnode->lprops[iip].flags); + ubifs_assert(test_bit(DIRTY_CNODE, &pnode->flags)); + return &pnode->lprops[iip]; +} + +/** + * lpt_init_rd - initialize the LPT for reading. + * @c: UBIFS file-system description object + * + * This function returns %0 on success and a negative error code on failure. + */ +static int lpt_init_rd(struct ubifs_info *c) +{ + int err, i; + + c->ltab = vmalloc(sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs); + if (!c->ltab) + return -ENOMEM; + + i = max_t(int, c->nnode_sz, c->pnode_sz); + c->lpt_nod_buf = kmalloc(i, GFP_KERNEL); + if (!c->lpt_nod_buf) + return -ENOMEM; + + for (i = 0; i < LPROPS_HEAP_CNT; i++) { + c->lpt_heap[i].arr = kmalloc(sizeof(void *) * LPT_HEAP_SZ, + GFP_KERNEL); + if (!c->lpt_heap[i].arr) + return -ENOMEM; + c->lpt_heap[i].cnt = 0; + c->lpt_heap[i].max_cnt = LPT_HEAP_SZ; + } + + c->dirty_idx.arr = kmalloc(sizeof(void *) * LPT_HEAP_SZ, GFP_KERNEL); + if (!c->dirty_idx.arr) + return -ENOMEM; + c->dirty_idx.cnt = 0; + c->dirty_idx.max_cnt = LPT_HEAP_SZ; + + err = read_ltab(c); + if (err) + return err; + + dbg_lp("space_bits %d", c->space_bits); + dbg_lp("lpt_lnum_bits %d", c->lpt_lnum_bits); + dbg_lp("lpt_offs_bits %d", c->lpt_offs_bits); + dbg_lp("lpt_spc_bits %d", c->lpt_spc_bits); + dbg_lp("pcnt_bits %d", c->pcnt_bits); + dbg_lp("lnum_bits %d", c->lnum_bits); + dbg_lp("pnode_sz %d", c->pnode_sz); + dbg_lp("nnode_sz %d", c->nnode_sz); + dbg_lp("ltab_sz %d", c->ltab_sz); + dbg_lp("lsave_sz %d", c->lsave_sz); + dbg_lp("lsave_cnt %d", c->lsave_cnt); + dbg_lp("lpt_hght %d", c->lpt_hght); + dbg_lp("big_lpt %d", c->big_lpt); + dbg_lp("LPT root is at %d:%d", c->lpt_lnum, c->lpt_offs); + dbg_lp("LPT head is at %d:%d", c->nhead_lnum, c->nhead_offs); + dbg_lp("LPT ltab is at %d:%d", c->ltab_lnum, c->ltab_offs); + if (c->big_lpt) + dbg_lp("LPT lsave is at %d:%d", c->lsave_lnum, c->lsave_offs); + + return 0; +} + +/** + * lpt_init_wr - initialize the LPT for writing. + * @c: UBIFS file-system description object + * + * 'lpt_init_rd()' must have been called already. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int lpt_init_wr(struct ubifs_info *c) +{ + int err, i; + + c->ltab_cmt = vmalloc(sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs); + if (!c->ltab_cmt) + return -ENOMEM; + + c->lpt_buf = vmalloc(c->leb_size); + if (!c->lpt_buf) + return -ENOMEM; + + if (c->big_lpt) { + c->lsave = kmalloc(sizeof(int) * c->lsave_cnt, GFP_NOFS); + if (!c->lsave) + return -ENOMEM; + err = read_lsave(c); + if (err) + return err; + } + + for (i = 0; i < c->lpt_lebs; i++) + if (c->ltab[i].free == c->leb_size) { + err = ubifs_leb_unmap(c, i + c->lpt_first); + if (err) + return err; + } + + return 0; +} + +/** + * ubifs_lpt_init - initialize the LPT. + * @c: UBIFS file-system description object + * @rd: whether to initialize lpt for reading + * @wr: whether to initialize lpt for writing + * + * For mounting 'rw', @rd and @wr are both true. For mounting 'ro', @rd is true + * and @wr is false. For mounting from 'ro' to 'rw', @rd is false and @wr is + * true. + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_lpt_init(struct ubifs_info *c, int rd, int wr) +{ + int err; + + if (rd) { + err = lpt_init_rd(c); + if (err) + return err; + } + + if (wr) { + err = lpt_init_wr(c); + if (err) + return err; + } + + return 0; +} + +/** + * struct lpt_scan_node - somewhere to put nodes while we scan LPT. + * @nnode: where to keep a nnode + * @pnode: where to keep a pnode + * @cnode: where to keep a cnode + * @in_tree: is the node in the tree in memory + * @ptr.nnode: pointer to the nnode (if it is an nnode) which may be here or in + * the tree + * @ptr.pnode: ditto for pnode + * @ptr.cnode: ditto for cnode + */ +struct lpt_scan_node { + union { + struct ubifs_nnode nnode; + struct ubifs_pnode pnode; + struct ubifs_cnode cnode; + }; + int in_tree; + union { + struct ubifs_nnode *nnode; + struct ubifs_pnode *pnode; + struct ubifs_cnode *cnode; + } ptr; +}; + +/** + * scan_get_nnode - for the scan, get a nnode from either the tree or flash. + * @c: the UBIFS file-system description object + * @path: where to put the nnode + * @parent: parent of the nnode + * @iip: index in parent of the nnode + * + * This function returns a pointer to the nnode on success or a negative error + * code on failure. + */ +static struct ubifs_nnode *scan_get_nnode(struct ubifs_info *c, + struct lpt_scan_node *path, + struct ubifs_nnode *parent, int iip) +{ + struct ubifs_nbranch *branch; + struct ubifs_nnode *nnode; + void *buf = c->lpt_nod_buf; + int err; + + branch = &parent->nbranch[iip]; + nnode = branch->nnode; + if (nnode) { + path->in_tree = 1; + path->ptr.nnode = nnode; + return nnode; + } + nnode = &path->nnode; + path->in_tree = 0; + path->ptr.nnode = nnode; + memset(nnode, 0, sizeof(struct ubifs_nnode)); + if (branch->lnum == 0) { + /* + * This nnode was not written which just means that the LEB + * properties in the subtree below it describe empty LEBs. We + * make the nnode as though we had read it, which in fact means + * doing almost nothing. + */ + if (c->big_lpt) + nnode->num = calc_nnode_num_from_parent(c, parent, iip); + } else { + err = ubi_read(c->ubi, branch->lnum, buf, branch->offs, + c->nnode_sz); + if (err) + return ERR_PTR(err); + err = unpack_nnode(c, buf, nnode); + if (err) + return ERR_PTR(err); + } + err = validate_nnode(c, nnode, parent, iip); + if (err) + return ERR_PTR(err); + if (!c->big_lpt) + nnode->num = calc_nnode_num_from_parent(c, parent, iip); + nnode->level = parent->level - 1; + nnode->parent = parent; + nnode->iip = iip; + return nnode; +} + +/** + * scan_get_pnode - for the scan, get a pnode from either the tree or flash. + * @c: the UBIFS file-system description object + * @path: where to put the pnode + * @parent: parent of the pnode + * @iip: index in parent of the pnode + * + * This function returns a pointer to the pnode on success or a negative error + * code on failure. + */ +static struct ubifs_pnode *scan_get_pnode(struct ubifs_info *c, + struct lpt_scan_node *path, + struct ubifs_nnode *parent, int iip) +{ + struct ubifs_nbranch *branch; + struct ubifs_pnode *pnode; + void *buf = c->lpt_nod_buf; + int err; + + branch = &parent->nbranch[iip]; + pnode = branch->pnode; + if (pnode) { + path->in_tree = 1; + path->ptr.pnode = pnode; + return pnode; + } + pnode = &path->pnode; + path->in_tree = 0; + path->ptr.pnode = pnode; + memset(pnode, 0, sizeof(struct ubifs_pnode)); + if (branch->lnum == 0) { + /* + * This pnode was not written which just means that the LEB + * properties in it describe empty LEBs. We make the pnode as + * though we had read it. + */ + int i; + + if (c->big_lpt) + pnode->num = calc_pnode_num_from_parent(c, parent, iip); + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + struct ubifs_lprops * const lprops = &pnode->lprops[i]; + + lprops->free = c->leb_size; + lprops->flags = ubifs_categorize_lprops(c, lprops); + } + } else { + ubifs_assert(branch->lnum >= c->lpt_first && + branch->lnum <= c->lpt_last); + ubifs_assert(branch->offs >= 0 && branch->offs < c->leb_size); + err = ubi_read(c->ubi, branch->lnum, buf, branch->offs, + c->pnode_sz); + if (err) + return ERR_PTR(err); + err = unpack_pnode(c, buf, pnode); + if (err) + return ERR_PTR(err); + } + err = validate_pnode(c, pnode, parent, iip); + if (err) + return ERR_PTR(err); + if (!c->big_lpt) + pnode->num = calc_pnode_num_from_parent(c, parent, iip); + pnode->parent = parent; + pnode->iip = iip; + set_pnode_lnum(c, pnode); + return pnode; +} + +/** + * ubifs_lpt_scan_nolock - scan the LPT. + * @c: the UBIFS file-system description object + * @start_lnum: LEB number from which to start scanning + * @end_lnum: LEB number at which to stop scanning + * @scan_cb: callback function called for each lprops + * @data: data to be passed to the callback function + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_lpt_scan_nolock(struct ubifs_info *c, int start_lnum, int end_lnum, + ubifs_lpt_scan_callback scan_cb, void *data) +{ + int err = 0, i, h, iip, shft; + struct ubifs_nnode *nnode; + struct ubifs_pnode *pnode; + struct lpt_scan_node *path; + + if (start_lnum == -1) { + start_lnum = end_lnum + 1; + if (start_lnum >= c->leb_cnt) + start_lnum = c->main_first; + } + + ubifs_assert(start_lnum >= c->main_first && start_lnum < c->leb_cnt); + ubifs_assert(end_lnum >= c->main_first && end_lnum < c->leb_cnt); + + if (!c->nroot) { + err = ubifs_read_nnode(c, NULL, 0); + if (err) + return err; + } + + path = kmalloc(sizeof(struct lpt_scan_node) * (c->lpt_hght + 1), + GFP_NOFS); + if (!path) + return -ENOMEM; + + path[0].ptr.nnode = c->nroot; + path[0].in_tree = 1; +again: + /* Descend to the pnode containing start_lnum */ + nnode = c->nroot; + i = start_lnum - c->main_first; + shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT; + for (h = 1; h < c->lpt_hght; h++) { + iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); + shft -= UBIFS_LPT_FANOUT_SHIFT; + nnode = scan_get_nnode(c, path + h, nnode, iip); + if (IS_ERR(nnode)) { + err = PTR_ERR(nnode); + goto out; + } + } + iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); + shft -= UBIFS_LPT_FANOUT_SHIFT; + pnode = scan_get_pnode(c, path + h, nnode, iip); + if (IS_ERR(pnode)) { + err = PTR_ERR(pnode); + goto out; + } + iip = (i & (UBIFS_LPT_FANOUT - 1)); + + /* Loop for each lprops */ + while (1) { + struct ubifs_lprops *lprops = &pnode->lprops[iip]; + int ret, lnum = lprops->lnum; + + ret = scan_cb(c, lprops, path[h].in_tree, data); + if (ret < 0) { + err = ret; + goto out; + } + if (ret & LPT_SCAN_ADD) { + /* Add all the nodes in path to the tree in memory */ + for (h = 1; h < c->lpt_hght; h++) { + const size_t sz = sizeof(struct ubifs_nnode); + struct ubifs_nnode *parent; + + if (path[h].in_tree) + continue; + nnode = kmalloc(sz, GFP_NOFS); + if (!nnode) { + err = -ENOMEM; + goto out; + } + memcpy(nnode, &path[h].nnode, sz); + parent = nnode->parent; + parent->nbranch[nnode->iip].nnode = nnode; + path[h].ptr.nnode = nnode; + path[h].in_tree = 1; + path[h + 1].cnode.parent = nnode; + } + if (path[h].in_tree) + ubifs_ensure_cat(c, lprops); + else { + const size_t sz = sizeof(struct ubifs_pnode); + struct ubifs_nnode *parent; + + pnode = kmalloc(sz, GFP_NOFS); + if (!pnode) { + err = -ENOMEM; + goto out; + } + memcpy(pnode, &path[h].pnode, sz); + parent = pnode->parent; + parent->nbranch[pnode->iip].pnode = pnode; + path[h].ptr.pnode = pnode; + path[h].in_tree = 1; + update_cats(c, pnode); + c->pnodes_have += 1; + } + err = dbg_check_lpt_nodes(c, (struct ubifs_cnode *) + c->nroot, 0, 0); + if (err) + goto out; + err = dbg_check_cats(c); + if (err) + goto out; + } + if (ret & LPT_SCAN_STOP) { + err = 0; + break; + } + /* Get the next lprops */ + if (lnum == end_lnum) { + /* + * We got to the end without finding what we were + * looking for + */ + err = -ENOSPC; + goto out; + } + if (lnum + 1 >= c->leb_cnt) { + /* Wrap-around to the beginning */ + start_lnum = c->main_first; + goto again; + } + if (iip + 1 < UBIFS_LPT_FANOUT) { + /* Next lprops is in the same pnode */ + iip += 1; + continue; + } + /* We need to get the next pnode. Go up until we can go right */ + iip = pnode->iip; + while (1) { + h -= 1; + ubifs_assert(h >= 0); + nnode = path[h].ptr.nnode; + if (iip + 1 < UBIFS_LPT_FANOUT) + break; + iip = nnode->iip; + } + /* Go right */ + iip += 1; + /* Descend to the pnode */ + h += 1; + for (; h < c->lpt_hght; h++) { + nnode = scan_get_nnode(c, path + h, nnode, iip); + if (IS_ERR(nnode)) { + err = PTR_ERR(nnode); + goto out; + } + iip = 0; + } + pnode = scan_get_pnode(c, path + h, nnode, iip); + if (IS_ERR(pnode)) { + err = PTR_ERR(pnode); + goto out; + } + iip = 0; + } +out: + kfree(path); + return err; +} + +#ifdef CONFIG_UBIFS_FS_DEBUG + +/** + * dbg_chk_pnode - check a pnode. + * @c: the UBIFS file-system description object + * @pnode: pnode to check + * @col: pnode column + * + * This function returns %0 on success and a negative error code on failure. + */ +static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, + int col) +{ + int i; + + if (pnode->num != col) { + dbg_err("pnode num %d expected %d parent num %d iip %d", + pnode->num, col, pnode->parent->num, pnode->iip); + return -EINVAL; + } + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + struct ubifs_lprops *lp, *lprops = &pnode->lprops[i]; + int lnum = (pnode->num << UBIFS_LPT_FANOUT_SHIFT) + i + + c->main_first; + int found, cat = lprops->flags & LPROPS_CAT_MASK; + struct ubifs_lpt_heap *heap; + struct list_head *list = NULL; + + if (lnum >= c->leb_cnt) + continue; + if (lprops->lnum != lnum) { + dbg_err("bad LEB number %d expected %d", + lprops->lnum, lnum); + return -EINVAL; + } + if (lprops->flags & LPROPS_TAKEN) { + if (cat != LPROPS_UNCAT) { + dbg_err("LEB %d taken but not uncat %d", + lprops->lnum, cat); + return -EINVAL; + } + continue; + } + if (lprops->flags & LPROPS_INDEX) { + switch (cat) { + case LPROPS_UNCAT: + case LPROPS_DIRTY_IDX: + case LPROPS_FRDI_IDX: + break; + default: + dbg_err("LEB %d index but cat %d", + lprops->lnum, cat); + return -EINVAL; + } + } else { + switch (cat) { + case LPROPS_UNCAT: + case LPROPS_DIRTY: + case LPROPS_FREE: + case LPROPS_EMPTY: + case LPROPS_FREEABLE: + break; + default: + dbg_err("LEB %d not index but cat %d", + lprops->lnum, cat); + return -EINVAL; + } + } + switch (cat) { + case LPROPS_UNCAT: + list = &c->uncat_list; + break; + case LPROPS_EMPTY: + list = &c->empty_list; + break; + case LPROPS_FREEABLE: + list = &c->freeable_list; + break; + case LPROPS_FRDI_IDX: + list = &c->frdi_idx_list; + break; + } + found = 0; + switch (cat) { + case LPROPS_DIRTY: + case LPROPS_DIRTY_IDX: + case LPROPS_FREE: + heap = &c->lpt_heap[cat - 1]; + if (lprops->hpos < heap->cnt && + heap->arr[lprops->hpos] == lprops) + found = 1; + break; + case LPROPS_UNCAT: + case LPROPS_EMPTY: + case LPROPS_FREEABLE: + case LPROPS_FRDI_IDX: + list_for_each_entry(lp, list, list) + if (lprops == lp) { + found = 1; + break; + } + break; + } + if (!found) { + dbg_err("LEB %d cat %d not found in cat heap/list", + lprops->lnum, cat); + return -EINVAL; + } + switch (cat) { + case LPROPS_EMPTY: + if (lprops->free != c->leb_size) { + dbg_err("LEB %d cat %d free %d dirty %d", + lprops->lnum, cat, lprops->free, + lprops->dirty); + return -EINVAL; + } + case LPROPS_FREEABLE: + case LPROPS_FRDI_IDX: + if (lprops->free + lprops->dirty != c->leb_size) { + dbg_err("LEB %d cat %d free %d dirty %d", + lprops->lnum, cat, lprops->free, + lprops->dirty); + return -EINVAL; + } + } + } + return 0; +} + +/** + * dbg_check_lpt_nodes - check nnodes and pnodes. + * @c: the UBIFS file-system description object + * @cnode: next cnode (nnode or pnode) to check + * @row: row of cnode (root is zero) + * @col: column of cnode (leftmost is zero) + * + * This function returns %0 on success and a negative error code on failure. + */ +int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode, + int row, int col) +{ + struct ubifs_nnode *nnode, *nn; + struct ubifs_cnode *cn; + int num, iip = 0, err; + + if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS)) + return 0; + + while (cnode) { + ubifs_assert(row >= 0); + nnode = cnode->parent; + if (cnode->level) { + /* cnode is a nnode */ + num = calc_nnode_num(row, col); + if (cnode->num != num) { + dbg_err("nnode num %d expected %d " + "parent num %d iip %d", cnode->num, num, + (nnode ? nnode->num : 0), cnode->iip); + return -EINVAL; + } + nn = (struct ubifs_nnode *)cnode; + while (iip < UBIFS_LPT_FANOUT) { + cn = nn->nbranch[iip].cnode; + if (cn) { + /* Go down */ + row += 1; + col <<= UBIFS_LPT_FANOUT_SHIFT; + col += iip; + iip = 0; + cnode = cn; + break; + } + /* Go right */ + iip += 1; + } + if (iip < UBIFS_LPT_FANOUT) + continue; + } else { + struct ubifs_pnode *pnode; + + /* cnode is a pnode */ + pnode = (struct ubifs_pnode *)cnode; + err = dbg_chk_pnode(c, pnode, col); + if (err) + return err; + } + /* Go up and to the right */ + row -= 1; + col >>= UBIFS_LPT_FANOUT_SHIFT; + iip = cnode->iip + 1; + cnode = (struct ubifs_cnode *)nnode; + } + return 0; +} + +#endif /* CONFIG_UBIFS_FS_DEBUG */ diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c new file mode 100644 index 000000000000..5f0b83e20af6 --- /dev/null +++ b/fs/ubifs/lpt_commit.c @@ -0,0 +1,1648 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * This file implements commit-related functionality of the LEB properties + * subsystem. + */ + +#include <linux/crc16.h> +#include "ubifs.h" + +/** + * first_dirty_cnode - find first dirty cnode. + * @c: UBIFS file-system description object + * @nnode: nnode at which to start + * + * This function returns the first dirty cnode or %NULL if there is not one. + */ +static struct ubifs_cnode *first_dirty_cnode(struct ubifs_nnode *nnode) +{ + ubifs_assert(nnode); + while (1) { + int i, cont = 0; + + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + struct ubifs_cnode *cnode; + + cnode = nnode->nbranch[i].cnode; + if (cnode && + test_bit(DIRTY_CNODE, &cnode->flags)) { + if (cnode->level == 0) + return cnode; + nnode = (struct ubifs_nnode *)cnode; + cont = 1; + break; + } + } + if (!cont) + return (struct ubifs_cnode *)nnode; + } +} + +/** + * next_dirty_cnode - find next dirty cnode. + * @cnode: cnode from which to begin searching + * + * This function returns the next dirty cnode or %NULL if there is not one. + */ +static struct ubifs_cnode *next_dirty_cnode(struct ubifs_cnode *cnode) +{ + struct ubifs_nnode *nnode; + int i; + + ubifs_assert(cnode); + nnode = cnode->parent; + if (!nnode) + return NULL; + for (i = cnode->iip + 1; i < UBIFS_LPT_FANOUT; i++) { + cnode = nnode->nbranch[i].cnode; + if (cnode && test_bit(DIRTY_CNODE, &cnode->flags)) { + if (cnode->level == 0) + return cnode; /* cnode is a pnode */ + /* cnode is a nnode */ + return first_dirty_cnode((struct ubifs_nnode *)cnode); + } + } + return (struct ubifs_cnode *)nnode; +} + +/** + * get_cnodes_to_commit - create list of dirty cnodes to commit. + * @c: UBIFS file-system description object + * + * This function returns the number of cnodes to commit. + */ +static int get_cnodes_to_commit(struct ubifs_info *c) +{ + struct ubifs_cnode *cnode, *cnext; + int cnt = 0; + + if (!c->nroot) + return 0; + + if (!test_bit(DIRTY_CNODE, &c->nroot->flags)) + return 0; + + c->lpt_cnext = first_dirty_cnode(c->nroot); + cnode = c->lpt_cnext; + if (!cnode) + return 0; + cnt += 1; + while (1) { + ubifs_assert(!test_bit(COW_ZNODE, &cnode->flags)); + __set_bit(COW_ZNODE, &cnode->flags); + cnext = next_dirty_cnode(cnode); + if (!cnext) { + cnode->cnext = c->lpt_cnext; + break; + } + cnode->cnext = cnext; + cnode = cnext; + cnt += 1; + } + dbg_cmt("committing %d cnodes", cnt); + dbg_lp("committing %d cnodes", cnt); + ubifs_assert(cnt == c->dirty_nn_cnt + c->dirty_pn_cnt); + return cnt; +} + +/** + * upd_ltab - update LPT LEB properties. + * @c: UBIFS file-system description object + * @lnum: LEB number + * @free: amount of free space + * @dirty: amount of dirty space to add + */ +static void upd_ltab(struct ubifs_info *c, int lnum, int free, int dirty) +{ + dbg_lp("LEB %d free %d dirty %d to %d +%d", + lnum, c->ltab[lnum - c->lpt_first].free, + c->ltab[lnum - c->lpt_first].dirty, free, dirty); + ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last); + c->ltab[lnum - c->lpt_first].free = free; + c->ltab[lnum - c->lpt_first].dirty += dirty; +} + +/** + * alloc_lpt_leb - allocate an LPT LEB that is empty. + * @c: UBIFS file-system description object + * @lnum: LEB number is passed and returned here + * + * This function finds the next empty LEB in the ltab starting from @lnum. If a + * an empty LEB is found it is returned in @lnum and the function returns %0. + * Otherwise the function returns -ENOSPC. Note however, that LPT is designed + * never to run out of space. + */ +static int alloc_lpt_leb(struct ubifs_info *c, int *lnum) +{ + int i, n; + + n = *lnum - c->lpt_first + 1; + for (i = n; i < c->lpt_lebs; i++) { + if (c->ltab[i].tgc || c->ltab[i].cmt) + continue; + if (c->ltab[i].free == c->leb_size) { + c->ltab[i].cmt = 1; + *lnum = i + c->lpt_first; + return 0; + } + } + + for (i = 0; i < n; i++) { + if (c->ltab[i].tgc || c->ltab[i].cmt) + continue; + if (c->ltab[i].free == c->leb_size) { + c->ltab[i].cmt = 1; + *lnum = i + c->lpt_first; + return 0; + } + } + dbg_err("last LEB %d", *lnum); + dump_stack(); + return -ENOSPC; +} + +/** + * layout_cnodes - layout cnodes for commit. + * @c: UBIFS file-system description object + * + * This function returns %0 on success and a negative error code on failure. + */ +static int layout_cnodes(struct ubifs_info *c) +{ + int lnum, offs, len, alen, done_lsave, done_ltab, err; + struct ubifs_cnode *cnode; + + cnode = c->lpt_cnext; + if (!cnode) + return 0; + lnum = c->nhead_lnum; + offs = c->nhead_offs; + /* Try to place lsave and ltab nicely */ + done_lsave = !c->big_lpt; + done_ltab = 0; + if (!done_lsave && offs + c->lsave_sz <= c->leb_size) { + done_lsave = 1; + c->lsave_lnum = lnum; + c->lsave_offs = offs; + offs += c->lsave_sz; + } + + if (offs + c->ltab_sz <= c->leb_size) { + done_ltab = 1; + c->ltab_lnum = lnum; + c->ltab_offs = offs; + offs += c->ltab_sz; + } + + do { + if (cnode->level) { + len = c->nnode_sz; + c->dirty_nn_cnt -= 1; + } else { + len = c->pnode_sz; + c->dirty_pn_cnt -= 1; + } + while (offs + len > c->leb_size) { + alen = ALIGN(offs, c->min_io_size); + upd_ltab(c, lnum, c->leb_size - alen, alen - offs); + err = alloc_lpt_leb(c, &lnum); + if (err) + return err; + offs = 0; + ubifs_assert(lnum >= c->lpt_first && + lnum <= c->lpt_last); + /* Try to place lsave and ltab nicely */ + if (!done_lsave) { + done_lsave = 1; + c->lsave_lnum = lnum; + c->lsave_offs = offs; + offs += c->lsave_sz; + continue; + } + if (!done_ltab) { + done_ltab = 1; + c->ltab_lnum = lnum; + c->ltab_offs = offs; + offs += c->ltab_sz; + continue; + } + break; + } + if (cnode->parent) { + cnode->parent->nbranch[cnode->iip].lnum = lnum; + cnode->parent->nbranch[cnode->iip].offs = offs; + } else { + c->lpt_lnum = lnum; + c->lpt_offs = offs; + } + offs += len; + cnode = cnode->cnext; + } while (cnode && cnode != c->lpt_cnext); + + /* Make sure to place LPT's save table */ + if (!done_lsave) { + if (offs + c->lsave_sz > c->leb_size) { + alen = ALIGN(offs, c->min_io_size); + upd_ltab(c, lnum, c->leb_size - alen, alen - offs); + err = alloc_lpt_leb(c, &lnum); + if (err) + return err; + offs = 0; + ubifs_assert(lnum >= c->lpt_first && + lnum <= c->lpt_last); + } + done_lsave = 1; + c->lsave_lnum = lnum; + c->lsave_offs = offs; + offs += c->lsave_sz; + } + + /* Make sure to place LPT's own lprops table */ + if (!done_ltab) { + if (offs + c->ltab_sz > c->leb_size) { + alen = ALIGN(offs, c->min_io_size); + upd_ltab(c, lnum, c->leb_size - alen, alen - offs); + err = alloc_lpt_leb(c, &lnum); + if (err) + return err; + offs = 0; + ubifs_assert(lnum >= c->lpt_first && + lnum <= c->lpt_last); + } + done_ltab = 1; + c->ltab_lnum = lnum; + c->ltab_offs = offs; + offs += c->ltab_sz; + } + + alen = ALIGN(offs, c->min_io_size); + upd_ltab(c, lnum, c->leb_size - alen, alen - offs); + return 0; +} + +/** + * realloc_lpt_leb - allocate an LPT LEB that is empty. + * @c: UBIFS file-system description object + * @lnum: LEB number is passed and returned here + * + * This function duplicates exactly the results of the function alloc_lpt_leb. + * It is used during end commit to reallocate the same LEB numbers that were + * allocated by alloc_lpt_leb during start commit. + * + * This function finds the next LEB that was allocated by the alloc_lpt_leb + * function starting from @lnum. If a LEB is found it is returned in @lnum and + * the function returns %0. Otherwise the function returns -ENOSPC. + * Note however, that LPT is designed never to run out of space. + */ +static int realloc_lpt_leb(struct ubifs_info *c, int *lnum) +{ + int i, n; + + n = *lnum - c->lpt_first + 1; + for (i = n; i < c->lpt_lebs; i++) + if (c->ltab[i].cmt) { + c->ltab[i].cmt = 0; + *lnum = i + c->lpt_first; + return 0; + } + + for (i = 0; i < n; i++) + if (c->ltab[i].cmt) { + c->ltab[i].cmt = 0; + *lnum = i + c->lpt_first; + return 0; + } + dbg_err("last LEB %d", *lnum); + dump_stack(); + return -ENOSPC; +} + +/** + * write_cnodes - write cnodes for commit. + * @c: UBIFS file-system description object + * + * This function returns %0 on success and a negative error code on failure. + */ +static int write_cnodes(struct ubifs_info *c) +{ + int lnum, offs, len, from, err, wlen, alen, done_ltab, done_lsave; + struct ubifs_cnode *cnode; + void *buf = c->lpt_buf; + + cnode = c->lpt_cnext; + if (!cnode) + return 0; + lnum = c->nhead_lnum; + offs = c->nhead_offs; + from = offs; + /* Ensure empty LEB is unmapped */ + if (offs == 0) { + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + } + /* Try to place lsave and ltab nicely */ + done_lsave = !c->big_lpt; + done_ltab = 0; + if (!done_lsave && offs + c->lsave_sz <= c->leb_size) { + done_lsave = 1; + ubifs_pack_lsave(c, buf + offs, c->lsave); + offs += c->lsave_sz; + } + + if (offs + c->ltab_sz <= c->leb_size) { + done_ltab = 1; + ubifs_pack_ltab(c, buf + offs, c->ltab_cmt); + offs += c->ltab_sz; + } + + /* Loop for each cnode */ + do { + if (cnode->level) + len = c->nnode_sz; + else + len = c->pnode_sz; + while (offs + len > c->leb_size) { + wlen = offs - from; + if (wlen) { + alen = ALIGN(wlen, c->min_io_size); + memset(buf + offs, 0xff, alen - wlen); + err = ubifs_leb_write(c, lnum, buf + from, from, + alen, UBI_SHORTTERM); + if (err) + return err; + } + err = realloc_lpt_leb(c, &lnum); + if (err) + return err; + offs = 0; + from = 0; + ubifs_assert(lnum >= c->lpt_first && + lnum <= c->lpt_last); + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + /* Try to place lsave and ltab nicely */ + if (!done_lsave) { + done_lsave = 1; + ubifs_pack_lsave(c, buf + offs, c->lsave); + offs += c->lsave_sz; + continue; + } + if (!done_ltab) { + done_ltab = 1; + ubifs_pack_ltab(c, buf + offs, c->ltab_cmt); + offs += c->ltab_sz; + continue; + } + break; + } + if (cnode->level) + ubifs_pack_nnode(c, buf + offs, + (struct ubifs_nnode *)cnode); + else + ubifs_pack_pnode(c, buf + offs, + (struct ubifs_pnode *)cnode); + /* + * The reason for the barriers is the same as in case of TNC. + * See comment in 'write_index()'. 'dirty_cow_nnode()' and + * 'dirty_cow_pnode()' are the functions for which this is + * important. + */ + clear_bit(DIRTY_CNODE, &cnode->flags); + smp_mb__before_clear_bit(); + clear_bit(COW_ZNODE, &cnode->flags); + smp_mb__after_clear_bit(); + offs += len; + cnode = cnode->cnext; + } while (cnode && cnode != c->lpt_cnext); + + /* Make sure to place LPT's save table */ + if (!done_lsave) { + if (offs + c->lsave_sz > c->leb_size) { + wlen = offs - from; + alen = ALIGN(wlen, c->min_io_size); + memset(buf + offs, 0xff, alen - wlen); + err = ubifs_leb_write(c, lnum, buf + from, from, alen, + UBI_SHORTTERM); + if (err) + return err; + err = realloc_lpt_leb(c, &lnum); + if (err) + return err; + offs = 0; + ubifs_assert(lnum >= c->lpt_first && + lnum <= c->lpt_last); + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + } + done_lsave = 1; + ubifs_pack_lsave(c, buf + offs, c->lsave); + offs += c->lsave_sz; + } + + /* Make sure to place LPT's own lprops table */ + if (!done_ltab) { + if (offs + c->ltab_sz > c->leb_size) { + wlen = offs - from; + alen = ALIGN(wlen, c->min_io_size); + memset(buf + offs, 0xff, alen - wlen); + err = ubifs_leb_write(c, lnum, buf + from, from, alen, + UBI_SHORTTERM); + if (err) + return err; + err = realloc_lpt_leb(c, &lnum); + if (err) + return err; + offs = 0; + ubifs_assert(lnum >= c->lpt_first && + lnum <= c->lpt_last); + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + } + done_ltab = 1; + ubifs_pack_ltab(c, buf + offs, c->ltab_cmt); + offs += c->ltab_sz; + } + + /* Write remaining data in buffer */ + wlen = offs - from; + alen = ALIGN(wlen, c->min_io_size); + memset(buf + offs, 0xff, alen - wlen); + err = ubifs_leb_write(c, lnum, buf + from, from, alen, UBI_SHORTTERM); + if (err) + return err; + c->nhead_lnum = lnum; + c->nhead_offs = ALIGN(offs, c->min_io_size); + + dbg_lp("LPT root is at %d:%d", c->lpt_lnum, c->lpt_offs); + dbg_lp("LPT head is at %d:%d", c->nhead_lnum, c->nhead_offs); + dbg_lp("LPT ltab is at %d:%d", c->ltab_lnum, c->ltab_offs); + if (c->big_lpt) + dbg_lp("LPT lsave is at %d:%d", c->lsave_lnum, c->lsave_offs); + return 0; +} + +/** + * next_pnode - find next pnode. + * @c: UBIFS file-system description object + * @pnode: pnode + * + * This function returns the next pnode or %NULL if there are no more pnodes. + */ +static struct ubifs_pnode *next_pnode(struct ubifs_info *c, + struct ubifs_pnode *pnode) +{ + struct ubifs_nnode *nnode; + int iip; + + /* Try to go right */ + nnode = pnode->parent; + iip = pnode->iip + 1; + if (iip < UBIFS_LPT_FANOUT) { + /* We assume here that LEB zero is never an LPT LEB */ + if (nnode->nbranch[iip].lnum) + return ubifs_get_pnode(c, nnode, iip); + else + return NULL; + } + + /* Go up while can't go right */ + do { + iip = nnode->iip + 1; + nnode = nnode->parent; + if (!nnode) + return NULL; + /* We assume here that LEB zero is never an LPT LEB */ + } while (iip >= UBIFS_LPT_FANOUT || !nnode->nbranch[iip].lnum); + + /* Go right */ + nnode = ubifs_get_nnode(c, nnode, iip); + if (IS_ERR(nnode)) + return (void *)nnode; + + /* Go down to level 1 */ + while (nnode->level > 1) { + nnode = ubifs_get_nnode(c, nnode, 0); + if (IS_ERR(nnode)) + return (void *)nnode; + } + + return ubifs_get_pnode(c, nnode, 0); +} + +/** + * pnode_lookup - lookup a pnode in the LPT. + * @c: UBIFS file-system description object + * @i: pnode number (0 to main_lebs - 1) + * + * This function returns a pointer to the pnode on success or a negative + * error code on failure. + */ +static struct ubifs_pnode *pnode_lookup(struct ubifs_info *c, int i) +{ + int err, h, iip, shft; + struct ubifs_nnode *nnode; + + if (!c->nroot) { + err = ubifs_read_nnode(c, NULL, 0); + if (err) + return ERR_PTR(err); + } + i <<= UBIFS_LPT_FANOUT_SHIFT; + nnode = c->nroot; + shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT; + for (h = 1; h < c->lpt_hght; h++) { + iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); + shft -= UBIFS_LPT_FANOUT_SHIFT; + nnode = ubifs_get_nnode(c, nnode, iip); + if (IS_ERR(nnode)) + return ERR_PTR(PTR_ERR(nnode)); + } + iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); + return ubifs_get_pnode(c, nnode, iip); +} + +/** + * add_pnode_dirt - add dirty space to LPT LEB properties. + * @c: UBIFS file-system description object + * @pnode: pnode for which to add dirt + */ +static void add_pnode_dirt(struct ubifs_info *c, struct ubifs_pnode *pnode) +{ + ubifs_add_lpt_dirt(c, pnode->parent->nbranch[pnode->iip].lnum, + c->pnode_sz); +} + +/** + * do_make_pnode_dirty - mark a pnode dirty. + * @c: UBIFS file-system description object + * @pnode: pnode to mark dirty + */ +static void do_make_pnode_dirty(struct ubifs_info *c, struct ubifs_pnode *pnode) +{ + /* Assumes cnext list is empty i.e. not called during commit */ + if (!test_and_set_bit(DIRTY_CNODE, &pnode->flags)) { + struct ubifs_nnode *nnode; + + c->dirty_pn_cnt += 1; + add_pnode_dirt(c, pnode); + /* Mark parent and ancestors dirty too */ + nnode = pnode->parent; + while (nnode) { + if (!test_and_set_bit(DIRTY_CNODE, &nnode->flags)) { + c->dirty_nn_cnt += 1; + ubifs_add_nnode_dirt(c, nnode); + nnode = nnode->parent; + } else + break; + } + } +} + +/** + * make_tree_dirty - mark the entire LEB properties tree dirty. + * @c: UBIFS file-system description object + * + * This function is used by the "small" LPT model to cause the entire LEB + * properties tree to be written. The "small" LPT model does not use LPT + * garbage collection because it is more efficient to write the entire tree + * (because it is small). + * + * This function returns %0 on success and a negative error code on failure. + */ +static int make_tree_dirty(struct ubifs_info *c) +{ + struct ubifs_pnode *pnode; + + pnode = pnode_lookup(c, 0); + while (pnode) { + do_make_pnode_dirty(c, pnode); + pnode = next_pnode(c, pnode); + if (IS_ERR(pnode)) + return PTR_ERR(pnode); + } + return 0; +} + +/** + * need_write_all - determine if the LPT area is running out of free space. + * @c: UBIFS file-system description object + * + * This function returns %1 if the LPT area is running out of free space and %0 + * if it is not. + */ +static int need_write_all(struct ubifs_info *c) +{ + long long free = 0; + int i; + + for (i = 0; i < c->lpt_lebs; i++) { + if (i + c->lpt_first == c->nhead_lnum) + free += c->leb_size - c->nhead_offs; + else if (c->ltab[i].free == c->leb_size) + free += c->leb_size; + else if (c->ltab[i].free + c->ltab[i].dirty == c->leb_size) + free += c->leb_size; + } + /* Less than twice the size left */ + if (free <= c->lpt_sz * 2) + return 1; + return 0; +} + +/** + * lpt_tgc_start - start trivial garbage collection of LPT LEBs. + * @c: UBIFS file-system description object + * + * LPT trivial garbage collection is where a LPT LEB contains only dirty and + * free space and so may be reused as soon as the next commit is completed. + * This function is called during start commit to mark LPT LEBs for trivial GC. + */ +static void lpt_tgc_start(struct ubifs_info *c) +{ + int i; + + for (i = 0; i < c->lpt_lebs; i++) { + if (i + c->lpt_first == c->nhead_lnum) + continue; + if (c->ltab[i].dirty > 0 && + c->ltab[i].free + c->ltab[i].dirty == c->leb_size) { + c->ltab[i].tgc = 1; + c->ltab[i].free = c->leb_size; + c->ltab[i].dirty = 0; + dbg_lp("LEB %d", i + c->lpt_first); + } + } +} + +/** + * lpt_tgc_end - end trivial garbage collection of LPT LEBs. + * @c: UBIFS file-system description object + * + * LPT trivial garbage collection is where a LPT LEB contains only dirty and + * free space and so may be reused as soon as the next commit is completed. + * This function is called after the commit is completed (master node has been + * written) and unmaps LPT LEBs that were marked for trivial GC. + */ +static int lpt_tgc_end(struct ubifs_info *c) +{ + int i, err; + + for (i = 0; i < c->lpt_lebs; i++) + if (c->ltab[i].tgc) { + err = ubifs_leb_unmap(c, i + c->lpt_first); + if (err) + return err; + c->ltab[i].tgc = 0; + dbg_lp("LEB %d", i + c->lpt_first); + } + return 0; +} + +/** + * populate_lsave - fill the lsave array with important LEB numbers. + * @c: the UBIFS file-system description object + * + * This function is only called for the "big" model. It records a small number + * of LEB numbers of important LEBs. Important LEBs are ones that are (from + * most important to least important): empty, freeable, freeable index, dirty + * index, dirty or free. Upon mount, we read this list of LEB numbers and bring + * their pnodes into memory. That will stop us from having to scan the LPT + * straight away. For the "small" model we assume that scanning the LPT is no + * big deal. + */ +static void populate_lsave(struct ubifs_info *c) +{ + struct ubifs_lprops *lprops; + struct ubifs_lpt_heap *heap; + int i, cnt = 0; + + ubifs_assert(c->big_lpt); + if (!(c->lpt_drty_flgs & LSAVE_DIRTY)) { + c->lpt_drty_flgs |= LSAVE_DIRTY; + ubifs_add_lpt_dirt(c, c->lsave_lnum, c->lsave_sz); + } + list_for_each_entry(lprops, &c->empty_list, list) { + c->lsave[cnt++] = lprops->lnum; + if (cnt >= c->lsave_cnt) + return; + } + list_for_each_entry(lprops, &c->freeable_list, list) { + c->lsave[cnt++] = lprops->lnum; + if (cnt >= c->lsave_cnt) + return; + } + list_for_each_entry(lprops, &c->frdi_idx_list, list) { + c->lsave[cnt++] = lprops->lnum; + if (cnt >= c->lsave_cnt) + return; + } + heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1]; + for (i = 0; i < heap->cnt; i++) { + c->lsave[cnt++] = heap->arr[i]->lnum; + if (cnt >= c->lsave_cnt) + return; + } + heap = &c->lpt_heap[LPROPS_DIRTY - 1]; + for (i = 0; i < heap->cnt; i++) { + c->lsave[cnt++] = heap->arr[i]->lnum; + if (cnt >= c->lsave_cnt) + return; + } + heap = &c->lpt_heap[LPROPS_FREE - 1]; + for (i = 0; i < heap->cnt; i++) { + c->lsave[cnt++] = heap->arr[i]->lnum; + if (cnt >= c->lsave_cnt) + return; + } + /* Fill it up completely */ + while (cnt < c->lsave_cnt) + c->lsave[cnt++] = c->main_first; +} + +/** + * nnode_lookup - lookup a nnode in the LPT. + * @c: UBIFS file-system description object + * @i: nnode number + * + * This function returns a pointer to the nnode on success or a negative + * error code on failure. + */ +static struct ubifs_nnode *nnode_lookup(struct ubifs_info *c, int i) +{ + int err, iip; + struct ubifs_nnode *nnode; + + if (!c->nroot) { + err = ubifs_read_nnode(c, NULL, 0); + if (err) + return ERR_PTR(err); + } + nnode = c->nroot; + while (1) { + iip = i & (UBIFS_LPT_FANOUT - 1); + i >>= UBIFS_LPT_FANOUT_SHIFT; + if (!i) + break; + nnode = ubifs_get_nnode(c, nnode, iip); + if (IS_ERR(nnode)) + return nnode; + } + return nnode; +} + +/** + * make_nnode_dirty - find a nnode and, if found, make it dirty. + * @c: UBIFS file-system description object + * @node_num: nnode number of nnode to make dirty + * @lnum: LEB number where nnode was written + * @offs: offset where nnode was written + * + * This function is used by LPT garbage collection. LPT garbage collection is + * used only for the "big" LPT model (c->big_lpt == 1). Garbage collection + * simply involves marking all the nodes in the LEB being garbage-collected as + * dirty. The dirty nodes are written next commit, after which the LEB is free + * to be reused. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int make_nnode_dirty(struct ubifs_info *c, int node_num, int lnum, + int offs) +{ + struct ubifs_nnode *nnode; + + nnode = nnode_lookup(c, node_num); + if (IS_ERR(nnode)) + return PTR_ERR(nnode); + if (nnode->parent) { + struct ubifs_nbranch *branch; + + branch = &nnode->parent->nbranch[nnode->iip]; + if (branch->lnum != lnum || branch->offs != offs) + return 0; /* nnode is obsolete */ + } else if (c->lpt_lnum != lnum || c->lpt_offs != offs) + return 0; /* nnode is obsolete */ + /* Assumes cnext list is empty i.e. not called during commit */ + if (!test_and_set_bit(DIRTY_CNODE, &nnode->flags)) { + c->dirty_nn_cnt += 1; + ubifs_add_nnode_dirt(c, nnode); + /* Mark parent and ancestors dirty too */ + nnode = nnode->parent; + while (nnode) { + if (!test_and_set_bit(DIRTY_CNODE, &nnode->flags)) { + c->dirty_nn_cnt += 1; + ubifs_add_nnode_dirt(c, nnode); + nnode = nnode->parent; + } else + break; + } + } + return 0; +} + +/** + * make_pnode_dirty - find a pnode and, if found, make it dirty. + * @c: UBIFS file-system description object + * @node_num: pnode number of pnode to make dirty + * @lnum: LEB number where pnode was written + * @offs: offset where pnode was written + * + * This function is used by LPT garbage collection. LPT garbage collection is + * used only for the "big" LPT model (c->big_lpt == 1). Garbage collection + * simply involves marking all the nodes in the LEB being garbage-collected as + * dirty. The dirty nodes are written next commit, after which the LEB is free + * to be reused. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int make_pnode_dirty(struct ubifs_info *c, int node_num, int lnum, + int offs) +{ + struct ubifs_pnode *pnode; + struct ubifs_nbranch *branch; + + pnode = pnode_lookup(c, node_num); + if (IS_ERR(pnode)) + return PTR_ERR(pnode); + branch = &pnode->parent->nbranch[pnode->iip]; + if (branch->lnum != lnum || branch->offs != offs) + return 0; + do_make_pnode_dirty(c, pnode); + return 0; +} + +/** + * make_ltab_dirty - make ltab node dirty. + * @c: UBIFS file-system description object + * @lnum: LEB number where ltab was written + * @offs: offset where ltab was written + * + * This function is used by LPT garbage collection. LPT garbage collection is + * used only for the "big" LPT model (c->big_lpt == 1). Garbage collection + * simply involves marking all the nodes in the LEB being garbage-collected as + * dirty. The dirty nodes are written next commit, after which the LEB is free + * to be reused. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int make_ltab_dirty(struct ubifs_info *c, int lnum, int offs) +{ + if (lnum != c->ltab_lnum || offs != c->ltab_offs) + return 0; /* This ltab node is obsolete */ + if (!(c->lpt_drty_flgs & LTAB_DIRTY)) { + c->lpt_drty_flgs |= LTAB_DIRTY; + ubifs_add_lpt_dirt(c, c->ltab_lnum, c->ltab_sz); + } + return 0; +} + +/** + * make_lsave_dirty - make lsave node dirty. + * @c: UBIFS file-system description object + * @lnum: LEB number where lsave was written + * @offs: offset where lsave was written + * + * This function is used by LPT garbage collection. LPT garbage collection is + * used only for the "big" LPT model (c->big_lpt == 1). Garbage collection + * simply involves marking all the nodes in the LEB being garbage-collected as + * dirty. The dirty nodes are written next commit, after which the LEB is free + * to be reused. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int make_lsave_dirty(struct ubifs_info *c, int lnum, int offs) +{ + if (lnum != c->lsave_lnum || offs != c->lsave_offs) + return 0; /* This lsave node is obsolete */ + if (!(c->lpt_drty_flgs & LSAVE_DIRTY)) { + c->lpt_drty_flgs |= LSAVE_DIRTY; + ubifs_add_lpt_dirt(c, c->lsave_lnum, c->lsave_sz); + } + return 0; +} + +/** + * make_node_dirty - make node dirty. + * @c: UBIFS file-system description object + * @node_type: LPT node type + * @node_num: node number + * @lnum: LEB number where node was written + * @offs: offset where node was written + * + * This function is used by LPT garbage collection. LPT garbage collection is + * used only for the "big" LPT model (c->big_lpt == 1). Garbage collection + * simply involves marking all the nodes in the LEB being garbage-collected as + * dirty. The dirty nodes are written next commit, after which the LEB is free + * to be reused. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int make_node_dirty(struct ubifs_info *c, int node_type, int node_num, + int lnum, int offs) +{ + switch (node_type) { + case UBIFS_LPT_NNODE: + return make_nnode_dirty(c, node_num, lnum, offs); + case UBIFS_LPT_PNODE: + return make_pnode_dirty(c, node_num, lnum, offs); + case UBIFS_LPT_LTAB: + return make_ltab_dirty(c, lnum, offs); + case UBIFS_LPT_LSAVE: + return make_lsave_dirty(c, lnum, offs); + } + return -EINVAL; +} + +/** + * get_lpt_node_len - return the length of a node based on its type. + * @c: UBIFS file-system description object + * @node_type: LPT node type + */ +static int get_lpt_node_len(struct ubifs_info *c, int node_type) +{ + switch (node_type) { + case UBIFS_LPT_NNODE: + return c->nnode_sz; + case UBIFS_LPT_PNODE: + return c->pnode_sz; + case UBIFS_LPT_LTAB: + return c->ltab_sz; + case UBIFS_LPT_LSAVE: + return c->lsave_sz; + } + return 0; +} + +/** + * get_pad_len - return the length of padding in a buffer. + * @c: UBIFS file-system description object + * @buf: buffer + * @len: length of buffer + */ +static int get_pad_len(struct ubifs_info *c, uint8_t *buf, int len) +{ + int offs, pad_len; + + if (c->min_io_size == 1) + return 0; + offs = c->leb_size - len; + pad_len = ALIGN(offs, c->min_io_size) - offs; + return pad_len; +} + +/** + * get_lpt_node_type - return type (and node number) of a node in a buffer. + * @c: UBIFS file-system description object + * @buf: buffer + * @node_num: node number is returned here + */ +static int get_lpt_node_type(struct ubifs_info *c, uint8_t *buf, int *node_num) +{ + uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; + int pos = 0, node_type; + + node_type = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_TYPE_BITS); + *node_num = ubifs_unpack_bits(&addr, &pos, c->pcnt_bits); + return node_type; +} + +/** + * is_a_node - determine if a buffer contains a node. + * @c: UBIFS file-system description object + * @buf: buffer + * @len: length of buffer + * + * This function returns %1 if the buffer contains a node or %0 if it does not. + */ +static int is_a_node(struct ubifs_info *c, uint8_t *buf, int len) +{ + uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; + int pos = 0, node_type, node_len; + uint16_t crc, calc_crc; + + node_type = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_TYPE_BITS); + if (node_type == UBIFS_LPT_NOT_A_NODE) + return 0; + node_len = get_lpt_node_len(c, node_type); + if (!node_len || node_len > len) + return 0; + pos = 0; + addr = buf; + crc = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_CRC_BITS); + calc_crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES, + node_len - UBIFS_LPT_CRC_BYTES); + if (crc != calc_crc) + return 0; + return 1; +} + + +/** + * lpt_gc_lnum - garbage collect a LPT LEB. + * @c: UBIFS file-system description object + * @lnum: LEB number to garbage collect + * + * LPT garbage collection is used only for the "big" LPT model + * (c->big_lpt == 1). Garbage collection simply involves marking all the nodes + * in the LEB being garbage-collected as dirty. The dirty nodes are written + * next commit, after which the LEB is free to be reused. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int lpt_gc_lnum(struct ubifs_info *c, int lnum) +{ + int err, len = c->leb_size, node_type, node_num, node_len, offs; + void *buf = c->lpt_buf; + + dbg_lp("LEB %d", lnum); + err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size); + if (err) { + ubifs_err("cannot read LEB %d, error %d", lnum, err); + return err; + } + while (1) { + if (!is_a_node(c, buf, len)) { + int pad_len; + + pad_len = get_pad_len(c, buf, len); + if (pad_len) { + buf += pad_len; + len -= pad_len; + continue; + } + return 0; + } + node_type = get_lpt_node_type(c, buf, &node_num); + node_len = get_lpt_node_len(c, node_type); + offs = c->leb_size - len; + ubifs_assert(node_len != 0); + mutex_lock(&c->lp_mutex); + err = make_node_dirty(c, node_type, node_num, lnum, offs); + mutex_unlock(&c->lp_mutex); + if (err) + return err; + buf += node_len; + len -= node_len; + } + return 0; +} + +/** + * lpt_gc - LPT garbage collection. + * @c: UBIFS file-system description object + * + * Select a LPT LEB for LPT garbage collection and call 'lpt_gc_lnum()'. + * Returns %0 on success and a negative error code on failure. + */ +static int lpt_gc(struct ubifs_info *c) +{ + int i, lnum = -1, dirty = 0; + + mutex_lock(&c->lp_mutex); + for (i = 0; i < c->lpt_lebs; i++) { + ubifs_assert(!c->ltab[i].tgc); + if (i + c->lpt_first == c->nhead_lnum || + c->ltab[i].free + c->ltab[i].dirty == c->leb_size) + continue; + if (c->ltab[i].dirty > dirty) { + dirty = c->ltab[i].dirty; + lnum = i + c->lpt_first; + } + } + mutex_unlock(&c->lp_mutex); + if (lnum == -1) + return -ENOSPC; + return lpt_gc_lnum(c, lnum); +} + +/** + * ubifs_lpt_start_commit - UBIFS commit starts. + * @c: the UBIFS file-system description object + * + * This function has to be called when UBIFS starts the commit operation. + * This function "freezes" all currently dirty LEB properties and does not + * change them anymore. Further changes are saved and tracked separately + * because they are not part of this commit. This function returns zero in case + * of success and a negative error code in case of failure. + */ +int ubifs_lpt_start_commit(struct ubifs_info *c) +{ + int err, cnt; + + dbg_lp(""); + + mutex_lock(&c->lp_mutex); + err = dbg_check_ltab(c); + if (err) + goto out; + + if (c->check_lpt_free) { + /* + * We ensure there is enough free space in + * ubifs_lpt_post_commit() by marking nodes dirty. That + * information is lost when we unmount, so we also need + * to check free space once after mounting also. + */ + c->check_lpt_free = 0; + while (need_write_all(c)) { + mutex_unlock(&c->lp_mutex); + err = lpt_gc(c); + if (err) + return err; + mutex_lock(&c->lp_mutex); + } + } + + lpt_tgc_start(c); + + if (!c->dirty_pn_cnt) { + dbg_cmt("no cnodes to commit"); + err = 0; + goto out; + } + + if (!c->big_lpt && need_write_all(c)) { + /* If needed, write everything */ + err = make_tree_dirty(c); + if (err) + goto out; + lpt_tgc_start(c); + } + + if (c->big_lpt) + populate_lsave(c); + + cnt = get_cnodes_to_commit(c); + ubifs_assert(cnt != 0); + + err = layout_cnodes(c); + if (err) + goto out; + + /* Copy the LPT's own lprops for end commit to write */ + memcpy(c->ltab_cmt, c->ltab, + sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs); + c->lpt_drty_flgs &= ~(LTAB_DIRTY | LSAVE_DIRTY); + +out: + mutex_unlock(&c->lp_mutex); + return err; +} + +/** + * free_obsolete_cnodes - free obsolete cnodes for commit end. + * @c: UBIFS file-system description object + */ +static void free_obsolete_cnodes(struct ubifs_info *c) +{ + struct ubifs_cnode *cnode, *cnext; + + cnext = c->lpt_cnext; + if (!cnext) + return; + do { + cnode = cnext; + cnext = cnode->cnext; + if (test_bit(OBSOLETE_CNODE, &cnode->flags)) + kfree(cnode); + else + cnode->cnext = NULL; + } while (cnext != c->lpt_cnext); + c->lpt_cnext = NULL; +} + +/** + * ubifs_lpt_end_commit - finish the commit operation. + * @c: the UBIFS file-system description object + * + * This function has to be called when the commit operation finishes. It + * flushes the changes which were "frozen" by 'ubifs_lprops_start_commit()' to + * the media. Returns zero in case of success and a negative error code in case + * of failure. + */ +int ubifs_lpt_end_commit(struct ubifs_info *c) +{ + int err; + + dbg_lp(""); + + if (!c->lpt_cnext) + return 0; + + err = write_cnodes(c); + if (err) + return err; + + mutex_lock(&c->lp_mutex); + free_obsolete_cnodes(c); + mutex_unlock(&c->lp_mutex); + + return 0; +} + +/** + * ubifs_lpt_post_commit - post commit LPT trivial GC and LPT GC. + * @c: UBIFS file-system description object + * + * LPT trivial GC is completed after a commit. Also LPT GC is done after a + * commit for the "big" LPT model. + */ +int ubifs_lpt_post_commit(struct ubifs_info *c) +{ + int err; + + mutex_lock(&c->lp_mutex); + err = lpt_tgc_end(c); + if (err) + goto out; + if (c->big_lpt) + while (need_write_all(c)) { + mutex_unlock(&c->lp_mutex); + err = lpt_gc(c); + if (err) + return err; + mutex_lock(&c->lp_mutex); + } +out: + mutex_unlock(&c->lp_mutex); + return err; +} + +/** + * first_nnode - find the first nnode in memory. + * @c: UBIFS file-system description object + * @hght: height of tree where nnode found is returned here + * + * This function returns a pointer to the nnode found or %NULL if no nnode is + * found. This function is a helper to 'ubifs_lpt_free()'. + */ +static struct ubifs_nnode *first_nnode(struct ubifs_info *c, int *hght) +{ + struct ubifs_nnode *nnode; + int h, i, found; + + nnode = c->nroot; + *hght = 0; + if (!nnode) + return NULL; + for (h = 1; h < c->lpt_hght; h++) { + found = 0; + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + if (nnode->nbranch[i].nnode) { + found = 1; + nnode = nnode->nbranch[i].nnode; + *hght = h; + break; + } + } + if (!found) + break; + } + return nnode; +} + +/** + * next_nnode - find the next nnode in memory. + * @c: UBIFS file-system description object + * @nnode: nnode from which to start. + * @hght: height of tree where nnode is, is passed and returned here + * + * This function returns a pointer to the nnode found or %NULL if no nnode is + * found. This function is a helper to 'ubifs_lpt_free()'. + */ +static struct ubifs_nnode *next_nnode(struct ubifs_info *c, + struct ubifs_nnode *nnode, int *hght) +{ + struct ubifs_nnode *parent; + int iip, h, i, found; + + parent = nnode->parent; + if (!parent) + return NULL; + if (nnode->iip == UBIFS_LPT_FANOUT - 1) { + *hght -= 1; + return parent; + } + for (iip = nnode->iip + 1; iip < UBIFS_LPT_FANOUT; iip++) { + nnode = parent->nbranch[iip].nnode; + if (nnode) + break; + } + if (!nnode) { + *hght -= 1; + return parent; + } + for (h = *hght + 1; h < c->lpt_hght; h++) { + found = 0; + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + if (nnode->nbranch[i].nnode) { + found = 1; + nnode = nnode->nbranch[i].nnode; + *hght = h; + break; + } + } + if (!found) + break; + } + return nnode; +} + +/** + * ubifs_lpt_free - free resources owned by the LPT. + * @c: UBIFS file-system description object + * @wr_only: free only resources used for writing + */ +void ubifs_lpt_free(struct ubifs_info *c, int wr_only) +{ + struct ubifs_nnode *nnode; + int i, hght; + + /* Free write-only things first */ + + free_obsolete_cnodes(c); /* Leftover from a failed commit */ + + vfree(c->ltab_cmt); + c->ltab_cmt = NULL; + vfree(c->lpt_buf); + c->lpt_buf = NULL; + kfree(c->lsave); + c->lsave = NULL; + + if (wr_only) + return; + + /* Now free the rest */ + + nnode = first_nnode(c, &hght); + while (nnode) { + for (i = 0; i < UBIFS_LPT_FANOUT; i++) + kfree(nnode->nbranch[i].nnode); + nnode = next_nnode(c, nnode, &hght); + } + for (i = 0; i < LPROPS_HEAP_CNT; i++) + kfree(c->lpt_heap[i].arr); + kfree(c->dirty_idx.arr); + kfree(c->nroot); + vfree(c->ltab); + kfree(c->lpt_nod_buf); +} + +#ifdef CONFIG_UBIFS_FS_DEBUG + +/** + * dbg_is_all_ff - determine if a buffer contains only 0xff bytes. + * @buf: buffer + * @len: buffer length + */ +static int dbg_is_all_ff(uint8_t *buf, int len) +{ + int i; + + for (i = 0; i < len; i++) + if (buf[i] != 0xff) + return 0; + return 1; +} + +/** + * dbg_is_nnode_dirty - determine if a nnode is dirty. + * @c: the UBIFS file-system description object + * @lnum: LEB number where nnode was written + * @offs: offset where nnode was written + */ +static int dbg_is_nnode_dirty(struct ubifs_info *c, int lnum, int offs) +{ + struct ubifs_nnode *nnode; + int hght; + + /* Entire tree is in memory so first_nnode / next_nnode are ok */ + nnode = first_nnode(c, &hght); + for (; nnode; nnode = next_nnode(c, nnode, &hght)) { + struct ubifs_nbranch *branch; + + cond_resched(); + if (nnode->parent) { + branch = &nnode->parent->nbranch[nnode->iip]; + if (branch->lnum != lnum || branch->offs != offs) + continue; + if (test_bit(DIRTY_CNODE, &nnode->flags)) + return 1; + return 0; + } else { + if (c->lpt_lnum != lnum || c->lpt_offs != offs) + continue; + if (test_bit(DIRTY_CNODE, &nnode->flags)) + return 1; + return 0; + } + } + return 1; +} + +/** + * dbg_is_pnode_dirty - determine if a pnode is dirty. + * @c: the UBIFS file-system description object + * @lnum: LEB number where pnode was written + * @offs: offset where pnode was written + */ +static int dbg_is_pnode_dirty(struct ubifs_info *c, int lnum, int offs) +{ + int i, cnt; + + cnt = DIV_ROUND_UP(c->main_lebs, UBIFS_LPT_FANOUT); + for (i = 0; i < cnt; i++) { + struct ubifs_pnode *pnode; + struct ubifs_nbranch *branch; + + cond_resched(); + pnode = pnode_lookup(c, i); + if (IS_ERR(pnode)) + return PTR_ERR(pnode); + branch = &pnode->parent->nbranch[pnode->iip]; + if (branch->lnum != lnum || branch->offs != offs) + continue; + if (test_bit(DIRTY_CNODE, &pnode->flags)) + return 1; + return 0; + } + return 1; +} + +/** + * dbg_is_ltab_dirty - determine if a ltab node is dirty. + * @c: the UBIFS file-system description object + * @lnum: LEB number where ltab node was written + * @offs: offset where ltab node was written + */ +static int dbg_is_ltab_dirty(struct ubifs_info *c, int lnum, int offs) +{ + if (lnum != c->ltab_lnum || offs != c->ltab_offs) + return 1; + return (c->lpt_drty_flgs & LTAB_DIRTY) != 0; +} + +/** + * dbg_is_lsave_dirty - determine if a lsave node is dirty. + * @c: the UBIFS file-system description object + * @lnum: LEB number where lsave node was written + * @offs: offset where lsave node was written + */ +static int dbg_is_lsave_dirty(struct ubifs_info *c, int lnum, int offs) +{ + if (lnum != c->lsave_lnum || offs != c->lsave_offs) + return 1; + return (c->lpt_drty_flgs & LSAVE_DIRTY) != 0; +} + +/** + * dbg_is_node_dirty - determine if a node is dirty. + * @c: the UBIFS file-system description object + * @node_type: node type + * @lnum: LEB number where node was written + * @offs: offset where node was written + */ +static int dbg_is_node_dirty(struct ubifs_info *c, int node_type, int lnum, + int offs) +{ + switch (node_type) { + case UBIFS_LPT_NNODE: + return dbg_is_nnode_dirty(c, lnum, offs); + case UBIFS_LPT_PNODE: + return dbg_is_pnode_dirty(c, lnum, offs); + case UBIFS_LPT_LTAB: + return dbg_is_ltab_dirty(c, lnum, offs); + case UBIFS_LPT_LSAVE: + return dbg_is_lsave_dirty(c, lnum, offs); + } + return 1; +} + +/** + * dbg_check_ltab_lnum - check the ltab for a LPT LEB number. + * @c: the UBIFS file-system description object + * @lnum: LEB number where node was written + * @offs: offset where node was written + * + * This function returns %0 on success and a negative error code on failure. + */ +static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum) +{ + int err, len = c->leb_size, dirty = 0, node_type, node_num, node_len; + int ret; + void *buf = c->dbg_buf; + + dbg_lp("LEB %d", lnum); + err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size); + if (err) { + dbg_msg("ubi_read failed, LEB %d, error %d", lnum, err); + return err; + } + while (1) { + if (!is_a_node(c, buf, len)) { + int i, pad_len; + + pad_len = get_pad_len(c, buf, len); + if (pad_len) { + buf += pad_len; + len -= pad_len; + dirty += pad_len; + continue; + } + if (!dbg_is_all_ff(buf, len)) { + dbg_msg("invalid empty space in LEB %d at %d", + lnum, c->leb_size - len); + err = -EINVAL; + } + i = lnum - c->lpt_first; + if (len != c->ltab[i].free) { + dbg_msg("invalid free space in LEB %d " + "(free %d, expected %d)", + lnum, len, c->ltab[i].free); + err = -EINVAL; + } + if (dirty != c->ltab[i].dirty) { + dbg_msg("invalid dirty space in LEB %d " + "(dirty %d, expected %d)", + lnum, dirty, c->ltab[i].dirty); + err = -EINVAL; + } + return err; + } + node_type = get_lpt_node_type(c, buf, &node_num); + node_len = get_lpt_node_len(c, node_type); + ret = dbg_is_node_dirty(c, node_type, lnum, c->leb_size - len); + if (ret == 1) + dirty += node_len; + buf += node_len; + len -= node_len; + } +} + +/** + * dbg_check_ltab - check the free and dirty space in the ltab. + * @c: the UBIFS file-system description object + * + * This function returns %0 on success and a negative error code on failure. + */ +int dbg_check_ltab(struct ubifs_info *c) +{ + int lnum, err, i, cnt; + + if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS)) + return 0; + + /* Bring the entire tree into memory */ + cnt = DIV_ROUND_UP(c->main_lebs, UBIFS_LPT_FANOUT); + for (i = 0; i < cnt; i++) { + struct ubifs_pnode *pnode; + + pnode = pnode_lookup(c, i); + if (IS_ERR(pnode)) + return PTR_ERR(pnode); + cond_resched(); + } + + /* Check nodes */ + err = dbg_check_lpt_nodes(c, (struct ubifs_cnode *)c->nroot, 0, 0); + if (err) + return err; + + /* Check each LEB */ + for (lnum = c->lpt_first; lnum <= c->lpt_last; lnum++) { + err = dbg_check_ltab_lnum(c, lnum); + if (err) { + dbg_err("failed at LEB %d", lnum); + return err; + } + } + + dbg_lp("succeeded"); + return 0; +} + +#endif /* CONFIG_UBIFS_FS_DEBUG */ diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c new file mode 100644 index 000000000000..71d5493bf565 --- /dev/null +++ b/fs/ubifs/master.c @@ -0,0 +1,387 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* This file implements reading and writing the master node */ + +#include "ubifs.h" + +/** + * scan_for_master - search the valid master node. + * @c: UBIFS file-system description object + * + * This function scans the master node LEBs and search for the latest master + * node. Returns zero in case of success and a negative error code in case of + * failure. + */ +static int scan_for_master(struct ubifs_info *c) +{ + struct ubifs_scan_leb *sleb; + struct ubifs_scan_node *snod; + int lnum, offs = 0, nodes_cnt; + + lnum = UBIFS_MST_LNUM; + + sleb = ubifs_scan(c, lnum, 0, c->sbuf); + if (IS_ERR(sleb)) + return PTR_ERR(sleb); + nodes_cnt = sleb->nodes_cnt; + if (nodes_cnt > 0) { + snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node, + list); + if (snod->type != UBIFS_MST_NODE) + goto out; + memcpy(c->mst_node, snod->node, snod->len); + offs = snod->offs; + } + ubifs_scan_destroy(sleb); + + lnum += 1; + + sleb = ubifs_scan(c, lnum, 0, c->sbuf); + if (IS_ERR(sleb)) + return PTR_ERR(sleb); + if (sleb->nodes_cnt != nodes_cnt) + goto out; + if (!sleb->nodes_cnt) + goto out; + snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node, list); + if (snod->type != UBIFS_MST_NODE) + goto out; + if (snod->offs != offs) + goto out; + if (memcmp((void *)c->mst_node + UBIFS_CH_SZ, + (void *)snod->node + UBIFS_CH_SZ, + UBIFS_MST_NODE_SZ - UBIFS_CH_SZ)) + goto out; + c->mst_offs = offs; + ubifs_scan_destroy(sleb); + return 0; + +out: + ubifs_scan_destroy(sleb); + return -EINVAL; +} + +/** + * validate_master - validate master node. + * @c: UBIFS file-system description object + * + * This function validates data which was read from master node. Returns zero + * if the data is all right and %-EINVAL if not. + */ +static int validate_master(const struct ubifs_info *c) +{ + long long main_sz; + int err; + + if (c->max_sqnum >= SQNUM_WATERMARK) { + err = 1; + goto out; + } + + if (c->cmt_no >= c->max_sqnum) { + err = 2; + goto out; + } + + if (c->highest_inum >= INUM_WATERMARK) { + err = 3; + goto out; + } + + if (c->lhead_lnum < UBIFS_LOG_LNUM || + c->lhead_lnum >= UBIFS_LOG_LNUM + c->log_lebs || + c->lhead_offs < 0 || c->lhead_offs >= c->leb_size || + c->lhead_offs & (c->min_io_size - 1)) { + err = 4; + goto out; + } + + if (c->zroot.lnum >= c->leb_cnt || c->zroot.lnum < c->main_first || + c->zroot.offs >= c->leb_size || c->zroot.offs & 7) { + err = 5; + goto out; + } + + if (c->zroot.len < c->ranges[UBIFS_IDX_NODE].min_len || + c->zroot.len > c->ranges[UBIFS_IDX_NODE].max_len) { + err = 6; + goto out; + } + + if (c->gc_lnum >= c->leb_cnt || c->gc_lnum < c->main_first) { + err = 7; + goto out; + } + + if (c->ihead_lnum >= c->leb_cnt || c->ihead_lnum < c->main_first || + c->ihead_offs % c->min_io_size || c->ihead_offs < 0 || + c->ihead_offs > c->leb_size || c->ihead_offs & 7) { + err = 8; + goto out; + } + + main_sz = (long long)c->main_lebs * c->leb_size; + if (c->old_idx_sz & 7 || c->old_idx_sz >= main_sz) { + err = 9; + goto out; + } + + if (c->lpt_lnum < c->lpt_first || c->lpt_lnum > c->lpt_last || + c->lpt_offs < 0 || c->lpt_offs + c->nnode_sz > c->leb_size) { + err = 10; + goto out; + } + + if (c->nhead_lnum < c->lpt_first || c->nhead_lnum > c->lpt_last || + c->nhead_offs < 0 || c->nhead_offs % c->min_io_size || + c->nhead_offs > c->leb_size) { + err = 11; + goto out; + } + + if (c->ltab_lnum < c->lpt_first || c->ltab_lnum > c->lpt_last || + c->ltab_offs < 0 || + c->ltab_offs + c->ltab_sz > c->leb_size) { + err = 12; + goto out; + } + + if (c->big_lpt && (c->lsave_lnum < c->lpt_first || + c->lsave_lnum > c->lpt_last || c->lsave_offs < 0 || + c->lsave_offs + c->lsave_sz > c->leb_size)) { + err = 13; + goto out; + } + + if (c->lscan_lnum < c->main_first || c->lscan_lnum >= c->leb_cnt) { + err = 14; + goto out; + } + + if (c->lst.empty_lebs < 0 || c->lst.empty_lebs > c->main_lebs - 2) { + err = 15; + goto out; + } + + if (c->lst.idx_lebs < 0 || c->lst.idx_lebs > c->main_lebs - 1) { + err = 16; + goto out; + } + + if (c->lst.total_free < 0 || c->lst.total_free > main_sz || + c->lst.total_free & 7) { + err = 17; + goto out; + } + + if (c->lst.total_dirty < 0 || (c->lst.total_dirty & 7)) { + err = 18; + goto out; + } + + if (c->lst.total_used < 0 || (c->lst.total_used & 7)) { + err = 19; + goto out; + } + + if (c->lst.total_free + c->lst.total_dirty + + c->lst.total_used > main_sz) { + err = 20; + goto out; + } + + if (c->lst.total_dead + c->lst.total_dark + + c->lst.total_used + c->old_idx_sz > main_sz) { + err = 21; + goto out; + } + + if (c->lst.total_dead < 0 || + c->lst.total_dead > c->lst.total_free + c->lst.total_dirty || + c->lst.total_dead & 7) { + err = 22; + goto out; + } + + if (c->lst.total_dark < 0 || + c->lst.total_dark > c->lst.total_free + c->lst.total_dirty || + c->lst.total_dark & 7) { + err = 23; + goto out; + } + + return 0; + +out: + ubifs_err("bad master node at offset %d error %d", c->mst_offs, err); + dbg_dump_node(c, c->mst_node); + return -EINVAL; +} + +/** + * ubifs_read_master - read master node. + * @c: UBIFS file-system description object + * + * This function finds and reads the master node during file-system mount. If + * the flash is empty, it creates default master node as well. Returns zero in + * case of success and a negative error code in case of failure. + */ +int ubifs_read_master(struct ubifs_info *c) +{ + int err, old_leb_cnt; + + c->mst_node = kzalloc(c->mst_node_alsz, GFP_KERNEL); + if (!c->mst_node) + return -ENOMEM; + + err = scan_for_master(c); + if (err) { + err = ubifs_recover_master_node(c); + if (err) + /* + * Note, we do not free 'c->mst_node' here because the + * unmount routine will take care of this. + */ + return err; + } + + /* Make sure that the recovery flag is clear */ + c->mst_node->flags &= cpu_to_le32(~UBIFS_MST_RCVRY); + + c->max_sqnum = le64_to_cpu(c->mst_node->ch.sqnum); + c->highest_inum = le64_to_cpu(c->mst_node->highest_inum); + c->cmt_no = le64_to_cpu(c->mst_node->cmt_no); + c->zroot.lnum = le32_to_cpu(c->mst_node->root_lnum); + c->zroot.offs = le32_to_cpu(c->mst_node->root_offs); + c->zroot.len = le32_to_cpu(c->mst_node->root_len); + c->lhead_lnum = le32_to_cpu(c->mst_node->log_lnum); + c->gc_lnum = le32_to_cpu(c->mst_node->gc_lnum); + c->ihead_lnum = le32_to_cpu(c->mst_node->ihead_lnum); + c->ihead_offs = le32_to_cpu(c->mst_node->ihead_offs); + c->old_idx_sz = le64_to_cpu(c->mst_node->index_size); + c->lpt_lnum = le32_to_cpu(c->mst_node->lpt_lnum); + c->lpt_offs = le32_to_cpu(c->mst_node->lpt_offs); + c->nhead_lnum = le32_to_cpu(c->mst_node->nhead_lnum); + c->nhead_offs = le32_to_cpu(c->mst_node->nhead_offs); + c->ltab_lnum = le32_to_cpu(c->mst_node->ltab_lnum); + c->ltab_offs = le32_to_cpu(c->mst_node->ltab_offs); + c->lsave_lnum = le32_to_cpu(c->mst_node->lsave_lnum); + c->lsave_offs = le32_to_cpu(c->mst_node->lsave_offs); + c->lscan_lnum = le32_to_cpu(c->mst_node->lscan_lnum); + c->lst.empty_lebs = le32_to_cpu(c->mst_node->empty_lebs); + c->lst.idx_lebs = le32_to_cpu(c->mst_node->idx_lebs); + old_leb_cnt = le32_to_cpu(c->mst_node->leb_cnt); + c->lst.total_free = le64_to_cpu(c->mst_node->total_free); + c->lst.total_dirty = le64_to_cpu(c->mst_node->total_dirty); + c->lst.total_used = le64_to_cpu(c->mst_node->total_used); + c->lst.total_dead = le64_to_cpu(c->mst_node->total_dead); + c->lst.total_dark = le64_to_cpu(c->mst_node->total_dark); + + c->calc_idx_sz = c->old_idx_sz; + + if (c->mst_node->flags & cpu_to_le32(UBIFS_MST_NO_ORPHS)) + c->no_orphs = 1; + + if (old_leb_cnt != c->leb_cnt) { + /* The file system has been resized */ + int growth = c->leb_cnt - old_leb_cnt; + + if (c->leb_cnt < old_leb_cnt || + c->leb_cnt < UBIFS_MIN_LEB_CNT) { + ubifs_err("bad leb_cnt on master node"); + dbg_dump_node(c, c->mst_node); + return -EINVAL; + } + + dbg_mnt("Auto resizing (master) from %d LEBs to %d LEBs", + old_leb_cnt, c->leb_cnt); + c->lst.empty_lebs += growth; + c->lst.total_free += growth * (long long)c->leb_size; + c->lst.total_dark += growth * (long long)c->dark_wm; + + /* + * Reflect changes back onto the master node. N.B. the master + * node gets written immediately whenever mounting (or + * remounting) in read-write mode, so we do not need to write it + * here. + */ + c->mst_node->leb_cnt = cpu_to_le32(c->leb_cnt); + c->mst_node->empty_lebs = cpu_to_le32(c->lst.empty_lebs); + c->mst_node->total_free = cpu_to_le64(c->lst.total_free); + c->mst_node->total_dark = cpu_to_le64(c->lst.total_dark); + } + + err = validate_master(c); + if (err) + return err; + + err = dbg_old_index_check_init(c, &c->zroot); + + return err; +} + +/** + * ubifs_write_master - write master node. + * @c: UBIFS file-system description object + * + * This function writes the master node. The caller has to take the + * @c->mst_mutex lock before calling this function. Returns zero in case of + * success and a negative error code in case of failure. The master node is + * written twice to enable recovery. + */ +int ubifs_write_master(struct ubifs_info *c) +{ + int err, lnum, offs, len; + + if (c->ro_media) + return -EINVAL; + + lnum = UBIFS_MST_LNUM; + offs = c->mst_offs + c->mst_node_alsz; + len = UBIFS_MST_NODE_SZ; + + if (offs + UBIFS_MST_NODE_SZ > c->leb_size) { + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + offs = 0; + } + + c->mst_offs = offs; + c->mst_node->highest_inum = cpu_to_le64(c->highest_inum); + + err = ubifs_write_node(c, c->mst_node, len, lnum, offs, UBI_SHORTTERM); + if (err) + return err; + + lnum += 1; + + if (offs == 0) { + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + } + err = ubifs_write_node(c, c->mst_node, len, lnum, offs, UBI_SHORTTERM); + + return err; +} diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h new file mode 100644 index 000000000000..4c12a9215d7f --- /dev/null +++ b/fs/ubifs/misc.h @@ -0,0 +1,313 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* + * This file contains miscellaneous helper functions. + */ + +#ifndef __UBIFS_MISC_H__ +#define __UBIFS_MISC_H__ + +/** + * ubifs_zn_dirty - check if znode is dirty. + * @znode: znode to check + * + * This helper function returns %1 if @znode is dirty and %0 otherwise. + */ +static inline int ubifs_zn_dirty(const struct ubifs_znode *znode) +{ + return !!test_bit(DIRTY_ZNODE, &znode->flags); +} + +/** + * ubifs_wake_up_bgt - wake up background thread. + * @c: UBIFS file-system description object + */ +static inline void ubifs_wake_up_bgt(struct ubifs_info *c) +{ + if (c->bgt && !c->need_bgt) { + c->need_bgt = 1; + wake_up_process(c->bgt); + } +} + +/** + * ubifs_tnc_find_child - find next child in znode. + * @znode: znode to search at + * @start: the zbranch index to start at + * + * This helper function looks for znode child starting at index @start. Returns + * the child or %NULL if no children were found. + */ +static inline struct ubifs_znode * +ubifs_tnc_find_child(struct ubifs_znode *znode, int start) +{ + while (start < znode->child_cnt) { + if (znode->zbranch[start].znode) + return znode->zbranch[start].znode; + start += 1; + } + + return NULL; +} + +/** + * ubifs_inode - get UBIFS inode information by VFS 'struct inode' object. + * @inode: the VFS 'struct inode' pointer + */ +static inline struct ubifs_inode *ubifs_inode(const struct inode *inode) +{ + return container_of(inode, struct ubifs_inode, vfs_inode); +} + +/** + * ubifs_compr_present - check if compressor was compiled in. + * @compr_type: compressor type to check + * + * This function returns %1 of compressor of type @compr_type is present, and + * %0 if not. + */ +static inline int ubifs_compr_present(int compr_type) +{ + ubifs_assert(compr_type >= 0 && compr_type < UBIFS_COMPR_TYPES_CNT); + return !!ubifs_compressors[compr_type]->capi_name; +} + +/** + * ubifs_compr_name - get compressor name string by its type. + * @compr_type: compressor type + * + * This function returns compressor type string. + */ +static inline const char *ubifs_compr_name(int compr_type) +{ + ubifs_assert(compr_type >= 0 && compr_type < UBIFS_COMPR_TYPES_CNT); + return ubifs_compressors[compr_type]->name; +} + +/** + * ubifs_wbuf_sync - synchronize write-buffer. + * @wbuf: write-buffer to synchronize + * + * This is the same as as 'ubifs_wbuf_sync_nolock()' but it does not assume + * that the write-buffer is already locked. + */ +static inline int ubifs_wbuf_sync(struct ubifs_wbuf *wbuf) +{ + int err; + + mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); + err = ubifs_wbuf_sync_nolock(wbuf); + mutex_unlock(&wbuf->io_mutex); + return err; +} + +/** + * ubifs_leb_unmap - unmap an LEB. + * @c: UBIFS file-system description object + * @lnum: LEB number to unmap + * + * This function returns %0 on success and a negative error code on failure. + */ +static inline int ubifs_leb_unmap(const struct ubifs_info *c, int lnum) +{ + int err; + + if (c->ro_media) + return -EROFS; + err = ubi_leb_unmap(c->ubi, lnum); + if (err) { + ubifs_err("unmap LEB %d failed, error %d", lnum, err); + return err; + } + + return 0; +} + +/** + * ubifs_leb_write - write to a LEB. + * @c: UBIFS file-system description object + * @lnum: LEB number to write + * @buf: buffer to write from + * @offs: offset within LEB to write to + * @len: length to write + * @dtype: data type + * + * This function returns %0 on success and a negative error code on failure. + */ +static inline int ubifs_leb_write(const struct ubifs_info *c, int lnum, + const void *buf, int offs, int len, int dtype) +{ + int err; + + if (c->ro_media) + return -EROFS; + err = ubi_leb_write(c->ubi, lnum, buf, offs, len, dtype); + if (err) { + ubifs_err("writing %d bytes at %d:%d, error %d", + len, lnum, offs, err); + return err; + } + + return 0; +} + +/** + * ubifs_leb_change - atomic LEB change. + * @c: UBIFS file-system description object + * @lnum: LEB number to write + * @buf: buffer to write from + * @len: length to write + * @dtype: data type + * + * This function returns %0 on success and a negative error code on failure. + */ +static inline int ubifs_leb_change(const struct ubifs_info *c, int lnum, + const void *buf, int len, int dtype) +{ + int err; + + if (c->ro_media) + return -EROFS; + err = ubi_leb_change(c->ubi, lnum, buf, len, dtype); + if (err) { + ubifs_err("changing %d bytes in LEB %d, error %d", + len, lnum, err); + return err; + } + + return 0; +} + +/** + * ubifs_encode_dev - encode device node IDs. + * @dev: UBIFS device node information + * @rdev: device IDs to encode + * + * This is a helper function which encodes major/minor numbers of a device node + * into UBIFS device node description. We use standard Linux "new" and "huge" + * encodings. + */ +static inline int ubifs_encode_dev(union ubifs_dev_desc *dev, dev_t rdev) +{ + if (new_valid_dev(rdev)) { + dev->new = cpu_to_le32(new_encode_dev(rdev)); + return sizeof(dev->new); + } else { + dev->huge = cpu_to_le64(huge_encode_dev(rdev)); + return sizeof(dev->huge); + } +} + +/** + * ubifs_add_dirt - add dirty space to LEB properties. + * @c: the UBIFS file-system description object + * @lnum: LEB to add dirty space for + * @dirty: dirty space to add + * + * This is a helper function which increased amount of dirty LEB space. Returns + * zero in case of success and a negative error code in case of failure. + */ +static inline int ubifs_add_dirt(struct ubifs_info *c, int lnum, int dirty) +{ + return ubifs_update_one_lp(c, lnum, LPROPS_NC, dirty, 0, 0); +} + +/** + * ubifs_return_leb - return LEB to lprops. + * @c: the UBIFS file-system description object + * @lnum: LEB to return + * + * This helper function cleans the "taken" flag of a logical eraseblock in the + * lprops. Returns zero in case of success and a negative error code in case of + * failure. + */ +static inline int ubifs_return_leb(struct ubifs_info *c, int lnum) +{ + return ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0, + LPROPS_TAKEN, 0); +} + +/** + * ubifs_idx_node_sz - return index node size. + * @c: the UBIFS file-system description object + * @child_cnt: number of children of this index node + */ +static inline int ubifs_idx_node_sz(const struct ubifs_info *c, int child_cnt) +{ + return UBIFS_IDX_NODE_SZ + (UBIFS_BRANCH_SZ + c->key_len) * child_cnt; +} + +/** + * ubifs_idx_branch - return pointer to an index branch. + * @c: the UBIFS file-system description object + * @idx: index node + * @bnum: branch number + */ +static inline +struct ubifs_branch *ubifs_idx_branch(const struct ubifs_info *c, + const struct ubifs_idx_node *idx, + int bnum) +{ + return (struct ubifs_branch *)((void *)idx->branches + + (UBIFS_BRANCH_SZ + c->key_len) * bnum); +} + +/** + * ubifs_idx_key - return pointer to an index key. + * @c: the UBIFS file-system description object + * @idx: index node + */ +static inline void *ubifs_idx_key(const struct ubifs_info *c, + const struct ubifs_idx_node *idx) +{ + return (void *)((struct ubifs_branch *)idx->branches)->key; +} + +/** + * ubifs_current_time - round current time to time granularity. + * @inode: inode + */ +static inline struct timespec ubifs_current_time(struct inode *inode) +{ + return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ? + current_fs_time(inode->i_sb) : CURRENT_TIME_SEC; +} + +/** + * ubifs_tnc_lookup - look up a file-system node. + * @c: UBIFS file-system description object + * @key: node key to lookup + * @node: the node is returned here + * + * This function look up and reads node with key @key. The caller has to make + * sure the @node buffer is large enough to fit the node. Returns zero in case + * of success, %-ENOENT if the node was not found, and a negative error code in + * case of failure. + */ +static inline int ubifs_tnc_lookup(struct ubifs_info *c, + const union ubifs_key *key, void *node) +{ + return ubifs_tnc_locate(c, key, node, NULL, NULL); +} + +#endif /* __UBIFS_MISC_H__ */ diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c new file mode 100644 index 000000000000..02d3462f4d3e --- /dev/null +++ b/fs/ubifs/orphan.c @@ -0,0 +1,958 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Author: Adrian Hunter + */ + +#include "ubifs.h" + +/* + * An orphan is an inode number whose inode node has been committed to the index + * with a link count of zero. That happens when an open file is deleted + * (unlinked) and then a commit is run. In the normal course of events the inode + * would be deleted when the file is closed. However in the case of an unclean + * unmount, orphans need to be accounted for. After an unclean unmount, the + * orphans' inodes must be deleted which means either scanning the entire index + * looking for them, or keeping a list on flash somewhere. This unit implements + * the latter approach. + * + * The orphan area is a fixed number of LEBs situated between the LPT area and + * the main area. The number of orphan area LEBs is specified when the file + * system is created. The minimum number is 1. The size of the orphan area + * should be so that it can hold the maximum number of orphans that are expected + * to ever exist at one time. + * + * The number of orphans that can fit in a LEB is: + * + * (c->leb_size - UBIFS_ORPH_NODE_SZ) / sizeof(__le64) + * + * For example: a 15872 byte LEB can fit 1980 orphans so 1 LEB may be enough. + * + * Orphans are accumulated in a rb-tree. When an inode's link count drops to + * zero, the inode number is added to the rb-tree. It is removed from the tree + * when the inode is deleted. Any new orphans that are in the orphan tree when + * the commit is run, are written to the orphan area in 1 or more orph nodes. + * If the orphan area is full, it is consolidated to make space. There is + * always enough space because validation prevents the user from creating more + * than the maximum number of orphans allowed. + */ + +#ifdef CONFIG_UBIFS_FS_DEBUG +static int dbg_check_orphans(struct ubifs_info *c); +#else +#define dbg_check_orphans(c) 0 +#endif + +/** + * ubifs_add_orphan - add an orphan. + * @c: UBIFS file-system description object + * @inum: orphan inode number + * + * Add an orphan. This function is called when an inodes link count drops to + * zero. + */ +int ubifs_add_orphan(struct ubifs_info *c, ino_t inum) +{ + struct ubifs_orphan *orphan, *o; + struct rb_node **p, *parent = NULL; + + orphan = kzalloc(sizeof(struct ubifs_orphan), GFP_NOFS); + if (!orphan) + return -ENOMEM; + orphan->inum = inum; + orphan->new = 1; + + spin_lock(&c->orphan_lock); + if (c->tot_orphans >= c->max_orphans) { + spin_unlock(&c->orphan_lock); + kfree(orphan); + return -ENFILE; + } + p = &c->orph_tree.rb_node; + while (*p) { + parent = *p; + o = rb_entry(parent, struct ubifs_orphan, rb); + if (inum < o->inum) + p = &(*p)->rb_left; + else if (inum > o->inum) + p = &(*p)->rb_right; + else { + dbg_err("orphaned twice"); + spin_unlock(&c->orphan_lock); + kfree(orphan); + return 0; + } + } + c->tot_orphans += 1; + c->new_orphans += 1; + rb_link_node(&orphan->rb, parent, p); + rb_insert_color(&orphan->rb, &c->orph_tree); + list_add_tail(&orphan->list, &c->orph_list); + list_add_tail(&orphan->new_list, &c->orph_new); + spin_unlock(&c->orphan_lock); + dbg_gen("ino %lu", inum); + return 0; +} + +/** + * ubifs_delete_orphan - delete an orphan. + * @c: UBIFS file-system description object + * @inum: orphan inode number + * + * Delete an orphan. This function is called when an inode is deleted. + */ +void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum) +{ + struct ubifs_orphan *o; + struct rb_node *p; + + spin_lock(&c->orphan_lock); + p = c->orph_tree.rb_node; + while (p) { + o = rb_entry(p, struct ubifs_orphan, rb); + if (inum < o->inum) + p = p->rb_left; + else if (inum > o->inum) + p = p->rb_right; + else { + if (o->dnext) { + spin_unlock(&c->orphan_lock); + dbg_gen("deleted twice ino %lu", inum); + return; + } + if (o->cnext) { + o->dnext = c->orph_dnext; + c->orph_dnext = o; + spin_unlock(&c->orphan_lock); + dbg_gen("delete later ino %lu", inum); + return; + } + rb_erase(p, &c->orph_tree); + list_del(&o->list); + c->tot_orphans -= 1; + if (o->new) { + list_del(&o->new_list); + c->new_orphans -= 1; + } + spin_unlock(&c->orphan_lock); + kfree(o); + dbg_gen("inum %lu", inum); + return; + } + } + spin_unlock(&c->orphan_lock); + dbg_err("missing orphan ino %lu", inum); + dbg_dump_stack(); +} + +/** + * ubifs_orphan_start_commit - start commit of orphans. + * @c: UBIFS file-system description object + * + * Start commit of orphans. + */ +int ubifs_orphan_start_commit(struct ubifs_info *c) +{ + struct ubifs_orphan *orphan, **last; + + spin_lock(&c->orphan_lock); + last = &c->orph_cnext; + list_for_each_entry(orphan, &c->orph_new, new_list) { + ubifs_assert(orphan->new); + orphan->new = 0; + *last = orphan; + last = &orphan->cnext; + } + *last = orphan->cnext; + c->cmt_orphans = c->new_orphans; + c->new_orphans = 0; + dbg_cmt("%d orphans to commit", c->cmt_orphans); + INIT_LIST_HEAD(&c->orph_new); + if (c->tot_orphans == 0) + c->no_orphs = 1; + else + c->no_orphs = 0; + spin_unlock(&c->orphan_lock); + return 0; +} + +/** + * avail_orphs - calculate available space. + * @c: UBIFS file-system description object + * + * This function returns the number of orphans that can be written in the + * available space. + */ +static int avail_orphs(struct ubifs_info *c) +{ + int avail_lebs, avail, gap; + + avail_lebs = c->orph_lebs - (c->ohead_lnum - c->orph_first) - 1; + avail = avail_lebs * + ((c->leb_size - UBIFS_ORPH_NODE_SZ) / sizeof(__le64)); + gap = c->leb_size - c->ohead_offs; + if (gap >= UBIFS_ORPH_NODE_SZ + sizeof(__le64)) + avail += (gap - UBIFS_ORPH_NODE_SZ) / sizeof(__le64); + return avail; +} + +/** + * tot_avail_orphs - calculate total space. + * @c: UBIFS file-system description object + * + * This function returns the number of orphans that can be written in half + * the total space. That leaves half the space for adding new orphans. + */ +static int tot_avail_orphs(struct ubifs_info *c) +{ + int avail_lebs, avail; + + avail_lebs = c->orph_lebs; + avail = avail_lebs * + ((c->leb_size - UBIFS_ORPH_NODE_SZ) / sizeof(__le64)); + return avail / 2; +} + +/** + * do_write_orph_node - write a node + * @c: UBIFS file-system description object + * @len: length of node + * @atomic: write atomically + * + * This function writes a node to the orphan head from the orphan buffer. If + * %atomic is not zero, then the write is done atomically. On success, %0 is + * returned, otherwise a negative error code is returned. + */ +static int do_write_orph_node(struct ubifs_info *c, int len, int atomic) +{ + int err = 0; + + if (atomic) { + ubifs_assert(c->ohead_offs == 0); + ubifs_prepare_node(c, c->orph_buf, len, 1); + len = ALIGN(len, c->min_io_size); + err = ubifs_leb_change(c, c->ohead_lnum, c->orph_buf, len, + UBI_SHORTTERM); + } else { + if (c->ohead_offs == 0) { + /* Ensure LEB has been unmapped */ + err = ubifs_leb_unmap(c, c->ohead_lnum); + if (err) + return err; + } + err = ubifs_write_node(c, c->orph_buf, len, c->ohead_lnum, + c->ohead_offs, UBI_SHORTTERM); + } + return err; +} + +/** + * write_orph_node - write an orph node + * @c: UBIFS file-system description object + * @atomic: write atomically + * + * This function builds an orph node from the cnext list and writes it to the + * orphan head. On success, %0 is returned, otherwise a negative error code + * is returned. + */ +static int write_orph_node(struct ubifs_info *c, int atomic) +{ + struct ubifs_orphan *orphan, *cnext; + struct ubifs_orph_node *orph; + int gap, err, len, cnt, i; + + ubifs_assert(c->cmt_orphans > 0); + gap = c->leb_size - c->ohead_offs; + if (gap < UBIFS_ORPH_NODE_SZ + sizeof(__le64)) { + c->ohead_lnum += 1; + c->ohead_offs = 0; + gap = c->leb_size; + if (c->ohead_lnum > c->orph_last) { + /* + * We limit the number of orphans so that this should + * never happen. + */ + ubifs_err("out of space in orphan area"); + return -EINVAL; + } + } + cnt = (gap - UBIFS_ORPH_NODE_SZ) / sizeof(__le64); + if (cnt > c->cmt_orphans) + cnt = c->cmt_orphans; + len = UBIFS_ORPH_NODE_SZ + cnt * sizeof(__le64); + ubifs_assert(c->orph_buf); + orph = c->orph_buf; + orph->ch.node_type = UBIFS_ORPH_NODE; + spin_lock(&c->orphan_lock); + cnext = c->orph_cnext; + for (i = 0; i < cnt; i++) { + orphan = cnext; + orph->inos[i] = cpu_to_le64(orphan->inum); + cnext = orphan->cnext; + orphan->cnext = NULL; + } + c->orph_cnext = cnext; + c->cmt_orphans -= cnt; + spin_unlock(&c->orphan_lock); + if (c->cmt_orphans) + orph->cmt_no = cpu_to_le64(c->cmt_no); + else + /* Mark the last node of the commit */ + orph->cmt_no = cpu_to_le64((c->cmt_no) | (1ULL << 63)); + ubifs_assert(c->ohead_offs + len <= c->leb_size); + ubifs_assert(c->ohead_lnum >= c->orph_first); + ubifs_assert(c->ohead_lnum <= c->orph_last); + err = do_write_orph_node(c, len, atomic); + c->ohead_offs += ALIGN(len, c->min_io_size); + c->ohead_offs = ALIGN(c->ohead_offs, 8); + return err; +} + +/** + * write_orph_nodes - write orph nodes until there are no more to commit + * @c: UBIFS file-system description object + * @atomic: write atomically + * + * This function writes orph nodes for all the orphans to commit. On success, + * %0 is returned, otherwise a negative error code is returned. + */ +static int write_orph_nodes(struct ubifs_info *c, int atomic) +{ + int err; + + while (c->cmt_orphans > 0) { + err = write_orph_node(c, atomic); + if (err) + return err; + } + if (atomic) { + int lnum; + + /* Unmap any unused LEBs after consolidation */ + lnum = c->ohead_lnum + 1; + for (lnum = c->ohead_lnum + 1; lnum <= c->orph_last; lnum++) { + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + } + } + return 0; +} + +/** + * consolidate - consolidate the orphan area. + * @c: UBIFS file-system description object + * + * This function enables consolidation by putting all the orphans into the list + * to commit. The list is in the order that the orphans were added, and the + * LEBs are written atomically in order, so at no time can orphans be lost by + * an unclean unmount. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int consolidate(struct ubifs_info *c) +{ + int tot_avail = tot_avail_orphs(c), err = 0; + + spin_lock(&c->orphan_lock); + dbg_cmt("there is space for %d orphans and there are %d", + tot_avail, c->tot_orphans); + if (c->tot_orphans - c->new_orphans <= tot_avail) { + struct ubifs_orphan *orphan, **last; + int cnt = 0; + + /* Change the cnext list to include all non-new orphans */ + last = &c->orph_cnext; + list_for_each_entry(orphan, &c->orph_list, list) { + if (orphan->new) + continue; + *last = orphan; + last = &orphan->cnext; + cnt += 1; + } + *last = orphan->cnext; + ubifs_assert(cnt == c->tot_orphans - c->new_orphans); + c->cmt_orphans = cnt; + c->ohead_lnum = c->orph_first; + c->ohead_offs = 0; + } else { + /* + * We limit the number of orphans so that this should + * never happen. + */ + ubifs_err("out of space in orphan area"); + err = -EINVAL; + } + spin_unlock(&c->orphan_lock); + return err; +} + +/** + * commit_orphans - commit orphans. + * @c: UBIFS file-system description object + * + * This function commits orphans to flash. On success, %0 is returned, + * otherwise a negative error code is returned. + */ +static int commit_orphans(struct ubifs_info *c) +{ + int avail, atomic = 0, err; + + ubifs_assert(c->cmt_orphans > 0); + avail = avail_orphs(c); + if (avail < c->cmt_orphans) { + /* Not enough space to write new orphans, so consolidate */ + err = consolidate(c); + if (err) + return err; + atomic = 1; + } + err = write_orph_nodes(c, atomic); + return err; +} + +/** + * erase_deleted - erase the orphans marked for deletion. + * @c: UBIFS file-system description object + * + * During commit, the orphans being committed cannot be deleted, so they are + * marked for deletion and deleted by this function. Also, the recovery + * adds killed orphans to the deletion list, and therefore they are deleted + * here too. + */ +static void erase_deleted(struct ubifs_info *c) +{ + struct ubifs_orphan *orphan, *dnext; + + spin_lock(&c->orphan_lock); + dnext = c->orph_dnext; + while (dnext) { + orphan = dnext; + dnext = orphan->dnext; + ubifs_assert(!orphan->new); + rb_erase(&orphan->rb, &c->orph_tree); + list_del(&orphan->list); + c->tot_orphans -= 1; + dbg_gen("deleting orphan ino %lu", orphan->inum); + kfree(orphan); + } + c->orph_dnext = NULL; + spin_unlock(&c->orphan_lock); +} + +/** + * ubifs_orphan_end_commit - end commit of orphans. + * @c: UBIFS file-system description object + * + * End commit of orphans. + */ +int ubifs_orphan_end_commit(struct ubifs_info *c) +{ + int err; + + if (c->cmt_orphans != 0) { + err = commit_orphans(c); + if (err) + return err; + } + erase_deleted(c); + err = dbg_check_orphans(c); + return err; +} + +/** + * clear_orphans - erase all LEBs used for orphans. + * @c: UBIFS file-system description object + * + * If recovery is not required, then the orphans from the previous session + * are not needed. This function locates the LEBs used to record + * orphans, and un-maps them. + */ +static int clear_orphans(struct ubifs_info *c) +{ + int lnum, err; + + for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) { + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + } + c->ohead_lnum = c->orph_first; + c->ohead_offs = 0; + return 0; +} + +/** + * insert_dead_orphan - insert an orphan. + * @c: UBIFS file-system description object + * @inum: orphan inode number + * + * This function is a helper to the 'do_kill_orphans()' function. The orphan + * must be kept until the next commit, so it is added to the rb-tree and the + * deletion list. + */ +static int insert_dead_orphan(struct ubifs_info *c, ino_t inum) +{ + struct ubifs_orphan *orphan, *o; + struct rb_node **p, *parent = NULL; + + orphan = kzalloc(sizeof(struct ubifs_orphan), GFP_KERNEL); + if (!orphan) + return -ENOMEM; + orphan->inum = inum; + + p = &c->orph_tree.rb_node; + while (*p) { + parent = *p; + o = rb_entry(parent, struct ubifs_orphan, rb); + if (inum < o->inum) + p = &(*p)->rb_left; + else if (inum > o->inum) + p = &(*p)->rb_right; + else { + /* Already added - no problem */ + kfree(orphan); + return 0; + } + } + c->tot_orphans += 1; + rb_link_node(&orphan->rb, parent, p); + rb_insert_color(&orphan->rb, &c->orph_tree); + list_add_tail(&orphan->list, &c->orph_list); + orphan->dnext = c->orph_dnext; + c->orph_dnext = orphan; + dbg_mnt("ino %lu, new %d, tot %d", + inum, c->new_orphans, c->tot_orphans); + return 0; +} + +/** + * do_kill_orphans - remove orphan inodes from the index. + * @c: UBIFS file-system description object + * @sleb: scanned LEB + * @last_cmt_no: cmt_no of last orph node read is passed and returned here + * @outofdate: whether the LEB is out of date is returned here + * @last_flagged: whether the end orph node is encountered + * + * This function is a helper to the 'kill_orphans()' function. It goes through + * every orphan node in a LEB and for every inode number recorded, removes + * all keys for that inode from the TNC. + */ +static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb, + unsigned long long *last_cmt_no, int *outofdate, + int *last_flagged) +{ + struct ubifs_scan_node *snod; + struct ubifs_orph_node *orph; + unsigned long long cmt_no; + ino_t inum; + int i, n, err, first = 1; + + list_for_each_entry(snod, &sleb->nodes, list) { + if (snod->type != UBIFS_ORPH_NODE) { + ubifs_err("invalid node type %d in orphan area at " + "%d:%d", snod->type, sleb->lnum, snod->offs); + dbg_dump_node(c, snod->node); + return -EINVAL; + } + + orph = snod->node; + + /* Check commit number */ + cmt_no = le64_to_cpu(orph->cmt_no) & LLONG_MAX; + /* + * The commit number on the master node may be less, because + * of a failed commit. If there are several failed commits in a + * row, the commit number written on orph nodes will continue to + * increase (because the commit number is adjusted here) even + * though the commit number on the master node stays the same + * because the master node has not been re-written. + */ + if (cmt_no > c->cmt_no) + c->cmt_no = cmt_no; + if (cmt_no < *last_cmt_no && *last_flagged) { + /* + * The last orph node had a higher commit number and was + * flagged as the last written for that commit number. + * That makes this orph node, out of date. + */ + if (!first) { + ubifs_err("out of order commit number %llu in " + "orphan node at %d:%d", + cmt_no, sleb->lnum, snod->offs); + dbg_dump_node(c, snod->node); + return -EINVAL; + } + dbg_rcvry("out of date LEB %d", sleb->lnum); + *outofdate = 1; + return 0; + } + + if (first) + first = 0; + + n = (le32_to_cpu(orph->ch.len) - UBIFS_ORPH_NODE_SZ) >> 3; + for (i = 0; i < n; i++) { + inum = le64_to_cpu(orph->inos[i]); + dbg_rcvry("deleting orphaned inode %lu", inum); + err = ubifs_tnc_remove_ino(c, inum); + if (err) + return err; + err = insert_dead_orphan(c, inum); + if (err) + return err; + } + + *last_cmt_no = cmt_no; + if (le64_to_cpu(orph->cmt_no) & (1ULL << 63)) { + dbg_rcvry("last orph node for commit %llu at %d:%d", + cmt_no, sleb->lnum, snod->offs); + *last_flagged = 1; + } else + *last_flagged = 0; + } + + return 0; +} + +/** + * kill_orphans - remove all orphan inodes from the index. + * @c: UBIFS file-system description object + * + * If recovery is required, then orphan inodes recorded during the previous + * session (which ended with an unclean unmount) must be deleted from the index. + * This is done by updating the TNC, but since the index is not updated until + * the next commit, the LEBs where the orphan information is recorded are not + * erased until the next commit. + */ +static int kill_orphans(struct ubifs_info *c) +{ + unsigned long long last_cmt_no = 0; + int lnum, err = 0, outofdate = 0, last_flagged = 0; + + c->ohead_lnum = c->orph_first; + c->ohead_offs = 0; + /* Check no-orphans flag and skip this if no orphans */ + if (c->no_orphs) { + dbg_rcvry("no orphans"); + return 0; + } + /* + * Orph nodes always start at c->orph_first and are written to each + * successive LEB in turn. Generally unused LEBs will have been unmapped + * but may contain out of date orph nodes if the unmap didn't go + * through. In addition, the last orph node written for each commit is + * marked (top bit of orph->cmt_no is set to 1). It is possible that + * there are orph nodes from the next commit (i.e. the commit did not + * complete successfully). In that case, no orphans will have been lost + * due to the way that orphans are written, and any orphans added will + * be valid orphans anyway and so can be deleted. + */ + for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) { + struct ubifs_scan_leb *sleb; + + dbg_rcvry("LEB %d", lnum); + sleb = ubifs_scan(c, lnum, 0, c->sbuf); + if (IS_ERR(sleb)) { + sleb = ubifs_recover_leb(c, lnum, 0, c->sbuf, 0); + if (IS_ERR(sleb)) { + err = PTR_ERR(sleb); + break; + } + } + err = do_kill_orphans(c, sleb, &last_cmt_no, &outofdate, + &last_flagged); + if (err || outofdate) { + ubifs_scan_destroy(sleb); + break; + } + if (sleb->endpt) { + c->ohead_lnum = lnum; + c->ohead_offs = sleb->endpt; + } + ubifs_scan_destroy(sleb); + } + return err; +} + +/** + * ubifs_mount_orphans - delete orphan inodes and erase LEBs that recorded them. + * @c: UBIFS file-system description object + * @unclean: indicates recovery from unclean unmount + * @read_only: indicates read only mount + * + * This function is called when mounting to erase orphans from the previous + * session. If UBIFS was not unmounted cleanly, then the inodes recorded as + * orphans are deleted. + */ +int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only) +{ + int err = 0; + + c->max_orphans = tot_avail_orphs(c); + + if (!read_only) { + c->orph_buf = vmalloc(c->leb_size); + if (!c->orph_buf) + return -ENOMEM; + } + + if (unclean) + err = kill_orphans(c); + else if (!read_only) + err = clear_orphans(c); + + return err; +} + +#ifdef CONFIG_UBIFS_FS_DEBUG + +struct check_orphan { + struct rb_node rb; + ino_t inum; +}; + +struct check_info { + unsigned long last_ino; + unsigned long tot_inos; + unsigned long missing; + unsigned long long leaf_cnt; + struct ubifs_ino_node *node; + struct rb_root root; +}; + +static int dbg_find_orphan(struct ubifs_info *c, ino_t inum) +{ + struct ubifs_orphan *o; + struct rb_node *p; + + spin_lock(&c->orphan_lock); + p = c->orph_tree.rb_node; + while (p) { + o = rb_entry(p, struct ubifs_orphan, rb); + if (inum < o->inum) + p = p->rb_left; + else if (inum > o->inum) + p = p->rb_right; + else { + spin_unlock(&c->orphan_lock); + return 1; + } + } + spin_unlock(&c->orphan_lock); + return 0; +} + +static int dbg_ins_check_orphan(struct rb_root *root, ino_t inum) +{ + struct check_orphan *orphan, *o; + struct rb_node **p, *parent = NULL; + + orphan = kzalloc(sizeof(struct check_orphan), GFP_NOFS); + if (!orphan) + return -ENOMEM; + orphan->inum = inum; + + p = &root->rb_node; + while (*p) { + parent = *p; + o = rb_entry(parent, struct check_orphan, rb); + if (inum < o->inum) + p = &(*p)->rb_left; + else if (inum > o->inum) + p = &(*p)->rb_right; + else { + kfree(orphan); + return 0; + } + } + rb_link_node(&orphan->rb, parent, p); + rb_insert_color(&orphan->rb, root); + return 0; +} + +static int dbg_find_check_orphan(struct rb_root *root, ino_t inum) +{ + struct check_orphan *o; + struct rb_node *p; + + p = root->rb_node; + while (p) { + o = rb_entry(p, struct check_orphan, rb); + if (inum < o->inum) + p = p->rb_left; + else if (inum > o->inum) + p = p->rb_right; + else + return 1; + } + return 0; +} + +static void dbg_free_check_tree(struct rb_root *root) +{ + struct rb_node *this = root->rb_node; + struct check_orphan *o; + + while (this) { + if (this->rb_left) { + this = this->rb_left; + continue; + } else if (this->rb_right) { + this = this->rb_right; + continue; + } + o = rb_entry(this, struct check_orphan, rb); + this = rb_parent(this); + if (this) { + if (this->rb_left == &o->rb) + this->rb_left = NULL; + else + this->rb_right = NULL; + } + kfree(o); + } +} + +static int dbg_orphan_check(struct ubifs_info *c, struct ubifs_zbranch *zbr, + void *priv) +{ + struct check_info *ci = priv; + ino_t inum; + int err; + + inum = key_inum(c, &zbr->key); + if (inum != ci->last_ino) { + /* Lowest node type is the inode node, so it comes first */ + if (key_type(c, &zbr->key) != UBIFS_INO_KEY) + ubifs_err("found orphan node ino %lu, type %d", inum, + key_type(c, &zbr->key)); + ci->last_ino = inum; + ci->tot_inos += 1; + err = ubifs_tnc_read_node(c, zbr, ci->node); + if (err) { + ubifs_err("node read failed, error %d", err); + return err; + } + if (ci->node->nlink == 0) + /* Must be recorded as an orphan */ + if (!dbg_find_check_orphan(&ci->root, inum) && + !dbg_find_orphan(c, inum)) { + ubifs_err("missing orphan, ino %lu", inum); + ci->missing += 1; + } + } + ci->leaf_cnt += 1; + return 0; +} + +static int dbg_read_orphans(struct check_info *ci, struct ubifs_scan_leb *sleb) +{ + struct ubifs_scan_node *snod; + struct ubifs_orph_node *orph; + ino_t inum; + int i, n, err; + + list_for_each_entry(snod, &sleb->nodes, list) { + cond_resched(); + if (snod->type != UBIFS_ORPH_NODE) + continue; + orph = snod->node; + n = (le32_to_cpu(orph->ch.len) - UBIFS_ORPH_NODE_SZ) >> 3; + for (i = 0; i < n; i++) { + inum = le64_to_cpu(orph->inos[i]); + err = dbg_ins_check_orphan(&ci->root, inum); + if (err) + return err; + } + } + return 0; +} + +static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci) +{ + int lnum, err = 0; + + /* Check no-orphans flag and skip this if no orphans */ + if (c->no_orphs) + return 0; + + for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) { + struct ubifs_scan_leb *sleb; + + sleb = ubifs_scan(c, lnum, 0, c->dbg_buf); + if (IS_ERR(sleb)) { + err = PTR_ERR(sleb); + break; + } + + err = dbg_read_orphans(ci, sleb); + ubifs_scan_destroy(sleb); + if (err) + break; + } + + return err; +} + +static int dbg_check_orphans(struct ubifs_info *c) +{ + struct check_info ci; + int err; + + if (!(ubifs_chk_flags & UBIFS_CHK_ORPH)) + return 0; + + ci.last_ino = 0; + ci.tot_inos = 0; + ci.missing = 0; + ci.leaf_cnt = 0; + ci.root = RB_ROOT; + ci.node = kmalloc(UBIFS_MAX_INO_NODE_SZ, GFP_NOFS); + if (!ci.node) { + ubifs_err("out of memory"); + return -ENOMEM; + } + + err = dbg_scan_orphans(c, &ci); + if (err) + goto out; + + err = dbg_walk_index(c, &dbg_orphan_check, NULL, &ci); + if (err) { + ubifs_err("cannot scan TNC, error %d", err); + goto out; + } + + if (ci.missing) { + ubifs_err("%lu missing orphan(s)", ci.missing); + err = -EINVAL; + goto out; + } + + dbg_cmt("last inode number is %lu", ci.last_ino); + dbg_cmt("total number of inodes is %lu", ci.tot_inos); + dbg_cmt("total number of leaf nodes is %llu", ci.leaf_cnt); + +out: + dbg_free_check_tree(&ci.root); + kfree(ci.node); + return err; +} + +#endif /* CONFIG_UBIFS_FS_DEBUG */ diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c new file mode 100644 index 000000000000..77d26c141cf6 --- /dev/null +++ b/fs/ubifs/recovery.c @@ -0,0 +1,1519 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * This file implements functions needed to recover from unclean un-mounts. + * When UBIFS is mounted, it checks a flag on the master node to determine if + * an un-mount was completed sucessfully. If not, the process of mounting + * incorparates additional checking and fixing of on-flash data structures. + * UBIFS always cleans away all remnants of an unclean un-mount, so that + * errors do not accumulate. However UBIFS defers recovery if it is mounted + * read-only, and the flash is not modified in that case. + */ + +#include <linux/crc32.h> +#include "ubifs.h" + +/** + * is_empty - determine whether a buffer is empty (contains all 0xff). + * @buf: buffer to clean + * @len: length of buffer + * + * This function returns %1 if the buffer is empty (contains all 0xff) otherwise + * %0 is returned. + */ +static int is_empty(void *buf, int len) +{ + uint8_t *p = buf; + int i; + + for (i = 0; i < len; i++) + if (*p++ != 0xff) + return 0; + return 1; +} + +/** + * get_master_node - get the last valid master node allowing for corruption. + * @c: UBIFS file-system description object + * @lnum: LEB number + * @pbuf: buffer containing the LEB read, is returned here + * @mst: master node, if found, is returned here + * @cor: corruption, if found, is returned here + * + * This function allocates a buffer, reads the LEB into it, and finds and + * returns the last valid master node allowing for one area of corruption. + * The corrupt area, if there is one, must be consistent with the assumption + * that it is the result of an unclean unmount while the master node was being + * written. Under those circumstances, it is valid to use the previously written + * master node. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int get_master_node(const struct ubifs_info *c, int lnum, void **pbuf, + struct ubifs_mst_node **mst, void **cor) +{ + const int sz = c->mst_node_alsz; + int err, offs, len; + void *sbuf, *buf; + + sbuf = vmalloc(c->leb_size); + if (!sbuf) + return -ENOMEM; + + err = ubi_read(c->ubi, lnum, sbuf, 0, c->leb_size); + if (err && err != -EBADMSG) + goto out_free; + + /* Find the first position that is definitely not a node */ + offs = 0; + buf = sbuf; + len = c->leb_size; + while (offs + UBIFS_MST_NODE_SZ <= c->leb_size) { + struct ubifs_ch *ch = buf; + + if (le32_to_cpu(ch->magic) != UBIFS_NODE_MAGIC) + break; + offs += sz; + buf += sz; + len -= sz; + } + /* See if there was a valid master node before that */ + if (offs) { + int ret; + + offs -= sz; + buf -= sz; + len += sz; + ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1); + if (ret != SCANNED_A_NODE && offs) { + /* Could have been corruption so check one place back */ + offs -= sz; + buf -= sz; + len += sz; + ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1); + if (ret != SCANNED_A_NODE) + /* + * We accept only one area of corruption because + * we are assuming that it was caused while + * trying to write a master node. + */ + goto out_err; + } + if (ret == SCANNED_A_NODE) { + struct ubifs_ch *ch = buf; + + if (ch->node_type != UBIFS_MST_NODE) + goto out_err; + dbg_rcvry("found a master node at %d:%d", lnum, offs); + *mst = buf; + offs += sz; + buf += sz; + len -= sz; + } + } + /* Check for corruption */ + if (offs < c->leb_size) { + if (!is_empty(buf, min_t(int, len, sz))) { + *cor = buf; + dbg_rcvry("found corruption at %d:%d", lnum, offs); + } + offs += sz; + buf += sz; + len -= sz; + } + /* Check remaining empty space */ + if (offs < c->leb_size) + if (!is_empty(buf, len)) + goto out_err; + *pbuf = sbuf; + return 0; + +out_err: + err = -EINVAL; +out_free: + vfree(sbuf); + *mst = NULL; + *cor = NULL; + return err; +} + +/** + * write_rcvrd_mst_node - write recovered master node. + * @c: UBIFS file-system description object + * @mst: master node + * + * This function returns %0 on success and a negative error code on failure. + */ +static int write_rcvrd_mst_node(struct ubifs_info *c, + struct ubifs_mst_node *mst) +{ + int err = 0, lnum = UBIFS_MST_LNUM, sz = c->mst_node_alsz; + uint32_t save_flags; + + dbg_rcvry("recovery"); + + save_flags = mst->flags; + mst->flags = cpu_to_le32(le32_to_cpu(mst->flags) | UBIFS_MST_RCVRY); + + ubifs_prepare_node(c, mst, UBIFS_MST_NODE_SZ, 1); + err = ubi_leb_change(c->ubi, lnum, mst, sz, UBI_SHORTTERM); + if (err) + goto out; + err = ubi_leb_change(c->ubi, lnum + 1, mst, sz, UBI_SHORTTERM); + if (err) + goto out; +out: + mst->flags = save_flags; + return err; +} + +/** + * ubifs_recover_master_node - recover the master node. + * @c: UBIFS file-system description object + * + * This function recovers the master node from corruption that may occur due to + * an unclean unmount. + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_recover_master_node(struct ubifs_info *c) +{ + void *buf1 = NULL, *buf2 = NULL, *cor1 = NULL, *cor2 = NULL; + struct ubifs_mst_node *mst1 = NULL, *mst2 = NULL, *mst; + const int sz = c->mst_node_alsz; + int err, offs1, offs2; + + dbg_rcvry("recovery"); + + err = get_master_node(c, UBIFS_MST_LNUM, &buf1, &mst1, &cor1); + if (err) + goto out_free; + + err = get_master_node(c, UBIFS_MST_LNUM + 1, &buf2, &mst2, &cor2); + if (err) + goto out_free; + + if (mst1) { + offs1 = (void *)mst1 - buf1; + if ((le32_to_cpu(mst1->flags) & UBIFS_MST_RCVRY) && + (offs1 == 0 && !cor1)) { + /* + * mst1 was written by recovery at offset 0 with no + * corruption. + */ + dbg_rcvry("recovery recovery"); + mst = mst1; + } else if (mst2) { + offs2 = (void *)mst2 - buf2; + if (offs1 == offs2) { + /* Same offset, so must be the same */ + if (memcmp((void *)mst1 + UBIFS_CH_SZ, + (void *)mst2 + UBIFS_CH_SZ, + UBIFS_MST_NODE_SZ - UBIFS_CH_SZ)) + goto out_err; + mst = mst1; + } else if (offs2 + sz == offs1) { + /* 1st LEB was written, 2nd was not */ + if (cor1) + goto out_err; + mst = mst1; + } else if (offs1 == 0 && offs2 + sz >= c->leb_size) { + /* 1st LEB was unmapped and written, 2nd not */ + if (cor1) + goto out_err; + mst = mst1; + } else + goto out_err; + } else { + /* + * 2nd LEB was unmapped and about to be written, so + * there must be only one master node in the first LEB + * and no corruption. + */ + if (offs1 != 0 || cor1) + goto out_err; + mst = mst1; + } + } else { + if (!mst2) + goto out_err; + /* + * 1st LEB was unmapped and about to be written, so there must + * be no room left in 2nd LEB. + */ + offs2 = (void *)mst2 - buf2; + if (offs2 + sz + sz <= c->leb_size) + goto out_err; + mst = mst2; + } + + dbg_rcvry("recovered master node from LEB %d", + (mst == mst1 ? UBIFS_MST_LNUM : UBIFS_MST_LNUM + 1)); + + memcpy(c->mst_node, mst, UBIFS_MST_NODE_SZ); + + if ((c->vfs_sb->s_flags & MS_RDONLY)) { + /* Read-only mode. Keep a copy for switching to rw mode */ + c->rcvrd_mst_node = kmalloc(sz, GFP_KERNEL); + if (!c->rcvrd_mst_node) { + err = -ENOMEM; + goto out_free; + } + memcpy(c->rcvrd_mst_node, c->mst_node, UBIFS_MST_NODE_SZ); + } else { + /* Write the recovered master node */ + c->max_sqnum = le64_to_cpu(mst->ch.sqnum) - 1; + err = write_rcvrd_mst_node(c, c->mst_node); + if (err) + goto out_free; + } + + vfree(buf2); + vfree(buf1); + + return 0; + +out_err: + err = -EINVAL; +out_free: + ubifs_err("failed to recover master node"); + if (mst1) { + dbg_err("dumping first master node"); + dbg_dump_node(c, mst1); + } + if (mst2) { + dbg_err("dumping second master node"); + dbg_dump_node(c, mst2); + } + vfree(buf2); + vfree(buf1); + return err; +} + +/** + * ubifs_write_rcvrd_mst_node - write the recovered master node. + * @c: UBIFS file-system description object + * + * This function writes the master node that was recovered during mounting in + * read-only mode and must now be written because we are remounting rw. + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_write_rcvrd_mst_node(struct ubifs_info *c) +{ + int err; + + if (!c->rcvrd_mst_node) + return 0; + c->rcvrd_mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY); + c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY); + err = write_rcvrd_mst_node(c, c->rcvrd_mst_node); + if (err) + return err; + kfree(c->rcvrd_mst_node); + c->rcvrd_mst_node = NULL; + return 0; +} + +/** + * is_last_write - determine if an offset was in the last write to a LEB. + * @c: UBIFS file-system description object + * @buf: buffer to check + * @offs: offset to check + * + * This function returns %1 if @offs was in the last write to the LEB whose data + * is in @buf, otherwise %0 is returned. The determination is made by checking + * for subsequent empty space starting from the next min_io_size boundary (or a + * bit less than the common header size if min_io_size is one). + */ +static int is_last_write(const struct ubifs_info *c, void *buf, int offs) +{ + int empty_offs; + int check_len; + uint8_t *p; + + if (c->min_io_size == 1) { + check_len = c->leb_size - offs; + p = buf + check_len; + for (; check_len > 0; check_len--) + if (*--p != 0xff) + break; + /* + * 'check_len' is the size of the corruption which cannot be + * more than the size of 1 node if it was caused by an unclean + * unmount. + */ + if (check_len > UBIFS_MAX_NODE_SZ) + return 0; + return 1; + } + + /* + * Round up to the next c->min_io_size boundary i.e. 'offs' is in the + * last wbuf written. After that should be empty space. + */ + empty_offs = ALIGN(offs + 1, c->min_io_size); + check_len = c->leb_size - empty_offs; + p = buf + empty_offs - offs; + + for (; check_len > 0; check_len--) + if (*p++ != 0xff) + return 0; + return 1; +} + +/** + * clean_buf - clean the data from an LEB sitting in a buffer. + * @c: UBIFS file-system description object + * @buf: buffer to clean + * @lnum: LEB number to clean + * @offs: offset from which to clean + * @len: length of buffer + * + * This function pads up to the next min_io_size boundary (if there is one) and + * sets empty space to all 0xff. @buf, @offs and @len are updated to the next + * min_io_size boundary (if there is one). + */ +static void clean_buf(const struct ubifs_info *c, void **buf, int lnum, + int *offs, int *len) +{ + int empty_offs, pad_len; + + lnum = lnum; + dbg_rcvry("cleaning corruption at %d:%d", lnum, *offs); + + if (c->min_io_size == 1) { + memset(*buf, 0xff, c->leb_size - *offs); + return; + } + + ubifs_assert(!(*offs & 7)); + empty_offs = ALIGN(*offs, c->min_io_size); + pad_len = empty_offs - *offs; + ubifs_pad(c, *buf, pad_len); + *offs += pad_len; + *buf += pad_len; + *len -= pad_len; + memset(*buf, 0xff, c->leb_size - empty_offs); +} + +/** + * no_more_nodes - determine if there are no more nodes in a buffer. + * @c: UBIFS file-system description object + * @buf: buffer to check + * @len: length of buffer + * @lnum: LEB number of the LEB from which @buf was read + * @offs: offset from which @buf was read + * + * This function scans @buf for more nodes and returns %0 is a node is found and + * %1 if no more nodes are found. + */ +static int no_more_nodes(const struct ubifs_info *c, void *buf, int len, + int lnum, int offs) +{ + int skip, next_offs = 0; + + if (len > UBIFS_DATA_NODE_SZ) { + struct ubifs_ch *ch = buf; + int dlen = le32_to_cpu(ch->len); + + if (ch->node_type == UBIFS_DATA_NODE && dlen >= UBIFS_CH_SZ && + dlen <= UBIFS_MAX_DATA_NODE_SZ) + /* The corrupt node looks like a data node */ + next_offs = ALIGN(offs + dlen, 8); + } + + if (c->min_io_size == 1) + skip = 8; + else + skip = ALIGN(offs + 1, c->min_io_size) - offs; + + offs += skip; + buf += skip; + len -= skip; + while (len > 8) { + struct ubifs_ch *ch = buf; + uint32_t magic = le32_to_cpu(ch->magic); + int ret; + + if (magic == UBIFS_NODE_MAGIC) { + ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1); + if (ret == SCANNED_A_NODE || ret > 0) { + /* + * There is a small chance this is just data in + * a data node, so check that possibility. e.g. + * this is part of a file that itself contains + * a UBIFS image. + */ + if (next_offs && offs + le32_to_cpu(ch->len) <= + next_offs) + continue; + dbg_rcvry("unexpected node at %d:%d", lnum, + offs); + return 0; + } + } + offs += 8; + buf += 8; + len -= 8; + } + return 1; +} + +/** + * fix_unclean_leb - fix an unclean LEB. + * @c: UBIFS file-system description object + * @sleb: scanned LEB information + * @start: offset where scan started + */ +static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb, + int start) +{ + int lnum = sleb->lnum, endpt = start; + + /* Get the end offset of the last node we are keeping */ + if (!list_empty(&sleb->nodes)) { + struct ubifs_scan_node *snod; + + snod = list_entry(sleb->nodes.prev, + struct ubifs_scan_node, list); + endpt = snod->offs + snod->len; + } + + if ((c->vfs_sb->s_flags & MS_RDONLY) && !c->remounting_rw) { + /* Add to recovery list */ + struct ubifs_unclean_leb *ucleb; + + dbg_rcvry("need to fix LEB %d start %d endpt %d", + lnum, start, sleb->endpt); + ucleb = kzalloc(sizeof(struct ubifs_unclean_leb), GFP_NOFS); + if (!ucleb) + return -ENOMEM; + ucleb->lnum = lnum; + ucleb->endpt = endpt; + list_add_tail(&ucleb->list, &c->unclean_leb_list); + } else { + /* Write the fixed LEB back to flash */ + int err; + + dbg_rcvry("fixing LEB %d start %d endpt %d", + lnum, start, sleb->endpt); + if (endpt == 0) { + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + } else { + int len = ALIGN(endpt, c->min_io_size); + + if (start) { + err = ubi_read(c->ubi, lnum, sleb->buf, 0, + start); + if (err) + return err; + } + /* Pad to min_io_size */ + if (len > endpt) { + int pad_len = len - ALIGN(endpt, 8); + + if (pad_len > 0) { + void *buf = sleb->buf + len - pad_len; + + ubifs_pad(c, buf, pad_len); + } + } + err = ubi_leb_change(c->ubi, lnum, sleb->buf, len, + UBI_UNKNOWN); + if (err) + return err; + } + } + return 0; +} + +/** + * drop_incomplete_group - drop nodes from an incomplete group. + * @sleb: scanned LEB information + * @offs: offset of dropped nodes is returned here + * + * This function returns %1 if nodes are dropped and %0 otherwise. + */ +static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs) +{ + int dropped = 0; + + while (!list_empty(&sleb->nodes)) { + struct ubifs_scan_node *snod; + struct ubifs_ch *ch; + + snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node, + list); + ch = snod->node; + if (ch->group_type != UBIFS_IN_NODE_GROUP) + return dropped; + dbg_rcvry("dropping node at %d:%d", sleb->lnum, snod->offs); + *offs = snod->offs; + list_del(&snod->list); + kfree(snod); + sleb->nodes_cnt -= 1; + dropped = 1; + } + return dropped; +} + +/** + * ubifs_recover_leb - scan and recover a LEB. + * @c: UBIFS file-system description object + * @lnum: LEB number + * @offs: offset + * @sbuf: LEB-sized buffer to use + * @grouped: nodes may be grouped for recovery + * + * This function does a scan of a LEB, but caters for errors that might have + * been caused by the unclean unmount from which we are attempting to recover. + * + * This function returns %0 on success and a negative error code on failure. + */ +struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, + int offs, void *sbuf, int grouped) +{ + int err, len = c->leb_size - offs, need_clean = 0, quiet = 1; + int empty_chkd = 0, start = offs; + struct ubifs_scan_leb *sleb; + void *buf = sbuf + offs; + + dbg_rcvry("%d:%d", lnum, offs); + + sleb = ubifs_start_scan(c, lnum, offs, sbuf); + if (IS_ERR(sleb)) + return sleb; + + if (sleb->ecc) + need_clean = 1; + + while (len >= 8) { + int ret; + + dbg_scan("look at LEB %d:%d (%d bytes left)", + lnum, offs, len); + + cond_resched(); + + /* + * Scan quietly until there is an error from which we cannot + * recover + */ + ret = ubifs_scan_a_node(c, buf, len, lnum, offs, quiet); + + if (ret == SCANNED_A_NODE) { + /* A valid node, and not a padding node */ + struct ubifs_ch *ch = buf; + int node_len; + + err = ubifs_add_snod(c, sleb, buf, offs); + if (err) + goto error; + node_len = ALIGN(le32_to_cpu(ch->len), 8); + offs += node_len; + buf += node_len; + len -= node_len; + continue; + } + + if (ret > 0) { + /* Padding bytes or a valid padding node */ + offs += ret; + buf += ret; + len -= ret; + continue; + } + + if (ret == SCANNED_EMPTY_SPACE) { + if (!is_empty(buf, len)) { + if (!is_last_write(c, buf, offs)) + break; + clean_buf(c, &buf, lnum, &offs, &len); + need_clean = 1; + } + empty_chkd = 1; + break; + } + + if (ret == SCANNED_GARBAGE || ret == SCANNED_A_BAD_PAD_NODE) + if (is_last_write(c, buf, offs)) { + clean_buf(c, &buf, lnum, &offs, &len); + need_clean = 1; + empty_chkd = 1; + break; + } + + if (ret == SCANNED_A_CORRUPT_NODE) + if (no_more_nodes(c, buf, len, lnum, offs)) { + clean_buf(c, &buf, lnum, &offs, &len); + need_clean = 1; + empty_chkd = 1; + break; + } + + if (quiet) { + /* Redo the last scan but noisily */ + quiet = 0; + continue; + } + + switch (ret) { + case SCANNED_GARBAGE: + dbg_err("garbage"); + goto corrupted; + case SCANNED_A_CORRUPT_NODE: + case SCANNED_A_BAD_PAD_NODE: + dbg_err("bad node"); + goto corrupted; + default: + dbg_err("unknown"); + goto corrupted; + } + } + + if (!empty_chkd && !is_empty(buf, len)) { + if (is_last_write(c, buf, offs)) { + clean_buf(c, &buf, lnum, &offs, &len); + need_clean = 1; + } else { + ubifs_err("corrupt empty space at LEB %d:%d", + lnum, offs); + goto corrupted; + } + } + + /* Drop nodes from incomplete group */ + if (grouped && drop_incomplete_group(sleb, &offs)) { + buf = sbuf + offs; + len = c->leb_size - offs; + clean_buf(c, &buf, lnum, &offs, &len); + need_clean = 1; + } + + if (offs % c->min_io_size) { + clean_buf(c, &buf, lnum, &offs, &len); + need_clean = 1; + } + + ubifs_end_scan(c, sleb, lnum, offs); + + if (need_clean) { + err = fix_unclean_leb(c, sleb, start); + if (err) + goto error; + } + + return sleb; + +corrupted: + ubifs_scanned_corruption(c, lnum, offs, buf); + err = -EUCLEAN; +error: + ubifs_err("LEB %d scanning failed", lnum); + ubifs_scan_destroy(sleb); + return ERR_PTR(err); +} + +/** + * get_cs_sqnum - get commit start sequence number. + * @c: UBIFS file-system description object + * @lnum: LEB number of commit start node + * @offs: offset of commit start node + * @cs_sqnum: commit start sequence number is returned here + * + * This function returns %0 on success and a negative error code on failure. + */ +static int get_cs_sqnum(struct ubifs_info *c, int lnum, int offs, + unsigned long long *cs_sqnum) +{ + struct ubifs_cs_node *cs_node = NULL; + int err, ret; + + dbg_rcvry("at %d:%d", lnum, offs); + cs_node = kmalloc(UBIFS_CS_NODE_SZ, GFP_KERNEL); + if (!cs_node) + return -ENOMEM; + if (c->leb_size - offs < UBIFS_CS_NODE_SZ) + goto out_err; + err = ubi_read(c->ubi, lnum, (void *)cs_node, offs, UBIFS_CS_NODE_SZ); + if (err && err != -EBADMSG) + goto out_free; + ret = ubifs_scan_a_node(c, cs_node, UBIFS_CS_NODE_SZ, lnum, offs, 0); + if (ret != SCANNED_A_NODE) { + dbg_err("Not a valid node"); + goto out_err; + } + if (cs_node->ch.node_type != UBIFS_CS_NODE) { + dbg_err("Node a CS node, type is %d", cs_node->ch.node_type); + goto out_err; + } + if (le64_to_cpu(cs_node->cmt_no) != c->cmt_no) { + dbg_err("CS node cmt_no %llu != current cmt_no %llu", + (unsigned long long)le64_to_cpu(cs_node->cmt_no), + c->cmt_no); + goto out_err; + } + *cs_sqnum = le64_to_cpu(cs_node->ch.sqnum); + dbg_rcvry("commit start sqnum %llu", *cs_sqnum); + kfree(cs_node); + return 0; + +out_err: + err = -EINVAL; +out_free: + ubifs_err("failed to get CS sqnum"); + kfree(cs_node); + return err; +} + +/** + * ubifs_recover_log_leb - scan and recover a log LEB. + * @c: UBIFS file-system description object + * @lnum: LEB number + * @offs: offset + * @sbuf: LEB-sized buffer to use + * + * This function does a scan of a LEB, but caters for errors that might have + * been caused by the unclean unmount from which we are attempting to recover. + * + * This function returns %0 on success and a negative error code on failure. + */ +struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum, + int offs, void *sbuf) +{ + struct ubifs_scan_leb *sleb; + int next_lnum; + + dbg_rcvry("LEB %d", lnum); + next_lnum = lnum + 1; + if (next_lnum >= UBIFS_LOG_LNUM + c->log_lebs) + next_lnum = UBIFS_LOG_LNUM; + if (next_lnum != c->ltail_lnum) { + /* + * We can only recover at the end of the log, so check that the + * next log LEB is empty or out of date. + */ + sleb = ubifs_scan(c, next_lnum, 0, sbuf); + if (IS_ERR(sleb)) + return sleb; + if (sleb->nodes_cnt) { + struct ubifs_scan_node *snod; + unsigned long long cs_sqnum = c->cs_sqnum; + + snod = list_entry(sleb->nodes.next, + struct ubifs_scan_node, list); + if (cs_sqnum == 0) { + int err; + + err = get_cs_sqnum(c, lnum, offs, &cs_sqnum); + if (err) { + ubifs_scan_destroy(sleb); + return ERR_PTR(err); + } + } + if (snod->sqnum > cs_sqnum) { + ubifs_err("unrecoverable log corruption " + "in LEB %d", lnum); + ubifs_scan_destroy(sleb); + return ERR_PTR(-EUCLEAN); + } + } + ubifs_scan_destroy(sleb); + } + return ubifs_recover_leb(c, lnum, offs, sbuf, 0); +} + +/** + * recover_head - recover a head. + * @c: UBIFS file-system description object + * @lnum: LEB number of head to recover + * @offs: offset of head to recover + * @sbuf: LEB-sized buffer to use + * + * This function ensures that there is no data on the flash at a head location. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int recover_head(const struct ubifs_info *c, int lnum, int offs, + void *sbuf) +{ + int len, err, need_clean = 0; + + if (c->min_io_size > 1) + len = c->min_io_size; + else + len = 512; + if (offs + len > c->leb_size) + len = c->leb_size - offs; + + if (!len) + return 0; + + /* Read at the head location and check it is empty flash */ + err = ubi_read(c->ubi, lnum, sbuf, offs, len); + if (err) + need_clean = 1; + else { + uint8_t *p = sbuf; + + while (len--) + if (*p++ != 0xff) { + need_clean = 1; + break; + } + } + + if (need_clean) { + dbg_rcvry("cleaning head at %d:%d", lnum, offs); + if (offs == 0) + return ubifs_leb_unmap(c, lnum); + err = ubi_read(c->ubi, lnum, sbuf, 0, offs); + if (err) + return err; + return ubi_leb_change(c->ubi, lnum, sbuf, offs, UBI_UNKNOWN); + } + + return 0; +} + +/** + * ubifs_recover_inl_heads - recover index and LPT heads. + * @c: UBIFS file-system description object + * @sbuf: LEB-sized buffer to use + * + * This function ensures that there is no data on the flash at the index and + * LPT head locations. + * + * This deals with the recovery of a half-completed journal commit. UBIFS is + * careful never to overwrite the last version of the index or the LPT. Because + * the index and LPT are wandering trees, data from a half-completed commit will + * not be referenced anywhere in UBIFS. The data will be either in LEBs that are + * assumed to be empty and will be unmapped anyway before use, or in the index + * and LPT heads. + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf) +{ + int err; + + ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY) || c->remounting_rw); + + dbg_rcvry("checking index head at %d:%d", c->ihead_lnum, c->ihead_offs); + err = recover_head(c, c->ihead_lnum, c->ihead_offs, sbuf); + if (err) + return err; + + dbg_rcvry("checking LPT head at %d:%d", c->nhead_lnum, c->nhead_offs); + err = recover_head(c, c->nhead_lnum, c->nhead_offs, sbuf); + if (err) + return err; + + return 0; +} + +/** + * clean_an_unclean_leb - read and write a LEB to remove corruption. + * @c: UBIFS file-system description object + * @ucleb: unclean LEB information + * @sbuf: LEB-sized buffer to use + * + * This function reads a LEB up to a point pre-determined by the mount recovery, + * checks the nodes, and writes the result back to the flash, thereby cleaning + * off any following corruption, or non-fatal ECC errors. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int clean_an_unclean_leb(const struct ubifs_info *c, + struct ubifs_unclean_leb *ucleb, void *sbuf) +{ + int err, lnum = ucleb->lnum, offs = 0, len = ucleb->endpt, quiet = 1; + void *buf = sbuf; + + dbg_rcvry("LEB %d len %d", lnum, len); + + if (len == 0) { + /* Nothing to read, just unmap it */ + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + return 0; + } + + err = ubi_read(c->ubi, lnum, buf, offs, len); + if (err && err != -EBADMSG) + return err; + + while (len >= 8) { + int ret; + + cond_resched(); + + /* Scan quietly until there is an error */ + ret = ubifs_scan_a_node(c, buf, len, lnum, offs, quiet); + + if (ret == SCANNED_A_NODE) { + /* A valid node, and not a padding node */ + struct ubifs_ch *ch = buf; + int node_len; + + node_len = ALIGN(le32_to_cpu(ch->len), 8); + offs += node_len; + buf += node_len; + len -= node_len; + continue; + } + + if (ret > 0) { + /* Padding bytes or a valid padding node */ + offs += ret; + buf += ret; + len -= ret; + continue; + } + + if (ret == SCANNED_EMPTY_SPACE) { + ubifs_err("unexpected empty space at %d:%d", + lnum, offs); + return -EUCLEAN; + } + + if (quiet) { + /* Redo the last scan but noisily */ + quiet = 0; + continue; + } + + ubifs_scanned_corruption(c, lnum, offs, buf); + return -EUCLEAN; + } + + /* Pad to min_io_size */ + len = ALIGN(ucleb->endpt, c->min_io_size); + if (len > ucleb->endpt) { + int pad_len = len - ALIGN(ucleb->endpt, 8); + + if (pad_len > 0) { + buf = c->sbuf + len - pad_len; + ubifs_pad(c, buf, pad_len); + } + } + + /* Write back the LEB atomically */ + err = ubi_leb_change(c->ubi, lnum, sbuf, len, UBI_UNKNOWN); + if (err) + return err; + + dbg_rcvry("cleaned LEB %d", lnum); + + return 0; +} + +/** + * ubifs_clean_lebs - clean LEBs recovered during read-only mount. + * @c: UBIFS file-system description object + * @sbuf: LEB-sized buffer to use + * + * This function cleans a LEB identified during recovery that needs to be + * written but was not because UBIFS was mounted read-only. This happens when + * remounting to read-write mode. + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_clean_lebs(const struct ubifs_info *c, void *sbuf) +{ + dbg_rcvry("recovery"); + while (!list_empty(&c->unclean_leb_list)) { + struct ubifs_unclean_leb *ucleb; + int err; + + ucleb = list_entry(c->unclean_leb_list.next, + struct ubifs_unclean_leb, list); + err = clean_an_unclean_leb(c, ucleb, sbuf); + if (err) + return err; + list_del(&ucleb->list); + kfree(ucleb); + } + return 0; +} + +/** + * ubifs_rcvry_gc_commit - recover the GC LEB number and run the commit. + * @c: UBIFS file-system description object + * + * Out-of-place garbage collection requires always one empty LEB with which to + * start garbage collection. The LEB number is recorded in c->gc_lnum and is + * written to the master node on unmounting. In the case of an unclean unmount + * the value of gc_lnum recorded in the master node is out of date and cannot + * be used. Instead, recovery must allocate an empty LEB for this purpose. + * However, there may not be enough empty space, in which case it must be + * possible to GC the dirtiest LEB into the GC head LEB. + * + * This function also runs the commit which causes the TNC updates from + * size-recovery and orphans to be written to the flash. That is important to + * ensure correct replay order for subsequent mounts. + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_rcvry_gc_commit(struct ubifs_info *c) +{ + struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; + struct ubifs_lprops lp; + int lnum, err; + + c->gc_lnum = -1; + if (wbuf->lnum == -1) { + dbg_rcvry("no GC head LEB"); + goto find_free; + } + /* + * See whether the used space in the dirtiest LEB fits in the GC head + * LEB. + */ + if (wbuf->offs == c->leb_size) { + dbg_rcvry("no room in GC head LEB"); + goto find_free; + } + err = ubifs_find_dirty_leb(c, &lp, wbuf->offs, 2); + if (err) { + if (err == -ENOSPC) + dbg_err("could not find a dirty LEB"); + return err; + } + ubifs_assert(!(lp.flags & LPROPS_INDEX)); + lnum = lp.lnum; + if (lp.free + lp.dirty == c->leb_size) { + /* An empty LEB was returned */ + if (lp.free != c->leb_size) { + err = ubifs_change_one_lp(c, lnum, c->leb_size, + 0, 0, 0, 0); + if (err) + return err; + } + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + c->gc_lnum = lnum; + dbg_rcvry("allocated LEB %d for GC", lnum); + /* Run the commit */ + dbg_rcvry("committing"); + return ubifs_run_commit(c); + } + /* + * There was no empty LEB so the used space in the dirtiest LEB must fit + * in the GC head LEB. + */ + if (lp.free + lp.dirty < wbuf->offs) { + dbg_rcvry("LEB %d doesn't fit in GC head LEB %d:%d", + lnum, wbuf->lnum, wbuf->offs); + err = ubifs_return_leb(c, lnum); + if (err) + return err; + goto find_free; + } + /* + * We run the commit before garbage collection otherwise subsequent + * mounts will see the GC and orphan deletion in a different order. + */ + dbg_rcvry("committing"); + err = ubifs_run_commit(c); + if (err) + return err; + /* + * The data in the dirtiest LEB fits in the GC head LEB, so do the GC + * - use locking to keep 'ubifs_assert()' happy. + */ + dbg_rcvry("GC'ing LEB %d", lnum); + mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); + err = ubifs_garbage_collect_leb(c, &lp); + if (err >= 0) { + int err2 = ubifs_wbuf_sync_nolock(wbuf); + + if (err2) + err = err2; + } + mutex_unlock(&wbuf->io_mutex); + if (err < 0) { + dbg_err("GC failed, error %d", err); + if (err == -EAGAIN) + err = -EINVAL; + return err; + } + if (err != LEB_RETAINED) { + dbg_err("GC returned %d", err); + return -EINVAL; + } + err = ubifs_leb_unmap(c, c->gc_lnum); + if (err) + return err; + dbg_rcvry("allocated LEB %d for GC", lnum); + return 0; + +find_free: + /* + * There is no GC head LEB or the free space in the GC head LEB is too + * small. Allocate gc_lnum by calling 'ubifs_find_free_leb_for_idx()' so + * GC is not run. + */ + lnum = ubifs_find_free_leb_for_idx(c); + if (lnum < 0) { + dbg_err("could not find an empty LEB"); + return lnum; + } + /* And reset the index flag */ + err = ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0, + LPROPS_INDEX, 0); + if (err) + return err; + c->gc_lnum = lnum; + dbg_rcvry("allocated LEB %d for GC", lnum); + /* Run the commit */ + dbg_rcvry("committing"); + return ubifs_run_commit(c); +} + +/** + * struct size_entry - inode size information for recovery. + * @rb: link in the RB-tree of sizes + * @inum: inode number + * @i_size: size on inode + * @d_size: maximum size based on data nodes + * @exists: indicates whether the inode exists + * @inode: inode if pinned in memory awaiting rw mode to fix it + */ +struct size_entry { + struct rb_node rb; + ino_t inum; + loff_t i_size; + loff_t d_size; + int exists; + struct inode *inode; +}; + +/** + * add_ino - add an entry to the size tree. + * @c: UBIFS file-system description object + * @inum: inode number + * @i_size: size on inode + * @d_size: maximum size based on data nodes + * @exists: indicates whether the inode exists + */ +static int add_ino(struct ubifs_info *c, ino_t inum, loff_t i_size, + loff_t d_size, int exists) +{ + struct rb_node **p = &c->size_tree.rb_node, *parent = NULL; + struct size_entry *e; + + while (*p) { + parent = *p; + e = rb_entry(parent, struct size_entry, rb); + if (inum < e->inum) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + + e = kzalloc(sizeof(struct size_entry), GFP_KERNEL); + if (!e) + return -ENOMEM; + + e->inum = inum; + e->i_size = i_size; + e->d_size = d_size; + e->exists = exists; + + rb_link_node(&e->rb, parent, p); + rb_insert_color(&e->rb, &c->size_tree); + + return 0; +} + +/** + * find_ino - find an entry on the size tree. + * @c: UBIFS file-system description object + * @inum: inode number + */ +static struct size_entry *find_ino(struct ubifs_info *c, ino_t inum) +{ + struct rb_node *p = c->size_tree.rb_node; + struct size_entry *e; + + while (p) { + e = rb_entry(p, struct size_entry, rb); + if (inum < e->inum) + p = p->rb_left; + else if (inum > e->inum) + p = p->rb_right; + else + return e; + } + return NULL; +} + +/** + * remove_ino - remove an entry from the size tree. + * @c: UBIFS file-system description object + * @inum: inode number + */ +static void remove_ino(struct ubifs_info *c, ino_t inum) +{ + struct size_entry *e = find_ino(c, inum); + + if (!e) + return; + rb_erase(&e->rb, &c->size_tree); + kfree(e); +} + +/** + * ubifs_destroy_size_tree - free resources related to the size tree. + * @c: UBIFS file-system description object + */ +void ubifs_destroy_size_tree(struct ubifs_info *c) +{ + struct rb_node *this = c->size_tree.rb_node; + struct size_entry *e; + + while (this) { + if (this->rb_left) { + this = this->rb_left; + continue; + } else if (this->rb_right) { + this = this->rb_right; + continue; + } + e = rb_entry(this, struct size_entry, rb); + if (e->inode) + iput(e->inode); + this = rb_parent(this); + if (this) { + if (this->rb_left == &e->rb) + this->rb_left = NULL; + else + this->rb_right = NULL; + } + kfree(e); + } + c->size_tree = RB_ROOT; +} + +/** + * ubifs_recover_size_accum - accumulate inode sizes for recovery. + * @c: UBIFS file-system description object + * @key: node key + * @deletion: node is for a deletion + * @new_size: inode size + * + * This function has two purposes: + * 1) to ensure there are no data nodes that fall outside the inode size + * 2) to ensure there are no data nodes for inodes that do not exist + * To accomplish those purposes, a rb-tree is constructed containing an entry + * for each inode number in the journal that has not been deleted, and recording + * the size from the inode node, the maximum size of any data node (also altered + * by truncations) and a flag indicating a inode number for which no inode node + * was present in the journal. + * + * Note that there is still the possibility that there are data nodes that have + * been committed that are beyond the inode size, however the only way to find + * them would be to scan the entire index. Alternatively, some provision could + * be made to record the size of inodes at the start of commit, which would seem + * very cumbersome for a scenario that is quite unlikely and the only negative + * consequence of which is wasted space. + * + * This functions returns %0 on success and a negative error code on failure. + */ +int ubifs_recover_size_accum(struct ubifs_info *c, union ubifs_key *key, + int deletion, loff_t new_size) +{ + ino_t inum = key_inum(c, key); + struct size_entry *e; + int err; + + switch (key_type(c, key)) { + case UBIFS_INO_KEY: + if (deletion) + remove_ino(c, inum); + else { + e = find_ino(c, inum); + if (e) { + e->i_size = new_size; + e->exists = 1; + } else { + err = add_ino(c, inum, new_size, 0, 1); + if (err) + return err; + } + } + break; + case UBIFS_DATA_KEY: + e = find_ino(c, inum); + if (e) { + if (new_size > e->d_size) + e->d_size = new_size; + } else { + err = add_ino(c, inum, 0, new_size, 0); + if (err) + return err; + } + break; + case UBIFS_TRUN_KEY: + e = find_ino(c, inum); + if (e) + e->d_size = new_size; + break; + } + return 0; +} + +/** + * fix_size_in_place - fix inode size in place on flash. + * @c: UBIFS file-system description object + * @e: inode size information for recovery + */ +static int fix_size_in_place(struct ubifs_info *c, struct size_entry *e) +{ + struct ubifs_ino_node *ino = c->sbuf; + unsigned char *p; + union ubifs_key key; + int err, lnum, offs, len; + loff_t i_size; + uint32_t crc; + + /* Locate the inode node LEB number and offset */ + ino_key_init(c, &key, e->inum); + err = ubifs_tnc_locate(c, &key, ino, &lnum, &offs); + if (err) + goto out; + /* + * If the size recorded on the inode node is greater than the size that + * was calculated from nodes in the journal then don't change the inode. + */ + i_size = le64_to_cpu(ino->size); + if (i_size >= e->d_size) + return 0; + /* Read the LEB */ + err = ubi_read(c->ubi, lnum, c->sbuf, 0, c->leb_size); + if (err) + goto out; + /* Change the size field and recalculate the CRC */ + ino = c->sbuf + offs; + ino->size = cpu_to_le64(e->d_size); + len = le32_to_cpu(ino->ch.len); + crc = crc32(UBIFS_CRC32_INIT, (void *)ino + 8, len - 8); + ino->ch.crc = cpu_to_le32(crc); + /* Work out where data in the LEB ends and free space begins */ + p = c->sbuf; + len = c->leb_size - 1; + while (p[len] == 0xff) + len -= 1; + len = ALIGN(len + 1, c->min_io_size); + /* Atomically write the fixed LEB back again */ + err = ubi_leb_change(c->ubi, lnum, c->sbuf, len, UBI_UNKNOWN); + if (err) + goto out; + dbg_rcvry("inode %lu at %d:%d size %lld -> %lld ", e->inum, lnum, offs, + i_size, e->d_size); + return 0; + +out: + ubifs_warn("inode %lu failed to fix size %lld -> %lld error %d", + e->inum, e->i_size, e->d_size, err); + return err; +} + +/** + * ubifs_recover_size - recover inode size. + * @c: UBIFS file-system description object + * + * This function attempts to fix inode size discrepancies identified by the + * 'ubifs_recover_size_accum()' function. + * + * This functions returns %0 on success and a negative error code on failure. + */ +int ubifs_recover_size(struct ubifs_info *c) +{ + struct rb_node *this = rb_first(&c->size_tree); + + while (this) { + struct size_entry *e; + int err; + + e = rb_entry(this, struct size_entry, rb); + if (!e->exists) { + union ubifs_key key; + + ino_key_init(c, &key, e->inum); + err = ubifs_tnc_lookup(c, &key, c->sbuf); + if (err && err != -ENOENT) + return err; + if (err == -ENOENT) { + /* Remove data nodes that have no inode */ + dbg_rcvry("removing ino %lu", e->inum); + err = ubifs_tnc_remove_ino(c, e->inum); + if (err) + return err; + } else { + struct ubifs_ino_node *ino = c->sbuf; + + e->exists = 1; + e->i_size = le64_to_cpu(ino->size); + } + } + if (e->exists && e->i_size < e->d_size) { + if (!e->inode && (c->vfs_sb->s_flags & MS_RDONLY)) { + /* Fix the inode size and pin it in memory */ + struct inode *inode; + + inode = ubifs_iget(c->vfs_sb, e->inum); + if (IS_ERR(inode)) + return PTR_ERR(inode); + if (inode->i_size < e->d_size) { + dbg_rcvry("ino %lu size %lld -> %lld", + e->inum, e->d_size, + inode->i_size); + inode->i_size = e->d_size; + ubifs_inode(inode)->ui_size = e->d_size; + e->inode = inode; + this = rb_next(this); + continue; + } + iput(inode); + } else { + /* Fix the size in place */ + err = fix_size_in_place(c, e); + if (err) + return err; + if (e->inode) + iput(e->inode); + } + } + this = rb_next(this); + rb_erase(&e->rb, &c->size_tree); + kfree(e); + } + return 0; +} diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c new file mode 100644 index 000000000000..7399692af859 --- /dev/null +++ b/fs/ubifs/replay.c @@ -0,0 +1,1075 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * This file contains journal replay code. It runs when the file-system is being + * mounted and requires no locking. + * + * The larger is the journal, the longer it takes to scan it, so the longer it + * takes to mount UBIFS. This is why the journal has limited size which may be + * changed depending on the system requirements. But a larger journal gives + * faster I/O speed because it writes the index less frequently. So this is a + * trade-off. Also, the journal is indexed by the in-memory index (TNC), so the + * larger is the journal, the more memory its index may consume. + */ + +#include "ubifs.h" + +/* + * Replay flags. + * + * REPLAY_DELETION: node was deleted + * REPLAY_REF: node is a reference node + */ +enum { + REPLAY_DELETION = 1, + REPLAY_REF = 2, +}; + +/** + * struct replay_entry - replay tree entry. + * @lnum: logical eraseblock number of the node + * @offs: node offset + * @len: node length + * @sqnum: node sequence number + * @flags: replay flags + * @rb: links the replay tree + * @key: node key + * @nm: directory entry name + * @old_size: truncation old size + * @new_size: truncation new size + * @free: amount of free space in a bud + * @dirty: amount of dirty space in a bud from padding and deletion nodes + * + * UBIFS journal replay must compare node sequence numbers, which means it must + * build a tree of node information to insert into the TNC. + */ +struct replay_entry { + int lnum; + int offs; + int len; + unsigned long long sqnum; + int flags; + struct rb_node rb; + union ubifs_key key; + union { + struct qstr nm; + struct { + loff_t old_size; + loff_t new_size; + }; + struct { + int free; + int dirty; + }; + }; +}; + +/** + * struct bud_entry - entry in the list of buds to replay. + * @list: next bud in the list + * @bud: bud description object + * @free: free bytes in the bud + * @sqnum: reference node sequence number + */ +struct bud_entry { + struct list_head list; + struct ubifs_bud *bud; + int free; + unsigned long long sqnum; +}; + +/** + * set_bud_lprops - set free and dirty space used by a bud. + * @c: UBIFS file-system description object + * @r: replay entry of bud + */ +static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r) +{ + const struct ubifs_lprops *lp; + int err = 0, dirty; + + ubifs_get_lprops(c); + + lp = ubifs_lpt_lookup_dirty(c, r->lnum); + if (IS_ERR(lp)) { + err = PTR_ERR(lp); + goto out; + } + + dirty = lp->dirty; + if (r->offs == 0 && (lp->free != c->leb_size || lp->dirty != 0)) { + /* + * The LEB was added to the journal with a starting offset of + * zero which means the LEB must have been empty. The LEB + * property values should be lp->free == c->leb_size and + * lp->dirty == 0, but that is not the case. The reason is that + * the LEB was garbage collected. The garbage collector resets + * the free and dirty space without recording it anywhere except + * lprops, so if there is not a commit then lprops does not have + * that information next time the file system is mounted. + * + * We do not need to adjust free space because the scan has told + * us the exact value which is recorded in the replay entry as + * r->free. + * + * However we do need to subtract from the dirty space the + * amount of space that the garbage collector reclaimed, which + * is the whole LEB minus the amount of space that was free. + */ + dbg_mnt("bud LEB %d was GC'd (%d free, %d dirty)", r->lnum, + lp->free, lp->dirty); + dbg_gc("bud LEB %d was GC'd (%d free, %d dirty)", r->lnum, + lp->free, lp->dirty); + dirty -= c->leb_size - lp->free; + /* + * If the replay order was perfect the dirty space would now be + * zero. The order is not perfect because the the journal heads + * race with eachother. This is not a problem but is does mean + * that the dirty space may temporarily exceed c->leb_size + * during the replay. + */ + if (dirty != 0) + dbg_msg("LEB %d lp: %d free %d dirty " + "replay: %d free %d dirty", r->lnum, lp->free, + lp->dirty, r->free, r->dirty); + } + lp = ubifs_change_lp(c, lp, r->free, dirty + r->dirty, + lp->flags | LPROPS_TAKEN, 0); + if (IS_ERR(lp)) { + err = PTR_ERR(lp); + goto out; + } +out: + ubifs_release_lprops(c); + return err; +} + +/** + * trun_remove_range - apply a replay entry for a truncation to the TNC. + * @c: UBIFS file-system description object + * @r: replay entry of truncation + */ +static int trun_remove_range(struct ubifs_info *c, struct replay_entry *r) +{ + unsigned min_blk, max_blk; + union ubifs_key min_key, max_key; + ino_t ino; + + min_blk = r->new_size / UBIFS_BLOCK_SIZE; + if (r->new_size & (UBIFS_BLOCK_SIZE - 1)) + min_blk += 1; + + max_blk = r->old_size / UBIFS_BLOCK_SIZE; + if ((r->old_size & (UBIFS_BLOCK_SIZE - 1)) == 0) + max_blk -= 1; + + ino = key_inum(c, &r->key); + + data_key_init(c, &min_key, ino, min_blk); + data_key_init(c, &max_key, ino, max_blk); + + return ubifs_tnc_remove_range(c, &min_key, &max_key); +} + +/** + * apply_replay_entry - apply a replay entry to the TNC. + * @c: UBIFS file-system description object + * @r: replay entry to apply + * + * Apply a replay entry to the TNC. + */ +static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r) +{ + int err, deletion = ((r->flags & REPLAY_DELETION) != 0); + + dbg_mnt("LEB %d:%d len %d flgs %d sqnum %llu %s", r->lnum, + r->offs, r->len, r->flags, r->sqnum, DBGKEY(&r->key)); + + /* Set c->replay_sqnum to help deal with dangling branches. */ + c->replay_sqnum = r->sqnum; + + if (r->flags & REPLAY_REF) + err = set_bud_lprops(c, r); + else if (is_hash_key(c, &r->key)) { + if (deletion) + err = ubifs_tnc_remove_nm(c, &r->key, &r->nm); + else + err = ubifs_tnc_add_nm(c, &r->key, r->lnum, r->offs, + r->len, &r->nm); + } else { + if (deletion) + switch (key_type(c, &r->key)) { + case UBIFS_INO_KEY: + { + ino_t inum = key_inum(c, &r->key); + + err = ubifs_tnc_remove_ino(c, inum); + break; + } + case UBIFS_TRUN_KEY: + err = trun_remove_range(c, r); + break; + default: + err = ubifs_tnc_remove(c, &r->key); + break; + } + else + err = ubifs_tnc_add(c, &r->key, r->lnum, r->offs, + r->len); + if (err) + return err; + + if (c->need_recovery) + err = ubifs_recover_size_accum(c, &r->key, deletion, + r->new_size); + } + + return err; +} + +/** + * destroy_replay_tree - destroy the replay. + * @c: UBIFS file-system description object + * + * Destroy the replay tree. + */ +static void destroy_replay_tree(struct ubifs_info *c) +{ + struct rb_node *this = c->replay_tree.rb_node; + struct replay_entry *r; + + while (this) { + if (this->rb_left) { + this = this->rb_left; + continue; + } else if (this->rb_right) { + this = this->rb_right; + continue; + } + r = rb_entry(this, struct replay_entry, rb); + this = rb_parent(this); + if (this) { + if (this->rb_left == &r->rb) + this->rb_left = NULL; + else + this->rb_right = NULL; + } + if (is_hash_key(c, &r->key)) + kfree(r->nm.name); + kfree(r); + } + c->replay_tree = RB_ROOT; +} + +/** + * apply_replay_tree - apply the replay tree to the TNC. + * @c: UBIFS file-system description object + * + * Apply the replay tree. + * Returns zero in case of success and a negative error code in case of + * failure. + */ +static int apply_replay_tree(struct ubifs_info *c) +{ + struct rb_node *this = rb_first(&c->replay_tree); + + while (this) { + struct replay_entry *r; + int err; + + cond_resched(); + + r = rb_entry(this, struct replay_entry, rb); + err = apply_replay_entry(c, r); + if (err) + return err; + this = rb_next(this); + } + return 0; +} + +/** + * insert_node - insert a node to the replay tree. + * @c: UBIFS file-system description object + * @lnum: node logical eraseblock number + * @offs: node offset + * @len: node length + * @key: node key + * @sqnum: sequence number + * @deletion: non-zero if this is a deletion + * @used: number of bytes in use in a LEB + * @old_size: truncation old size + * @new_size: truncation new size + * + * This function inserts a scanned non-direntry node to the replay tree. The + * replay tree is an RB-tree containing @struct replay_entry elements which are + * indexed by the sequence number. The replay tree is applied at the very end + * of the replay process. Since the tree is sorted in sequence number order, + * the older modifications are applied first. This function returns zero in + * case of success and a negative error code in case of failure. + */ +static int insert_node(struct ubifs_info *c, int lnum, int offs, int len, + union ubifs_key *key, unsigned long long sqnum, + int deletion, int *used, loff_t old_size, + loff_t new_size) +{ + struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL; + struct replay_entry *r; + + if (key_inum(c, key) >= c->highest_inum) + c->highest_inum = key_inum(c, key); + + dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key)); + while (*p) { + parent = *p; + r = rb_entry(parent, struct replay_entry, rb); + if (sqnum < r->sqnum) { + p = &(*p)->rb_left; + continue; + } else if (sqnum > r->sqnum) { + p = &(*p)->rb_right; + continue; + } + ubifs_err("duplicate sqnum in replay"); + return -EINVAL; + } + + r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL); + if (!r) + return -ENOMEM; + + if (!deletion) + *used += ALIGN(len, 8); + r->lnum = lnum; + r->offs = offs; + r->len = len; + r->sqnum = sqnum; + r->flags = (deletion ? REPLAY_DELETION : 0); + r->old_size = old_size; + r->new_size = new_size; + key_copy(c, key, &r->key); + + rb_link_node(&r->rb, parent, p); + rb_insert_color(&r->rb, &c->replay_tree); + return 0; +} + +/** + * insert_dent - insert a directory entry node into the replay tree. + * @c: UBIFS file-system description object + * @lnum: node logical eraseblock number + * @offs: node offset + * @len: node length + * @key: node key + * @name: directory entry name + * @nlen: directory entry name length + * @sqnum: sequence number + * @deletion: non-zero if this is a deletion + * @used: number of bytes in use in a LEB + * + * This function inserts a scanned directory entry node to the replay tree. + * Returns zero in case of success and a negative error code in case of + * failure. + * + * This function is also used for extended attribute entries because they are + * implemented as directory entry nodes. + */ +static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len, + union ubifs_key *key, const char *name, int nlen, + unsigned long long sqnum, int deletion, int *used) +{ + struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL; + struct replay_entry *r; + char *nbuf; + + if (key_inum(c, key) >= c->highest_inum) + c->highest_inum = key_inum(c, key); + + dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key)); + while (*p) { + parent = *p; + r = rb_entry(parent, struct replay_entry, rb); + if (sqnum < r->sqnum) { + p = &(*p)->rb_left; + continue; + } + if (sqnum > r->sqnum) { + p = &(*p)->rb_right; + continue; + } + ubifs_err("duplicate sqnum in replay"); + return -EINVAL; + } + + r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL); + if (!r) + return -ENOMEM; + nbuf = kmalloc(nlen + 1, GFP_KERNEL); + if (!nbuf) { + kfree(r); + return -ENOMEM; + } + + if (!deletion) + *used += ALIGN(len, 8); + r->lnum = lnum; + r->offs = offs; + r->len = len; + r->sqnum = sqnum; + r->nm.len = nlen; + memcpy(nbuf, name, nlen); + nbuf[nlen] = '\0'; + r->nm.name = nbuf; + r->flags = (deletion ? REPLAY_DELETION : 0); + key_copy(c, key, &r->key); + + ubifs_assert(!*p); + rb_link_node(&r->rb, parent, p); + rb_insert_color(&r->rb, &c->replay_tree); + return 0; +} + +/** + * ubifs_validate_entry - validate directory or extended attribute entry node. + * @c: UBIFS file-system description object + * @dent: the node to validate + * + * This function validates directory or extended attribute entry node @dent. + * Returns zero if the node is all right and a %-EINVAL if not. + */ +int ubifs_validate_entry(struct ubifs_info *c, + const struct ubifs_dent_node *dent) +{ + int key_type = key_type_flash(c, dent->key); + int nlen = le16_to_cpu(dent->nlen); + + if (le32_to_cpu(dent->ch.len) != nlen + UBIFS_DENT_NODE_SZ + 1 || + dent->type >= UBIFS_ITYPES_CNT || + nlen > UBIFS_MAX_NLEN || dent->name[nlen] != 0 || + strnlen(dent->name, nlen) != nlen || + le64_to_cpu(dent->inum) > MAX_INUM) { + ubifs_err("bad %s node", key_type == UBIFS_DENT_KEY ? + "directory entry" : "extended attribute entry"); + return -EINVAL; + } + + if (key_type != UBIFS_DENT_KEY && key_type != UBIFS_XENT_KEY) { + ubifs_err("bad key type %d", key_type); + return -EINVAL; + } + + return 0; +} + +/** + * replay_bud - replay a bud logical eraseblock. + * @c: UBIFS file-system description object + * @lnum: bud logical eraseblock number to replay + * @offs: bud start offset + * @jhead: journal head to which this bud belongs + * @free: amount of free space in the bud is returned here + * @dirty: amount of dirty space from padding and deletion nodes is returned + * here + * + * This function returns zero in case of success and a negative error code in + * case of failure. + */ +static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead, + int *free, int *dirty) +{ + int err = 0, used = 0; + struct ubifs_scan_leb *sleb; + struct ubifs_scan_node *snod; + struct ubifs_bud *bud; + + dbg_mnt("replay bud LEB %d, head %d", lnum, jhead); + if (c->need_recovery) + sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf, jhead != GCHD); + else + sleb = ubifs_scan(c, lnum, offs, c->sbuf); + if (IS_ERR(sleb)) + return PTR_ERR(sleb); + + /* + * The bud does not have to start from offset zero - the beginning of + * the 'lnum' LEB may contain previously committed data. One of the + * things we have to do in replay is to correctly update lprops with + * newer information about this LEB. + * + * At this point lprops thinks that this LEB has 'c->leb_size - offs' + * bytes of free space because it only contain information about + * committed data. + * + * But we know that real amount of free space is 'c->leb_size - + * sleb->endpt', and the space in the 'lnum' LEB between 'offs' and + * 'sleb->endpt' is used by bud data. We have to correctly calculate + * how much of these data are dirty and update lprops with this + * information. + * + * The dirt in that LEB region is comprised of padding nodes, deletion + * nodes, truncation nodes and nodes which are obsoleted by subsequent + * nodes in this LEB. So instead of calculating clean space, we + * calculate used space ('used' variable). + */ + + list_for_each_entry(snod, &sleb->nodes, list) { + int deletion = 0; + + cond_resched(); + + if (snod->sqnum >= SQNUM_WATERMARK) { + ubifs_err("file system's life ended"); + goto out_dump; + } + + if (snod->sqnum > c->max_sqnum) + c->max_sqnum = snod->sqnum; + + switch (snod->type) { + case UBIFS_INO_NODE: + { + struct ubifs_ino_node *ino = snod->node; + loff_t new_size = le64_to_cpu(ino->size); + + if (le32_to_cpu(ino->nlink) == 0) + deletion = 1; + err = insert_node(c, lnum, snod->offs, snod->len, + &snod->key, snod->sqnum, deletion, + &used, 0, new_size); + break; + } + case UBIFS_DATA_NODE: + { + struct ubifs_data_node *dn = snod->node; + loff_t new_size = le32_to_cpu(dn->size) + + key_block(c, &snod->key) * + UBIFS_BLOCK_SIZE; + + err = insert_node(c, lnum, snod->offs, snod->len, + &snod->key, snod->sqnum, deletion, + &used, 0, new_size); + break; + } + case UBIFS_DENT_NODE: + case UBIFS_XENT_NODE: + { + struct ubifs_dent_node *dent = snod->node; + + err = ubifs_validate_entry(c, dent); + if (err) + goto out_dump; + + err = insert_dent(c, lnum, snod->offs, snod->len, + &snod->key, dent->name, + le16_to_cpu(dent->nlen), snod->sqnum, + !le64_to_cpu(dent->inum), &used); + break; + } + case UBIFS_TRUN_NODE: + { + struct ubifs_trun_node *trun = snod->node; + loff_t old_size = le64_to_cpu(trun->old_size); + loff_t new_size = le64_to_cpu(trun->new_size); + union ubifs_key key; + + /* Validate truncation node */ + if (old_size < 0 || old_size > c->max_inode_sz || + new_size < 0 || new_size > c->max_inode_sz || + old_size <= new_size) { + ubifs_err("bad truncation node"); + goto out_dump; + } + + /* + * Create a fake truncation key just to use the same + * functions which expect nodes to have keys. + */ + trun_key_init(c, &key, le32_to_cpu(trun->inum)); + err = insert_node(c, lnum, snod->offs, snod->len, + &key, snod->sqnum, 1, &used, + old_size, new_size); + break; + } + default: + ubifs_err("unexpected node type %d in bud LEB %d:%d", + snod->type, lnum, snod->offs); + err = -EINVAL; + goto out_dump; + } + if (err) + goto out; + } + + bud = ubifs_search_bud(c, lnum); + if (!bud) + BUG(); + + ubifs_assert(sleb->endpt - offs >= used); + ubifs_assert(sleb->endpt % c->min_io_size == 0); + + if (sleb->endpt + c->min_io_size <= c->leb_size && + !(c->vfs_sb->s_flags & MS_RDONLY)) + err = ubifs_wbuf_seek_nolock(&c->jheads[jhead].wbuf, lnum, + sleb->endpt, UBI_SHORTTERM); + + *dirty = sleb->endpt - offs - used; + *free = c->leb_size - sleb->endpt; + +out: + ubifs_scan_destroy(sleb); + return err; + +out_dump: + ubifs_err("bad node is at LEB %d:%d", lnum, snod->offs); + dbg_dump_node(c, snod->node); + ubifs_scan_destroy(sleb); + return -EINVAL; +} + +/** + * insert_ref_node - insert a reference node to the replay tree. + * @c: UBIFS file-system description object + * @lnum: node logical eraseblock number + * @offs: node offset + * @sqnum: sequence number + * @free: amount of free space in bud + * @dirty: amount of dirty space from padding and deletion nodes + * + * This function inserts a reference node to the replay tree and returns zero + * in case of success ort a negative error code in case of failure. + */ +static int insert_ref_node(struct ubifs_info *c, int lnum, int offs, + unsigned long long sqnum, int free, int dirty) +{ + struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL; + struct replay_entry *r; + + dbg_mnt("add ref LEB %d:%d", lnum, offs); + while (*p) { + parent = *p; + r = rb_entry(parent, struct replay_entry, rb); + if (sqnum < r->sqnum) { + p = &(*p)->rb_left; + continue; + } else if (sqnum > r->sqnum) { + p = &(*p)->rb_right; + continue; + } + ubifs_err("duplicate sqnum in replay tree"); + return -EINVAL; + } + + r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL); + if (!r) + return -ENOMEM; + + r->lnum = lnum; + r->offs = offs; + r->sqnum = sqnum; + r->flags = REPLAY_REF; + r->free = free; + r->dirty = dirty; + + rb_link_node(&r->rb, parent, p); + rb_insert_color(&r->rb, &c->replay_tree); + return 0; +} + +/** + * replay_buds - replay all buds. + * @c: UBIFS file-system description object + * + * This function returns zero in case of success and a negative error code in + * case of failure. + */ +static int replay_buds(struct ubifs_info *c) +{ + struct bud_entry *b; + int err, uninitialized_var(free), uninitialized_var(dirty); + + list_for_each_entry(b, &c->replay_buds, list) { + err = replay_bud(c, b->bud->lnum, b->bud->start, b->bud->jhead, + &free, &dirty); + if (err) + return err; + err = insert_ref_node(c, b->bud->lnum, b->bud->start, b->sqnum, + free, dirty); + if (err) + return err; + } + + return 0; +} + +/** + * destroy_bud_list - destroy the list of buds to replay. + * @c: UBIFS file-system description object + */ +static void destroy_bud_list(struct ubifs_info *c) +{ + struct bud_entry *b; + + while (!list_empty(&c->replay_buds)) { + b = list_entry(c->replay_buds.next, struct bud_entry, list); + list_del(&b->list); + kfree(b); + } +} + +/** + * add_replay_bud - add a bud to the list of buds to replay. + * @c: UBIFS file-system description object + * @lnum: bud logical eraseblock number to replay + * @offs: bud start offset + * @jhead: journal head to which this bud belongs + * @sqnum: reference node sequence number + * + * This function returns zero in case of success and a negative error code in + * case of failure. + */ +static int add_replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead, + unsigned long long sqnum) +{ + struct ubifs_bud *bud; + struct bud_entry *b; + + dbg_mnt("add replay bud LEB %d:%d, head %d", lnum, offs, jhead); + + bud = kmalloc(sizeof(struct ubifs_bud), GFP_KERNEL); + if (!bud) + return -ENOMEM; + + b = kmalloc(sizeof(struct bud_entry), GFP_KERNEL); + if (!b) { + kfree(bud); + return -ENOMEM; + } + + bud->lnum = lnum; + bud->start = offs; + bud->jhead = jhead; + ubifs_add_bud(c, bud); + + b->bud = bud; + b->sqnum = sqnum; + list_add_tail(&b->list, &c->replay_buds); + + return 0; +} + +/** + * validate_ref - validate a reference node. + * @c: UBIFS file-system description object + * @ref: the reference node to validate + * @ref_lnum: LEB number of the reference node + * @ref_offs: reference node offset + * + * This function returns %1 if a bud reference already exists for the LEB. %0 is + * returned if the reference node is new, otherwise %-EINVAL is returned if + * validation failed. + */ +static int validate_ref(struct ubifs_info *c, const struct ubifs_ref_node *ref) +{ + struct ubifs_bud *bud; + int lnum = le32_to_cpu(ref->lnum); + unsigned int offs = le32_to_cpu(ref->offs); + unsigned int jhead = le32_to_cpu(ref->jhead); + + /* + * ref->offs may point to the end of LEB when the journal head points + * to the end of LEB and we write reference node for it during commit. + * So this is why we require 'offs > c->leb_size'. + */ + if (jhead >= c->jhead_cnt || lnum >= c->leb_cnt || + lnum < c->main_first || offs > c->leb_size || + offs & (c->min_io_size - 1)) + return -EINVAL; + + /* Make sure we have not already looked at this bud */ + bud = ubifs_search_bud(c, lnum); + if (bud) { + if (bud->jhead == jhead && bud->start <= offs) + return 1; + ubifs_err("bud at LEB %d:%d was already referred", lnum, offs); + return -EINVAL; + } + + return 0; +} + +/** + * replay_log_leb - replay a log logical eraseblock. + * @c: UBIFS file-system description object + * @lnum: log logical eraseblock to replay + * @offs: offset to start replaying from + * @sbuf: scan buffer + * + * This function replays a log LEB and returns zero in case of success, %1 if + * this is the last LEB in the log, and a negative error code in case of + * failure. + */ +static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf) +{ + int err; + struct ubifs_scan_leb *sleb; + struct ubifs_scan_node *snod; + const struct ubifs_cs_node *node; + + dbg_mnt("replay log LEB %d:%d", lnum, offs); + sleb = ubifs_scan(c, lnum, offs, sbuf); + if (IS_ERR(sleb)) { + if (c->need_recovery) + sleb = ubifs_recover_log_leb(c, lnum, offs, sbuf); + if (IS_ERR(sleb)) + return PTR_ERR(sleb); + } + + if (sleb->nodes_cnt == 0) { + err = 1; + goto out; + } + + node = sleb->buf; + + snod = list_entry(sleb->nodes.next, struct ubifs_scan_node, list); + if (c->cs_sqnum == 0) { + /* + * This is the first log LEB we are looking at, make sure that + * the first node is a commit start node. Also record its + * sequence number so that UBIFS can determine where the log + * ends, because all nodes which were have higher sequence + * numbers. + */ + if (snod->type != UBIFS_CS_NODE) { + dbg_err("first log node at LEB %d:%d is not CS node", + lnum, offs); + goto out_dump; + } + if (le64_to_cpu(node->cmt_no) != c->cmt_no) { + dbg_err("first CS node at LEB %d:%d has wrong " + "commit number %llu expected %llu", + lnum, offs, + (unsigned long long)le64_to_cpu(node->cmt_no), + c->cmt_no); + goto out_dump; + } + + c->cs_sqnum = le64_to_cpu(node->ch.sqnum); + dbg_mnt("commit start sqnum %llu", c->cs_sqnum); + } + + if (snod->sqnum < c->cs_sqnum) { + /* + * This means that we reached end of log and now + * look to the older log data, which was already + * committed but the eraseblock was not erased (UBIFS + * only unmaps it). So this basically means we have to + * exit with "end of log" code. + */ + err = 1; + goto out; + } + + /* Make sure the first node sits at offset zero of the LEB */ + if (snod->offs != 0) { + dbg_err("first node is not at zero offset"); + goto out_dump; + } + + list_for_each_entry(snod, &sleb->nodes, list) { + + cond_resched(); + + if (snod->sqnum >= SQNUM_WATERMARK) { + ubifs_err("file system's life ended"); + goto out_dump; + } + + if (snod->sqnum < c->cs_sqnum) { + dbg_err("bad sqnum %llu, commit sqnum %llu", + snod->sqnum, c->cs_sqnum); + goto out_dump; + } + + if (snod->sqnum > c->max_sqnum) + c->max_sqnum = snod->sqnum; + + switch (snod->type) { + case UBIFS_REF_NODE: { + const struct ubifs_ref_node *ref = snod->node; + + err = validate_ref(c, ref); + if (err == 1) + break; /* Already have this bud */ + if (err) + goto out_dump; + + err = add_replay_bud(c, le32_to_cpu(ref->lnum), + le32_to_cpu(ref->offs), + le32_to_cpu(ref->jhead), + snod->sqnum); + if (err) + goto out; + + break; + } + case UBIFS_CS_NODE: + /* Make sure it sits at the beginning of LEB */ + if (snod->offs != 0) { + ubifs_err("unexpected node in log"); + goto out_dump; + } + break; + default: + ubifs_err("unexpected node in log"); + goto out_dump; + } + } + + if (sleb->endpt || c->lhead_offs >= c->leb_size) { + c->lhead_lnum = lnum; + c->lhead_offs = sleb->endpt; + } + + err = !sleb->endpt; +out: + ubifs_scan_destroy(sleb); + return err; + +out_dump: + ubifs_err("log error detected while replying the log at LEB %d:%d", + lnum, offs + snod->offs); + dbg_dump_node(c, snod->node); + ubifs_scan_destroy(sleb); + return -EINVAL; +} + +/** + * take_ihead - update the status of the index head in lprops to 'taken'. + * @c: UBIFS file-system description object + * + * This function returns the amount of free space in the index head LEB or a + * negative error code. + */ +static int take_ihead(struct ubifs_info *c) +{ + const struct ubifs_lprops *lp; + int err, free; + + ubifs_get_lprops(c); + + lp = ubifs_lpt_lookup_dirty(c, c->ihead_lnum); + if (IS_ERR(lp)) { + err = PTR_ERR(lp); + goto out; + } + + free = lp->free; + + lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC, + lp->flags | LPROPS_TAKEN, 0); + if (IS_ERR(lp)) { + err = PTR_ERR(lp); + goto out; + } + + err = free; +out: + ubifs_release_lprops(c); + return err; +} + +/** + * ubifs_replay_journal - replay journal. + * @c: UBIFS file-system description object + * + * This function scans the journal, replays and cleans it up. It makes sure all + * memory data structures related to uncommitted journal are built (dirty TNC + * tree, tree of buds, modified lprops, etc). + */ +int ubifs_replay_journal(struct ubifs_info *c) +{ + int err, i, lnum, offs, free; + void *sbuf = NULL; + + BUILD_BUG_ON(UBIFS_TRUN_KEY > 5); + + /* Update the status of the index head in lprops to 'taken' */ + free = take_ihead(c); + if (free < 0) + return free; /* Error code */ + + if (c->ihead_offs != c->leb_size - free) { + ubifs_err("bad index head LEB %d:%d", c->ihead_lnum, + c->ihead_offs); + return -EINVAL; + } + + sbuf = vmalloc(c->leb_size); + if (!sbuf) + return -ENOMEM; + + dbg_mnt("start replaying the journal"); + + c->replaying = 1; + + lnum = c->ltail_lnum = c->lhead_lnum; + offs = c->lhead_offs; + + for (i = 0; i < c->log_lebs; i++, lnum++) { + if (lnum >= UBIFS_LOG_LNUM + c->log_lebs) { + /* + * The log is logically circular, we reached the last + * LEB, switch to the first one. + */ + lnum = UBIFS_LOG_LNUM; + offs = 0; + } + err = replay_log_leb(c, lnum, offs, sbuf); + if (err == 1) + /* We hit the end of the log */ + break; + if (err) + goto out; + offs = 0; + } + + err = replay_buds(c); + if (err) + goto out; + + err = apply_replay_tree(c); + if (err) + goto out; + + ubifs_assert(c->bud_bytes <= c->max_bud_bytes || c->need_recovery); + dbg_mnt("finished, log head LEB %d:%d, max_sqnum %llu, " + "highest_inum %lu", c->lhead_lnum, c->lhead_offs, c->max_sqnum, + c->highest_inum); +out: + destroy_replay_tree(c); + destroy_bud_list(c); + vfree(sbuf); + c->replaying = 0; + return err; +} diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c new file mode 100644 index 000000000000..2bf753b38889 --- /dev/null +++ b/fs/ubifs/sb.c @@ -0,0 +1,629 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* + * This file implements UBIFS superblock. The superblock is stored at the first + * LEB of the volume and is never changed by UBIFS. Only user-space tools may + * change it. The superblock node mostly contains geometry information. + */ + +#include "ubifs.h" +#include <linux/random.h> + +/* + * Default journal size in logical eraseblocks as a percent of total + * flash size. + */ +#define DEFAULT_JNL_PERCENT 5 + +/* Default maximum journal size in bytes */ +#define DEFAULT_MAX_JNL (32*1024*1024) + +/* Default indexing tree fanout */ +#define DEFAULT_FANOUT 8 + +/* Default number of data journal heads */ +#define DEFAULT_JHEADS_CNT 1 + +/* Default positions of different LEBs in the main area */ +#define DEFAULT_IDX_LEB 0 +#define DEFAULT_DATA_LEB 1 +#define DEFAULT_GC_LEB 2 + +/* Default number of LEB numbers in LPT's save table */ +#define DEFAULT_LSAVE_CNT 256 + +/* Default reserved pool size as a percent of maximum free space */ +#define DEFAULT_RP_PERCENT 5 + +/* The default maximum size of reserved pool in bytes */ +#define DEFAULT_MAX_RP_SIZE (5*1024*1024) + +/* Default time granularity in nanoseconds */ +#define DEFAULT_TIME_GRAN 1000000000 + +/** + * create_default_filesystem - format empty UBI volume. + * @c: UBIFS file-system description object + * + * This function creates default empty file-system. Returns zero in case of + * success and a negative error code in case of failure. + */ +static int create_default_filesystem(struct ubifs_info *c) +{ + struct ubifs_sb_node *sup; + struct ubifs_mst_node *mst; + struct ubifs_idx_node *idx; + struct ubifs_branch *br; + struct ubifs_ino_node *ino; + struct ubifs_cs_node *cs; + union ubifs_key key; + int err, tmp, jnl_lebs, log_lebs, max_buds, main_lebs, main_first; + int lpt_lebs, lpt_first, orph_lebs, big_lpt, ino_waste, sup_flags = 0; + int min_leb_cnt = UBIFS_MIN_LEB_CNT; + uint64_t tmp64, main_bytes; + + /* Some functions called from here depend on the @c->key_len filed */ + c->key_len = UBIFS_SK_LEN; + + /* + * First of all, we have to calculate default file-system geometry - + * log size, journal size, etc. + */ + if (c->leb_cnt < 0x7FFFFFFF / DEFAULT_JNL_PERCENT) + /* We can first multiply then divide and have no overflow */ + jnl_lebs = c->leb_cnt * DEFAULT_JNL_PERCENT / 100; + else + jnl_lebs = (c->leb_cnt / 100) * DEFAULT_JNL_PERCENT; + + if (jnl_lebs < UBIFS_MIN_JNL_LEBS) + jnl_lebs = UBIFS_MIN_JNL_LEBS; + if (jnl_lebs * c->leb_size > DEFAULT_MAX_JNL) + jnl_lebs = DEFAULT_MAX_JNL / c->leb_size; + + /* + * The log should be large enough to fit reference nodes for all bud + * LEBs. Because buds do not have to start from the beginning of LEBs + * (half of the LEB may contain committed data), the log should + * generally be larger, make it twice as large. + */ + tmp = 2 * (c->ref_node_alsz * jnl_lebs) + c->leb_size - 1; + log_lebs = tmp / c->leb_size; + /* Plus one LEB reserved for commit */ + log_lebs += 1; + if (c->leb_cnt - min_leb_cnt > 8) { + /* And some extra space to allow writes while committing */ + log_lebs += 1; + min_leb_cnt += 1; + } + + max_buds = jnl_lebs - log_lebs; + if (max_buds < UBIFS_MIN_BUD_LEBS) + max_buds = UBIFS_MIN_BUD_LEBS; + + /* + * Orphan nodes are stored in a separate area. One node can store a lot + * of orphan inode numbers, but when new orphan comes we just add a new + * orphan node. At some point the nodes are consolidated into one + * orphan node. + */ + orph_lebs = UBIFS_MIN_ORPH_LEBS; +#ifdef CONFIG_UBIFS_FS_DEBUG + if (c->leb_cnt - min_leb_cnt > 1) + /* + * For debugging purposes it is better to have at least 2 + * orphan LEBs, because the orphan subsystem would need to do + * consolidations and would be stressed more. + */ + orph_lebs += 1; +#endif + + main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS - log_lebs; + main_lebs -= orph_lebs; + + lpt_first = UBIFS_LOG_LNUM + log_lebs; + c->lsave_cnt = DEFAULT_LSAVE_CNT; + c->max_leb_cnt = c->leb_cnt; + err = ubifs_create_dflt_lpt(c, &main_lebs, lpt_first, &lpt_lebs, + &big_lpt); + if (err) + return err; + + dbg_gen("LEB Properties Tree created (LEBs %d-%d)", lpt_first, + lpt_first + lpt_lebs - 1); + + main_first = c->leb_cnt - main_lebs; + + /* Create default superblock */ + tmp = ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size); + sup = kzalloc(tmp, GFP_KERNEL); + if (!sup) + return -ENOMEM; + + tmp64 = (uint64_t)max_buds * c->leb_size; + if (big_lpt) + sup_flags |= UBIFS_FLG_BIGLPT; + + sup->ch.node_type = UBIFS_SB_NODE; + sup->key_hash = UBIFS_KEY_HASH_R5; + sup->flags = cpu_to_le32(sup_flags); + sup->min_io_size = cpu_to_le32(c->min_io_size); + sup->leb_size = cpu_to_le32(c->leb_size); + sup->leb_cnt = cpu_to_le32(c->leb_cnt); + sup->max_leb_cnt = cpu_to_le32(c->max_leb_cnt); + sup->max_bud_bytes = cpu_to_le64(tmp64); + sup->log_lebs = cpu_to_le32(log_lebs); + sup->lpt_lebs = cpu_to_le32(lpt_lebs); + sup->orph_lebs = cpu_to_le32(orph_lebs); + sup->jhead_cnt = cpu_to_le32(DEFAULT_JHEADS_CNT); + sup->fanout = cpu_to_le32(DEFAULT_FANOUT); + sup->lsave_cnt = cpu_to_le32(c->lsave_cnt); + sup->fmt_version = cpu_to_le32(UBIFS_FORMAT_VERSION); + sup->default_compr = cpu_to_le16(UBIFS_COMPR_LZO); + sup->time_gran = cpu_to_le32(DEFAULT_TIME_GRAN); + + generate_random_uuid(sup->uuid); + + main_bytes = (uint64_t)main_lebs * c->leb_size; + tmp64 = main_bytes * DEFAULT_RP_PERCENT; + do_div(tmp64, 100); + if (tmp64 > DEFAULT_MAX_RP_SIZE) + tmp64 = DEFAULT_MAX_RP_SIZE; + sup->rp_size = cpu_to_le64(tmp64); + + err = ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0, UBI_LONGTERM); + kfree(sup); + if (err) + return err; + + dbg_gen("default superblock created at LEB 0:0"); + + /* Create default master node */ + mst = kzalloc(c->mst_node_alsz, GFP_KERNEL); + if (!mst) + return -ENOMEM; + + mst->ch.node_type = UBIFS_MST_NODE; + mst->log_lnum = cpu_to_le32(UBIFS_LOG_LNUM); + mst->highest_inum = cpu_to_le64(UBIFS_FIRST_INO); + mst->cmt_no = 0; + mst->root_lnum = cpu_to_le32(main_first + DEFAULT_IDX_LEB); + mst->root_offs = 0; + tmp = ubifs_idx_node_sz(c, 1); + mst->root_len = cpu_to_le32(tmp); + mst->gc_lnum = cpu_to_le32(main_first + DEFAULT_GC_LEB); + mst->ihead_lnum = cpu_to_le32(main_first + DEFAULT_IDX_LEB); + mst->ihead_offs = cpu_to_le32(ALIGN(tmp, c->min_io_size)); + mst->index_size = cpu_to_le64(ALIGN(tmp, 8)); + mst->lpt_lnum = cpu_to_le32(c->lpt_lnum); + mst->lpt_offs = cpu_to_le32(c->lpt_offs); + mst->nhead_lnum = cpu_to_le32(c->nhead_lnum); + mst->nhead_offs = cpu_to_le32(c->nhead_offs); + mst->ltab_lnum = cpu_to_le32(c->ltab_lnum); + mst->ltab_offs = cpu_to_le32(c->ltab_offs); + mst->lsave_lnum = cpu_to_le32(c->lsave_lnum); + mst->lsave_offs = cpu_to_le32(c->lsave_offs); + mst->lscan_lnum = cpu_to_le32(main_first); + mst->empty_lebs = cpu_to_le32(main_lebs - 2); + mst->idx_lebs = cpu_to_le32(1); + mst->leb_cnt = cpu_to_le32(c->leb_cnt); + + /* Calculate lprops statistics */ + tmp64 = main_bytes; + tmp64 -= ALIGN(ubifs_idx_node_sz(c, 1), c->min_io_size); + tmp64 -= ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size); + mst->total_free = cpu_to_le64(tmp64); + + tmp64 = ALIGN(ubifs_idx_node_sz(c, 1), c->min_io_size); + ino_waste = ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size) - + UBIFS_INO_NODE_SZ; + tmp64 += ino_waste; + tmp64 -= ALIGN(ubifs_idx_node_sz(c, 1), 8); + mst->total_dirty = cpu_to_le64(tmp64); + + /* The indexing LEB does not contribute to dark space */ + tmp64 = (c->main_lebs - 1) * c->dark_wm; + mst->total_dark = cpu_to_le64(tmp64); + + mst->total_used = cpu_to_le64(UBIFS_INO_NODE_SZ); + + err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM, 0, + UBI_UNKNOWN); + if (err) { + kfree(mst); + return err; + } + err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM + 1, 0, + UBI_UNKNOWN); + kfree(mst); + if (err) + return err; + + dbg_gen("default master node created at LEB %d:0", UBIFS_MST_LNUM); + + /* Create the root indexing node */ + tmp = ubifs_idx_node_sz(c, 1); + idx = kzalloc(ALIGN(tmp, c->min_io_size), GFP_KERNEL); + if (!idx) + return -ENOMEM; + + c->key_fmt = UBIFS_SIMPLE_KEY_FMT; + c->key_hash = key_r5_hash; + + idx->ch.node_type = UBIFS_IDX_NODE; + idx->child_cnt = cpu_to_le16(1); + ino_key_init(c, &key, UBIFS_ROOT_INO); + br = ubifs_idx_branch(c, idx, 0); + key_write_idx(c, &key, &br->key); + br->lnum = cpu_to_le32(main_first + DEFAULT_DATA_LEB); + br->len = cpu_to_le32(UBIFS_INO_NODE_SZ); + err = ubifs_write_node(c, idx, tmp, main_first + DEFAULT_IDX_LEB, 0, + UBI_UNKNOWN); + kfree(idx); + if (err) + return err; + + dbg_gen("default root indexing node created LEB %d:0", + main_first + DEFAULT_IDX_LEB); + + /* Create default root inode */ + tmp = ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size); + ino = kzalloc(tmp, GFP_KERNEL); + if (!ino) + return -ENOMEM; + + ino_key_init_flash(c, &ino->key, UBIFS_ROOT_INO); + ino->ch.node_type = UBIFS_INO_NODE; + ino->creat_sqnum = cpu_to_le64(++c->max_sqnum); + ino->nlink = cpu_to_le32(2); + tmp = cpu_to_le64(CURRENT_TIME_SEC.tv_sec); + ino->atime_sec = tmp; + ino->ctime_sec = tmp; + ino->mtime_sec = tmp; + ino->atime_nsec = 0; + ino->ctime_nsec = 0; + ino->mtime_nsec = 0; + ino->mode = cpu_to_le32(S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO); + ino->size = cpu_to_le64(UBIFS_INO_NODE_SZ); + + /* Set compression enabled by default */ + ino->flags = cpu_to_le32(UBIFS_COMPR_FL); + + err = ubifs_write_node(c, ino, UBIFS_INO_NODE_SZ, + main_first + DEFAULT_DATA_LEB, 0, + UBI_UNKNOWN); + kfree(ino); + if (err) + return err; + + dbg_gen("root inode created at LEB %d:0", + main_first + DEFAULT_DATA_LEB); + + /* + * The first node in the log has to be the commit start node. This is + * always the case during normal file-system operation. Write a fake + * commit start node to the log. + */ + tmp = ALIGN(UBIFS_CS_NODE_SZ, c->min_io_size); + cs = kzalloc(tmp, GFP_KERNEL); + if (!cs) + return -ENOMEM; + + cs->ch.node_type = UBIFS_CS_NODE; + err = ubifs_write_node(c, cs, UBIFS_CS_NODE_SZ, UBIFS_LOG_LNUM, + 0, UBI_UNKNOWN); + kfree(cs); + + ubifs_msg("default file-system created"); + return 0; +} + +/** + * validate_sb - validate superblock node. + * @c: UBIFS file-system description object + * @sup: superblock node + * + * This function validates superblock node @sup. Since most of data was read + * from the superblock and stored in @c, the function validates fields in @c + * instead. Returns zero in case of success and %-EINVAL in case of validation + * failure. + */ +static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup) +{ + long long max_bytes; + int err = 1, min_leb_cnt; + + if (!c->key_hash) { + err = 2; + goto failed; + } + + if (sup->key_fmt != UBIFS_SIMPLE_KEY_FMT) { + err = 3; + goto failed; + } + + if (le32_to_cpu(sup->min_io_size) != c->min_io_size) { + ubifs_err("min. I/O unit mismatch: %d in superblock, %d real", + le32_to_cpu(sup->min_io_size), c->min_io_size); + goto failed; + } + + if (le32_to_cpu(sup->leb_size) != c->leb_size) { + ubifs_err("LEB size mismatch: %d in superblock, %d real", + le32_to_cpu(sup->leb_size), c->leb_size); + goto failed; + } + + if (c->log_lebs < UBIFS_MIN_LOG_LEBS || + c->lpt_lebs < UBIFS_MIN_LPT_LEBS || + c->orph_lebs < UBIFS_MIN_ORPH_LEBS || + c->main_lebs < UBIFS_MIN_MAIN_LEBS) { + err = 4; + goto failed; + } + + /* + * Calculate minimum allowed amount of main area LEBs. This is very + * similar to %UBIFS_MIN_LEB_CNT, but we take into account real what we + * have just read from the superblock. + */ + min_leb_cnt = UBIFS_SB_LEBS + UBIFS_MST_LEBS + c->log_lebs; + min_leb_cnt += c->lpt_lebs + c->orph_lebs + c->jhead_cnt + 6; + + if (c->leb_cnt < min_leb_cnt || c->leb_cnt > c->vi.size) { + ubifs_err("bad LEB count: %d in superblock, %d on UBI volume, " + "%d minimum required", c->leb_cnt, c->vi.size, + min_leb_cnt); + goto failed; + } + + if (c->max_leb_cnt < c->leb_cnt) { + ubifs_err("max. LEB count %d less than LEB count %d", + c->max_leb_cnt, c->leb_cnt); + goto failed; + } + + if (c->main_lebs < UBIFS_MIN_MAIN_LEBS) { + err = 7; + goto failed; + } + + if (c->max_bud_bytes < (long long)c->leb_size * UBIFS_MIN_BUD_LEBS || + c->max_bud_bytes > (long long)c->leb_size * c->main_lebs) { + err = 8; + goto failed; + } + + if (c->jhead_cnt < NONDATA_JHEADS_CNT + 1 || + c->jhead_cnt > NONDATA_JHEADS_CNT + UBIFS_MAX_JHEADS) { + err = 9; + goto failed; + } + + if (c->fanout < UBIFS_MIN_FANOUT || + ubifs_idx_node_sz(c, c->fanout) > c->leb_size) { + err = 10; + goto failed; + } + + if (c->lsave_cnt < 0 || (c->lsave_cnt > DEFAULT_LSAVE_CNT && + c->lsave_cnt > c->max_leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS - + c->log_lebs - c->lpt_lebs - c->orph_lebs)) { + err = 11; + goto failed; + } + + if (UBIFS_SB_LEBS + UBIFS_MST_LEBS + c->log_lebs + c->lpt_lebs + + c->orph_lebs + c->main_lebs != c->leb_cnt) { + err = 12; + goto failed; + } + + if (c->default_compr < 0 || c->default_compr >= UBIFS_COMPR_TYPES_CNT) { + err = 13; + goto failed; + } + + max_bytes = c->main_lebs * (long long)c->leb_size; + if (c->rp_size < 0 || max_bytes < c->rp_size) { + err = 14; + goto failed; + } + + if (le32_to_cpu(sup->time_gran) > 1000000000 || + le32_to_cpu(sup->time_gran) < 1) { + err = 15; + goto failed; + } + + return 0; + +failed: + ubifs_err("bad superblock, error %d", err); + dbg_dump_node(c, sup); + return -EINVAL; +} + +/** + * ubifs_read_sb_node - read superblock node. + * @c: UBIFS file-system description object + * + * This function returns a pointer to the superblock node or a negative error + * code. + */ +struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c) +{ + struct ubifs_sb_node *sup; + int err; + + sup = kmalloc(ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size), GFP_NOFS); + if (!sup) + return ERR_PTR(-ENOMEM); + + err = ubifs_read_node(c, sup, UBIFS_SB_NODE, UBIFS_SB_NODE_SZ, + UBIFS_SB_LNUM, 0); + if (err) { + kfree(sup); + return ERR_PTR(err); + } + + return sup; +} + +/** + * ubifs_write_sb_node - write superblock node. + * @c: UBIFS file-system description object + * @sup: superblock node read with 'ubifs_read_sb_node()' + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup) +{ + int len = ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size); + + ubifs_prepare_node(c, sup, UBIFS_SB_NODE_SZ, 1); + return ubifs_leb_change(c, UBIFS_SB_LNUM, sup, len, UBI_LONGTERM); +} + +/** + * ubifs_read_superblock - read superblock. + * @c: UBIFS file-system description object + * + * This function finds, reads and checks the superblock. If an empty UBI volume + * is being mounted, this function creates default superblock. Returns zero in + * case of success, and a negative error code in case of failure. + */ +int ubifs_read_superblock(struct ubifs_info *c) +{ + int err, sup_flags; + struct ubifs_sb_node *sup; + + if (c->empty) { + err = create_default_filesystem(c); + if (err) + return err; + } + + sup = ubifs_read_sb_node(c); + if (IS_ERR(sup)) + return PTR_ERR(sup); + + /* + * The software supports all previous versions but not future versions, + * due to the unavailability of time-travelling equipment. + */ + c->fmt_version = le32_to_cpu(sup->fmt_version); + if (c->fmt_version > UBIFS_FORMAT_VERSION) { + ubifs_err("on-flash format version is %d, but software only " + "supports up to version %d", c->fmt_version, + UBIFS_FORMAT_VERSION); + err = -EINVAL; + goto out; + } + + if (c->fmt_version < 3) { + ubifs_err("on-flash format version %d is not supported", + c->fmt_version); + err = -EINVAL; + goto out; + } + + switch (sup->key_hash) { + case UBIFS_KEY_HASH_R5: + c->key_hash = key_r5_hash; + c->key_hash_type = UBIFS_KEY_HASH_R5; + break; + + case UBIFS_KEY_HASH_TEST: + c->key_hash = key_test_hash; + c->key_hash_type = UBIFS_KEY_HASH_TEST; + break; + }; + + c->key_fmt = sup->key_fmt; + + switch (c->key_fmt) { + case UBIFS_SIMPLE_KEY_FMT: + c->key_len = UBIFS_SK_LEN; + break; + default: + ubifs_err("unsupported key format"); + err = -EINVAL; + goto out; + } + + c->leb_cnt = le32_to_cpu(sup->leb_cnt); + c->max_leb_cnt = le32_to_cpu(sup->max_leb_cnt); + c->max_bud_bytes = le64_to_cpu(sup->max_bud_bytes); + c->log_lebs = le32_to_cpu(sup->log_lebs); + c->lpt_lebs = le32_to_cpu(sup->lpt_lebs); + c->orph_lebs = le32_to_cpu(sup->orph_lebs); + c->jhead_cnt = le32_to_cpu(sup->jhead_cnt) + NONDATA_JHEADS_CNT; + c->fanout = le32_to_cpu(sup->fanout); + c->lsave_cnt = le32_to_cpu(sup->lsave_cnt); + c->default_compr = le16_to_cpu(sup->default_compr); + c->rp_size = le64_to_cpu(sup->rp_size); + c->rp_uid = le32_to_cpu(sup->rp_uid); + c->rp_gid = le32_to_cpu(sup->rp_gid); + sup_flags = le32_to_cpu(sup->flags); + + c->vfs_sb->s_time_gran = le32_to_cpu(sup->time_gran); + + memcpy(&c->uuid, &sup->uuid, 16); + + c->big_lpt = !!(sup_flags & UBIFS_FLG_BIGLPT); + + /* Automatically increase file system size to the maximum size */ + c->old_leb_cnt = c->leb_cnt; + if (c->leb_cnt < c->vi.size && c->leb_cnt < c->max_leb_cnt) { + c->leb_cnt = min_t(int, c->max_leb_cnt, c->vi.size); + if (c->vfs_sb->s_flags & MS_RDONLY) + dbg_mnt("Auto resizing (ro) from %d LEBs to %d LEBs", + c->old_leb_cnt, c->leb_cnt); + else { + dbg_mnt("Auto resizing (sb) from %d LEBs to %d LEBs", + c->old_leb_cnt, c->leb_cnt); + sup->leb_cnt = cpu_to_le32(c->leb_cnt); + err = ubifs_write_sb_node(c, sup); + if (err) + goto out; + c->old_leb_cnt = c->leb_cnt; + } + } + + c->log_bytes = (long long)c->log_lebs * c->leb_size; + c->log_last = UBIFS_LOG_LNUM + c->log_lebs - 1; + c->lpt_first = UBIFS_LOG_LNUM + c->log_lebs; + c->lpt_last = c->lpt_first + c->lpt_lebs - 1; + c->orph_first = c->lpt_last + 1; + c->orph_last = c->orph_first + c->orph_lebs - 1; + c->main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS; + c->main_lebs -= c->log_lebs + c->lpt_lebs + c->orph_lebs; + c->main_first = c->leb_cnt - c->main_lebs; + c->report_rp_size = ubifs_reported_space(c, c->rp_size); + + err = validate_sb(c, sup); +out: + kfree(sup); + return err; +} diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c new file mode 100644 index 000000000000..acf5c5fffc60 --- /dev/null +++ b/fs/ubifs/scan.c @@ -0,0 +1,362 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * This file implements the scan which is a general-purpose function for + * determining what nodes are in an eraseblock. The scan is used to replay the + * journal, to do garbage collection. for the TNC in-the-gaps method, and by + * debugging functions. + */ + +#include "ubifs.h" + +/** + * scan_padding_bytes - scan for padding bytes. + * @buf: buffer to scan + * @len: length of buffer + * + * This function returns the number of padding bytes on success and + * %SCANNED_GARBAGE on failure. + */ +static int scan_padding_bytes(void *buf, int len) +{ + int pad_len = 0, max_pad_len = min_t(int, UBIFS_PAD_NODE_SZ, len); + uint8_t *p = buf; + + dbg_scan("not a node"); + + while (pad_len < max_pad_len && *p++ == UBIFS_PADDING_BYTE) + pad_len += 1; + + if (!pad_len || (pad_len & 7)) + return SCANNED_GARBAGE; + + dbg_scan("%d padding bytes", pad_len); + + return pad_len; +} + +/** + * ubifs_scan_a_node - scan for a node or padding. + * @c: UBIFS file-system description object + * @buf: buffer to scan + * @len: length of buffer + * @lnum: logical eraseblock number + * @offs: offset within the logical eraseblock + * @quiet: print no messages + * + * This function returns a scanning code to indicate what was scanned. + */ +int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum, + int offs, int quiet) +{ + struct ubifs_ch *ch = buf; + uint32_t magic; + + magic = le32_to_cpu(ch->magic); + + if (magic == 0xFFFFFFFF) { + dbg_scan("hit empty space"); + return SCANNED_EMPTY_SPACE; + } + + if (magic != UBIFS_NODE_MAGIC) + return scan_padding_bytes(buf, len); + + if (len < UBIFS_CH_SZ) + return SCANNED_GARBAGE; + + dbg_scan("scanning %s", dbg_ntype(ch->node_type)); + + if (ubifs_check_node(c, buf, lnum, offs, quiet)) + return SCANNED_A_CORRUPT_NODE; + + if (ch->node_type == UBIFS_PAD_NODE) { + struct ubifs_pad_node *pad = buf; + int pad_len = le32_to_cpu(pad->pad_len); + int node_len = le32_to_cpu(ch->len); + + /* Validate the padding node */ + if (pad_len < 0 || + offs + node_len + pad_len > c->leb_size) { + if (!quiet) { + ubifs_err("bad pad node at LEB %d:%d", + lnum, offs); + dbg_dump_node(c, pad); + } + return SCANNED_A_BAD_PAD_NODE; + } + + /* Make the node pads to 8-byte boundary */ + if ((node_len + pad_len) & 7) { + if (!quiet) { + dbg_err("bad padding length %d - %d", + offs, offs + node_len + pad_len); + } + return SCANNED_A_BAD_PAD_NODE; + } + + dbg_scan("%d bytes padded, offset now %d", + pad_len, ALIGN(offs + node_len + pad_len, 8)); + + return node_len + pad_len; + } + + return SCANNED_A_NODE; +} + +/** + * ubifs_start_scan - create LEB scanning information at start of scan. + * @c: UBIFS file-system description object + * @lnum: logical eraseblock number + * @offs: offset to start at (usually zero) + * @sbuf: scan buffer (must be c->leb_size) + * + * This function returns %0 on success and a negative error code on failure. + */ +struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum, + int offs, void *sbuf) +{ + struct ubifs_scan_leb *sleb; + int err; + + dbg_scan("scan LEB %d:%d", lnum, offs); + + sleb = kzalloc(sizeof(struct ubifs_scan_leb), GFP_NOFS); + if (!sleb) + return ERR_PTR(-ENOMEM); + + sleb->lnum = lnum; + INIT_LIST_HEAD(&sleb->nodes); + sleb->buf = sbuf; + + err = ubi_read(c->ubi, lnum, sbuf + offs, offs, c->leb_size - offs); + if (err && err != -EBADMSG) { + ubifs_err("cannot read %d bytes from LEB %d:%d," + " error %d", c->leb_size - offs, lnum, offs, err); + kfree(sleb); + return ERR_PTR(err); + } + + if (err == -EBADMSG) + sleb->ecc = 1; + + return sleb; +} + +/** + * ubifs_end_scan - update LEB scanning information at end of scan. + * @c: UBIFS file-system description object + * @sleb: scanning information + * @lnum: logical eraseblock number + * @offs: offset to start at (usually zero) + * + * This function returns %0 on success and a negative error code on failure. + */ +void ubifs_end_scan(const struct ubifs_info *c, struct ubifs_scan_leb *sleb, + int lnum, int offs) +{ + lnum = lnum; + dbg_scan("stop scanning LEB %d at offset %d", lnum, offs); + ubifs_assert(offs % c->min_io_size == 0); + + sleb->endpt = ALIGN(offs, c->min_io_size); +} + +/** + * ubifs_add_snod - add a scanned node to LEB scanning information. + * @c: UBIFS file-system description object + * @sleb: scanning information + * @buf: buffer containing node + * @offs: offset of node on flash + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_add_snod(const struct ubifs_info *c, struct ubifs_scan_leb *sleb, + void *buf, int offs) +{ + struct ubifs_ch *ch = buf; + struct ubifs_ino_node *ino = buf; + struct ubifs_scan_node *snod; + + snod = kzalloc(sizeof(struct ubifs_scan_node), GFP_NOFS); + if (!snod) + return -ENOMEM; + + snod->sqnum = le64_to_cpu(ch->sqnum); + snod->type = ch->node_type; + snod->offs = offs; + snod->len = le32_to_cpu(ch->len); + snod->node = buf; + + switch (ch->node_type) { + case UBIFS_INO_NODE: + case UBIFS_DENT_NODE: + case UBIFS_XENT_NODE: + case UBIFS_DATA_NODE: + case UBIFS_TRUN_NODE: + /* + * The key is in the same place in all keyed + * nodes. + */ + key_read(c, &ino->key, &snod->key); + break; + } + list_add_tail(&snod->list, &sleb->nodes); + sleb->nodes_cnt += 1; + return 0; +} + +/** + * ubifs_scanned_corruption - print information after UBIFS scanned corruption. + * @c: UBIFS file-system description object + * @lnum: LEB number of corruption + * @offs: offset of corruption + * @buf: buffer containing corruption + */ +void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs, + void *buf) +{ + int len; + + ubifs_err("corrupted data at LEB %d:%d", lnum, offs); + if (dbg_failure_mode) + return; + len = c->leb_size - offs; + if (len > 4096) + len = 4096; + dbg_err("first %d bytes from LEB %d:%d", len, lnum, offs); + print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 4, buf, len, 1); +} + +/** + * ubifs_scan - scan a logical eraseblock. + * @c: UBIFS file-system description object + * @lnum: logical eraseblock number + * @offs: offset to start at (usually zero) + * @sbuf: scan buffer (must be c->leb_size) + * + * This function scans LEB number @lnum and returns complete information about + * its contents. Returns an error code in case of failure. + */ +struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum, + int offs, void *sbuf) +{ + void *buf = sbuf + offs; + int err, len = c->leb_size - offs; + struct ubifs_scan_leb *sleb; + + sleb = ubifs_start_scan(c, lnum, offs, sbuf); + if (IS_ERR(sleb)) + return sleb; + + while (len >= 8) { + struct ubifs_ch *ch = buf; + int node_len, ret; + + dbg_scan("look at LEB %d:%d (%d bytes left)", + lnum, offs, len); + + cond_resched(); + + ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 0); + + if (ret > 0) { + /* Padding bytes or a valid padding node */ + offs += ret; + buf += ret; + len -= ret; + continue; + } + + if (ret == SCANNED_EMPTY_SPACE) + /* Empty space is checked later */ + break; + + switch (ret) { + case SCANNED_GARBAGE: + dbg_err("garbage"); + goto corrupted; + case SCANNED_A_NODE: + break; + case SCANNED_A_CORRUPT_NODE: + case SCANNED_A_BAD_PAD_NODE: + dbg_err("bad node"); + goto corrupted; + default: + dbg_err("unknown"); + goto corrupted; + } + + err = ubifs_add_snod(c, sleb, buf, offs); + if (err) + goto error; + + node_len = ALIGN(le32_to_cpu(ch->len), 8); + offs += node_len; + buf += node_len; + len -= node_len; + } + + if (offs % c->min_io_size) + goto corrupted; + + ubifs_end_scan(c, sleb, lnum, offs); + + for (; len > 4; offs += 4, buf = buf + 4, len -= 4) + if (*(uint32_t *)buf != 0xffffffff) + break; + for (; len; offs++, buf++, len--) + if (*(uint8_t *)buf != 0xff) { + ubifs_err("corrupt empty space at LEB %d:%d", + lnum, offs); + goto corrupted; + } + + return sleb; + +corrupted: + ubifs_scanned_corruption(c, lnum, offs, buf); + err = -EUCLEAN; +error: + ubifs_err("LEB %d scanning failed", lnum); + ubifs_scan_destroy(sleb); + return ERR_PTR(err); +} + +/** + * ubifs_scan_destroy - destroy LEB scanning information. + * @sleb: scanning information to free + */ +void ubifs_scan_destroy(struct ubifs_scan_leb *sleb) +{ + struct ubifs_scan_node *node; + struct list_head *head; + + head = &sleb->nodes; + while (!list_empty(head)) { + node = list_entry(head->next, struct ubifs_scan_node, list); + list_del(&node->list); + kfree(node); + } + kfree(sleb); +} diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c new file mode 100644 index 000000000000..f248533841a2 --- /dev/null +++ b/fs/ubifs/shrinker.c @@ -0,0 +1,322 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* + * This file implements UBIFS shrinker which evicts clean znodes from the TNC + * tree when Linux VM needs more RAM. + * + * We do not implement any LRU lists to find oldest znodes to free because it + * would add additional overhead to the file system fast paths. So the shrinker + * just walks the TNC tree when searching for znodes to free. + * + * If the root of a TNC sub-tree is clean and old enough, then the children are + * also clean and old enough. So the shrinker walks the TNC in level order and + * dumps entire sub-trees. + * + * The age of znodes is just the time-stamp when they were last looked at. + * The current shrinker first tries to evict old znodes, then young ones. + * + * Since the shrinker is global, it has to protect against races with FS + * un-mounts, which is done by the 'ubifs_infos_lock' and 'c->umount_mutex'. + */ + +#include "ubifs.h" + +/* List of all UBIFS file-system instances */ +LIST_HEAD(ubifs_infos); + +/* + * We number each shrinker run and record the number on the ubifs_info structure + * so that we can easily work out which ubifs_info structures have already been + * done by the current run. + */ +static unsigned int shrinker_run_no; + +/* Protects 'ubifs_infos' list */ +DEFINE_SPINLOCK(ubifs_infos_lock); + +/* Global clean znode counter (for all mounted UBIFS instances) */ +atomic_long_t ubifs_clean_zn_cnt; + +/** + * shrink_tnc - shrink TNC tree. + * @c: UBIFS file-system description object + * @nr: number of znodes to free + * @age: the age of znodes to free + * @contention: if any contention, this is set to %1 + * + * This function traverses TNC tree and frees clean znodes. It does not free + * clean znodes which younger then @age. Returns number of freed znodes. + */ +static int shrink_tnc(struct ubifs_info *c, int nr, int age, int *contention) +{ + int total_freed = 0; + struct ubifs_znode *znode, *zprev; + int time = get_seconds(); + + ubifs_assert(mutex_is_locked(&c->umount_mutex)); + ubifs_assert(mutex_is_locked(&c->tnc_mutex)); + + if (!c->zroot.znode || atomic_long_read(&c->clean_zn_cnt) == 0) + return 0; + + /* + * Traverse the TNC tree in levelorder manner, so that it is possible + * to destroy large sub-trees. Indeed, if a znode is old, then all its + * children are older or of the same age. + * + * Note, we are holding 'c->tnc_mutex', so we do not have to lock the + * 'c->space_lock' when _reading_ 'c->clean_zn_cnt', because it is + * changed only when the 'c->tnc_mutex' is held. + */ + zprev = NULL; + znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL); + while (znode && total_freed < nr && + atomic_long_read(&c->clean_zn_cnt) > 0) { + int freed; + + /* + * If the znode is clean, but it is in the 'c->cnext' list, this + * means that this znode has just been written to flash as a + * part of commit and was marked clean. They will be removed + * from the list at end commit. We cannot change the list, + * because it is not protected by any mutex (design decision to + * make commit really independent and parallel to main I/O). So + * we just skip these znodes. + * + * Note, the 'clean_zn_cnt' counters are not updated until + * after the commit, so the UBIFS shrinker does not report + * the znodes which are in the 'c->cnext' list as freeable. + * + * Also note, if the root of a sub-tree is not in 'c->cnext', + * then the whole sub-tree is not in 'c->cnext' as well, so it + * is safe to dump whole sub-tree. + */ + + if (znode->cnext) { + /* + * Very soon these znodes will be removed from the list + * and become freeable. + */ + *contention = 1; + } else if (!ubifs_zn_dirty(znode) && + abs(time - znode->time) >= age) { + if (znode->parent) + znode->parent->zbranch[znode->iip].znode = NULL; + else + c->zroot.znode = NULL; + + freed = ubifs_destroy_tnc_subtree(znode); + atomic_long_sub(freed, &ubifs_clean_zn_cnt); + atomic_long_sub(freed, &c->clean_zn_cnt); + ubifs_assert(atomic_long_read(&c->clean_zn_cnt) >= 0); + total_freed += freed; + znode = zprev; + } + + if (unlikely(!c->zroot.znode)) + break; + + zprev = znode; + znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode); + cond_resched(); + } + + return total_freed; +} + +/** + * shrink_tnc_trees - shrink UBIFS TNC trees. + * @nr: number of znodes to free + * @age: the age of znodes to free + * @contention: if any contention, this is set to %1 + * + * This function walks the list of mounted UBIFS file-systems and frees clean + * znodes which are older then @age, until at least @nr znodes are freed. + * Returns the number of freed znodes. + */ +static int shrink_tnc_trees(int nr, int age, int *contention) +{ + struct ubifs_info *c; + struct list_head *p; + unsigned int run_no; + int freed = 0; + + spin_lock(&ubifs_infos_lock); + do { + run_no = ++shrinker_run_no; + } while (run_no == 0); + /* Iterate over all mounted UBIFS file-systems and try to shrink them */ + p = ubifs_infos.next; + while (p != &ubifs_infos) { + c = list_entry(p, struct ubifs_info, infos_list); + /* + * We move the ones we do to the end of the list, so we stop + * when we see one we have already done. + */ + if (c->shrinker_run_no == run_no) + break; + if (!mutex_trylock(&c->umount_mutex)) { + /* Some un-mount is in progress, try next FS */ + *contention = 1; + p = p->next; + continue; + } + /* + * We're holding 'c->umount_mutex', so the file-system won't go + * away. + */ + if (!mutex_trylock(&c->tnc_mutex)) { + mutex_unlock(&c->umount_mutex); + *contention = 1; + p = p->next; + continue; + } + spin_unlock(&ubifs_infos_lock); + /* + * OK, now we have TNC locked, the file-system cannot go away - + * it is safe to reap the cache. + */ + c->shrinker_run_no = run_no; + freed += shrink_tnc(c, nr, age, contention); + mutex_unlock(&c->tnc_mutex); + spin_lock(&ubifs_infos_lock); + /* Get the next list element before we move this one */ + p = p->next; + /* + * Move this one to the end of the list to provide some + * fairness. + */ + list_del(&c->infos_list); + list_add_tail(&c->infos_list, &ubifs_infos); + mutex_unlock(&c->umount_mutex); + if (freed >= nr) + break; + } + spin_unlock(&ubifs_infos_lock); + return freed; +} + +/** + * kick_a_thread - kick a background thread to start commit. + * + * This function kicks a background thread to start background commit. Returns + * %-1 if a thread was kicked or there is another reason to assume the memory + * will soon be freed or become freeable. If there are no dirty znodes, returns + * %0. + */ +static int kick_a_thread(void) +{ + int i; + struct ubifs_info *c; + + /* + * Iterate over all mounted UBIFS file-systems and find out if there is + * already an ongoing commit operation there. If no, then iterate for + * the second time and initiate background commit. + */ + spin_lock(&ubifs_infos_lock); + for (i = 0; i < 2; i++) { + list_for_each_entry(c, &ubifs_infos, infos_list) { + long dirty_zn_cnt; + + if (!mutex_trylock(&c->umount_mutex)) { + /* + * Some un-mount is in progress, it will + * certainly free memory, so just return. + */ + spin_unlock(&ubifs_infos_lock); + return -1; + } + + dirty_zn_cnt = atomic_long_read(&c->dirty_zn_cnt); + + if (!dirty_zn_cnt || c->cmt_state == COMMIT_BROKEN || + c->ro_media) { + mutex_unlock(&c->umount_mutex); + continue; + } + + if (c->cmt_state != COMMIT_RESTING) { + spin_unlock(&ubifs_infos_lock); + mutex_unlock(&c->umount_mutex); + return -1; + } + + if (i == 1) { + list_del(&c->infos_list); + list_add_tail(&c->infos_list, &ubifs_infos); + spin_unlock(&ubifs_infos_lock); + + ubifs_request_bg_commit(c); + mutex_unlock(&c->umount_mutex); + return -1; + } + mutex_unlock(&c->umount_mutex); + } + } + spin_unlock(&ubifs_infos_lock); + + return 0; +} + +int ubifs_shrinker(int nr, gfp_t gfp_mask) +{ + int freed, contention = 0; + long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt); + + if (nr == 0) + return clean_zn_cnt; + + if (!clean_zn_cnt) { + /* + * No clean znodes, nothing to reap. All we can do in this case + * is to kick background threads to start commit, which will + * probably make clean znodes which, in turn, will be freeable. + * And we return -1 which means will make VM call us again + * later. + */ + dbg_tnc("no clean znodes, kick a thread"); + return kick_a_thread(); + } + + freed = shrink_tnc_trees(nr, OLD_ZNODE_AGE, &contention); + if (freed >= nr) + goto out; + + dbg_tnc("not enough old znodes, try to free young ones"); + freed += shrink_tnc_trees(nr - freed, YOUNG_ZNODE_AGE, &contention); + if (freed >= nr) + goto out; + + dbg_tnc("not enough young znodes, free all"); + freed += shrink_tnc_trees(nr - freed, 0, &contention); + + if (!freed && contention) { + dbg_tnc("freed nothing, but contention"); + return -1; + } + +out: + dbg_tnc("%d znodes were freed, requested %d", freed, nr); + return freed; +} diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c new file mode 100644 index 000000000000..9a9220333b3b --- /dev/null +++ b/fs/ubifs/super.c @@ -0,0 +1,1968 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* + * This file implements UBIFS initialization and VFS superblock operations. Some + * initialization stuff which is rather large and complex is placed at + * corresponding subsystems, but most of it is here. + */ + +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/ctype.h> +#include <linux/kthread.h> +#include <linux/parser.h> +#include <linux/seq_file.h> +#include <linux/mount.h> +#include "ubifs.h" + +/* Slab cache for UBIFS inodes */ +struct kmem_cache *ubifs_inode_slab; + +/* UBIFS TNC shrinker description */ +static struct shrinker ubifs_shrinker_info = { + .shrink = ubifs_shrinker, + .seeks = DEFAULT_SEEKS, +}; + +/** + * validate_inode - validate inode. + * @c: UBIFS file-system description object + * @inode: the inode to validate + * + * This is a helper function for 'ubifs_iget()' which validates various fields + * of a newly built inode to make sure they contain sane values and prevent + * possible vulnerabilities. Returns zero if the inode is all right and + * a non-zero error code if not. + */ +static int validate_inode(struct ubifs_info *c, const struct inode *inode) +{ + int err; + const struct ubifs_inode *ui = ubifs_inode(inode); + + if (inode->i_size > c->max_inode_sz) { + ubifs_err("inode is too large (%lld)", + (long long)inode->i_size); + return 1; + } + + if (ui->compr_type < 0 || ui->compr_type >= UBIFS_COMPR_TYPES_CNT) { + ubifs_err("unknown compression type %d", ui->compr_type); + return 2; + } + + if (ui->xattr_names + ui->xattr_cnt > XATTR_LIST_MAX) + return 3; + + if (ui->data_len < 0 || ui->data_len > UBIFS_MAX_INO_DATA) + return 4; + + if (ui->xattr && (inode->i_mode & S_IFMT) != S_IFREG) + return 5; + + if (!ubifs_compr_present(ui->compr_type)) { + ubifs_warn("inode %lu uses '%s' compression, but it was not " + "compiled in", inode->i_ino, + ubifs_compr_name(ui->compr_type)); + } + + err = dbg_check_dir_size(c, inode); + return err; +} + +struct inode *ubifs_iget(struct super_block *sb, unsigned long inum) +{ + int err; + union ubifs_key key; + struct ubifs_ino_node *ino; + struct ubifs_info *c = sb->s_fs_info; + struct inode *inode; + struct ubifs_inode *ui; + + dbg_gen("inode %lu", inum); + + inode = iget_locked(sb, inum); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + ui = ubifs_inode(inode); + + ino = kmalloc(UBIFS_MAX_INO_NODE_SZ, GFP_NOFS); + if (!ino) { + err = -ENOMEM; + goto out; + } + + ino_key_init(c, &key, inode->i_ino); + + err = ubifs_tnc_lookup(c, &key, ino); + if (err) + goto out_ino; + + inode->i_flags |= (S_NOCMTIME | S_NOATIME); + inode->i_nlink = le32_to_cpu(ino->nlink); + inode->i_uid = le32_to_cpu(ino->uid); + inode->i_gid = le32_to_cpu(ino->gid); + inode->i_atime.tv_sec = (int64_t)le64_to_cpu(ino->atime_sec); + inode->i_atime.tv_nsec = le32_to_cpu(ino->atime_nsec); + inode->i_mtime.tv_sec = (int64_t)le64_to_cpu(ino->mtime_sec); + inode->i_mtime.tv_nsec = le32_to_cpu(ino->mtime_nsec); + inode->i_ctime.tv_sec = (int64_t)le64_to_cpu(ino->ctime_sec); + inode->i_ctime.tv_nsec = le32_to_cpu(ino->ctime_nsec); + inode->i_mode = le32_to_cpu(ino->mode); + inode->i_size = le64_to_cpu(ino->size); + + ui->data_len = le32_to_cpu(ino->data_len); + ui->flags = le32_to_cpu(ino->flags); + ui->compr_type = le16_to_cpu(ino->compr_type); + ui->creat_sqnum = le64_to_cpu(ino->creat_sqnum); + ui->xattr_cnt = le32_to_cpu(ino->xattr_cnt); + ui->xattr_size = le32_to_cpu(ino->xattr_size); + ui->xattr_names = le32_to_cpu(ino->xattr_names); + ui->synced_i_size = ui->ui_size = inode->i_size; + + ui->xattr = (ui->flags & UBIFS_XATTR_FL) ? 1 : 0; + + err = validate_inode(c, inode); + if (err) + goto out_invalid; + + /* Disable read-ahead */ + inode->i_mapping->backing_dev_info = &c->bdi; + + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + inode->i_mapping->a_ops = &ubifs_file_address_operations; + inode->i_op = &ubifs_file_inode_operations; + inode->i_fop = &ubifs_file_operations; + if (ui->xattr) { + ui->data = kmalloc(ui->data_len + 1, GFP_NOFS); + if (!ui->data) { + err = -ENOMEM; + goto out_ino; + } + memcpy(ui->data, ino->data, ui->data_len); + ((char *)ui->data)[ui->data_len] = '\0'; + } else if (ui->data_len != 0) { + err = 10; + goto out_invalid; + } + break; + case S_IFDIR: + inode->i_op = &ubifs_dir_inode_operations; + inode->i_fop = &ubifs_dir_operations; + if (ui->data_len != 0) { + err = 11; + goto out_invalid; + } + break; + case S_IFLNK: + inode->i_op = &ubifs_symlink_inode_operations; + if (ui->data_len <= 0 || ui->data_len > UBIFS_MAX_INO_DATA) { + err = 12; + goto out_invalid; + } + ui->data = kmalloc(ui->data_len + 1, GFP_NOFS); + if (!ui->data) { + err = -ENOMEM; + goto out_ino; + } + memcpy(ui->data, ino->data, ui->data_len); + ((char *)ui->data)[ui->data_len] = '\0'; + break; + case S_IFBLK: + case S_IFCHR: + { + dev_t rdev; + union ubifs_dev_desc *dev; + + ui->data = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS); + if (!ui->data) { + err = -ENOMEM; + goto out_ino; + } + + dev = (union ubifs_dev_desc *)ino->data; + if (ui->data_len == sizeof(dev->new)) + rdev = new_decode_dev(le32_to_cpu(dev->new)); + else if (ui->data_len == sizeof(dev->huge)) + rdev = huge_decode_dev(le64_to_cpu(dev->huge)); + else { + err = 13; + goto out_invalid; + } + memcpy(ui->data, ino->data, ui->data_len); + inode->i_op = &ubifs_file_inode_operations; + init_special_inode(inode, inode->i_mode, rdev); + break; + } + case S_IFSOCK: + case S_IFIFO: + inode->i_op = &ubifs_file_inode_operations; + init_special_inode(inode, inode->i_mode, 0); + if (ui->data_len != 0) { + err = 14; + goto out_invalid; + } + break; + default: + err = 15; + goto out_invalid; + } + + kfree(ino); + ubifs_set_inode_flags(inode); + unlock_new_inode(inode); + return inode; + +out_invalid: + ubifs_err("inode %lu validation failed, error %d", inode->i_ino, err); + dbg_dump_node(c, ino); + dbg_dump_inode(c, inode); + err = -EINVAL; +out_ino: + kfree(ino); +out: + ubifs_err("failed to read inode %lu, error %d", inode->i_ino, err); + iget_failed(inode); + return ERR_PTR(err); +} + +static struct inode *ubifs_alloc_inode(struct super_block *sb) +{ + struct ubifs_inode *ui; + + ui = kmem_cache_alloc(ubifs_inode_slab, GFP_NOFS); + if (!ui) + return NULL; + + memset((void *)ui + sizeof(struct inode), 0, + sizeof(struct ubifs_inode) - sizeof(struct inode)); + mutex_init(&ui->ui_mutex); + spin_lock_init(&ui->ui_lock); + return &ui->vfs_inode; +}; + +static void ubifs_destroy_inode(struct inode *inode) +{ + struct ubifs_inode *ui = ubifs_inode(inode); + + kfree(ui->data); + kmem_cache_free(ubifs_inode_slab, inode); +} + +/* + * Note, Linux write-back code calls this without 'i_mutex'. + */ +static int ubifs_write_inode(struct inode *inode, int wait) +{ + int err = 0; + struct ubifs_info *c = inode->i_sb->s_fs_info; + struct ubifs_inode *ui = ubifs_inode(inode); + + ubifs_assert(!ui->xattr); + if (is_bad_inode(inode)) + return 0; + + mutex_lock(&ui->ui_mutex); + /* + * Due to races between write-back forced by budgeting + * (see 'sync_some_inodes()') and pdflush write-back, the inode may + * have already been synchronized, do not do this again. This might + * also happen if it was synchronized in an VFS operation, e.g. + * 'ubifs_link()'. + */ + if (!ui->dirty) { + mutex_unlock(&ui->ui_mutex); + return 0; + } + + /* + * As an optimization, do not write orphan inodes to the media just + * because this is not needed. + */ + dbg_gen("inode %lu, mode %#x, nlink %u", + inode->i_ino, (int)inode->i_mode, inode->i_nlink); + if (inode->i_nlink) { + err = ubifs_jnl_write_inode(c, inode); + if (err) + ubifs_err("can't write inode %lu, error %d", + inode->i_ino, err); + } + + ui->dirty = 0; + mutex_unlock(&ui->ui_mutex); + ubifs_release_dirty_inode_budget(c, ui); + return err; +} + +static void ubifs_delete_inode(struct inode *inode) +{ + int err; + struct ubifs_info *c = inode->i_sb->s_fs_info; + struct ubifs_inode *ui = ubifs_inode(inode); + + if (ui->xattr) + /* + * Extended attribute inode deletions are fully handled in + * 'ubifs_removexattr()'. These inodes are special and have + * limited usage, so there is nothing to do here. + */ + goto out; + + dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode); + ubifs_assert(!atomic_read(&inode->i_count)); + ubifs_assert(inode->i_nlink == 0); + + truncate_inode_pages(&inode->i_data, 0); + if (is_bad_inode(inode)) + goto out; + + ui->ui_size = inode->i_size = 0; + err = ubifs_jnl_delete_inode(c, inode); + if (err) + /* + * Worst case we have a lost orphan inode wasting space, so a + * simple error message is OK here. + */ + ubifs_err("can't delete inode %lu, error %d", + inode->i_ino, err); + +out: + if (ui->dirty) + ubifs_release_dirty_inode_budget(c, ui); + clear_inode(inode); +} + +static void ubifs_dirty_inode(struct inode *inode) +{ + struct ubifs_inode *ui = ubifs_inode(inode); + + ubifs_assert(mutex_is_locked(&ui->ui_mutex)); + if (!ui->dirty) { + ui->dirty = 1; + dbg_gen("inode %lu", inode->i_ino); + } +} + +static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct ubifs_info *c = dentry->d_sb->s_fs_info; + unsigned long long free; + __le32 *uuid = (__le32 *)c->uuid; + + free = ubifs_get_free_space(c); + dbg_gen("free space %lld bytes (%lld blocks)", + free, free >> UBIFS_BLOCK_SHIFT); + + buf->f_type = UBIFS_SUPER_MAGIC; + buf->f_bsize = UBIFS_BLOCK_SIZE; + buf->f_blocks = c->block_cnt; + buf->f_bfree = free >> UBIFS_BLOCK_SHIFT; + if (free > c->report_rp_size) + buf->f_bavail = (free - c->report_rp_size) >> UBIFS_BLOCK_SHIFT; + else + buf->f_bavail = 0; + buf->f_files = 0; + buf->f_ffree = 0; + buf->f_namelen = UBIFS_MAX_NLEN; + buf->f_fsid.val[0] = le32_to_cpu(uuid[0]) ^ le32_to_cpu(uuid[2]); + buf->f_fsid.val[1] = le32_to_cpu(uuid[1]) ^ le32_to_cpu(uuid[3]); + return 0; +} + +static int ubifs_show_options(struct seq_file *s, struct vfsmount *mnt) +{ + struct ubifs_info *c = mnt->mnt_sb->s_fs_info; + + if (c->mount_opts.unmount_mode == 2) + seq_printf(s, ",fast_unmount"); + else if (c->mount_opts.unmount_mode == 1) + seq_printf(s, ",norm_unmount"); + + return 0; +} + +static int ubifs_sync_fs(struct super_block *sb, int wait) +{ + struct ubifs_info *c = sb->s_fs_info; + int i, ret = 0, err; + + if (c->jheads) + for (i = 0; i < c->jhead_cnt; i++) { + err = ubifs_wbuf_sync(&c->jheads[i].wbuf); + if (err && !ret) + ret = err; + } + /* + * We ought to call sync for c->ubi but it does not have one. If it had + * it would in turn call mtd->sync, however mtd operations are + * synchronous anyway, so we don't lose any sleep here. + */ + return ret; +} + +/** + * init_constants_early - initialize UBIFS constants. + * @c: UBIFS file-system description object + * + * This function initialize UBIFS constants which do not need the superblock to + * be read. It also checks that the UBI volume satisfies basic UBIFS + * requirements. Returns zero in case of success and a negative error code in + * case of failure. + */ +static int init_constants_early(struct ubifs_info *c) +{ + if (c->vi.corrupted) { + ubifs_warn("UBI volume is corrupted - read-only mode"); + c->ro_media = 1; + } + + if (c->di.ro_mode) { + ubifs_msg("read-only UBI device"); + c->ro_media = 1; + } + + if (c->vi.vol_type == UBI_STATIC_VOLUME) { + ubifs_msg("static UBI volume - read-only mode"); + c->ro_media = 1; + } + + c->leb_cnt = c->vi.size; + c->leb_size = c->vi.usable_leb_size; + c->half_leb_size = c->leb_size / 2; + c->min_io_size = c->di.min_io_size; + c->min_io_shift = fls(c->min_io_size) - 1; + + if (c->leb_size < UBIFS_MIN_LEB_SZ) { + ubifs_err("too small LEBs (%d bytes), min. is %d bytes", + c->leb_size, UBIFS_MIN_LEB_SZ); + return -EINVAL; + } + + if (c->leb_cnt < UBIFS_MIN_LEB_CNT) { + ubifs_err("too few LEBs (%d), min. is %d", + c->leb_cnt, UBIFS_MIN_LEB_CNT); + return -EINVAL; + } + + if (!is_power_of_2(c->min_io_size)) { + ubifs_err("bad min. I/O size %d", c->min_io_size); + return -EINVAL; + } + + /* + * UBIFS aligns all node to 8-byte boundary, so to make function in + * io.c simpler, assume minimum I/O unit size to be 8 bytes if it is + * less than 8. + */ + if (c->min_io_size < 8) { + c->min_io_size = 8; + c->min_io_shift = 3; + } + + c->ref_node_alsz = ALIGN(UBIFS_REF_NODE_SZ, c->min_io_size); + c->mst_node_alsz = ALIGN(UBIFS_MST_NODE_SZ, c->min_io_size); + + /* + * Initialize node length ranges which are mostly needed for node + * length validation. + */ + c->ranges[UBIFS_PAD_NODE].len = UBIFS_PAD_NODE_SZ; + c->ranges[UBIFS_SB_NODE].len = UBIFS_SB_NODE_SZ; + c->ranges[UBIFS_MST_NODE].len = UBIFS_MST_NODE_SZ; + c->ranges[UBIFS_REF_NODE].len = UBIFS_REF_NODE_SZ; + c->ranges[UBIFS_TRUN_NODE].len = UBIFS_TRUN_NODE_SZ; + c->ranges[UBIFS_CS_NODE].len = UBIFS_CS_NODE_SZ; + + c->ranges[UBIFS_INO_NODE].min_len = UBIFS_INO_NODE_SZ; + c->ranges[UBIFS_INO_NODE].max_len = UBIFS_MAX_INO_NODE_SZ; + c->ranges[UBIFS_ORPH_NODE].min_len = + UBIFS_ORPH_NODE_SZ + sizeof(__le64); + c->ranges[UBIFS_ORPH_NODE].max_len = c->leb_size; + c->ranges[UBIFS_DENT_NODE].min_len = UBIFS_DENT_NODE_SZ; + c->ranges[UBIFS_DENT_NODE].max_len = UBIFS_MAX_DENT_NODE_SZ; + c->ranges[UBIFS_XENT_NODE].min_len = UBIFS_XENT_NODE_SZ; + c->ranges[UBIFS_XENT_NODE].max_len = UBIFS_MAX_XENT_NODE_SZ; + c->ranges[UBIFS_DATA_NODE].min_len = UBIFS_DATA_NODE_SZ; + c->ranges[UBIFS_DATA_NODE].max_len = UBIFS_MAX_DATA_NODE_SZ; + /* + * Minimum indexing node size is amended later when superblock is + * read and the key length is known. + */ + c->ranges[UBIFS_IDX_NODE].min_len = UBIFS_IDX_NODE_SZ + UBIFS_BRANCH_SZ; + /* + * Maximum indexing node size is amended later when superblock is + * read and the fanout is known. + */ + c->ranges[UBIFS_IDX_NODE].max_len = INT_MAX; + + /* + * Initialize dead and dark LEB space watermarks. + * + * Dead space is the space which cannot be used. Its watermark is + * equivalent to min. I/O unit or minimum node size if it is greater + * then min. I/O unit. + * + * Dark space is the space which might be used, or might not, depending + * on which node should be written to the LEB. Its watermark is + * equivalent to maximum UBIFS node size. + */ + c->dead_wm = ALIGN(MIN_WRITE_SZ, c->min_io_size); + c->dark_wm = ALIGN(UBIFS_MAX_NODE_SZ, c->min_io_size); + + /* + * Calculate how many bytes would be wasted at the end of LEB if it was + * fully filled with data nodes of maximum size. This is used in + * calculations when reporting free space. + */ + c->leb_overhead = c->leb_size % UBIFS_MAX_DATA_NODE_SZ; + return 0; +} + +/** + * bud_wbuf_callback - bud LEB write-buffer synchronization call-back. + * @c: UBIFS file-system description object + * @lnum: LEB the write-buffer was synchronized to + * @free: how many free bytes left in this LEB + * @pad: how many bytes were padded + * + * This is a callback function which is called by the I/O unit when the + * write-buffer is synchronized. We need this to correctly maintain space + * accounting in bud logical eraseblocks. This function returns zero in case of + * success and a negative error code in case of failure. + * + * This function actually belongs to the journal, but we keep it here because + * we want to keep it static. + */ +static int bud_wbuf_callback(struct ubifs_info *c, int lnum, int free, int pad) +{ + return ubifs_update_one_lp(c, lnum, free, pad, 0, 0); +} + +/* + * init_constants_late - initialize UBIFS constants. + * @c: UBIFS file-system description object + * + * This is a helper function which initializes various UBIFS constants after + * the superblock has been read. It also checks various UBIFS parameters and + * makes sure they are all right. Returns zero in case of success and a + * negative error code in case of failure. + */ +static int init_constants_late(struct ubifs_info *c) +{ + int tmp, err; + uint64_t tmp64; + + c->main_bytes = (long long)c->main_lebs * c->leb_size; + c->max_znode_sz = sizeof(struct ubifs_znode) + + c->fanout * sizeof(struct ubifs_zbranch); + + tmp = ubifs_idx_node_sz(c, 1); + c->ranges[UBIFS_IDX_NODE].min_len = tmp; + c->min_idx_node_sz = ALIGN(tmp, 8); + + tmp = ubifs_idx_node_sz(c, c->fanout); + c->ranges[UBIFS_IDX_NODE].max_len = tmp; + c->max_idx_node_sz = ALIGN(tmp, 8); + + /* Make sure LEB size is large enough to fit full commit */ + tmp = UBIFS_CS_NODE_SZ + UBIFS_REF_NODE_SZ * c->jhead_cnt; + tmp = ALIGN(tmp, c->min_io_size); + if (tmp > c->leb_size) { + dbg_err("too small LEB size %d, at least %d needed", + c->leb_size, tmp); + return -EINVAL; + } + + /* + * Make sure that the log is large enough to fit reference nodes for + * all buds plus one reserved LEB. + */ + tmp64 = c->max_bud_bytes; + tmp = do_div(tmp64, c->leb_size); + c->max_bud_cnt = tmp64 + !!tmp; + tmp = (c->ref_node_alsz * c->max_bud_cnt + c->leb_size - 1); + tmp /= c->leb_size; + tmp += 1; + if (c->log_lebs < tmp) { + dbg_err("too small log %d LEBs, required min. %d LEBs", + c->log_lebs, tmp); + return -EINVAL; + } + + /* + * When budgeting we assume worst-case scenarios when the pages are not + * be compressed and direntries are of the maximum size. + * + * Note, data, which may be stored in inodes is budgeted separately, so + * it is not included into 'c->inode_budget'. + */ + c->page_budget = UBIFS_MAX_DATA_NODE_SZ * UBIFS_BLOCKS_PER_PAGE; + c->inode_budget = UBIFS_INO_NODE_SZ; + c->dent_budget = UBIFS_MAX_DENT_NODE_SZ; + + /* + * When the amount of flash space used by buds becomes + * 'c->max_bud_bytes', UBIFS just blocks all writers and starts commit. + * The writers are unblocked when the commit is finished. To avoid + * writers to be blocked UBIFS initiates background commit in advance, + * when number of bud bytes becomes above the limit defined below. + */ + c->bg_bud_bytes = (c->max_bud_bytes * 13) >> 4; + + /* + * Ensure minimum journal size. All the bytes in the journal heads are + * considered to be used, when calculating the current journal usage. + * Consequently, if the journal is too small, UBIFS will treat it as + * always full. + */ + tmp64 = (uint64_t)(c->jhead_cnt + 1) * c->leb_size + 1; + if (c->bg_bud_bytes < tmp64) + c->bg_bud_bytes = tmp64; + if (c->max_bud_bytes < tmp64 + c->leb_size) + c->max_bud_bytes = tmp64 + c->leb_size; + + err = ubifs_calc_lpt_geom(c); + if (err) + return err; + + c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); + + /* + * Calculate total amount of FS blocks. This number is not used + * internally because it does not make much sense for UBIFS, but it is + * necessary to report something for the 'statfs()' call. + * + * Subtract the LEB reserved for GC, the LEB which is reserved for + * deletions, and assume only one journal head is available. + */ + tmp64 = c->main_lebs - 2 - c->jhead_cnt + 1; + tmp64 *= (uint64_t)c->leb_size - c->leb_overhead; + tmp64 = ubifs_reported_space(c, tmp64); + c->block_cnt = tmp64 >> UBIFS_BLOCK_SHIFT; + + return 0; +} + +/** + * take_gc_lnum - reserve GC LEB. + * @c: UBIFS file-system description object + * + * This function ensures that the LEB reserved for garbage collection is + * unmapped and is marked as "taken" in lprops. We also have to set free space + * to LEB size and dirty space to zero, because lprops may contain out-of-date + * information if the file-system was un-mounted before it has been committed. + * This function returns zero in case of success and a negative error code in + * case of failure. + */ +static int take_gc_lnum(struct ubifs_info *c) +{ + int err; + + if (c->gc_lnum == -1) { + ubifs_err("no LEB for GC"); + return -EINVAL; + } + + err = ubifs_leb_unmap(c, c->gc_lnum); + if (err) + return err; + + /* And we have to tell lprops that this LEB is taken */ + err = ubifs_change_one_lp(c, c->gc_lnum, c->leb_size, 0, + LPROPS_TAKEN, 0, 0); + return err; +} + +/** + * alloc_wbufs - allocate write-buffers. + * @c: UBIFS file-system description object + * + * This helper function allocates and initializes UBIFS write-buffers. Returns + * zero in case of success and %-ENOMEM in case of failure. + */ +static int alloc_wbufs(struct ubifs_info *c) +{ + int i, err; + + c->jheads = kzalloc(c->jhead_cnt * sizeof(struct ubifs_jhead), + GFP_KERNEL); + if (!c->jheads) + return -ENOMEM; + + /* Initialize journal heads */ + for (i = 0; i < c->jhead_cnt; i++) { + INIT_LIST_HEAD(&c->jheads[i].buds_list); + err = ubifs_wbuf_init(c, &c->jheads[i].wbuf); + if (err) + return err; + + c->jheads[i].wbuf.sync_callback = &bud_wbuf_callback; + c->jheads[i].wbuf.jhead = i; + } + + c->jheads[BASEHD].wbuf.dtype = UBI_SHORTTERM; + /* + * Garbage Collector head likely contains long-term data and + * does not need to be synchronized by timer. + */ + c->jheads[GCHD].wbuf.dtype = UBI_LONGTERM; + c->jheads[GCHD].wbuf.timeout = 0; + + return 0; +} + +/** + * free_wbufs - free write-buffers. + * @c: UBIFS file-system description object + */ +static void free_wbufs(struct ubifs_info *c) +{ + int i; + + if (c->jheads) { + for (i = 0; i < c->jhead_cnt; i++) { + kfree(c->jheads[i].wbuf.buf); + kfree(c->jheads[i].wbuf.inodes); + } + kfree(c->jheads); + c->jheads = NULL; + } +} + +/** + * free_orphans - free orphans. + * @c: UBIFS file-system description object + */ +static void free_orphans(struct ubifs_info *c) +{ + struct ubifs_orphan *orph; + + while (c->orph_dnext) { + orph = c->orph_dnext; + c->orph_dnext = orph->dnext; + list_del(&orph->list); + kfree(orph); + } + + while (!list_empty(&c->orph_list)) { + orph = list_entry(c->orph_list.next, struct ubifs_orphan, list); + list_del(&orph->list); + kfree(orph); + dbg_err("orphan list not empty at unmount"); + } + + vfree(c->orph_buf); + c->orph_buf = NULL; +} + +/** + * free_buds - free per-bud objects. + * @c: UBIFS file-system description object + */ +static void free_buds(struct ubifs_info *c) +{ + struct rb_node *this = c->buds.rb_node; + struct ubifs_bud *bud; + + while (this) { + if (this->rb_left) + this = this->rb_left; + else if (this->rb_right) + this = this->rb_right; + else { + bud = rb_entry(this, struct ubifs_bud, rb); + this = rb_parent(this); + if (this) { + if (this->rb_left == &bud->rb) + this->rb_left = NULL; + else + this->rb_right = NULL; + } + kfree(bud); + } + } +} + +/** + * check_volume_empty - check if the UBI volume is empty. + * @c: UBIFS file-system description object + * + * This function checks if the UBIFS volume is empty by looking if its LEBs are + * mapped or not. The result of checking is stored in the @c->empty variable. + * Returns zero in case of success and a negative error code in case of + * failure. + */ +static int check_volume_empty(struct ubifs_info *c) +{ + int lnum, err; + + c->empty = 1; + for (lnum = 0; lnum < c->leb_cnt; lnum++) { + err = ubi_is_mapped(c->ubi, lnum); + if (unlikely(err < 0)) + return err; + if (err == 1) { + c->empty = 0; + break; + } + + cond_resched(); + } + + return 0; +} + +/* + * UBIFS mount options. + * + * Opt_fast_unmount: do not run a journal commit before un-mounting + * Opt_norm_unmount: run a journal commit before un-mounting + * Opt_err: just end of array marker + */ +enum { + Opt_fast_unmount, + Opt_norm_unmount, + Opt_err, +}; + +static const match_table_t tokens = { + {Opt_fast_unmount, "fast_unmount"}, + {Opt_norm_unmount, "norm_unmount"}, + {Opt_err, NULL}, +}; + +/** + * ubifs_parse_options - parse mount parameters. + * @c: UBIFS file-system description object + * @options: parameters to parse + * @is_remount: non-zero if this is FS re-mount + * + * This function parses UBIFS mount options and returns zero in case success + * and a negative error code in case of failure. + */ +static int ubifs_parse_options(struct ubifs_info *c, char *options, + int is_remount) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + + if (!options) + return 0; + + while ((p = strsep(&options, ","))) { + int token; + + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_fast_unmount: + c->mount_opts.unmount_mode = 2; + c->fast_unmount = 1; + break; + case Opt_norm_unmount: + c->mount_opts.unmount_mode = 1; + c->fast_unmount = 0; + break; + default: + ubifs_err("unrecognized mount option \"%s\" " + "or missing value", p); + return -EINVAL; + } + } + + return 0; +} + +/** + * destroy_journal - destroy journal data structures. + * @c: UBIFS file-system description object + * + * This function destroys journal data structures including those that may have + * been created by recovery functions. + */ +static void destroy_journal(struct ubifs_info *c) +{ + while (!list_empty(&c->unclean_leb_list)) { + struct ubifs_unclean_leb *ucleb; + + ucleb = list_entry(c->unclean_leb_list.next, + struct ubifs_unclean_leb, list); + list_del(&ucleb->list); + kfree(ucleb); + } + while (!list_empty(&c->old_buds)) { + struct ubifs_bud *bud; + + bud = list_entry(c->old_buds.next, struct ubifs_bud, list); + list_del(&bud->list); + kfree(bud); + } + ubifs_destroy_idx_gc(c); + ubifs_destroy_size_tree(c); + ubifs_tnc_close(c); + free_buds(c); +} + +/** + * mount_ubifs - mount UBIFS file-system. + * @c: UBIFS file-system description object + * + * This function mounts UBIFS file system. Returns zero in case of success and + * a negative error code in case of failure. + * + * Note, the function does not de-allocate resources it it fails half way + * through, and the caller has to do this instead. + */ +static int mount_ubifs(struct ubifs_info *c) +{ + struct super_block *sb = c->vfs_sb; + int err, mounted_read_only = (sb->s_flags & MS_RDONLY); + long long x; + size_t sz; + + err = init_constants_early(c); + if (err) + return err; + +#ifdef CONFIG_UBIFS_FS_DEBUG + c->dbg_buf = vmalloc(c->leb_size); + if (!c->dbg_buf) + return -ENOMEM; +#endif + + err = check_volume_empty(c); + if (err) + goto out_free; + + if (c->empty && (mounted_read_only || c->ro_media)) { + /* + * This UBI volume is empty, and read-only, or the file system + * is mounted read-only - we cannot format it. + */ + ubifs_err("can't format empty UBI volume: read-only %s", + c->ro_media ? "UBI volume" : "mount"); + err = -EROFS; + goto out_free; + } + + if (c->ro_media && !mounted_read_only) { + ubifs_err("cannot mount read-write - read-only media"); + err = -EROFS; + goto out_free; + } + + /* + * The requirement for the buffer is that it should fit indexing B-tree + * height amount of integers. We assume the height if the TNC tree will + * never exceed 64. + */ + err = -ENOMEM; + c->bottom_up_buf = kmalloc(BOTTOM_UP_HEIGHT * sizeof(int), GFP_KERNEL); + if (!c->bottom_up_buf) + goto out_free; + + c->sbuf = vmalloc(c->leb_size); + if (!c->sbuf) + goto out_free; + + if (!mounted_read_only) { + c->ileb_buf = vmalloc(c->leb_size); + if (!c->ileb_buf) + goto out_free; + } + + err = ubifs_read_superblock(c); + if (err) + goto out_free; + + /* + * Make sure the compressor which is set as the default on in the + * superblock was actually compiled in. + */ + if (!ubifs_compr_present(c->default_compr)) { + ubifs_warn("'%s' compressor is set by superblock, but not " + "compiled in", ubifs_compr_name(c->default_compr)); + c->default_compr = UBIFS_COMPR_NONE; + } + + dbg_failure_mode_registration(c); + + err = init_constants_late(c); + if (err) + goto out_dereg; + + sz = ALIGN(c->max_idx_node_sz, c->min_io_size); + sz = ALIGN(sz + c->max_idx_node_sz, c->min_io_size); + c->cbuf = kmalloc(sz, GFP_NOFS); + if (!c->cbuf) { + err = -ENOMEM; + goto out_dereg; + } + + sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id); + if (!mounted_read_only) { + err = alloc_wbufs(c); + if (err) + goto out_cbuf; + + /* Create background thread */ + c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name); + if (!c->bgt) + c->bgt = ERR_PTR(-EINVAL); + if (IS_ERR(c->bgt)) { + err = PTR_ERR(c->bgt); + c->bgt = NULL; + ubifs_err("cannot spawn \"%s\", error %d", + c->bgt_name, err); + goto out_wbufs; + } + wake_up_process(c->bgt); + } + + err = ubifs_read_master(c); + if (err) + goto out_master; + + if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) { + ubifs_msg("recovery needed"); + c->need_recovery = 1; + if (!mounted_read_only) { + err = ubifs_recover_inl_heads(c, c->sbuf); + if (err) + goto out_master; + } + } else if (!mounted_read_only) { + /* + * Set the "dirty" flag so that if we reboot uncleanly we + * will notice this immediately on the next mount. + */ + c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY); + err = ubifs_write_master(c); + if (err) + goto out_master; + } + + err = ubifs_lpt_init(c, 1, !mounted_read_only); + if (err) + goto out_lpt; + + err = dbg_check_idx_size(c, c->old_idx_sz); + if (err) + goto out_lpt; + + err = ubifs_replay_journal(c); + if (err) + goto out_journal; + + err = ubifs_mount_orphans(c, c->need_recovery, mounted_read_only); + if (err) + goto out_orphans; + + if (!mounted_read_only) { + int lnum; + + /* Check for enough free space */ + if (ubifs_calc_available(c, c->min_idx_lebs) <= 0) { + ubifs_err("insufficient available space"); + err = -EINVAL; + goto out_orphans; + } + + /* Check for enough log space */ + lnum = c->lhead_lnum + 1; + if (lnum >= UBIFS_LOG_LNUM + c->log_lebs) + lnum = UBIFS_LOG_LNUM; + if (lnum == c->ltail_lnum) { + err = ubifs_consolidate_log(c); + if (err) + goto out_orphans; + } + + if (c->need_recovery) { + err = ubifs_recover_size(c); + if (err) + goto out_orphans; + err = ubifs_rcvry_gc_commit(c); + } else + err = take_gc_lnum(c); + if (err) + goto out_orphans; + + err = dbg_check_lprops(c); + if (err) + goto out_orphans; + } else if (c->need_recovery) { + err = ubifs_recover_size(c); + if (err) + goto out_orphans; + } + + spin_lock(&ubifs_infos_lock); + list_add_tail(&c->infos_list, &ubifs_infos); + spin_unlock(&ubifs_infos_lock); + + if (c->need_recovery) { + if (mounted_read_only) + ubifs_msg("recovery deferred"); + else { + c->need_recovery = 0; + ubifs_msg("recovery completed"); + } + } + + err = dbg_check_filesystem(c); + if (err) + goto out_infos; + + ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"", + c->vi.ubi_num, c->vi.vol_id, c->vi.name); + if (mounted_read_only) + ubifs_msg("mounted read-only"); + x = (long long)c->main_lebs * c->leb_size; + ubifs_msg("file system size: %lld bytes (%lld KiB, %lld MiB, %d LEBs)", + x, x >> 10, x >> 20, c->main_lebs); + x = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes; + ubifs_msg("journal size: %lld bytes (%lld KiB, %lld MiB, %d LEBs)", + x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt); + ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr)); + ubifs_msg("media format %d, latest format %d", + c->fmt_version, UBIFS_FORMAT_VERSION); + + dbg_msg("compiled on: " __DATE__ " at " __TIME__); + dbg_msg("min. I/O unit size: %d bytes", c->min_io_size); + dbg_msg("LEB size: %d bytes (%d KiB)", + c->leb_size, c->leb_size / 1024); + dbg_msg("data journal heads: %d", + c->jhead_cnt - NONDATA_JHEADS_CNT); + dbg_msg("UUID: %02X%02X%02X%02X-%02X%02X" + "-%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X", + c->uuid[0], c->uuid[1], c->uuid[2], c->uuid[3], + c->uuid[4], c->uuid[5], c->uuid[6], c->uuid[7], + c->uuid[8], c->uuid[9], c->uuid[10], c->uuid[11], + c->uuid[12], c->uuid[13], c->uuid[14], c->uuid[15]); + dbg_msg("fast unmount: %d", c->fast_unmount); + dbg_msg("big_lpt %d", c->big_lpt); + dbg_msg("log LEBs: %d (%d - %d)", + c->log_lebs, UBIFS_LOG_LNUM, c->log_last); + dbg_msg("LPT area LEBs: %d (%d - %d)", + c->lpt_lebs, c->lpt_first, c->lpt_last); + dbg_msg("orphan area LEBs: %d (%d - %d)", + c->orph_lebs, c->orph_first, c->orph_last); + dbg_msg("main area LEBs: %d (%d - %d)", + c->main_lebs, c->main_first, c->leb_cnt - 1); + dbg_msg("index LEBs: %d", c->lst.idx_lebs); + dbg_msg("total index bytes: %lld (%lld KiB, %lld MiB)", + c->old_idx_sz, c->old_idx_sz >> 10, c->old_idx_sz >> 20); + dbg_msg("key hash type: %d", c->key_hash_type); + dbg_msg("tree fanout: %d", c->fanout); + dbg_msg("reserved GC LEB: %d", c->gc_lnum); + dbg_msg("first main LEB: %d", c->main_first); + dbg_msg("dead watermark: %d", c->dead_wm); + dbg_msg("dark watermark: %d", c->dark_wm); + x = (long long)c->main_lebs * c->dark_wm; + dbg_msg("max. dark space: %lld (%lld KiB, %lld MiB)", + x, x >> 10, x >> 20); + dbg_msg("maximum bud bytes: %lld (%lld KiB, %lld MiB)", + c->max_bud_bytes, c->max_bud_bytes >> 10, + c->max_bud_bytes >> 20); + dbg_msg("BG commit bud bytes: %lld (%lld KiB, %lld MiB)", + c->bg_bud_bytes, c->bg_bud_bytes >> 10, + c->bg_bud_bytes >> 20); + dbg_msg("current bud bytes %lld (%lld KiB, %lld MiB)", + c->bud_bytes, c->bud_bytes >> 10, c->bud_bytes >> 20); + dbg_msg("max. seq. number: %llu", c->max_sqnum); + dbg_msg("commit number: %llu", c->cmt_no); + + return 0; + +out_infos: + spin_lock(&ubifs_infos_lock); + list_del(&c->infos_list); + spin_unlock(&ubifs_infos_lock); +out_orphans: + free_orphans(c); +out_journal: + destroy_journal(c); +out_lpt: + ubifs_lpt_free(c, 0); +out_master: + kfree(c->mst_node); + kfree(c->rcvrd_mst_node); + if (c->bgt) + kthread_stop(c->bgt); +out_wbufs: + free_wbufs(c); +out_cbuf: + kfree(c->cbuf); +out_dereg: + dbg_failure_mode_deregistration(c); +out_free: + vfree(c->ileb_buf); + vfree(c->sbuf); + kfree(c->bottom_up_buf); + UBIFS_DBG(vfree(c->dbg_buf)); + return err; +} + +/** + * ubifs_umount - un-mount UBIFS file-system. + * @c: UBIFS file-system description object + * + * Note, this function is called to free allocated resourced when un-mounting, + * as well as free resources when an error occurred while we were half way + * through mounting (error path cleanup function). So it has to make sure the + * resource was actually allocated before freeing it. + */ +static void ubifs_umount(struct ubifs_info *c) +{ + dbg_gen("un-mounting UBI device %d, volume %d", c->vi.ubi_num, + c->vi.vol_id); + + spin_lock(&ubifs_infos_lock); + list_del(&c->infos_list); + spin_unlock(&ubifs_infos_lock); + + if (c->bgt) + kthread_stop(c->bgt); + + destroy_journal(c); + free_wbufs(c); + free_orphans(c); + ubifs_lpt_free(c, 0); + + kfree(c->cbuf); + kfree(c->rcvrd_mst_node); + kfree(c->mst_node); + vfree(c->sbuf); + kfree(c->bottom_up_buf); + UBIFS_DBG(vfree(c->dbg_buf)); + vfree(c->ileb_buf); + dbg_failure_mode_deregistration(c); +} + +/** + * ubifs_remount_rw - re-mount in read-write mode. + * @c: UBIFS file-system description object + * + * UBIFS avoids allocating many unnecessary resources when mounted in read-only + * mode. This function allocates the needed resources and re-mounts UBIFS in + * read-write mode. + */ +static int ubifs_remount_rw(struct ubifs_info *c) +{ + int err, lnum; + + if (c->ro_media) + return -EINVAL; + + mutex_lock(&c->umount_mutex); + c->remounting_rw = 1; + + /* Check for enough free space */ + if (ubifs_calc_available(c, c->min_idx_lebs) <= 0) { + ubifs_err("insufficient available space"); + err = -EINVAL; + goto out; + } + + if (c->old_leb_cnt != c->leb_cnt) { + struct ubifs_sb_node *sup; + + sup = ubifs_read_sb_node(c); + if (IS_ERR(sup)) { + err = PTR_ERR(sup); + goto out; + } + sup->leb_cnt = cpu_to_le32(c->leb_cnt); + err = ubifs_write_sb_node(c, sup); + if (err) + goto out; + } + + if (c->need_recovery) { + ubifs_msg("completing deferred recovery"); + err = ubifs_write_rcvrd_mst_node(c); + if (err) + goto out; + err = ubifs_recover_size(c); + if (err) + goto out; + err = ubifs_clean_lebs(c, c->sbuf); + if (err) + goto out; + err = ubifs_recover_inl_heads(c, c->sbuf); + if (err) + goto out; + } + + if (!(c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY))) { + c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY); + err = ubifs_write_master(c); + if (err) + goto out; + } + + c->ileb_buf = vmalloc(c->leb_size); + if (!c->ileb_buf) { + err = -ENOMEM; + goto out; + } + + err = ubifs_lpt_init(c, 0, 1); + if (err) + goto out; + + err = alloc_wbufs(c); + if (err) + goto out; + + ubifs_create_buds_lists(c); + + /* Create background thread */ + c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name); + if (!c->bgt) + c->bgt = ERR_PTR(-EINVAL); + if (IS_ERR(c->bgt)) { + err = PTR_ERR(c->bgt); + c->bgt = NULL; + ubifs_err("cannot spawn \"%s\", error %d", + c->bgt_name, err); + return err; + } + wake_up_process(c->bgt); + + c->orph_buf = vmalloc(c->leb_size); + if (!c->orph_buf) + return -ENOMEM; + + /* Check for enough log space */ + lnum = c->lhead_lnum + 1; + if (lnum >= UBIFS_LOG_LNUM + c->log_lebs) + lnum = UBIFS_LOG_LNUM; + if (lnum == c->ltail_lnum) { + err = ubifs_consolidate_log(c); + if (err) + goto out; + } + + if (c->need_recovery) + err = ubifs_rcvry_gc_commit(c); + else + err = take_gc_lnum(c); + if (err) + goto out; + + if (c->need_recovery) { + c->need_recovery = 0; + ubifs_msg("deferred recovery completed"); + } + + dbg_gen("re-mounted read-write"); + c->vfs_sb->s_flags &= ~MS_RDONLY; + c->remounting_rw = 0; + mutex_unlock(&c->umount_mutex); + return 0; + +out: + vfree(c->orph_buf); + c->orph_buf = NULL; + if (c->bgt) { + kthread_stop(c->bgt); + c->bgt = NULL; + } + free_wbufs(c); + vfree(c->ileb_buf); + c->ileb_buf = NULL; + ubifs_lpt_free(c, 1); + c->remounting_rw = 0; + mutex_unlock(&c->umount_mutex); + return err; +} + +/** + * commit_on_unmount - commit the journal when un-mounting. + * @c: UBIFS file-system description object + * + * This function is called during un-mounting and it commits the journal unless + * the "fast unmount" mode is enabled. It also avoids committing the journal if + * it contains too few data. + * + * Sometimes recovery requires the journal to be committed at least once, and + * this function takes care about this. + */ +static void commit_on_unmount(struct ubifs_info *c) +{ + if (!c->fast_unmount) { + long long bud_bytes; + + spin_lock(&c->buds_lock); + bud_bytes = c->bud_bytes; + spin_unlock(&c->buds_lock); + if (bud_bytes > c->leb_size) + ubifs_run_commit(c); + } +} + +/** + * ubifs_remount_ro - re-mount in read-only mode. + * @c: UBIFS file-system description object + * + * We rely on VFS to have stopped writing. Possibly the background thread could + * be running a commit, however kthread_stop will wait in that case. + */ +static void ubifs_remount_ro(struct ubifs_info *c) +{ + int i, err; + + ubifs_assert(!c->need_recovery); + commit_on_unmount(c); + + mutex_lock(&c->umount_mutex); + if (c->bgt) { + kthread_stop(c->bgt); + c->bgt = NULL; + } + + for (i = 0; i < c->jhead_cnt; i++) { + ubifs_wbuf_sync(&c->jheads[i].wbuf); + del_timer_sync(&c->jheads[i].wbuf.timer); + } + + if (!c->ro_media) { + c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY); + c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS); + c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum); + err = ubifs_write_master(c); + if (err) + ubifs_ro_mode(c, err); + } + + ubifs_destroy_idx_gc(c); + free_wbufs(c); + vfree(c->orph_buf); + c->orph_buf = NULL; + vfree(c->ileb_buf); + c->ileb_buf = NULL; + ubifs_lpt_free(c, 1); + mutex_unlock(&c->umount_mutex); +} + +static void ubifs_put_super(struct super_block *sb) +{ + int i; + struct ubifs_info *c = sb->s_fs_info; + + ubifs_msg("un-mount UBI device %d, volume %d", c->vi.ubi_num, + c->vi.vol_id); + /* + * The following asserts are only valid if there has not been a failure + * of the media. For example, there will be dirty inodes if we failed + * to write them back because of I/O errors. + */ + ubifs_assert(atomic_long_read(&c->dirty_pg_cnt) == 0); + ubifs_assert(c->budg_idx_growth == 0); + ubifs_assert(c->budg_dd_growth == 0); + ubifs_assert(c->budg_data_growth == 0); + + /* + * The 'c->umount_lock' prevents races between UBIFS memory shrinker + * and file system un-mount. Namely, it prevents the shrinker from + * picking this superblock for shrinking - it will be just skipped if + * the mutex is locked. + */ + mutex_lock(&c->umount_mutex); + if (!(c->vfs_sb->s_flags & MS_RDONLY)) { + /* + * First of all kill the background thread to make sure it does + * not interfere with un-mounting and freeing resources. + */ + if (c->bgt) { + kthread_stop(c->bgt); + c->bgt = NULL; + } + + /* Synchronize write-buffers */ + if (c->jheads) + for (i = 0; i < c->jhead_cnt; i++) { + ubifs_wbuf_sync(&c->jheads[i].wbuf); + del_timer_sync(&c->jheads[i].wbuf.timer); + } + + /* + * On fatal errors c->ro_media is set to 1, in which case we do + * not write the master node. + */ + if (!c->ro_media) { + /* + * We are being cleanly unmounted which means the + * orphans were killed - indicate this in the master + * node. Also save the reserved GC LEB number. + */ + int err; + + c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY); + c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS); + c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum); + err = ubifs_write_master(c); + if (err) + /* + * Recovery will attempt to fix the master area + * next mount, so we just print a message and + * continue to unmount normally. + */ + ubifs_err("failed to write master node, " + "error %d", err); + } + } + + ubifs_umount(c); + bdi_destroy(&c->bdi); + ubi_close_volume(c->ubi); + mutex_unlock(&c->umount_mutex); + kfree(c); +} + +static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data) +{ + int err; + struct ubifs_info *c = sb->s_fs_info; + + dbg_gen("old flags %#lx, new flags %#x", sb->s_flags, *flags); + + err = ubifs_parse_options(c, data, 1); + if (err) { + ubifs_err("invalid or unknown remount parameter"); + return err; + } + if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) { + err = ubifs_remount_rw(c); + if (err) + return err; + } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) + ubifs_remount_ro(c); + + return 0; +} + +struct super_operations ubifs_super_operations = { + .alloc_inode = ubifs_alloc_inode, + .destroy_inode = ubifs_destroy_inode, + .put_super = ubifs_put_super, + .write_inode = ubifs_write_inode, + .delete_inode = ubifs_delete_inode, + .statfs = ubifs_statfs, + .dirty_inode = ubifs_dirty_inode, + .remount_fs = ubifs_remount_fs, + .show_options = ubifs_show_options, + .sync_fs = ubifs_sync_fs, +}; + +/** + * open_ubi - parse UBI device name string and open the UBI device. + * @name: UBI volume name + * @mode: UBI volume open mode + * + * There are several ways to specify UBI volumes when mounting UBIFS: + * o ubiX_Y - UBI device number X, volume Y; + * o ubiY - UBI device number 0, volume Y; + * o ubiX:NAME - mount UBI device X, volume with name NAME; + * o ubi:NAME - mount UBI device 0, volume with name NAME. + * + * Alternative '!' separator may be used instead of ':' (because some shells + * like busybox may interpret ':' as an NFS host name separator). This function + * returns ubi volume object in case of success and a negative error code in + * case of failure. + */ +static struct ubi_volume_desc *open_ubi(const char *name, int mode) +{ + int dev, vol; + char *endptr; + + if (name[0] != 'u' || name[1] != 'b' || name[2] != 'i') + return ERR_PTR(-EINVAL); + + /* ubi:NAME method */ + if ((name[3] == ':' || name[3] == '!') && name[4] != '\0') + return ubi_open_volume_nm(0, name + 4, mode); + + if (!isdigit(name[3])) + return ERR_PTR(-EINVAL); + + dev = simple_strtoul(name + 3, &endptr, 0); + + /* ubiY method */ + if (*endptr == '\0') + return ubi_open_volume(0, dev, mode); + + /* ubiX_Y method */ + if (*endptr == '_' && isdigit(endptr[1])) { + vol = simple_strtoul(endptr + 1, &endptr, 0); + if (*endptr != '\0') + return ERR_PTR(-EINVAL); + return ubi_open_volume(dev, vol, mode); + } + + /* ubiX:NAME method */ + if ((*endptr == ':' || *endptr == '!') && endptr[1] != '\0') + return ubi_open_volume_nm(dev, ++endptr, mode); + + return ERR_PTR(-EINVAL); +} + +static int ubifs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct ubi_volume_desc *ubi = sb->s_fs_info; + struct ubifs_info *c; + struct inode *root; + int err; + + c = kzalloc(sizeof(struct ubifs_info), GFP_KERNEL); + if (!c) + return -ENOMEM; + + spin_lock_init(&c->cnt_lock); + spin_lock_init(&c->cs_lock); + spin_lock_init(&c->buds_lock); + spin_lock_init(&c->space_lock); + spin_lock_init(&c->orphan_lock); + init_rwsem(&c->commit_sem); + mutex_init(&c->lp_mutex); + mutex_init(&c->tnc_mutex); + mutex_init(&c->log_mutex); + mutex_init(&c->mst_mutex); + mutex_init(&c->umount_mutex); + init_waitqueue_head(&c->cmt_wq); + c->buds = RB_ROOT; + c->old_idx = RB_ROOT; + c->size_tree = RB_ROOT; + c->orph_tree = RB_ROOT; + INIT_LIST_HEAD(&c->infos_list); + INIT_LIST_HEAD(&c->idx_gc); + INIT_LIST_HEAD(&c->replay_list); + INIT_LIST_HEAD(&c->replay_buds); + INIT_LIST_HEAD(&c->uncat_list); + INIT_LIST_HEAD(&c->empty_list); + INIT_LIST_HEAD(&c->freeable_list); + INIT_LIST_HEAD(&c->frdi_idx_list); + INIT_LIST_HEAD(&c->unclean_leb_list); + INIT_LIST_HEAD(&c->old_buds); + INIT_LIST_HEAD(&c->orph_list); + INIT_LIST_HEAD(&c->orph_new); + + c->highest_inum = UBIFS_FIRST_INO; + c->lhead_lnum = c->ltail_lnum = UBIFS_LOG_LNUM; + + ubi_get_volume_info(ubi, &c->vi); + ubi_get_device_info(c->vi.ubi_num, &c->di); + + /* Re-open the UBI device in read-write mode */ + c->ubi = ubi_open_volume(c->vi.ubi_num, c->vi.vol_id, UBI_READWRITE); + if (IS_ERR(c->ubi)) { + err = PTR_ERR(c->ubi); + goto out_free; + } + + /* + * UBIFS provides 'backing_dev_info' in order to disable read-ahead. For + * UBIFS, I/O is not deferred, it is done immediately in readpage, + * which means the user would have to wait not just for their own I/O + * but the read-ahead I/O as well i.e. completely pointless. + * + * Read-ahead will be disabled because @c->bdi.ra_pages is 0. + */ + c->bdi.capabilities = BDI_CAP_MAP_COPY; + c->bdi.unplug_io_fn = default_unplug_io_fn; + err = bdi_init(&c->bdi); + if (err) + goto out_close; + + err = ubifs_parse_options(c, data, 0); + if (err) + goto out_bdi; + + c->vfs_sb = sb; + + sb->s_fs_info = c; + sb->s_magic = UBIFS_SUPER_MAGIC; + sb->s_blocksize = UBIFS_BLOCK_SIZE; + sb->s_blocksize_bits = UBIFS_BLOCK_SHIFT; + sb->s_dev = c->vi.cdev; + sb->s_maxbytes = c->max_inode_sz = key_max_inode_size(c); + if (c->max_inode_sz > MAX_LFS_FILESIZE) + sb->s_maxbytes = c->max_inode_sz = MAX_LFS_FILESIZE; + sb->s_op = &ubifs_super_operations; + + mutex_lock(&c->umount_mutex); + err = mount_ubifs(c); + if (err) { + ubifs_assert(err < 0); + goto out_unlock; + } + + /* Read the root inode */ + root = ubifs_iget(sb, UBIFS_ROOT_INO); + if (IS_ERR(root)) { + err = PTR_ERR(root); + goto out_umount; + } + + sb->s_root = d_alloc_root(root); + if (!sb->s_root) + goto out_iput; + + mutex_unlock(&c->umount_mutex); + + return 0; + +out_iput: + iput(root); +out_umount: + ubifs_umount(c); +out_unlock: + mutex_unlock(&c->umount_mutex); +out_bdi: + bdi_destroy(&c->bdi); +out_close: + ubi_close_volume(c->ubi); +out_free: + kfree(c); + return err; +} + +static int sb_test(struct super_block *sb, void *data) +{ + dev_t *dev = data; + + return sb->s_dev == *dev; +} + +static int sb_set(struct super_block *sb, void *data) +{ + dev_t *dev = data; + + sb->s_dev = *dev; + return 0; +} + +static int ubifs_get_sb(struct file_system_type *fs_type, int flags, + const char *name, void *data, struct vfsmount *mnt) +{ + struct ubi_volume_desc *ubi; + struct ubi_volume_info vi; + struct super_block *sb; + int err; + + dbg_gen("name %s, flags %#x", name, flags); + + /* + * Get UBI device number and volume ID. Mount it read-only so far + * because this might be a new mount point, and UBI allows only one + * read-write user at a time. + */ + ubi = open_ubi(name, UBI_READONLY); + if (IS_ERR(ubi)) { + ubifs_err("cannot open \"%s\", error %d", + name, (int)PTR_ERR(ubi)); + return PTR_ERR(ubi); + } + ubi_get_volume_info(ubi, &vi); + + dbg_gen("opened ubi%d_%d", vi.ubi_num, vi.vol_id); + + sb = sget(fs_type, &sb_test, &sb_set, &vi.cdev); + if (IS_ERR(sb)) { + err = PTR_ERR(sb); + goto out_close; + } + + if (sb->s_root) { + /* A new mount point for already mounted UBIFS */ + dbg_gen("this ubi volume is already mounted"); + if ((flags ^ sb->s_flags) & MS_RDONLY) { + err = -EBUSY; + goto out_deact; + } + } else { + sb->s_flags = flags; + /* + * Pass 'ubi' to 'fill_super()' in sb->s_fs_info where it is + * replaced by 'c'. + */ + sb->s_fs_info = ubi; + err = ubifs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0); + if (err) + goto out_deact; + /* We do not support atime */ + sb->s_flags |= MS_ACTIVE | MS_NOATIME; + } + + /* 'fill_super()' opens ubi again so we must close it here */ + ubi_close_volume(ubi); + + return simple_set_mnt(mnt, sb); + +out_deact: + up_write(&sb->s_umount); + deactivate_super(sb); +out_close: + ubi_close_volume(ubi); + return err; +} + +static void ubifs_kill_sb(struct super_block *sb) +{ + struct ubifs_info *c = sb->s_fs_info; + + /* + * We do 'commit_on_unmount()' here instead of 'ubifs_put_super()' + * in order to be outside BKL. + */ + if (sb->s_root && !(sb->s_flags & MS_RDONLY)) + commit_on_unmount(c); + /* The un-mount routine is actually done in put_super() */ + generic_shutdown_super(sb); +} + +static struct file_system_type ubifs_fs_type = { + .name = "ubifs", + .owner = THIS_MODULE, + .get_sb = ubifs_get_sb, + .kill_sb = ubifs_kill_sb +}; + +/* + * Inode slab cache constructor. + */ +static void inode_slab_ctor(void *obj) +{ + struct ubifs_inode *ui = obj; + inode_init_once(&ui->vfs_inode); +} + +static int __init ubifs_init(void) +{ + int err; + + BUILD_BUG_ON(sizeof(struct ubifs_ch) != 24); + + /* Make sure node sizes are 8-byte aligned */ + BUILD_BUG_ON(UBIFS_CH_SZ & 7); + BUILD_BUG_ON(UBIFS_INO_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_DENT_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_XENT_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_DATA_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_TRUN_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_SB_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_MST_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_REF_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_CS_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_ORPH_NODE_SZ & 7); + + BUILD_BUG_ON(UBIFS_MAX_DENT_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_MAX_XENT_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_MAX_DATA_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_MAX_INO_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_MAX_NODE_SZ & 7); + BUILD_BUG_ON(MIN_WRITE_SZ & 7); + + /* Check min. node size */ + BUILD_BUG_ON(UBIFS_INO_NODE_SZ < MIN_WRITE_SZ); + BUILD_BUG_ON(UBIFS_DENT_NODE_SZ < MIN_WRITE_SZ); + BUILD_BUG_ON(UBIFS_XENT_NODE_SZ < MIN_WRITE_SZ); + BUILD_BUG_ON(UBIFS_TRUN_NODE_SZ < MIN_WRITE_SZ); + + BUILD_BUG_ON(UBIFS_MAX_DENT_NODE_SZ > UBIFS_MAX_NODE_SZ); + BUILD_BUG_ON(UBIFS_MAX_XENT_NODE_SZ > UBIFS_MAX_NODE_SZ); + BUILD_BUG_ON(UBIFS_MAX_DATA_NODE_SZ > UBIFS_MAX_NODE_SZ); + BUILD_BUG_ON(UBIFS_MAX_INO_NODE_SZ > UBIFS_MAX_NODE_SZ); + + /* Defined node sizes */ + BUILD_BUG_ON(UBIFS_SB_NODE_SZ != 4096); + BUILD_BUG_ON(UBIFS_MST_NODE_SZ != 512); + BUILD_BUG_ON(UBIFS_INO_NODE_SZ != 160); + BUILD_BUG_ON(UBIFS_REF_NODE_SZ != 64); + + /* + * We require that PAGE_CACHE_SIZE is greater-than-or-equal-to + * UBIFS_BLOCK_SIZE. It is assumed that both are powers of 2. + */ + if (PAGE_CACHE_SIZE < UBIFS_BLOCK_SIZE) { + ubifs_err("VFS page cache size is %u bytes, but UBIFS requires" + " at least 4096 bytes", + (unsigned int)PAGE_CACHE_SIZE); + return -EINVAL; + } + + err = register_filesystem(&ubifs_fs_type); + if (err) { + ubifs_err("cannot register file system, error %d", err); + return err; + } + + err = -ENOMEM; + ubifs_inode_slab = kmem_cache_create("ubifs_inode_slab", + sizeof(struct ubifs_inode), 0, + SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT, + &inode_slab_ctor); + if (!ubifs_inode_slab) + goto out_reg; + + register_shrinker(&ubifs_shrinker_info); + + err = ubifs_compressors_init(); + if (err) + goto out_compr; + + return 0; + +out_compr: + unregister_shrinker(&ubifs_shrinker_info); + kmem_cache_destroy(ubifs_inode_slab); +out_reg: + unregister_filesystem(&ubifs_fs_type); + return err; +} +/* late_initcall to let compressors initialize first */ +late_initcall(ubifs_init); + +static void __exit ubifs_exit(void) +{ + ubifs_assert(list_empty(&ubifs_infos)); + ubifs_assert(atomic_long_read(&ubifs_clean_zn_cnt) == 0); + + ubifs_compressors_exit(); + unregister_shrinker(&ubifs_shrinker_info); + kmem_cache_destroy(ubifs_inode_slab); + unregister_filesystem(&ubifs_fs_type); +} +module_exit(ubifs_exit); + +MODULE_LICENSE("GPL"); +MODULE_VERSION(__stringify(UBIFS_VERSION)); +MODULE_AUTHOR("Artem Bityutskiy, Adrian Hunter"); +MODULE_DESCRIPTION("UBIFS - UBI File System"); diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c new file mode 100644 index 000000000000..7634c5970887 --- /dev/null +++ b/fs/ubifs/tnc.c @@ -0,0 +1,2962 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * This file implements TNC (Tree Node Cache) which caches indexing nodes of + * the UBIFS B-tree. + * + * At the moment the locking rules of the TNC tree are quite simple and + * straightforward. We just have a mutex and lock it when we traverse the + * tree. If a znode is not in memory, we read it from flash while still having + * the mutex locked. + */ + +#include <linux/crc32.h> +#include "ubifs.h" + +/* + * Returned codes of 'matches_name()' and 'fallible_matches_name()' functions. + * @NAME_LESS: name corresponding to the first argument is less than second + * @NAME_MATCHES: names match + * @NAME_GREATER: name corresponding to the second argument is greater than + * first + * @NOT_ON_MEDIA: node referred by zbranch does not exist on the media + * + * These constants were introduce to improve readability. + */ +enum { + NAME_LESS = 0, + NAME_MATCHES = 1, + NAME_GREATER = 2, + NOT_ON_MEDIA = 3, +}; + +/** + * insert_old_idx - record an index node obsoleted since the last commit start. + * @c: UBIFS file-system description object + * @lnum: LEB number of obsoleted index node + * @offs: offset of obsoleted index node + * + * Returns %0 on success, and a negative error code on failure. + * + * For recovery, there must always be a complete intact version of the index on + * flash at all times. That is called the "old index". It is the index as at the + * time of the last successful commit. Many of the index nodes in the old index + * may be dirty, but they must not be erased until the next successful commit + * (at which point that index becomes the old index). + * + * That means that the garbage collection and the in-the-gaps method of + * committing must be able to determine if an index node is in the old index. + * Most of the old index nodes can be found by looking up the TNC using the + * 'lookup_znode()' function. However, some of the old index nodes may have + * been deleted from the current index or may have been changed so much that + * they cannot be easily found. In those cases, an entry is added to an RB-tree. + * That is what this function does. The RB-tree is ordered by LEB number and + * offset because they uniquely identify the old index node. + */ +static int insert_old_idx(struct ubifs_info *c, int lnum, int offs) +{ + struct ubifs_old_idx *old_idx, *o; + struct rb_node **p, *parent = NULL; + + old_idx = kmalloc(sizeof(struct ubifs_old_idx), GFP_NOFS); + if (unlikely(!old_idx)) + return -ENOMEM; + old_idx->lnum = lnum; + old_idx->offs = offs; + + p = &c->old_idx.rb_node; + while (*p) { + parent = *p; + o = rb_entry(parent, struct ubifs_old_idx, rb); + if (lnum < o->lnum) + p = &(*p)->rb_left; + else if (lnum > o->lnum) + p = &(*p)->rb_right; + else if (offs < o->offs) + p = &(*p)->rb_left; + else if (offs > o->offs) + p = &(*p)->rb_right; + else { + ubifs_err("old idx added twice!"); + kfree(old_idx); + return 0; + } + } + rb_link_node(&old_idx->rb, parent, p); + rb_insert_color(&old_idx->rb, &c->old_idx); + return 0; +} + +/** + * insert_old_idx_znode - record a znode obsoleted since last commit start. + * @c: UBIFS file-system description object + * @znode: znode of obsoleted index node + * + * Returns %0 on success, and a negative error code on failure. + */ +int insert_old_idx_znode(struct ubifs_info *c, struct ubifs_znode *znode) +{ + if (znode->parent) { + struct ubifs_zbranch *zbr; + + zbr = &znode->parent->zbranch[znode->iip]; + if (zbr->len) + return insert_old_idx(c, zbr->lnum, zbr->offs); + } else + if (c->zroot.len) + return insert_old_idx(c, c->zroot.lnum, + c->zroot.offs); + return 0; +} + +/** + * ins_clr_old_idx_znode - record a znode obsoleted since last commit start. + * @c: UBIFS file-system description object + * @znode: znode of obsoleted index node + * + * Returns %0 on success, and a negative error code on failure. + */ +static int ins_clr_old_idx_znode(struct ubifs_info *c, + struct ubifs_znode *znode) +{ + int err; + + if (znode->parent) { + struct ubifs_zbranch *zbr; + + zbr = &znode->parent->zbranch[znode->iip]; + if (zbr->len) { + err = insert_old_idx(c, zbr->lnum, zbr->offs); + if (err) + return err; + zbr->lnum = 0; + zbr->offs = 0; + zbr->len = 0; + } + } else + if (c->zroot.len) { + err = insert_old_idx(c, c->zroot.lnum, c->zroot.offs); + if (err) + return err; + c->zroot.lnum = 0; + c->zroot.offs = 0; + c->zroot.len = 0; + } + return 0; +} + +/** + * destroy_old_idx - destroy the old_idx RB-tree. + * @c: UBIFS file-system description object + * + * During start commit, the old_idx RB-tree is used to avoid overwriting index + * nodes that were in the index last commit but have since been deleted. This + * is necessary for recovery i.e. the old index must be kept intact until the + * new index is successfully written. The old-idx RB-tree is used for the + * in-the-gaps method of writing index nodes and is destroyed every commit. + */ +void destroy_old_idx(struct ubifs_info *c) +{ + struct rb_node *this = c->old_idx.rb_node; + struct ubifs_old_idx *old_idx; + + while (this) { + if (this->rb_left) { + this = this->rb_left; + continue; + } else if (this->rb_right) { + this = this->rb_right; + continue; + } + old_idx = rb_entry(this, struct ubifs_old_idx, rb); + this = rb_parent(this); + if (this) { + if (this->rb_left == &old_idx->rb) + this->rb_left = NULL; + else + this->rb_right = NULL; + } + kfree(old_idx); + } + c->old_idx = RB_ROOT; +} + +/** + * copy_znode - copy a dirty znode. + * @c: UBIFS file-system description object + * @znode: znode to copy + * + * A dirty znode being committed may not be changed, so it is copied. + */ +static struct ubifs_znode *copy_znode(struct ubifs_info *c, + struct ubifs_znode *znode) +{ + struct ubifs_znode *zn; + + zn = kmalloc(c->max_znode_sz, GFP_NOFS); + if (unlikely(!zn)) + return ERR_PTR(-ENOMEM); + + memcpy(zn, znode, c->max_znode_sz); + zn->cnext = NULL; + __set_bit(DIRTY_ZNODE, &zn->flags); + __clear_bit(COW_ZNODE, &zn->flags); + + ubifs_assert(!test_bit(OBSOLETE_ZNODE, &znode->flags)); + __set_bit(OBSOLETE_ZNODE, &znode->flags); + + if (znode->level != 0) { + int i; + const int n = zn->child_cnt; + + /* The children now have new parent */ + for (i = 0; i < n; i++) { + struct ubifs_zbranch *zbr = &zn->zbranch[i]; + + if (zbr->znode) + zbr->znode->parent = zn; + } + } + + atomic_long_inc(&c->dirty_zn_cnt); + return zn; +} + +/** + * add_idx_dirt - add dirt due to a dirty znode. + * @c: UBIFS file-system description object + * @lnum: LEB number of index node + * @dirt: size of index node + * + * This function updates lprops dirty space and the new size of the index. + */ +static int add_idx_dirt(struct ubifs_info *c, int lnum, int dirt) +{ + c->calc_idx_sz -= ALIGN(dirt, 8); + return ubifs_add_dirt(c, lnum, dirt); +} + +/** + * dirty_cow_znode - ensure a znode is not being committed. + * @c: UBIFS file-system description object + * @zbr: branch of znode to check + * + * Returns dirtied znode on success or negative error code on failure. + */ +static struct ubifs_znode *dirty_cow_znode(struct ubifs_info *c, + struct ubifs_zbranch *zbr) +{ + struct ubifs_znode *znode = zbr->znode; + struct ubifs_znode *zn; + int err; + + if (!test_bit(COW_ZNODE, &znode->flags)) { + /* znode is not being committed */ + if (!test_and_set_bit(DIRTY_ZNODE, &znode->flags)) { + atomic_long_inc(&c->dirty_zn_cnt); + atomic_long_dec(&c->clean_zn_cnt); + atomic_long_dec(&ubifs_clean_zn_cnt); + err = add_idx_dirt(c, zbr->lnum, zbr->len); + if (unlikely(err)) + return ERR_PTR(err); + } + return znode; + } + + zn = copy_znode(c, znode); + if (unlikely(IS_ERR(zn))) + return zn; + + if (zbr->len) { + err = insert_old_idx(c, zbr->lnum, zbr->offs); + if (unlikely(err)) + return ERR_PTR(err); + err = add_idx_dirt(c, zbr->lnum, zbr->len); + } else + err = 0; + + zbr->znode = zn; + zbr->lnum = 0; + zbr->offs = 0; + zbr->len = 0; + + if (unlikely(err)) + return ERR_PTR(err); + return zn; +} + +/** + * lnc_add - add a leaf node to the leaf node cache. + * @c: UBIFS file-system description object + * @zbr: zbranch of leaf node + * @node: leaf node + * + * Leaf nodes are non-index nodes directory entry nodes or data nodes. The + * purpose of the leaf node cache is to save re-reading the same leaf node over + * and over again. Most things are cached by VFS, however the file system must + * cache directory entries for readdir and for resolving hash collisions. The + * present implementation of the leaf node cache is extremely simple, and + * allows for error returns that are not used but that may be needed if a more + * complex implementation is created. + * + * Note, this function does not add the @node object to LNC directly, but + * allocates a copy of the object and adds the copy to LNC. The reason for this + * is that @node has been allocated outside of the TNC subsystem and will be + * used with @c->tnc_mutex unlock upon return from the TNC subsystem. But LNC + * may be changed at any time, e.g. freed by the shrinker. + */ +static int lnc_add(struct ubifs_info *c, struct ubifs_zbranch *zbr, + const void *node) +{ + int err; + void *lnc_node; + const struct ubifs_dent_node *dent = node; + + ubifs_assert(!zbr->leaf); + ubifs_assert(zbr->len != 0); + ubifs_assert(is_hash_key(c, &zbr->key)); + + err = ubifs_validate_entry(c, dent); + if (err) { + dbg_dump_stack(); + dbg_dump_node(c, dent); + return err; + } + + lnc_node = kmalloc(zbr->len, GFP_NOFS); + if (!lnc_node) + /* We don't have to have the cache, so no error */ + return 0; + + memcpy(lnc_node, node, zbr->len); + zbr->leaf = lnc_node; + return 0; +} + + /** + * lnc_add_directly - add a leaf node to the leaf-node-cache. + * @c: UBIFS file-system description object + * @zbr: zbranch of leaf node + * @node: leaf node + * + * This function is similar to 'lnc_add()', but it does not create a copy of + * @node but inserts @node to TNC directly. + */ +static int lnc_add_directly(struct ubifs_info *c, struct ubifs_zbranch *zbr, + void *node) +{ + int err; + + ubifs_assert(!zbr->leaf); + ubifs_assert(zbr->len != 0); + + err = ubifs_validate_entry(c, node); + if (err) { + dbg_dump_stack(); + dbg_dump_node(c, node); + return err; + } + + zbr->leaf = node; + return 0; +} + +/** + * lnc_free - remove a leaf node from the leaf node cache. + * @zbr: zbranch of leaf node + * @node: leaf node + */ +static void lnc_free(struct ubifs_zbranch *zbr) +{ + if (!zbr->leaf) + return; + kfree(zbr->leaf); + zbr->leaf = NULL; +} + +/** + * tnc_read_node_nm - read a "hashed" leaf node. + * @c: UBIFS file-system description object + * @zbr: key and position of the node + * @node: node is returned here + * + * This function reads a "hashed" node defined by @zbr from the leaf node cache + * (in it is there) or from the hash media, in which case the node is also + * added to LNC. Returns zero in case of success or a negative negative error + * code in case of failure. + */ +static int tnc_read_node_nm(struct ubifs_info *c, struct ubifs_zbranch *zbr, + void *node) +{ + int err; + + ubifs_assert(is_hash_key(c, &zbr->key)); + + if (zbr->leaf) { + /* Read from the leaf node cache */ + ubifs_assert(zbr->len != 0); + memcpy(node, zbr->leaf, zbr->len); + return 0; + } + + err = ubifs_tnc_read_node(c, zbr, node); + if (err) + return err; + + /* Add the node to the leaf node cache */ + err = lnc_add(c, zbr, node); + return err; +} + +/** + * try_read_node - read a node if it is a node. + * @c: UBIFS file-system description object + * @buf: buffer to read to + * @type: node type + * @len: node length (not aligned) + * @lnum: LEB number of node to read + * @offs: offset of node to read + * + * This function tries to read a node of known type and length, checks it and + * stores it in @buf. This function returns %1 if a node is present and %0 if + * a node is not present. A negative error code is returned for I/O errors. + * This function performs that same function as ubifs_read_node except that + * it does not require that there is actually a node present and instead + * the return code indicates if a node was read. + */ +static int try_read_node(const struct ubifs_info *c, void *buf, int type, + int len, int lnum, int offs) +{ + int err, node_len; + struct ubifs_ch *ch = buf; + uint32_t crc, node_crc; + + dbg_io("LEB %d:%d, %s, length %d", lnum, offs, dbg_ntype(type), len); + + err = ubi_read(c->ubi, lnum, buf, offs, len); + if (err) { + ubifs_err("cannot read node type %d from LEB %d:%d, error %d", + type, lnum, offs, err); + return err; + } + + if (le32_to_cpu(ch->magic) != UBIFS_NODE_MAGIC) + return 0; + + if (ch->node_type != type) + return 0; + + node_len = le32_to_cpu(ch->len); + if (node_len != len) + return 0; + + crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8); + node_crc = le32_to_cpu(ch->crc); + if (crc != node_crc) + return 0; + + return 1; +} + +/** + * fallible_read_node - try to read a leaf node. + * @c: UBIFS file-system description object + * @key: key of node to read + * @zbr: position of node + * @node: node returned + * + * This function tries to read a node and returns %1 if the node is read, %0 + * if the node is not present, and a negative error code in the case of error. + */ +static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key, + struct ubifs_zbranch *zbr, void *node) +{ + int ret; + + dbg_tnc("LEB %d:%d, key %s", zbr->lnum, zbr->offs, DBGKEY(key)); + + ret = try_read_node(c, node, key_type(c, key), zbr->len, zbr->lnum, + zbr->offs); + if (ret == 1) { + union ubifs_key node_key; + struct ubifs_dent_node *dent = node; + + /* All nodes have key in the same place */ + key_read(c, &dent->key, &node_key); + if (keys_cmp(c, key, &node_key) != 0) + ret = 0; + } + if (ret == 0 && c->replaying) + dbg_mnt("dangling branch LEB %d:%d len %d, key %s", + zbr->lnum, zbr->offs, zbr->len, DBGKEY(key)); + return ret; +} + +/** + * matches_name - determine if a direntry or xattr entry matches a given name. + * @c: UBIFS file-system description object + * @zbr: zbranch of dent + * @nm: name to match + * + * This function checks if xentry/direntry referred by zbranch @zbr matches name + * @nm. Returns %NAME_MATCHES if it does, %NAME_LESS if the name referred by + * @zbr is less than @nm, and %NAME_GREATER if it is greater than @nm. In case + * of failure, a negative error code is returned. + */ +static int matches_name(struct ubifs_info *c, struct ubifs_zbranch *zbr, + const struct qstr *nm) +{ + struct ubifs_dent_node *dent; + int nlen, err; + + /* If possible, match against the dent in the leaf node cache */ + if (!zbr->leaf) { + dent = kmalloc(zbr->len, GFP_NOFS); + if (!dent) + return -ENOMEM; + + err = ubifs_tnc_read_node(c, zbr, dent); + if (err) + goto out_free; + + /* Add the node to the leaf node cache */ + err = lnc_add_directly(c, zbr, dent); + if (err) + goto out_free; + } else + dent = zbr->leaf; + + nlen = le16_to_cpu(dent->nlen); + err = memcmp(dent->name, nm->name, min_t(int, nlen, nm->len)); + if (err == 0) { + if (nlen == nm->len) + return NAME_MATCHES; + else if (nlen < nm->len) + return NAME_LESS; + else + return NAME_GREATER; + } else if (err < 0) + return NAME_LESS; + else + return NAME_GREATER; + +out_free: + kfree(dent); + return err; +} + +/** + * get_znode - get a TNC znode that may not be loaded yet. + * @c: UBIFS file-system description object + * @znode: parent znode + * @n: znode branch slot number + * + * This function returns the znode or a negative error code. + */ +static struct ubifs_znode *get_znode(struct ubifs_info *c, + struct ubifs_znode *znode, int n) +{ + struct ubifs_zbranch *zbr; + + zbr = &znode->zbranch[n]; + if (zbr->znode) + znode = zbr->znode; + else + znode = ubifs_load_znode(c, zbr, znode, n); + return znode; +} + +/** + * tnc_next - find next TNC entry. + * @c: UBIFS file-system description object + * @zn: znode is passed and returned here + * @n: znode branch slot number is passed and returned here + * + * This function returns %0 if the next TNC entry is found, %-ENOENT if there is + * no next entry, or a negative error code otherwise. + */ +static int tnc_next(struct ubifs_info *c, struct ubifs_znode **zn, int *n) +{ + struct ubifs_znode *znode = *zn; + int nn = *n; + + nn += 1; + if (nn < znode->child_cnt) { + *n = nn; + return 0; + } + while (1) { + struct ubifs_znode *zp; + + zp = znode->parent; + if (!zp) + return -ENOENT; + nn = znode->iip + 1; + znode = zp; + if (nn < znode->child_cnt) { + znode = get_znode(c, znode, nn); + if (IS_ERR(znode)) + return PTR_ERR(znode); + while (znode->level != 0) { + znode = get_znode(c, znode, 0); + if (IS_ERR(znode)) + return PTR_ERR(znode); + } + nn = 0; + break; + } + } + *zn = znode; + *n = nn; + return 0; +} + +/** + * tnc_prev - find previous TNC entry. + * @c: UBIFS file-system description object + * @zn: znode is returned here + * @n: znode branch slot number is passed and returned here + * + * This function returns %0 if the previous TNC entry is found, %-ENOENT if + * there is no next entry, or a negative error code otherwise. + */ +static int tnc_prev(struct ubifs_info *c, struct ubifs_znode **zn, int *n) +{ + struct ubifs_znode *znode = *zn; + int nn = *n; + + if (nn > 0) { + *n = nn - 1; + return 0; + } + while (1) { + struct ubifs_znode *zp; + + zp = znode->parent; + if (!zp) + return -ENOENT; + nn = znode->iip - 1; + znode = zp; + if (nn >= 0) { + znode = get_znode(c, znode, nn); + if (IS_ERR(znode)) + return PTR_ERR(znode); + while (znode->level != 0) { + nn = znode->child_cnt - 1; + znode = get_znode(c, znode, nn); + if (IS_ERR(znode)) + return PTR_ERR(znode); + } + nn = znode->child_cnt - 1; + break; + } + } + *zn = znode; + *n = nn; + return 0; +} + +/** + * resolve_collision - resolve a collision. + * @c: UBIFS file-system description object + * @key: key of a directory or extended attribute entry + * @zn: znode is returned here + * @n: zbranch number is passed and returned here + * @nm: name of the entry + * + * This function is called for "hashed" keys to make sure that the found key + * really corresponds to the looked up node (directory or extended attribute + * entry). It returns %1 and sets @zn and @n if the collision is resolved. + * %0 is returned if @nm is not found and @zn and @n are set to the previous + * entry, i.e. to the entry after which @nm could follow if it were in TNC. + * This means that @n may be set to %-1 if the leftmost key in @zn is the + * previous one. A negative error code is returned on failures. + */ +static int resolve_collision(struct ubifs_info *c, const union ubifs_key *key, + struct ubifs_znode **zn, int *n, + const struct qstr *nm) +{ + int err; + + err = matches_name(c, &(*zn)->zbranch[*n], nm); + if (unlikely(err < 0)) + return err; + if (err == NAME_MATCHES) + return 1; + + if (err == NAME_GREATER) { + /* Look left */ + while (1) { + err = tnc_prev(c, zn, n); + if (err == -ENOENT) { + ubifs_assert(*n == 0); + *n = -1; + return 0; + } + if (err < 0) + return err; + if (keys_cmp(c, &(*zn)->zbranch[*n].key, key)) { + /* + * We have found the branch after which we would + * like to insert, but inserting in this znode + * may still be wrong. Consider the following 3 + * znodes, in the case where we are resolving a + * collision with Key2. + * + * znode zp + * ---------------------- + * level 1 | Key0 | Key1 | + * ----------------------- + * | | + * znode za | | znode zb + * ------------ ------------ + * level 0 | Key0 | | Key2 | + * ------------ ------------ + * + * The lookup finds Key2 in znode zb. Lets say + * there is no match and the name is greater so + * we look left. When we find Key0, we end up + * here. If we return now, we will insert into + * znode za at slot n = 1. But that is invalid + * according to the parent's keys. Key2 must + * be inserted into znode zb. + * + * Note, this problem is not relevant for the + * case when we go right, because + * 'tnc_insert()' would correct the parent key. + */ + if (*n == (*zn)->child_cnt - 1) { + err = tnc_next(c, zn, n); + if (err) { + /* Should be impossible */ + ubifs_assert(0); + if (err == -ENOENT) + err = -EINVAL; + return err; + } + ubifs_assert(*n == 0); + *n = -1; + } + return 0; + } + err = matches_name(c, &(*zn)->zbranch[*n], nm); + if (err < 0) + return err; + if (err == NAME_LESS) + return 0; + if (err == NAME_MATCHES) + return 1; + ubifs_assert(err == NAME_GREATER); + } + } else { + int nn = *n; + struct ubifs_znode *znode = *zn; + + /* Look right */ + while (1) { + err = tnc_next(c, &znode, &nn); + if (err == -ENOENT) + return 0; + if (err < 0) + return err; + if (keys_cmp(c, &znode->zbranch[nn].key, key)) + return 0; + err = matches_name(c, &znode->zbranch[nn], nm); + if (err < 0) + return err; + if (err == NAME_GREATER) + return 0; + *zn = znode; + *n = nn; + if (err == NAME_MATCHES) + return 1; + ubifs_assert(err == NAME_LESS); + } + } +} + +/** + * fallible_matches_name - determine if a dent matches a given name. + * @c: UBIFS file-system description object + * @zbr: zbranch of dent + * @nm: name to match + * + * This is a "fallible" version of 'matches_name()' function which does not + * panic if the direntry/xentry referred by @zbr does not exist on the media. + * + * This function checks if xentry/direntry referred by zbranch @zbr matches name + * @nm. Returns %NAME_MATCHES it does, %NAME_LESS if the name referred by @zbr + * is less than @nm, %NAME_GREATER if it is greater than @nm, and @NOT_ON_MEDIA + * if xentry/direntry referred by @zbr does not exist on the media. A negative + * error code is returned in case of failure. + */ +static int fallible_matches_name(struct ubifs_info *c, + struct ubifs_zbranch *zbr, + const struct qstr *nm) +{ + struct ubifs_dent_node *dent; + int nlen, err; + + /* If possible, match against the dent in the leaf node cache */ + if (!zbr->leaf) { + dent = kmalloc(zbr->len, GFP_NOFS); + if (!dent) + return -ENOMEM; + + err = fallible_read_node(c, &zbr->key, zbr, dent); + if (err < 0) + goto out_free; + if (err == 0) { + /* The node was not present */ + err = NOT_ON_MEDIA; + goto out_free; + } + ubifs_assert(err == 1); + + err = lnc_add_directly(c, zbr, dent); + if (err) + goto out_free; + } else + dent = zbr->leaf; + + nlen = le16_to_cpu(dent->nlen); + err = memcmp(dent->name, nm->name, min_t(int, nlen, nm->len)); + if (err == 0) { + if (nlen == nm->len) + return NAME_MATCHES; + else if (nlen < nm->len) + return NAME_LESS; + else + return NAME_GREATER; + } else if (err < 0) + return NAME_LESS; + else + return NAME_GREATER; + +out_free: + kfree(dent); + return err; +} + +/** + * fallible_resolve_collision - resolve a collision even if nodes are missing. + * @c: UBIFS file-system description object + * @key: key + * @zn: znode is returned here + * @n: branch number is passed and returned here + * @nm: name of directory entry + * @adding: indicates caller is adding a key to the TNC + * + * This is a "fallible" version of the 'resolve_collision()' function which + * does not panic if one of the nodes referred to by TNC does not exist on the + * media. This may happen when replaying the journal if a deleted node was + * Garbage-collected and the commit was not done. A branch that refers to a node + * that is not present is called a dangling branch. The following are the return + * codes for this function: + * o if @nm was found, %1 is returned and @zn and @n are set to the found + * branch; + * o if we are @adding and @nm was not found, %0 is returned; + * o if we are not @adding and @nm was not found, but a dangling branch was + * found, then %1 is returned and @zn and @n are set to the dangling branch; + * o a negative error code is returned in case of failure. + */ +static int fallible_resolve_collision(struct ubifs_info *c, + const union ubifs_key *key, + struct ubifs_znode **zn, int *n, + const struct qstr *nm, int adding) +{ + struct ubifs_znode *o_znode = NULL, *znode = *zn; + int uninitialized_var(o_n), err, cmp, unsure = 0, nn = *n; + + cmp = fallible_matches_name(c, &znode->zbranch[nn], nm); + if (unlikely(cmp < 0)) + return cmp; + if (cmp == NAME_MATCHES) + return 1; + if (cmp == NOT_ON_MEDIA) { + o_znode = znode; + o_n = nn; + /* + * We are unlucky and hit a dangling branch straight away. + * Now we do not really know where to go to find the needed + * branch - to the left or to the right. Well, let's try left. + */ + unsure = 1; + } else if (!adding) + unsure = 1; /* Remove a dangling branch wherever it is */ + + if (cmp == NAME_GREATER || unsure) { + /* Look left */ + while (1) { + err = tnc_prev(c, zn, n); + if (err == -ENOENT) { + ubifs_assert(*n == 0); + *n = -1; + break; + } + if (err < 0) + return err; + if (keys_cmp(c, &(*zn)->zbranch[*n].key, key)) { + /* See comments in 'resolve_collision()' */ + if (*n == (*zn)->child_cnt - 1) { + err = tnc_next(c, zn, n); + if (err) { + /* Should be impossible */ + ubifs_assert(0); + if (err == -ENOENT) + err = -EINVAL; + return err; + } + ubifs_assert(*n == 0); + *n = -1; + } + break; + } + err = fallible_matches_name(c, &(*zn)->zbranch[*n], nm); + if (err < 0) + return err; + if (err == NAME_MATCHES) + return 1; + if (err == NOT_ON_MEDIA) { + o_znode = *zn; + o_n = *n; + continue; + } + if (!adding) + continue; + if (err == NAME_LESS) + break; + else + unsure = 0; + } + } + + if (cmp == NAME_LESS || unsure) { + /* Look right */ + *zn = znode; + *n = nn; + while (1) { + err = tnc_next(c, &znode, &nn); + if (err == -ENOENT) + break; + if (err < 0) + return err; + if (keys_cmp(c, &znode->zbranch[nn].key, key)) + break; + err = fallible_matches_name(c, &znode->zbranch[nn], nm); + if (err < 0) + return err; + if (err == NAME_GREATER) + break; + *zn = znode; + *n = nn; + if (err == NAME_MATCHES) + return 1; + if (err == NOT_ON_MEDIA) { + o_znode = znode; + o_n = nn; + } + } + } + + /* Never match a dangling branch when adding */ + if (adding || !o_znode) + return 0; + + dbg_mnt("dangling match LEB %d:%d len %d %s", + o_znode->zbranch[o_n].lnum, o_znode->zbranch[o_n].offs, + o_znode->zbranch[o_n].len, DBGKEY(key)); + *zn = o_znode; + *n = o_n; + return 1; +} + +/** + * matches_position - determine if a zbranch matches a given position. + * @zbr: zbranch of dent + * @lnum: LEB number of dent to match + * @offs: offset of dent to match + * + * This function returns %1 if @lnum:@offs matches, and %0 otherwise. + */ +static int matches_position(struct ubifs_zbranch *zbr, int lnum, int offs) +{ + if (zbr->lnum == lnum && zbr->offs == offs) + return 1; + else + return 0; +} + +/** + * resolve_collision_directly - resolve a collision directly. + * @c: UBIFS file-system description object + * @key: key of directory entry + * @zn: znode is passed and returned here + * @n: zbranch number is passed and returned here + * @lnum: LEB number of dent node to match + * @offs: offset of dent node to match + * + * This function is used for "hashed" keys to make sure the found directory or + * extended attribute entry node is what was looked for. It is used when the + * flash address of the right node is known (@lnum:@offs) which makes it much + * easier to resolve collisions (no need to read entries and match full + * names). This function returns %1 and sets @zn and @n if the collision is + * resolved, %0 if @lnum:@offs is not found and @zn and @n are set to the + * previous directory entry. Otherwise a negative error code is returned. + */ +static int resolve_collision_directly(struct ubifs_info *c, + const union ubifs_key *key, + struct ubifs_znode **zn, int *n, + int lnum, int offs) +{ + struct ubifs_znode *znode; + int nn, err; + + znode = *zn; + nn = *n; + if (matches_position(&znode->zbranch[nn], lnum, offs)) + return 1; + + /* Look left */ + while (1) { + err = tnc_prev(c, &znode, &nn); + if (err == -ENOENT) + break; + if (err < 0) + return err; + if (keys_cmp(c, &znode->zbranch[nn].key, key)) + break; + if (matches_position(&znode->zbranch[nn], lnum, offs)) { + *zn = znode; + *n = nn; + return 1; + } + } + + /* Look right */ + znode = *zn; + nn = *n; + while (1) { + err = tnc_next(c, &znode, &nn); + if (err == -ENOENT) + return 0; + if (err < 0) + return err; + if (keys_cmp(c, &znode->zbranch[nn].key, key)) + return 0; + *zn = znode; + *n = nn; + if (matches_position(&znode->zbranch[nn], lnum, offs)) + return 1; + } +} + +/** + * dirty_cow_bottom_up - dirty a znode and its ancestors. + * @c: UBIFS file-system description object + * @znode: znode to dirty + * + * If we do not have a unique key that resides in a znode, then we cannot + * dirty that znode from the top down (i.e. by using lookup_level0_dirty) + * This function records the path back to the last dirty ancestor, and then + * dirties the znodes on that path. + */ +static struct ubifs_znode *dirty_cow_bottom_up(struct ubifs_info *c, + struct ubifs_znode *znode) +{ + struct ubifs_znode *zp; + int *path = c->bottom_up_buf, p = 0; + + ubifs_assert(c->zroot.znode); + ubifs_assert(znode); + if (c->zroot.znode->level > BOTTOM_UP_HEIGHT) { + kfree(c->bottom_up_buf); + c->bottom_up_buf = kmalloc(c->zroot.znode->level * sizeof(int), + GFP_NOFS); + if (!c->bottom_up_buf) + return ERR_PTR(-ENOMEM); + path = c->bottom_up_buf; + } + if (c->zroot.znode->level) { + /* Go up until parent is dirty */ + while (1) { + int n; + + zp = znode->parent; + if (!zp) + break; + n = znode->iip; + ubifs_assert(p < c->zroot.znode->level); + path[p++] = n; + if (!zp->cnext && ubifs_zn_dirty(znode)) + break; + znode = zp; + } + } + + /* Come back down, dirtying as we go */ + while (1) { + struct ubifs_zbranch *zbr; + + zp = znode->parent; + if (zp) { + ubifs_assert(path[p - 1] >= 0); + ubifs_assert(path[p - 1] < zp->child_cnt); + zbr = &zp->zbranch[path[--p]]; + znode = dirty_cow_znode(c, zbr); + } else { + ubifs_assert(znode == c->zroot.znode); + znode = dirty_cow_znode(c, &c->zroot); + } + if (unlikely(IS_ERR(znode)) || !p) + break; + ubifs_assert(path[p - 1] >= 0); + ubifs_assert(path[p - 1] < znode->child_cnt); + znode = znode->zbranch[path[p - 1]].znode; + } + + return znode; +} + +/** + * ubifs_lookup_level0 - search for zero-level znode. + * @c: UBIFS file-system description object + * @key: key to lookup + * @zn: znode is returned here + * @n: znode branch slot number is returned here + * + * This function looks up the TNC tree and search for zero-level znode which + * refers key @key. The found zero-level znode is returned in @zn. There are 3 + * cases: + * o exact match, i.e. the found zero-level znode contains key @key, then %1 + * is returned and slot number of the matched branch is stored in @n; + * o not exact match, which means that zero-level znode does not contain + * @key, then %0 is returned and slot number of the closed branch is stored + * in @n; + * o @key is so small that it is even less than the lowest key of the + * leftmost zero-level node, then %0 is returned and %0 is stored in @n. + * + * Note, when the TNC tree is traversed, some znodes may be absent, then this + * function reads corresponding indexing nodes and inserts them to TNC. In + * case of failure, a negative error code is returned. + */ +int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key, + struct ubifs_znode **zn, int *n) +{ + int err, exact; + struct ubifs_znode *znode; + unsigned long time = get_seconds(); + + dbg_tnc("search key %s", DBGKEY(key)); + + znode = c->zroot.znode; + if (unlikely(!znode)) { + znode = ubifs_load_znode(c, &c->zroot, NULL, 0); + if (IS_ERR(znode)) + return PTR_ERR(znode); + } + + znode->time = time; + + while (1) { + struct ubifs_zbranch *zbr; + + exact = ubifs_search_zbranch(c, znode, key, n); + + if (znode->level == 0) + break; + + if (*n < 0) + *n = 0; + zbr = &znode->zbranch[*n]; + + if (zbr->znode) { + znode->time = time; + znode = zbr->znode; + continue; + } + + /* znode is not in TNC cache, load it from the media */ + znode = ubifs_load_znode(c, zbr, znode, *n); + if (IS_ERR(znode)) + return PTR_ERR(znode); + } + + *zn = znode; + if (exact || !is_hash_key(c, key) || *n != -1) { + dbg_tnc("found %d, lvl %d, n %d", exact, znode->level, *n); + return exact; + } + + /* + * Here is a tricky place. We have not found the key and this is a + * "hashed" key, which may collide. The rest of the code deals with + * situations like this: + * + * | 3 | 5 | + * / \ + * | 3 | 5 | | 6 | 7 | (x) + * + * Or more a complex example: + * + * | 1 | 5 | + * / \ + * | 1 | 3 | | 5 | 8 | + * \ / + * | 5 | 5 | | 6 | 7 | (x) + * + * In the examples, if we are looking for key "5", we may reach nodes + * marked with "(x)". In this case what we have do is to look at the + * left and see if there is "5" key there. If there is, we have to + * return it. + * + * Note, this whole situation is possible because we allow to have + * elements which are equivalent to the next key in the parent in the + * children of current znode. For example, this happens if we split a + * znode like this: | 3 | 5 | 5 | 6 | 7 |, which results in something + * like this: + * | 3 | 5 | + * / \ + * | 3 | 5 | | 5 | 6 | 7 | + * ^ + * And this becomes what is at the first "picture" after key "5" marked + * with "^" is removed. What could be done is we could prohibit + * splitting in the middle of the colliding sequence. Also, when + * removing the leftmost key, we would have to correct the key of the + * parent node, which would introduce additional complications. Namely, + * if we changed the the leftmost key of the parent znode, the garbage + * collector would be unable to find it (GC is doing this when GC'ing + * indexing LEBs). Although we already have an additional RB-tree where + * we save such changed znodes (see 'ins_clr_old_idx_znode()') until + * after the commit. But anyway, this does not look easy to implement + * so we did not try this. + */ + err = tnc_prev(c, &znode, n); + if (err == -ENOENT) { + dbg_tnc("found 0, lvl %d, n -1", znode->level); + *n = -1; + return 0; + } + if (unlikely(err < 0)) + return err; + if (keys_cmp(c, key, &znode->zbranch[*n].key)) { + dbg_tnc("found 0, lvl %d, n -1", znode->level); + *n = -1; + return 0; + } + + dbg_tnc("found 1, lvl %d, n %d", znode->level, *n); + *zn = znode; + return 1; +} + +/** + * lookup_level0_dirty - search for zero-level znode dirtying. + * @c: UBIFS file-system description object + * @key: key to lookup + * @zn: znode is returned here + * @n: znode branch slot number is returned here + * + * This function looks up the TNC tree and search for zero-level znode which + * refers key @key. The found zero-level znode is returned in @zn. There are 3 + * cases: + * o exact match, i.e. the found zero-level znode contains key @key, then %1 + * is returned and slot number of the matched branch is stored in @n; + * o not exact match, which means that zero-level znode does not contain @key + * then %0 is returned and slot number of the closed branch is stored in + * @n; + * o @key is so small that it is even less than the lowest key of the + * leftmost zero-level node, then %0 is returned and %-1 is stored in @n. + * + * Additionally all znodes in the path from the root to the located zero-level + * znode are marked as dirty. + * + * Note, when the TNC tree is traversed, some znodes may be absent, then this + * function reads corresponding indexing nodes and inserts them to TNC. In + * case of failure, a negative error code is returned. + */ +static int lookup_level0_dirty(struct ubifs_info *c, const union ubifs_key *key, + struct ubifs_znode **zn, int *n) +{ + int err, exact; + struct ubifs_znode *znode; + unsigned long time = get_seconds(); + + dbg_tnc("search and dirty key %s", DBGKEY(key)); + + znode = c->zroot.znode; + if (unlikely(!znode)) { + znode = ubifs_load_znode(c, &c->zroot, NULL, 0); + if (IS_ERR(znode)) + return PTR_ERR(znode); + } + + znode = dirty_cow_znode(c, &c->zroot); + if (IS_ERR(znode)) + return PTR_ERR(znode); + + znode->time = time; + + while (1) { + struct ubifs_zbranch *zbr; + + exact = ubifs_search_zbranch(c, znode, key, n); + + if (znode->level == 0) + break; + + if (*n < 0) + *n = 0; + zbr = &znode->zbranch[*n]; + + if (zbr->znode) { + znode->time = time; + znode = dirty_cow_znode(c, zbr); + if (IS_ERR(znode)) + return PTR_ERR(znode); + continue; + } + + /* znode is not in TNC cache, load it from the media */ + znode = ubifs_load_znode(c, zbr, znode, *n); + if (IS_ERR(znode)) + return PTR_ERR(znode); + znode = dirty_cow_znode(c, zbr); + if (IS_ERR(znode)) + return PTR_ERR(znode); + } + + *zn = znode; + if (exact || !is_hash_key(c, key) || *n != -1) { + dbg_tnc("found %d, lvl %d, n %d", exact, znode->level, *n); + return exact; + } + + /* + * See huge comment at 'lookup_level0_dirty()' what is the rest of the + * code. + */ + err = tnc_prev(c, &znode, n); + if (err == -ENOENT) { + *n = -1; + dbg_tnc("found 0, lvl %d, n -1", znode->level); + return 0; + } + if (unlikely(err < 0)) + return err; + if (keys_cmp(c, key, &znode->zbranch[*n].key)) { + *n = -1; + dbg_tnc("found 0, lvl %d, n -1", znode->level); + return 0; + } + + if (znode->cnext || !ubifs_zn_dirty(znode)) { + znode = dirty_cow_bottom_up(c, znode); + if (IS_ERR(znode)) + return PTR_ERR(znode); + } + + dbg_tnc("found 1, lvl %d, n %d", znode->level, *n); + *zn = znode; + return 1; +} + +/** + * maybe_leb_gced - determine if a LEB may have been garbage collected. + * @c: UBIFS file-system description object + * @lnum: LEB number + * @gc_seq1: garbage collection sequence number + * + * This function determines if @lnum may have been garbage collected since + * sequence number @gc_seq1. If it may have been then %1 is returned, otherwise + * %0 is returned. + */ +static int maybe_leb_gced(struct ubifs_info *c, int lnum, int gc_seq1) +{ + int gc_seq2, gced_lnum; + + gced_lnum = c->gced_lnum; + smp_rmb(); + gc_seq2 = c->gc_seq; + /* Same seq means no GC */ + if (gc_seq1 == gc_seq2) + return 0; + /* Different by more than 1 means we don't know */ + if (gc_seq1 + 1 != gc_seq2) + return 1; + /* + * We have seen the sequence number has increased by 1. Now we need to + * be sure we read the right LEB number, so read it again. + */ + smp_rmb(); + if (gced_lnum != c->gced_lnum) + return 1; + /* Finally we can check lnum */ + if (gced_lnum == lnum) + return 1; + return 0; +} + +/** + * ubifs_tnc_locate - look up a file-system node and return it and its location. + * @c: UBIFS file-system description object + * @key: node key to lookup + * @node: the node is returned here + * @lnum: LEB number is returned here + * @offs: offset is returned here + * + * This function look up and reads node with key @key. The caller has to make + * sure the @node buffer is large enough to fit the node. Returns zero in case + * of success, %-ENOENT if the node was not found, and a negative error code in + * case of failure. The node location can be returned in @lnum and @offs. + */ +int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key, + void *node, int *lnum, int *offs) +{ + int found, n, err, safely = 0, gc_seq1; + struct ubifs_znode *znode; + struct ubifs_zbranch zbr, *zt; + +again: + mutex_lock(&c->tnc_mutex); + found = ubifs_lookup_level0(c, key, &znode, &n); + if (!found) { + err = -ENOENT; + goto out; + } else if (found < 0) { + err = found; + goto out; + } + zt = &znode->zbranch[n]; + if (lnum) { + *lnum = zt->lnum; + *offs = zt->offs; + } + if (is_hash_key(c, key)) { + /* + * In this case the leaf node cache gets used, so we pass the + * address of the zbranch and keep the mutex locked + */ + err = tnc_read_node_nm(c, zt, node); + goto out; + } + if (safely) { + err = ubifs_tnc_read_node(c, zt, node); + goto out; + } + /* Drop the TNC mutex prematurely and race with garbage collection */ + zbr = znode->zbranch[n]; + gc_seq1 = c->gc_seq; + mutex_unlock(&c->tnc_mutex); + + if (ubifs_get_wbuf(c, zbr.lnum)) { + /* We do not GC journal heads */ + err = ubifs_tnc_read_node(c, &zbr, node); + return err; + } + + err = fallible_read_node(c, key, &zbr, node); + if (err <= 0 || maybe_leb_gced(c, zbr.lnum, gc_seq1)) { + /* + * The node may have been GC'ed out from under us so try again + * while keeping the TNC mutex locked. + */ + safely = 1; + goto again; + } + return 0; + +out: + mutex_unlock(&c->tnc_mutex); + return err; +} + +/** + * do_lookup_nm- look up a "hashed" node. + * @c: UBIFS file-system description object + * @key: node key to lookup + * @node: the node is returned here + * @nm: node name + * + * This function look up and reads a node which contains name hash in the key. + * Since the hash may have collisions, there may be many nodes with the same + * key, so we have to sequentially look to all of them until the needed one is + * found. This function returns zero in case of success, %-ENOENT if the node + * was not found, and a negative error code in case of failure. + */ +static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key, + void *node, const struct qstr *nm) +{ + int found, n, err; + struct ubifs_znode *znode; + + dbg_tnc("name '%.*s' key %s", nm->len, nm->name, DBGKEY(key)); + mutex_lock(&c->tnc_mutex); + found = ubifs_lookup_level0(c, key, &znode, &n); + if (!found) { + err = -ENOENT; + goto out_unlock; + } else if (found < 0) { + err = found; + goto out_unlock; + } + + ubifs_assert(n >= 0); + + err = resolve_collision(c, key, &znode, &n, nm); + dbg_tnc("rc returned %d, znode %p, n %d", err, znode, n); + if (unlikely(err < 0)) + goto out_unlock; + if (err == 0) { + err = -ENOENT; + goto out_unlock; + } + + err = tnc_read_node_nm(c, &znode->zbranch[n], node); + +out_unlock: + mutex_unlock(&c->tnc_mutex); + return err; +} + +/** + * ubifs_tnc_lookup_nm - look up a "hashed" node. + * @c: UBIFS file-system description object + * @key: node key to lookup + * @node: the node is returned here + * @nm: node name + * + * This function look up and reads a node which contains name hash in the key. + * Since the hash may have collisions, there may be many nodes with the same + * key, so we have to sequentially look to all of them until the needed one is + * found. This function returns zero in case of success, %-ENOENT if the node + * was not found, and a negative error code in case of failure. + */ +int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key, + void *node, const struct qstr *nm) +{ + int err, len; + const struct ubifs_dent_node *dent = node; + + /* + * We assume that in most of the cases there are no name collisions and + * 'ubifs_tnc_lookup()' returns us the right direntry. + */ + err = ubifs_tnc_lookup(c, key, node); + if (err) + return err; + + len = le16_to_cpu(dent->nlen); + if (nm->len == len && !memcmp(dent->name, nm->name, len)) + return 0; + + /* + * Unluckily, there are hash collisions and we have to iterate over + * them look at each direntry with colliding name hash sequentially. + */ + return do_lookup_nm(c, key, node, nm); +} + +/** + * correct_parent_keys - correct parent znodes' keys. + * @c: UBIFS file-system description object + * @znode: znode to correct parent znodes for + * + * This is a helper function for 'tnc_insert()'. When the key of the leftmost + * zbranch changes, keys of parent znodes have to be corrected. This helper + * function is called in such situations and corrects the keys if needed. + */ +static void correct_parent_keys(const struct ubifs_info *c, + struct ubifs_znode *znode) +{ + union ubifs_key *key, *key1; + + ubifs_assert(znode->parent); + ubifs_assert(znode->iip == 0); + + key = &znode->zbranch[0].key; + key1 = &znode->parent->zbranch[0].key; + + while (keys_cmp(c, key, key1) < 0) { + key_copy(c, key, key1); + znode = znode->parent; + znode->alt = 1; + if (!znode->parent || znode->iip) + break; + key1 = &znode->parent->zbranch[0].key; + } +} + +/** + * insert_zbranch - insert a zbranch into a znode. + * @znode: znode into which to insert + * @zbr: zbranch to insert + * @n: slot number to insert to + * + * This is a helper function for 'tnc_insert()'. UBIFS does not allow "gaps" in + * znode's array of zbranches and keeps zbranches consolidated, so when a new + * zbranch has to be inserted to the @znode->zbranches[]' array at the @n-th + * slot, zbranches starting from @n have to be moved right. + */ +static void insert_zbranch(struct ubifs_znode *znode, + const struct ubifs_zbranch *zbr, int n) +{ + int i; + + ubifs_assert(ubifs_zn_dirty(znode)); + + if (znode->level) { + for (i = znode->child_cnt; i > n; i--) { + znode->zbranch[i] = znode->zbranch[i - 1]; + if (znode->zbranch[i].znode) + znode->zbranch[i].znode->iip = i; + } + if (zbr->znode) + zbr->znode->iip = n; + } else + for (i = znode->child_cnt; i > n; i--) + znode->zbranch[i] = znode->zbranch[i - 1]; + + znode->zbranch[n] = *zbr; + znode->child_cnt += 1; + + /* + * After inserting at slot zero, the lower bound of the key range of + * this znode may have changed. If this znode is subsequently split + * then the upper bound of the key range may change, and furthermore + * it could change to be lower than the original lower bound. If that + * happens, then it will no longer be possible to find this znode in the + * TNC using the key from the index node on flash. That is bad because + * if it is not found, we will assume it is obsolete and may overwrite + * it. Then if there is an unclean unmount, we will start using the + * old index which will be broken. + * + * So we first mark znodes that have insertions at slot zero, and then + * if they are split we add their lnum/offs to the old_idx tree. + */ + if (n == 0) + znode->alt = 1; +} + +/** + * tnc_insert - insert a node into TNC. + * @c: UBIFS file-system description object + * @znode: znode to insert into + * @zbr: branch to insert + * @n: slot number to insert new zbranch to + * + * This function inserts a new node described by @zbr into znode @znode. If + * znode does not have a free slot for new zbranch, it is split. Parent znodes + * are splat as well if needed. Returns zero in case of success or a negative + * error code in case of failure. + */ +static int tnc_insert(struct ubifs_info *c, struct ubifs_znode *znode, + struct ubifs_zbranch *zbr, int n) +{ + struct ubifs_znode *zn, *zi, *zp; + int i, keep, move, appending = 0; + union ubifs_key *key = &zbr->key; + + ubifs_assert(n >= 0 && n <= c->fanout); + + /* Implement naive insert for now */ +again: + zp = znode->parent; + if (znode->child_cnt < c->fanout) { + ubifs_assert(n != c->fanout); + dbg_tnc("inserted at %d level %d, key %s", n, znode->level, + DBGKEY(key)); + + insert_zbranch(znode, zbr, n); + + /* Ensure parent's key is correct */ + if (n == 0 && zp && znode->iip == 0) + correct_parent_keys(c, znode); + + return 0; + } + + /* + * Unfortunately, @znode does not have more empty slots and we have to + * split it. + */ + dbg_tnc("splitting level %d, key %s", znode->level, DBGKEY(key)); + + if (znode->alt) + /* + * We can no longer be sure of finding this znode by key, so we + * record it in the old_idx tree. + */ + ins_clr_old_idx_znode(c, znode); + + zn = kzalloc(c->max_znode_sz, GFP_NOFS); + if (!zn) + return -ENOMEM; + zn->parent = zp; + zn->level = znode->level; + + /* Decide where to split */ + if (znode->level == 0 && n == c->fanout && + key_type(c, key) == UBIFS_DATA_KEY) { + union ubifs_key *key1; + + /* + * If this is an inode which is being appended - do not split + * it because no other zbranches can be inserted between + * zbranches of consecutive data nodes anyway. + */ + key1 = &znode->zbranch[n - 1].key; + if (key_inum(c, key1) == key_inum(c, key) && + key_type(c, key1) == UBIFS_DATA_KEY && + key_block(c, key1) == key_block(c, key) - 1) + appending = 1; + } + + if (appending) { + keep = c->fanout; + move = 0; + } else { + keep = (c->fanout + 1) / 2; + move = c->fanout - keep; + } + + /* + * Although we don't at present, we could look at the neighbors and see + * if we can move some zbranches there. + */ + + if (n < keep) { + /* Insert into existing znode */ + zi = znode; + move += 1; + keep -= 1; + } else { + /* Insert into new znode */ + zi = zn; + n -= keep; + /* Re-parent */ + if (zn->level != 0) + zbr->znode->parent = zn; + } + + __set_bit(DIRTY_ZNODE, &zn->flags); + atomic_long_inc(&c->dirty_zn_cnt); + + zn->child_cnt = move; + znode->child_cnt = keep; + + dbg_tnc("moving %d, keeping %d", move, keep); + + /* Move zbranch */ + for (i = 0; i < move; i++) { + zn->zbranch[i] = znode->zbranch[keep + i]; + /* Re-parent */ + if (zn->level != 0) + if (zn->zbranch[i].znode) { + zn->zbranch[i].znode->parent = zn; + zn->zbranch[i].znode->iip = i; + } + } + + /* Insert new key and branch */ + dbg_tnc("inserting at %d level %d, key %s", n, zn->level, DBGKEY(key)); + + insert_zbranch(zi, zbr, n); + + /* Insert new znode (produced by spitting) into the parent */ + if (zp) { + i = n; + /* Locate insertion point */ + n = znode->iip + 1; + if (appending && n != c->fanout) + appending = 0; + + if (i == 0 && zi == znode && znode->iip == 0) + correct_parent_keys(c, znode); + + /* Tail recursion */ + zbr->key = zn->zbranch[0].key; + zbr->znode = zn; + zbr->lnum = 0; + zbr->offs = 0; + zbr->len = 0; + znode = zp; + + goto again; + } + + /* We have to split root znode */ + dbg_tnc("creating new zroot at level %d", znode->level + 1); + + zi = kzalloc(c->max_znode_sz, GFP_NOFS); + if (!zi) + return -ENOMEM; + + zi->child_cnt = 2; + zi->level = znode->level + 1; + + __set_bit(DIRTY_ZNODE, &zi->flags); + atomic_long_inc(&c->dirty_zn_cnt); + + zi->zbranch[0].key = znode->zbranch[0].key; + zi->zbranch[0].znode = znode; + zi->zbranch[0].lnum = c->zroot.lnum; + zi->zbranch[0].offs = c->zroot.offs; + zi->zbranch[0].len = c->zroot.len; + zi->zbranch[1].key = zn->zbranch[0].key; + zi->zbranch[1].znode = zn; + + c->zroot.lnum = 0; + c->zroot.offs = 0; + c->zroot.len = 0; + c->zroot.znode = zi; + + zn->parent = zi; + zn->iip = 1; + znode->parent = zi; + znode->iip = 0; + + return 0; +} + +/** + * ubifs_tnc_add - add a node to TNC. + * @c: UBIFS file-system description object + * @key: key to add + * @lnum: LEB number of node + * @offs: node offset + * @len: node length + * + * This function adds a node with key @key to TNC. The node may be new or it may + * obsolete some existing one. Returns %0 on success or negative error code on + * failure. + */ +int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum, + int offs, int len) +{ + int found, n, err = 0; + struct ubifs_znode *znode; + + mutex_lock(&c->tnc_mutex); + dbg_tnc("%d:%d, len %d, key %s", lnum, offs, len, DBGKEY(key)); + found = lookup_level0_dirty(c, key, &znode, &n); + if (!found) { + struct ubifs_zbranch zbr; + + zbr.znode = NULL; + zbr.lnum = lnum; + zbr.offs = offs; + zbr.len = len; + key_copy(c, key, &zbr.key); + err = tnc_insert(c, znode, &zbr, n + 1); + } else if (found == 1) { + struct ubifs_zbranch *zbr = &znode->zbranch[n]; + + lnc_free(zbr); + err = ubifs_add_dirt(c, zbr->lnum, zbr->len); + zbr->lnum = lnum; + zbr->offs = offs; + zbr->len = len; + } else + err = found; + if (!err) + err = dbg_check_tnc(c, 0); + mutex_unlock(&c->tnc_mutex); + + return err; +} + +/** + * ubifs_tnc_replace - replace a node in the TNC only if the old node is found. + * @c: UBIFS file-system description object + * @key: key to add + * @old_lnum: LEB number of old node + * @old_offs: old node offset + * @lnum: LEB number of node + * @offs: node offset + * @len: node length + * + * This function replaces a node with key @key in the TNC only if the old node + * is found. This function is called by garbage collection when node are moved. + * Returns %0 on success or negative error code on failure. + */ +int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key, + int old_lnum, int old_offs, int lnum, int offs, int len) +{ + int found, n, err = 0; + struct ubifs_znode *znode; + + mutex_lock(&c->tnc_mutex); + dbg_tnc("old LEB %d:%d, new LEB %d:%d, len %d, key %s", old_lnum, + old_offs, lnum, offs, len, DBGKEY(key)); + found = lookup_level0_dirty(c, key, &znode, &n); + if (found < 0) { + err = found; + goto out_unlock; + } + + if (found == 1) { + struct ubifs_zbranch *zbr = &znode->zbranch[n]; + + found = 0; + if (zbr->lnum == old_lnum && zbr->offs == old_offs) { + lnc_free(zbr); + err = ubifs_add_dirt(c, zbr->lnum, zbr->len); + if (err) + goto out_unlock; + zbr->lnum = lnum; + zbr->offs = offs; + zbr->len = len; + found = 1; + } else if (is_hash_key(c, key)) { + found = resolve_collision_directly(c, key, &znode, &n, + old_lnum, old_offs); + dbg_tnc("rc returned %d, znode %p, n %d, LEB %d:%d", + found, znode, n, old_lnum, old_offs); + if (found < 0) { + err = found; + goto out_unlock; + } + + if (found) { + /* Ensure the znode is dirtied */ + if (znode->cnext || !ubifs_zn_dirty(znode)) { + znode = dirty_cow_bottom_up(c, + znode); + if (IS_ERR(znode)) { + err = PTR_ERR(znode); + goto out_unlock; + } + } + zbr = &znode->zbranch[n]; + lnc_free(zbr); + err = ubifs_add_dirt(c, zbr->lnum, + zbr->len); + if (err) + goto out_unlock; + zbr->lnum = lnum; + zbr->offs = offs; + zbr->len = len; + } + } + } + + if (!found) + err = ubifs_add_dirt(c, lnum, len); + + if (!err) + err = dbg_check_tnc(c, 0); + +out_unlock: + mutex_unlock(&c->tnc_mutex); + return err; +} + +/** + * ubifs_tnc_add_nm - add a "hashed" node to TNC. + * @c: UBIFS file-system description object + * @key: key to add + * @lnum: LEB number of node + * @offs: node offset + * @len: node length + * @nm: node name + * + * This is the same as 'ubifs_tnc_add()' but it should be used with keys which + * may have collisions, like directory entry keys. + */ +int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key, + int lnum, int offs, int len, const struct qstr *nm) +{ + int found, n, err = 0; + struct ubifs_znode *znode; + + mutex_lock(&c->tnc_mutex); + dbg_tnc("LEB %d:%d, name '%.*s', key %s", lnum, offs, nm->len, nm->name, + DBGKEY(key)); + found = lookup_level0_dirty(c, key, &znode, &n); + if (found < 0) { + err = found; + goto out_unlock; + } + + if (found == 1) { + if (c->replaying) + found = fallible_resolve_collision(c, key, &znode, &n, + nm, 1); + else + found = resolve_collision(c, key, &znode, &n, nm); + dbg_tnc("rc returned %d, znode %p, n %d", found, znode, n); + if (found < 0) { + err = found; + goto out_unlock; + } + + /* Ensure the znode is dirtied */ + if (znode->cnext || !ubifs_zn_dirty(znode)) { + znode = dirty_cow_bottom_up(c, znode); + if (IS_ERR(znode)) { + err = PTR_ERR(znode); + goto out_unlock; + } + } + + if (found == 1) { + struct ubifs_zbranch *zbr = &znode->zbranch[n]; + + lnc_free(zbr); + err = ubifs_add_dirt(c, zbr->lnum, zbr->len); + zbr->lnum = lnum; + zbr->offs = offs; + zbr->len = len; + goto out_unlock; + } + } + + if (!found) { + struct ubifs_zbranch zbr; + + zbr.znode = NULL; + zbr.lnum = lnum; + zbr.offs = offs; + zbr.len = len; + key_copy(c, key, &zbr.key); + err = tnc_insert(c, znode, &zbr, n + 1); + if (err) + goto out_unlock; + if (c->replaying) { + /* + * We did not find it in the index so there may be a + * dangling branch still in the index. So we remove it + * by passing 'ubifs_tnc_remove_nm()' the same key but + * an unmatchable name. + */ + struct qstr noname = { .len = 0, .name = "" }; + + err = dbg_check_tnc(c, 0); + mutex_unlock(&c->tnc_mutex); + if (err) + return err; + return ubifs_tnc_remove_nm(c, key, &noname); + } + } + +out_unlock: + if (!err) + err = dbg_check_tnc(c, 0); + mutex_unlock(&c->tnc_mutex); + return err; +} + +/** + * tnc_delete - delete a znode form TNC. + * @c: UBIFS file-system description object + * @znode: znode to delete from + * @n: zbranch slot number to delete + * + * This function deletes a leaf node from @n-th slot of @znode. Returns zero in + * case of success and a negative error code in case of failure. + */ +static int tnc_delete(struct ubifs_info *c, struct ubifs_znode *znode, int n) +{ + struct ubifs_zbranch *zbr; + struct ubifs_znode *zp; + int i, err; + + /* Delete without merge for now */ + ubifs_assert(znode->level == 0); + ubifs_assert(n >= 0 && n < c->fanout); + dbg_tnc("deleting %s", DBGKEY(&znode->zbranch[n].key)); + + zbr = &znode->zbranch[n]; + lnc_free(zbr); + + err = ubifs_add_dirt(c, zbr->lnum, zbr->len); + if (err) { + dbg_dump_znode(c, znode); + return err; + } + + /* We do not "gap" zbranch slots */ + for (i = n; i < znode->child_cnt - 1; i++) + znode->zbranch[i] = znode->zbranch[i + 1]; + znode->child_cnt -= 1; + + if (znode->child_cnt > 0) + return 0; + + /* + * This was the last zbranch, we have to delete this znode from the + * parent. + */ + + do { + ubifs_assert(!test_bit(OBSOLETE_ZNODE, &znode->flags)); + ubifs_assert(ubifs_zn_dirty(znode)); + + zp = znode->parent; + n = znode->iip; + + atomic_long_dec(&c->dirty_zn_cnt); + + err = insert_old_idx_znode(c, znode); + if (err) + return err; + + if (znode->cnext) { + __set_bit(OBSOLETE_ZNODE, &znode->flags); + atomic_long_inc(&c->clean_zn_cnt); + atomic_long_inc(&ubifs_clean_zn_cnt); + } else + kfree(znode); + znode = zp; + } while (znode->child_cnt == 1); /* while removing last child */ + + /* Remove from znode, entry n - 1 */ + znode->child_cnt -= 1; + ubifs_assert(znode->level != 0); + for (i = n; i < znode->child_cnt; i++) { + znode->zbranch[i] = znode->zbranch[i + 1]; + if (znode->zbranch[i].znode) + znode->zbranch[i].znode->iip = i; + } + + /* + * If this is the root and it has only 1 child then + * collapse the tree. + */ + if (!znode->parent) { + while (znode->child_cnt == 1 && znode->level != 0) { + zp = znode; + zbr = &znode->zbranch[0]; + znode = get_znode(c, znode, 0); + if (IS_ERR(znode)) + return PTR_ERR(znode); + znode = dirty_cow_znode(c, zbr); + if (IS_ERR(znode)) + return PTR_ERR(znode); + znode->parent = NULL; + znode->iip = 0; + if (c->zroot.len) { + err = insert_old_idx(c, c->zroot.lnum, + c->zroot.offs); + if (err) + return err; + } + c->zroot.lnum = zbr->lnum; + c->zroot.offs = zbr->offs; + c->zroot.len = zbr->len; + c->zroot.znode = znode; + ubifs_assert(!test_bit(OBSOLETE_ZNODE, + &zp->flags)); + ubifs_assert(test_bit(DIRTY_ZNODE, &zp->flags)); + atomic_long_dec(&c->dirty_zn_cnt); + + if (zp->cnext) { + __set_bit(OBSOLETE_ZNODE, &zp->flags); + atomic_long_inc(&c->clean_zn_cnt); + atomic_long_inc(&ubifs_clean_zn_cnt); + } else + kfree(zp); + } + } + + return 0; +} + +/** + * ubifs_tnc_remove - remove an index entry of a node. + * @c: UBIFS file-system description object + * @key: key of node + * + * Returns %0 on success or negative error code on failure. + */ +int ubifs_tnc_remove(struct ubifs_info *c, const union ubifs_key *key) +{ + int found, n, err = 0; + struct ubifs_znode *znode; + + mutex_lock(&c->tnc_mutex); + dbg_tnc("key %s", DBGKEY(key)); + found = lookup_level0_dirty(c, key, &znode, &n); + if (found < 0) { + err = found; + goto out_unlock; + } + if (found == 1) + err = tnc_delete(c, znode, n); + if (!err) + err = dbg_check_tnc(c, 0); + +out_unlock: + mutex_unlock(&c->tnc_mutex); + return err; +} + +/** + * ubifs_tnc_remove_nm - remove an index entry for a "hashed" node. + * @c: UBIFS file-system description object + * @key: key of node + * @nm: directory entry name + * + * Returns %0 on success or negative error code on failure. + */ +int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key, + const struct qstr *nm) +{ + int n, err; + struct ubifs_znode *znode; + + mutex_lock(&c->tnc_mutex); + dbg_tnc("%.*s, key %s", nm->len, nm->name, DBGKEY(key)); + err = lookup_level0_dirty(c, key, &znode, &n); + if (err < 0) + goto out_unlock; + + if (err) { + if (c->replaying) + err = fallible_resolve_collision(c, key, &znode, &n, + nm, 0); + else + err = resolve_collision(c, key, &znode, &n, nm); + dbg_tnc("rc returned %d, znode %p, n %d", err, znode, n); + if (err < 0) + goto out_unlock; + if (err) { + /* Ensure the znode is dirtied */ + if (znode->cnext || !ubifs_zn_dirty(znode)) { + znode = dirty_cow_bottom_up(c, znode); + if (IS_ERR(znode)) { + err = PTR_ERR(znode); + goto out_unlock; + } + } + err = tnc_delete(c, znode, n); + } + } + +out_unlock: + if (!err) + err = dbg_check_tnc(c, 0); + mutex_unlock(&c->tnc_mutex); + return err; +} + +/** + * key_in_range - determine if a key falls within a range of keys. + * @c: UBIFS file-system description object + * @key: key to check + * @from_key: lowest key in range + * @to_key: highest key in range + * + * This function returns %1 if the key is in range and %0 otherwise. + */ +static int key_in_range(struct ubifs_info *c, union ubifs_key *key, + union ubifs_key *from_key, union ubifs_key *to_key) +{ + if (keys_cmp(c, key, from_key) < 0) + return 0; + if (keys_cmp(c, key, to_key) > 0) + return 0; + return 1; +} + +/** + * ubifs_tnc_remove_range - remove index entries in range. + * @c: UBIFS file-system description object + * @from_key: lowest key to remove + * @to_key: highest key to remove + * + * This function removes index entries starting at @from_key and ending at + * @to_key. This function returns zero in case of success and a negative error + * code in case of failure. + */ +int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key, + union ubifs_key *to_key) +{ + int i, n, k, err = 0; + struct ubifs_znode *znode; + union ubifs_key *key; + + mutex_lock(&c->tnc_mutex); + while (1) { + /* Find first level 0 znode that contains keys to remove */ + err = ubifs_lookup_level0(c, from_key, &znode, &n); + if (err < 0) + goto out_unlock; + + if (err) + key = from_key; + else { + err = tnc_next(c, &znode, &n); + if (err == -ENOENT) { + err = 0; + goto out_unlock; + } + if (err < 0) + goto out_unlock; + key = &znode->zbranch[n].key; + if (!key_in_range(c, key, from_key, to_key)) { + err = 0; + goto out_unlock; + } + } + + /* Ensure the znode is dirtied */ + if (znode->cnext || !ubifs_zn_dirty(znode)) { + znode = dirty_cow_bottom_up(c, znode); + if (IS_ERR(znode)) { + err = PTR_ERR(znode); + goto out_unlock; + } + } + + /* Remove all keys in range except the first */ + for (i = n + 1, k = 0; i < znode->child_cnt; i++, k++) { + key = &znode->zbranch[i].key; + if (!key_in_range(c, key, from_key, to_key)) + break; + lnc_free(&znode->zbranch[i]); + err = ubifs_add_dirt(c, znode->zbranch[i].lnum, + znode->zbranch[i].len); + if (err) { + dbg_dump_znode(c, znode); + goto out_unlock; + } + dbg_tnc("removing %s", DBGKEY(key)); + } + if (k) { + for (i = n + 1 + k; i < znode->child_cnt; i++) + znode->zbranch[i - k] = znode->zbranch[i]; + znode->child_cnt -= k; + } + + /* Now delete the first */ + err = tnc_delete(c, znode, n); + if (err) + goto out_unlock; + } + +out_unlock: + if (!err) + err = dbg_check_tnc(c, 0); + mutex_unlock(&c->tnc_mutex); + return err; +} + +/** + * ubifs_tnc_remove_ino - remove an inode from TNC. + * @c: UBIFS file-system description object + * @inum: inode number to remove + * + * This function remove inode @inum and all the extended attributes associated + * with the anode from TNC and returns zero in case of success or a negative + * error code in case of failure. + */ +int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum) +{ + union ubifs_key key1, key2; + struct ubifs_dent_node *xent, *pxent = NULL; + struct qstr nm = { .name = NULL }; + + dbg_tnc("ino %lu", inum); + + /* + * Walk all extended attribute entries and remove them together with + * corresponding extended attribute inodes. + */ + lowest_xent_key(c, &key1, inum); + while (1) { + ino_t xattr_inum; + int err; + + xent = ubifs_tnc_next_ent(c, &key1, &nm); + if (IS_ERR(xent)) { + err = PTR_ERR(xent); + if (err == -ENOENT) + break; + return err; + } + + xattr_inum = le64_to_cpu(xent->inum); + dbg_tnc("xent '%s', ino %lu", xent->name, xattr_inum); + + nm.name = xent->name; + nm.len = le16_to_cpu(xent->nlen); + err = ubifs_tnc_remove_nm(c, &key1, &nm); + if (err) { + kfree(xent); + return err; + } + + lowest_ino_key(c, &key1, xattr_inum); + highest_ino_key(c, &key2, xattr_inum); + err = ubifs_tnc_remove_range(c, &key1, &key2); + if (err) { + kfree(xent); + return err; + } + + kfree(pxent); + pxent = xent; + key_read(c, &xent->key, &key1); + } + + kfree(pxent); + lowest_ino_key(c, &key1, inum); + highest_ino_key(c, &key2, inum); + + return ubifs_tnc_remove_range(c, &key1, &key2); +} + +/** + * ubifs_tnc_next_ent - walk directory or extended attribute entries. + * @c: UBIFS file-system description object + * @key: key of last entry + * @nm: name of last entry found or %NULL + * + * This function finds and reads the next directory or extended attribute entry + * after the given key (@key) if there is one. @nm is used to resolve + * collisions. + * + * If the name of the current entry is not known and only the key is known, + * @nm->name has to be %NULL. In this case the semantics of this function is a + * little bit different and it returns the entry corresponding to this key, not + * the next one. If the key was not found, the closest "right" entry is + * returned. + * + * If the fist entry has to be found, @key has to contain the lowest possible + * key value for this inode and @name has to be %NULL. + * + * This function returns the found directory or extended attribute entry node + * in case of success, %-ENOENT is returned if no entry was found, and a + * negative error code is returned in case of failure. + */ +struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c, + union ubifs_key *key, + const struct qstr *nm) +{ + int n, err, type = key_type(c, key); + struct ubifs_znode *znode; + struct ubifs_dent_node *dent; + struct ubifs_zbranch *zbr; + union ubifs_key *dkey; + + dbg_tnc("%s %s", nm->name ? (char *)nm->name : "(lowest)", DBGKEY(key)); + ubifs_assert(is_hash_key(c, key)); + + mutex_lock(&c->tnc_mutex); + err = ubifs_lookup_level0(c, key, &znode, &n); + if (unlikely(err < 0)) + goto out_unlock; + + if (nm->name) { + if (err) { + /* Handle collisions */ + err = resolve_collision(c, key, &znode, &n, nm); + dbg_tnc("rc returned %d, znode %p, n %d", + err, znode, n); + if (unlikely(err < 0)) + goto out_unlock; + } + + /* Now find next entry */ + err = tnc_next(c, &znode, &n); + if (unlikely(err)) + goto out_unlock; + } else { + /* + * The full name of the entry was not given, in which case the + * behavior of this function is a little different and it + * returns current entry, not the next one. + */ + if (!err) { + /* + * However, the given key does not exist in the TNC + * tree and @znode/@n variables contain the closest + * "preceding" element. Switch to the next one. + */ + err = tnc_next(c, &znode, &n); + if (err) + goto out_unlock; + } + } + + zbr = &znode->zbranch[n]; + dent = kmalloc(zbr->len, GFP_NOFS); + if (unlikely(!dent)) { + err = -ENOMEM; + goto out_unlock; + } + + /* + * The above 'tnc_next()' call could lead us to the next inode, check + * this. + */ + dkey = &zbr->key; + if (key_inum(c, dkey) != key_inum(c, key) || + key_type(c, dkey) != type) { + err = -ENOENT; + goto out_free; + } + + err = tnc_read_node_nm(c, zbr, dent); + if (unlikely(err)) + goto out_free; + + mutex_unlock(&c->tnc_mutex); + return dent; + +out_free: + kfree(dent); +out_unlock: + mutex_unlock(&c->tnc_mutex); + return ERR_PTR(err); +} + +/** + * tnc_destroy_cnext - destroy left-over obsolete znodes from a failed commit. + * @c: UBIFS file-system description object + * + * Destroy left-over obsolete znodes from a failed commit. + */ +static void tnc_destroy_cnext(struct ubifs_info *c) +{ + struct ubifs_znode *cnext; + + if (!c->cnext) + return; + ubifs_assert(c->cmt_state == COMMIT_BROKEN); + cnext = c->cnext; + do { + struct ubifs_znode *znode = cnext; + + cnext = cnext->cnext; + if (test_bit(OBSOLETE_ZNODE, &znode->flags)) + kfree(znode); + } while (cnext && cnext != c->cnext); +} + +/** + * ubifs_tnc_close - close TNC subsystem and free all related resources. + * @c: UBIFS file-system description object + */ +void ubifs_tnc_close(struct ubifs_info *c) +{ + long clean_freed; + + tnc_destroy_cnext(c); + if (c->zroot.znode) { + clean_freed = ubifs_destroy_tnc_subtree(c->zroot.znode); + atomic_long_sub(clean_freed, &ubifs_clean_zn_cnt); + } + kfree(c->gap_lebs); + kfree(c->ilebs); + destroy_old_idx(c); +} + +/** + * left_znode - get the znode to the left. + * @c: UBIFS file-system description object + * @znode: znode + * + * This function returns a pointer to the znode to the left of @znode or NULL if + * there is not one. A negative error code is returned on failure. + */ +static struct ubifs_znode *left_znode(struct ubifs_info *c, + struct ubifs_znode *znode) +{ + int level = znode->level; + + while (1) { + int n = znode->iip - 1; + + /* Go up until we can go left */ + znode = znode->parent; + if (!znode) + return NULL; + if (n >= 0) { + /* Now go down the rightmost branch to 'level' */ + znode = get_znode(c, znode, n); + if (IS_ERR(znode)) + return znode; + while (znode->level != level) { + n = znode->child_cnt - 1; + znode = get_znode(c, znode, n); + if (IS_ERR(znode)) + return znode; + } + break; + } + } + return znode; +} + +/** + * right_znode - get the znode to the right. + * @c: UBIFS file-system description object + * @znode: znode + * + * This function returns a pointer to the znode to the right of @znode or NULL + * if there is not one. A negative error code is returned on failure. + */ +static struct ubifs_znode *right_znode(struct ubifs_info *c, + struct ubifs_znode *znode) +{ + int level = znode->level; + + while (1) { + int n = znode->iip + 1; + + /* Go up until we can go right */ + znode = znode->parent; + if (!znode) + return NULL; + if (n < znode->child_cnt) { + /* Now go down the leftmost branch to 'level' */ + znode = get_znode(c, znode, n); + if (IS_ERR(znode)) + return znode; + while (znode->level != level) { + znode = get_znode(c, znode, 0); + if (IS_ERR(znode)) + return znode; + } + break; + } + } + return znode; +} + +/** + * lookup_znode - find a particular indexing node from TNC. + * @c: UBIFS file-system description object + * @key: index node key to lookup + * @level: index node level + * @lnum: index node LEB number + * @offs: index node offset + * + * This function searches an indexing node by its first key @key and its + * address @lnum:@offs. It looks up the indexing tree by pulling all indexing + * nodes it traverses to TNC. This function is called fro indexing nodes which + * were found on the media by scanning, for example when garbage-collecting or + * when doing in-the-gaps commit. This means that the indexing node which is + * looked for does not have to have exactly the same leftmost key @key, because + * the leftmost key may have been changed, in which case TNC will contain a + * dirty znode which still refers the same @lnum:@offs. This function is clever + * enough to recognize such indexing nodes. + * + * Note, if a znode was deleted or changed too much, then this function will + * not find it. For situations like this UBIFS has the old index RB-tree + * (indexed by @lnum:@offs). + * + * This function returns a pointer to the znode found or %NULL if it is not + * found. A negative error code is returned on failure. + */ +static struct ubifs_znode *lookup_znode(struct ubifs_info *c, + union ubifs_key *key, int level, + int lnum, int offs) +{ + struct ubifs_znode *znode, *zn; + int n, nn; + + /* + * The arguments have probably been read off flash, so don't assume + * they are valid. + */ + if (level < 0) + return ERR_PTR(-EINVAL); + + /* Get the root znode */ + znode = c->zroot.znode; + if (!znode) { + znode = ubifs_load_znode(c, &c->zroot, NULL, 0); + if (IS_ERR(znode)) + return znode; + } + /* Check if it is the one we are looking for */ + if (c->zroot.lnum == lnum && c->zroot.offs == offs) + return znode; + /* Descend to the parent level i.e. (level + 1) */ + if (level >= znode->level) + return NULL; + while (1) { + ubifs_search_zbranch(c, znode, key, &n); + if (n < 0) { + /* + * We reached a znode where the leftmost key is greater + * than the key we are searching for. This is the same + * situation as the one described in a huge comment at + * the end of the 'ubifs_lookup_level0()' function. And + * for exactly the same reasons we have to try to look + * left before giving up. + */ + znode = left_znode(c, znode); + if (!znode) + return NULL; + if (IS_ERR(znode)) + return znode; + ubifs_search_zbranch(c, znode, key, &n); + ubifs_assert(n >= 0); + } + if (znode->level == level + 1) + break; + znode = get_znode(c, znode, n); + if (IS_ERR(znode)) + return znode; + } + /* Check if the child is the one we are looking for */ + if (znode->zbranch[n].lnum == lnum && znode->zbranch[n].offs == offs) + return get_znode(c, znode, n); + /* If the key is unique, there is nowhere else to look */ + if (!is_hash_key(c, key)) + return NULL; + /* + * The key is not unique and so may be also in the znodes to either + * side. + */ + zn = znode; + nn = n; + /* Look left */ + while (1) { + /* Move one branch to the left */ + if (n) + n -= 1; + else { + znode = left_znode(c, znode); + if (!znode) + break; + if (IS_ERR(znode)) + return znode; + n = znode->child_cnt - 1; + } + /* Check it */ + if (znode->zbranch[n].lnum == lnum && + znode->zbranch[n].offs == offs) + return get_znode(c, znode, n); + /* Stop if the key is less than the one we are looking for */ + if (keys_cmp(c, &znode->zbranch[n].key, key) < 0) + break; + } + /* Back to the middle */ + znode = zn; + n = nn; + /* Look right */ + while (1) { + /* Move one branch to the right */ + if (++n >= znode->child_cnt) { + znode = right_znode(c, znode); + if (!znode) + break; + if (IS_ERR(znode)) + return znode; + n = 0; + } + /* Check it */ + if (znode->zbranch[n].lnum == lnum && + znode->zbranch[n].offs == offs) + return get_znode(c, znode, n); + /* Stop if the key is greater than the one we are looking for */ + if (keys_cmp(c, &znode->zbranch[n].key, key) > 0) + break; + } + return NULL; +} + +/** + * is_idx_node_in_tnc - determine if an index node is in the TNC. + * @c: UBIFS file-system description object + * @key: key of index node + * @level: index node level + * @lnum: LEB number of index node + * @offs: offset of index node + * + * This function returns %0 if the index node is not referred to in the TNC, %1 + * if the index node is referred to in the TNC and the corresponding znode is + * dirty, %2 if an index node is referred to in the TNC and the corresponding + * znode is clean, and a negative error code in case of failure. + * + * Note, the @key argument has to be the key of the first child. Also note, + * this function relies on the fact that 0:0 is never a valid LEB number and + * offset for a main-area node. + */ +int is_idx_node_in_tnc(struct ubifs_info *c, union ubifs_key *key, int level, + int lnum, int offs) +{ + struct ubifs_znode *znode; + + znode = lookup_znode(c, key, level, lnum, offs); + if (!znode) + return 0; + if (IS_ERR(znode)) + return PTR_ERR(znode); + + return ubifs_zn_dirty(znode) ? 1 : 2; +} + +/** + * is_leaf_node_in_tnc - determine if a non-indexing not is in the TNC. + * @c: UBIFS file-system description object + * @key: node key + * @lnum: node LEB number + * @offs: node offset + * + * This function returns %1 if the node is referred to in the TNC, %0 if it is + * not, and a negative error code in case of failure. + * + * Note, this function relies on the fact that 0:0 is never a valid LEB number + * and offset for a main-area node. + */ +static int is_leaf_node_in_tnc(struct ubifs_info *c, union ubifs_key *key, + int lnum, int offs) +{ + struct ubifs_zbranch *zbr; + struct ubifs_znode *znode, *zn; + int n, found, err, nn; + const int unique = !is_hash_key(c, key); + + found = ubifs_lookup_level0(c, key, &znode, &n); + if (found < 0) + return found; /* Error code */ + if (!found) + return 0; + zbr = &znode->zbranch[n]; + if (lnum == zbr->lnum && offs == zbr->offs) + return 1; /* Found it */ + if (unique) + return 0; + /* + * Because the key is not unique, we have to look left + * and right as well + */ + zn = znode; + nn = n; + /* Look left */ + while (1) { + err = tnc_prev(c, &znode, &n); + if (err == -ENOENT) + break; + if (err) + return err; + if (keys_cmp(c, key, &znode->zbranch[n].key)) + break; + zbr = &znode->zbranch[n]; + if (lnum == zbr->lnum && offs == zbr->offs) + return 1; /* Found it */ + } + /* Look right */ + znode = zn; + n = nn; + while (1) { + err = tnc_next(c, &znode, &n); + if (err) { + if (err == -ENOENT) + return 0; + return err; + } + if (keys_cmp(c, key, &znode->zbranch[n].key)) + break; + zbr = &znode->zbranch[n]; + if (lnum == zbr->lnum && offs == zbr->offs) + return 1; /* Found it */ + } + return 0; +} + +/** + * ubifs_tnc_has_node - determine whether a node is in the TNC. + * @c: UBIFS file-system description object + * @key: node key + * @level: index node level (if it is an index node) + * @lnum: node LEB number + * @offs: node offset + * @is_idx: non-zero if the node is an index node + * + * This function returns %1 if the node is in the TNC, %0 if it is not, and a + * negative error code in case of failure. For index nodes, @key has to be the + * key of the first child. An index node is considered to be in the TNC only if + * the corresponding znode is clean or has not been loaded. + */ +int ubifs_tnc_has_node(struct ubifs_info *c, union ubifs_key *key, int level, + int lnum, int offs, int is_idx) +{ + int err; + + mutex_lock(&c->tnc_mutex); + if (is_idx) { + err = is_idx_node_in_tnc(c, key, level, lnum, offs); + if (err < 0) + goto out_unlock; + if (err == 1) + /* The index node was found but it was dirty */ + err = 0; + else if (err == 2) + /* The index node was found and it was clean */ + err = 1; + else + BUG_ON(err != 0); + } else + err = is_leaf_node_in_tnc(c, key, lnum, offs); + +out_unlock: + mutex_unlock(&c->tnc_mutex); + return err; +} + +/** + * ubifs_dirty_idx_node - dirty an index node. + * @c: UBIFS file-system description object + * @key: index node key + * @level: index node level + * @lnum: index node LEB number + * @offs: index node offset + * + * This function loads and dirties an index node so that it can be garbage + * collected. The @key argument has to be the key of the first child. This + * function relies on the fact that 0:0 is never a valid LEB number and offset + * for a main-area node. Returns %0 on success and a negative error code on + * failure. + */ +int ubifs_dirty_idx_node(struct ubifs_info *c, union ubifs_key *key, int level, + int lnum, int offs) +{ + struct ubifs_znode *znode; + int err = 0; + + mutex_lock(&c->tnc_mutex); + znode = lookup_znode(c, key, level, lnum, offs); + if (!znode) + goto out_unlock; + if (IS_ERR(znode)) { + err = PTR_ERR(znode); + goto out_unlock; + } + znode = dirty_cow_bottom_up(c, znode); + if (IS_ERR(znode)) { + err = PTR_ERR(znode); + goto out_unlock; + } + +out_unlock: + mutex_unlock(&c->tnc_mutex); + return err; +} diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c new file mode 100644 index 000000000000..8ac76b1c2d55 --- /dev/null +++ b/fs/ubifs/tnc_commit.c @@ -0,0 +1,1102 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + */ + +/* This file implements TNC functions for committing */ + +#include "ubifs.h" + +/** + * make_idx_node - make an index node for fill-the-gaps method of TNC commit. + * @c: UBIFS file-system description object + * @idx: buffer in which to place new index node + * @znode: znode from which to make new index node + * @lnum: LEB number where new index node will be written + * @offs: offset where new index node will be written + * @len: length of new index node + */ +static int make_idx_node(struct ubifs_info *c, struct ubifs_idx_node *idx, + struct ubifs_znode *znode, int lnum, int offs, int len) +{ + struct ubifs_znode *zp; + int i, err; + + /* Make index node */ + idx->ch.node_type = UBIFS_IDX_NODE; + idx->child_cnt = cpu_to_le16(znode->child_cnt); + idx->level = cpu_to_le16(znode->level); + for (i = 0; i < znode->child_cnt; i++) { + struct ubifs_branch *br = ubifs_idx_branch(c, idx, i); + struct ubifs_zbranch *zbr = &znode->zbranch[i]; + + key_write_idx(c, &zbr->key, &br->key); + br->lnum = cpu_to_le32(zbr->lnum); + br->offs = cpu_to_le32(zbr->offs); + br->len = cpu_to_le32(zbr->len); + if (!zbr->lnum || !zbr->len) { + ubifs_err("bad ref in znode"); + dbg_dump_znode(c, znode); + if (zbr->znode) + dbg_dump_znode(c, zbr->znode); + } + } + ubifs_prepare_node(c, idx, len, 0); + +#ifdef CONFIG_UBIFS_FS_DEBUG + znode->lnum = lnum; + znode->offs = offs; + znode->len = len; +#endif + + err = insert_old_idx_znode(c, znode); + + /* Update the parent */ + zp = znode->parent; + if (zp) { + struct ubifs_zbranch *zbr; + + zbr = &zp->zbranch[znode->iip]; + zbr->lnum = lnum; + zbr->offs = offs; + zbr->len = len; + } else { + c->zroot.lnum = lnum; + c->zroot.offs = offs; + c->zroot.len = len; + } + c->calc_idx_sz += ALIGN(len, 8); + + atomic_long_dec(&c->dirty_zn_cnt); + + ubifs_assert(ubifs_zn_dirty(znode)); + ubifs_assert(test_bit(COW_ZNODE, &znode->flags)); + + __clear_bit(DIRTY_ZNODE, &znode->flags); + __clear_bit(COW_ZNODE, &znode->flags); + + return err; +} + +/** + * fill_gap - make index nodes in gaps in dirty index LEBs. + * @c: UBIFS file-system description object + * @lnum: LEB number that gap appears in + * @gap_start: offset of start of gap + * @gap_end: offset of end of gap + * @dirt: adds dirty space to this + * + * This function returns the number of index nodes written into the gap. + */ +static int fill_gap(struct ubifs_info *c, int lnum, int gap_start, int gap_end, + int *dirt) +{ + int len, gap_remains, gap_pos, written, pad_len; + + ubifs_assert((gap_start & 7) == 0); + ubifs_assert((gap_end & 7) == 0); + ubifs_assert(gap_end >= gap_start); + + gap_remains = gap_end - gap_start; + if (!gap_remains) + return 0; + gap_pos = gap_start; + written = 0; + while (c->enext) { + len = ubifs_idx_node_sz(c, c->enext->child_cnt); + if (len < gap_remains) { + struct ubifs_znode *znode = c->enext; + const int alen = ALIGN(len, 8); + int err; + + ubifs_assert(alen <= gap_remains); + err = make_idx_node(c, c->ileb_buf + gap_pos, znode, + lnum, gap_pos, len); + if (err) + return err; + gap_remains -= alen; + gap_pos += alen; + c->enext = znode->cnext; + if (c->enext == c->cnext) + c->enext = NULL; + written += 1; + } else + break; + } + if (gap_end == c->leb_size) { + c->ileb_len = ALIGN(gap_pos, c->min_io_size); + /* Pad to end of min_io_size */ + pad_len = c->ileb_len - gap_pos; + } else + /* Pad to end of gap */ + pad_len = gap_remains; + dbg_gc("LEB %d:%d to %d len %d nodes written %d wasted bytes %d", + lnum, gap_start, gap_end, gap_end - gap_start, written, pad_len); + ubifs_pad(c, c->ileb_buf + gap_pos, pad_len); + *dirt += pad_len; + return written; +} + +/** + * find_old_idx - find an index node obsoleted since the last commit start. + * @c: UBIFS file-system description object + * @lnum: LEB number of obsoleted index node + * @offs: offset of obsoleted index node + * + * Returns %1 if found and %0 otherwise. + */ +static int find_old_idx(struct ubifs_info *c, int lnum, int offs) +{ + struct ubifs_old_idx *o; + struct rb_node *p; + + p = c->old_idx.rb_node; + while (p) { + o = rb_entry(p, struct ubifs_old_idx, rb); + if (lnum < o->lnum) + p = p->rb_left; + else if (lnum > o->lnum) + p = p->rb_right; + else if (offs < o->offs) + p = p->rb_left; + else if (offs > o->offs) + p = p->rb_right; + else + return 1; + } + return 0; +} + +/** + * is_idx_node_in_use - determine if an index node can be overwritten. + * @c: UBIFS file-system description object + * @key: key of index node + * @level: index node level + * @lnum: LEB number of index node + * @offs: offset of index node + * + * If @key / @lnum / @offs identify an index node that was not part of the old + * index, then this function returns %0 (obsolete). Else if the index node was + * part of the old index but is now dirty %1 is returned, else if it is clean %2 + * is returned. A negative error code is returned on failure. + */ +static int is_idx_node_in_use(struct ubifs_info *c, union ubifs_key *key, + int level, int lnum, int offs) +{ + int ret; + + ret = is_idx_node_in_tnc(c, key, level, lnum, offs); + if (ret < 0) + return ret; /* Error code */ + if (ret == 0) + if (find_old_idx(c, lnum, offs)) + return 1; + return ret; +} + +/** + * layout_leb_in_gaps - layout index nodes using in-the-gaps method. + * @c: UBIFS file-system description object + * @p: return LEB number here + * + * This function lays out new index nodes for dirty znodes using in-the-gaps + * method of TNC commit. + * This function merely puts the next znode into the next gap, making no attempt + * to try to maximise the number of znodes that fit. + * This function returns the number of index nodes written into the gaps, or a + * negative error code on failure. + */ +static int layout_leb_in_gaps(struct ubifs_info *c, int *p) +{ + struct ubifs_scan_leb *sleb; + struct ubifs_scan_node *snod; + int lnum, dirt = 0, gap_start, gap_end, err, written, tot_written; + + tot_written = 0; + /* Get an index LEB with lots of obsolete index nodes */ + lnum = ubifs_find_dirty_idx_leb(c); + if (lnum < 0) + /* + * There also may be dirt in the index head that could be + * filled, however we do not check there at present. + */ + return lnum; /* Error code */ + *p = lnum; + dbg_gc("LEB %d", lnum); + /* + * Scan the index LEB. We use the generic scan for this even though + * it is more comprehensive and less efficient than is needed for this + * purpose. + */ + sleb = ubifs_scan(c, lnum, 0, c->ileb_buf); + c->ileb_len = 0; + if (IS_ERR(sleb)) + return PTR_ERR(sleb); + gap_start = 0; + list_for_each_entry(snod, &sleb->nodes, list) { + struct ubifs_idx_node *idx; + int in_use, level; + + ubifs_assert(snod->type == UBIFS_IDX_NODE); + idx = snod->node; + key_read(c, ubifs_idx_key(c, idx), &snod->key); + level = le16_to_cpu(idx->level); + /* Determine if the index node is in use (not obsolete) */ + in_use = is_idx_node_in_use(c, &snod->key, level, lnum, + snod->offs); + if (in_use < 0) { + ubifs_scan_destroy(sleb); + return in_use; /* Error code */ + } + if (in_use) { + if (in_use == 1) + dirt += ALIGN(snod->len, 8); + /* + * The obsolete index nodes form gaps that can be + * overwritten. This gap has ended because we have + * found an index node that is still in use + * i.e. not obsolete + */ + gap_end = snod->offs; + /* Try to fill gap */ + written = fill_gap(c, lnum, gap_start, gap_end, &dirt); + if (written < 0) { + ubifs_scan_destroy(sleb); + return written; /* Error code */ + } + tot_written += written; + gap_start = ALIGN(snod->offs + snod->len, 8); + } + } + ubifs_scan_destroy(sleb); + c->ileb_len = c->leb_size; + gap_end = c->leb_size; + /* Try to fill gap */ + written = fill_gap(c, lnum, gap_start, gap_end, &dirt); + if (written < 0) + return written; /* Error code */ + tot_written += written; + if (tot_written == 0) { + struct ubifs_lprops lp; + + dbg_gc("LEB %d wrote %d index nodes", lnum, tot_written); + err = ubifs_read_one_lp(c, lnum, &lp); + if (err) + return err; + if (lp.free == c->leb_size) { + /* + * We must have snatched this LEB from the idx_gc list + * so we need to correct the free and dirty space. + */ + err = ubifs_change_one_lp(c, lnum, + c->leb_size - c->ileb_len, + dirt, 0, 0, 0); + if (err) + return err; + } + return 0; + } + err = ubifs_change_one_lp(c, lnum, c->leb_size - c->ileb_len, dirt, + 0, 0, 0); + if (err) + return err; + err = ubifs_leb_change(c, lnum, c->ileb_buf, c->ileb_len, + UBI_SHORTTERM); + if (err) + return err; + dbg_gc("LEB %d wrote %d index nodes", lnum, tot_written); + return tot_written; +} + +/** + * get_leb_cnt - calculate the number of empty LEBs needed to commit. + * @c: UBIFS file-system description object + * @cnt: number of znodes to commit + * + * This function returns the number of empty LEBs needed to commit @cnt znodes + * to the current index head. The number is not exact and may be more than + * needed. + */ +static int get_leb_cnt(struct ubifs_info *c, int cnt) +{ + int d; + + /* Assume maximum index node size (i.e. overestimate space needed) */ + cnt -= (c->leb_size - c->ihead_offs) / c->max_idx_node_sz; + if (cnt < 0) + cnt = 0; + d = c->leb_size / c->max_idx_node_sz; + return DIV_ROUND_UP(cnt, d); +} + +/** + * layout_in_gaps - in-the-gaps method of committing TNC. + * @c: UBIFS file-system description object + * @cnt: number of dirty znodes to commit. + * + * This function lays out new index nodes for dirty znodes using in-the-gaps + * method of TNC commit. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int layout_in_gaps(struct ubifs_info *c, int cnt) +{ + int err, leb_needed_cnt, written, *p; + + dbg_gc("%d znodes to write", cnt); + + c->gap_lebs = kmalloc(sizeof(int) * (c->lst.idx_lebs + 1), GFP_NOFS); + if (!c->gap_lebs) + return -ENOMEM; + + p = c->gap_lebs; + do { + ubifs_assert(p < c->gap_lebs + sizeof(int) * c->lst.idx_lebs); + written = layout_leb_in_gaps(c, p); + if (written < 0) { + err = written; + if (err != -ENOSPC) { + kfree(c->gap_lebs); + c->gap_lebs = NULL; + return err; + } + if (!dbg_force_in_the_gaps_enabled) { + /* + * Do not print scary warnings if the debugging + * option which forces in-the-gaps is enabled. + */ + ubifs_err("out of space"); + spin_lock(&c->space_lock); + dbg_dump_budg(c); + spin_unlock(&c->space_lock); + dbg_dump_lprops(c); + } + /* Try to commit anyway */ + err = 0; + break; + } + p++; + cnt -= written; + leb_needed_cnt = get_leb_cnt(c, cnt); + dbg_gc("%d znodes remaining, need %d LEBs, have %d", cnt, + leb_needed_cnt, c->ileb_cnt); + } while (leb_needed_cnt > c->ileb_cnt); + + *p = -1; + return 0; +} + +/** + * layout_in_empty_space - layout index nodes in empty space. + * @c: UBIFS file-system description object + * + * This function lays out new index nodes for dirty znodes using empty LEBs. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int layout_in_empty_space(struct ubifs_info *c) +{ + struct ubifs_znode *znode, *cnext, *zp; + int lnum, offs, len, next_len, buf_len, buf_offs, used, avail; + int wlen, blen, err; + + cnext = c->enext; + if (!cnext) + return 0; + + lnum = c->ihead_lnum; + buf_offs = c->ihead_offs; + + buf_len = ubifs_idx_node_sz(c, c->fanout); + buf_len = ALIGN(buf_len, c->min_io_size); + used = 0; + avail = buf_len; + + /* Ensure there is enough room for first write */ + next_len = ubifs_idx_node_sz(c, cnext->child_cnt); + if (buf_offs + next_len > c->leb_size) + lnum = -1; + + while (1) { + znode = cnext; + + len = ubifs_idx_node_sz(c, znode->child_cnt); + + /* Determine the index node position */ + if (lnum == -1) { + if (c->ileb_nxt >= c->ileb_cnt) { + ubifs_err("out of space"); + return -ENOSPC; + } + lnum = c->ilebs[c->ileb_nxt++]; + buf_offs = 0; + used = 0; + avail = buf_len; + } + + offs = buf_offs + used; + +#ifdef CONFIG_UBIFS_FS_DEBUG + znode->lnum = lnum; + znode->offs = offs; + znode->len = len; +#endif + + /* Update the parent */ + zp = znode->parent; + if (zp) { + struct ubifs_zbranch *zbr; + int i; + + i = znode->iip; + zbr = &zp->zbranch[i]; + zbr->lnum = lnum; + zbr->offs = offs; + zbr->len = len; + } else { + c->zroot.lnum = lnum; + c->zroot.offs = offs; + c->zroot.len = len; + } + c->calc_idx_sz += ALIGN(len, 8); + + /* + * Once lprops is updated, we can decrease the dirty znode count + * but it is easier to just do it here. + */ + atomic_long_dec(&c->dirty_zn_cnt); + + /* + * Calculate the next index node length to see if there is + * enough room for it + */ + cnext = znode->cnext; + if (cnext == c->cnext) + next_len = 0; + else + next_len = ubifs_idx_node_sz(c, cnext->child_cnt); + + if (c->min_io_size == 1) { + buf_offs += ALIGN(len, 8); + if (next_len) { + if (buf_offs + next_len <= c->leb_size) + continue; + err = ubifs_update_one_lp(c, lnum, 0, + c->leb_size - buf_offs, 0, 0); + if (err) + return err; + lnum = -1; + continue; + } + err = ubifs_update_one_lp(c, lnum, + c->leb_size - buf_offs, 0, 0, 0); + if (err) + return err; + break; + } + + /* Update buffer positions */ + wlen = used + len; + used += ALIGN(len, 8); + avail -= ALIGN(len, 8); + + if (next_len != 0 && + buf_offs + used + next_len <= c->leb_size && + avail > 0) + continue; + + if (avail <= 0 && next_len && + buf_offs + used + next_len <= c->leb_size) + blen = buf_len; + else + blen = ALIGN(wlen, c->min_io_size); + + /* The buffer is full or there are no more znodes to do */ + buf_offs += blen; + if (next_len) { + if (buf_offs + next_len > c->leb_size) { + err = ubifs_update_one_lp(c, lnum, + c->leb_size - buf_offs, blen - used, + 0, 0); + if (err) + return err; + lnum = -1; + } + used -= blen; + if (used < 0) + used = 0; + avail = buf_len - used; + continue; + } + err = ubifs_update_one_lp(c, lnum, c->leb_size - buf_offs, + blen - used, 0, 0); + if (err) + return err; + break; + } + +#ifdef CONFIG_UBIFS_FS_DEBUG + c->new_ihead_lnum = lnum; + c->new_ihead_offs = buf_offs; +#endif + + return 0; +} + +/** + * layout_commit - determine positions of index nodes to commit. + * @c: UBIFS file-system description object + * @no_space: indicates that insufficient empty LEBs were allocated + * @cnt: number of znodes to commit + * + * Calculate and update the positions of index nodes to commit. If there were + * an insufficient number of empty LEBs allocated, then index nodes are placed + * into the gaps created by obsolete index nodes in non-empty index LEBs. For + * this purpose, an obsolete index node is one that was not in the index as at + * the end of the last commit. To write "in-the-gaps" requires that those index + * LEBs are updated atomically in-place. + */ +static int layout_commit(struct ubifs_info *c, int no_space, int cnt) +{ + int err; + + if (no_space) { + err = layout_in_gaps(c, cnt); + if (err) + return err; + } + err = layout_in_empty_space(c); + return err; +} + +/** + * find_first_dirty - find first dirty znode. + * @znode: znode to begin searching from + */ +static struct ubifs_znode *find_first_dirty(struct ubifs_znode *znode) +{ + int i, cont; + + if (!znode) + return NULL; + + while (1) { + if (znode->level == 0) { + if (ubifs_zn_dirty(znode)) + return znode; + return NULL; + } + cont = 0; + for (i = 0; i < znode->child_cnt; i++) { + struct ubifs_zbranch *zbr = &znode->zbranch[i]; + + if (zbr->znode && ubifs_zn_dirty(zbr->znode)) { + znode = zbr->znode; + cont = 1; + break; + } + } + if (!cont) { + if (ubifs_zn_dirty(znode)) + return znode; + return NULL; + } + } +} + +/** + * find_next_dirty - find next dirty znode. + * @znode: znode to begin searching from + */ +static struct ubifs_znode *find_next_dirty(struct ubifs_znode *znode) +{ + int n = znode->iip + 1; + + znode = znode->parent; + if (!znode) + return NULL; + for (; n < znode->child_cnt; n++) { + struct ubifs_zbranch *zbr = &znode->zbranch[n]; + + if (zbr->znode && ubifs_zn_dirty(zbr->znode)) + return find_first_dirty(zbr->znode); + } + return znode; +} + +/** + * get_znodes_to_commit - create list of dirty znodes to commit. + * @c: UBIFS file-system description object + * + * This function returns the number of znodes to commit. + */ +static int get_znodes_to_commit(struct ubifs_info *c) +{ + struct ubifs_znode *znode, *cnext; + int cnt = 0; + + c->cnext = find_first_dirty(c->zroot.znode); + znode = c->enext = c->cnext; + if (!znode) { + dbg_cmt("no znodes to commit"); + return 0; + } + cnt += 1; + while (1) { + ubifs_assert(!test_bit(COW_ZNODE, &znode->flags)); + __set_bit(COW_ZNODE, &znode->flags); + znode->alt = 0; + cnext = find_next_dirty(znode); + if (!cnext) { + znode->cnext = c->cnext; + break; + } + znode->cnext = cnext; + znode = cnext; + cnt += 1; + } + dbg_cmt("committing %d znodes", cnt); + ubifs_assert(cnt == atomic_long_read(&c->dirty_zn_cnt)); + return cnt; +} + +/** + * alloc_idx_lebs - allocate empty LEBs to be used to commit. + * @c: UBIFS file-system description object + * @cnt: number of znodes to commit + * + * This function returns %-ENOSPC if it cannot allocate a sufficient number of + * empty LEBs. %0 is returned on success, otherwise a negative error code + * is returned. + */ +static int alloc_idx_lebs(struct ubifs_info *c, int cnt) +{ + int i, leb_cnt, lnum; + + c->ileb_cnt = 0; + c->ileb_nxt = 0; + leb_cnt = get_leb_cnt(c, cnt); + dbg_cmt("need about %d empty LEBS for TNC commit", leb_cnt); + if (!leb_cnt) + return 0; + c->ilebs = kmalloc(leb_cnt * sizeof(int), GFP_NOFS); + if (!c->ilebs) + return -ENOMEM; + for (i = 0; i < leb_cnt; i++) { + lnum = ubifs_find_free_leb_for_idx(c); + if (lnum < 0) + return lnum; + c->ilebs[c->ileb_cnt++] = lnum; + dbg_cmt("LEB %d", lnum); + } + if (dbg_force_in_the_gaps()) + return -ENOSPC; + return 0; +} + +/** + * free_unused_idx_lebs - free unused LEBs that were allocated for the commit. + * @c: UBIFS file-system description object + * + * It is possible that we allocate more empty LEBs for the commit than we need. + * This functions frees the surplus. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int free_unused_idx_lebs(struct ubifs_info *c) +{ + int i, err = 0, lnum, er; + + for (i = c->ileb_nxt; i < c->ileb_cnt; i++) { + lnum = c->ilebs[i]; + dbg_cmt("LEB %d", lnum); + er = ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0, + LPROPS_INDEX | LPROPS_TAKEN, 0); + if (!err) + err = er; + } + return err; +} + +/** + * free_idx_lebs - free unused LEBs after commit end. + * @c: UBIFS file-system description object + * + * This function returns %0 on success and a negative error code on failure. + */ +static int free_idx_lebs(struct ubifs_info *c) +{ + int err; + + err = free_unused_idx_lebs(c); + kfree(c->ilebs); + c->ilebs = NULL; + return err; +} + +/** + * ubifs_tnc_start_commit - start TNC commit. + * @c: UBIFS file-system description object + * @zroot: new index root position is returned here + * + * This function prepares the list of indexing nodes to commit and lays out + * their positions on flash. If there is not enough free space it uses the + * in-gap commit method. Returns zero in case of success and a negative error + * code in case of failure. + */ +int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot) +{ + int err = 0, cnt; + + mutex_lock(&c->tnc_mutex); + err = dbg_check_tnc(c, 1); + if (err) + goto out; + cnt = get_znodes_to_commit(c); + if (cnt != 0) { + int no_space = 0; + + err = alloc_idx_lebs(c, cnt); + if (err == -ENOSPC) + no_space = 1; + else if (err) + goto out_free; + err = layout_commit(c, no_space, cnt); + if (err) + goto out_free; + ubifs_assert(atomic_long_read(&c->dirty_zn_cnt) == 0); + err = free_unused_idx_lebs(c); + if (err) + goto out; + } + destroy_old_idx(c); + memcpy(zroot, &c->zroot, sizeof(struct ubifs_zbranch)); + + err = ubifs_save_dirty_idx_lnums(c); + if (err) + goto out; + + spin_lock(&c->space_lock); + /* + * Although we have not finished committing yet, update size of the + * committed index ('c->old_idx_sz') and zero out the index growth + * budget. It is OK to do this now, because we've reserved all the + * space which is needed to commit the index, and it is save for the + * budgeting subsystem to assume the index is already committed, + * even though it is not. + */ + c->old_idx_sz = c->calc_idx_sz; + c->budg_uncommitted_idx = 0; + spin_unlock(&c->space_lock); + mutex_unlock(&c->tnc_mutex); + + dbg_cmt("number of index LEBs %d", c->lst.idx_lebs); + dbg_cmt("size of index %llu", c->calc_idx_sz); + return err; + +out_free: + free_idx_lebs(c); +out: + mutex_unlock(&c->tnc_mutex); + return err; +} + +/** + * write_index - write index nodes. + * @c: UBIFS file-system description object + * + * This function writes the index nodes whose positions were laid out in the + * layout_in_empty_space function. + */ +static int write_index(struct ubifs_info *c) +{ + struct ubifs_idx_node *idx; + struct ubifs_znode *znode, *cnext; + int i, lnum, offs, len, next_len, buf_len, buf_offs, used; + int avail, wlen, err, lnum_pos = 0; + + cnext = c->enext; + if (!cnext) + return 0; + + /* + * Always write index nodes to the index head so that index nodes and + * other types of nodes are never mixed in the same erase block. + */ + lnum = c->ihead_lnum; + buf_offs = c->ihead_offs; + + /* Allocate commit buffer */ + buf_len = ALIGN(c->max_idx_node_sz, c->min_io_size); + used = 0; + avail = buf_len; + + /* Ensure there is enough room for first write */ + next_len = ubifs_idx_node_sz(c, cnext->child_cnt); + if (buf_offs + next_len > c->leb_size) { + err = ubifs_update_one_lp(c, lnum, LPROPS_NC, 0, 0, + LPROPS_TAKEN); + if (err) + return err; + lnum = -1; + } + + while (1) { + cond_resched(); + + znode = cnext; + idx = c->cbuf + used; + + /* Make index node */ + idx->ch.node_type = UBIFS_IDX_NODE; + idx->child_cnt = cpu_to_le16(znode->child_cnt); + idx->level = cpu_to_le16(znode->level); + for (i = 0; i < znode->child_cnt; i++) { + struct ubifs_branch *br = ubifs_idx_branch(c, idx, i); + struct ubifs_zbranch *zbr = &znode->zbranch[i]; + + key_write_idx(c, &zbr->key, &br->key); + br->lnum = cpu_to_le32(zbr->lnum); + br->offs = cpu_to_le32(zbr->offs); + br->len = cpu_to_le32(zbr->len); + if (!zbr->lnum || !zbr->len) { + ubifs_err("bad ref in znode"); + dbg_dump_znode(c, znode); + if (zbr->znode) + dbg_dump_znode(c, zbr->znode); + } + } + len = ubifs_idx_node_sz(c, znode->child_cnt); + ubifs_prepare_node(c, idx, len, 0); + + /* Determine the index node position */ + if (lnum == -1) { + lnum = c->ilebs[lnum_pos++]; + buf_offs = 0; + used = 0; + avail = buf_len; + } + offs = buf_offs + used; + +#ifdef CONFIG_UBIFS_FS_DEBUG + if (lnum != znode->lnum || offs != znode->offs || + len != znode->len) { + ubifs_err("inconsistent znode posn"); + return -EINVAL; + } +#endif + + /* Grab some stuff from znode while we still can */ + cnext = znode->cnext; + + ubifs_assert(ubifs_zn_dirty(znode)); + ubifs_assert(test_bit(COW_ZNODE, &znode->flags)); + + /* + * It is important that other threads should see %DIRTY_ZNODE + * flag cleared before %COW_ZNODE. Specifically, it matters in + * the 'dirty_cow_znode()' function. This is the reason for the + * first barrier. Also, we want the bit changes to be seen to + * other threads ASAP, to avoid unnecesarry copying, which is + * the reason for the second barrier. + */ + clear_bit(DIRTY_ZNODE, &znode->flags); + smp_mb__before_clear_bit(); + clear_bit(COW_ZNODE, &znode->flags); + smp_mb__after_clear_bit(); + + /* Do not access znode from this point on */ + + /* Update buffer positions */ + wlen = used + len; + used += ALIGN(len, 8); + avail -= ALIGN(len, 8); + + /* + * Calculate the next index node length to see if there is + * enough room for it + */ + if (cnext == c->cnext) + next_len = 0; + else + next_len = ubifs_idx_node_sz(c, cnext->child_cnt); + + if (c->min_io_size == 1) { + /* + * Write the prepared index node immediately if there is + * no minimum IO size + */ + err = ubifs_leb_write(c, lnum, c->cbuf, buf_offs, + wlen, UBI_SHORTTERM); + if (err) + return err; + buf_offs += ALIGN(wlen, 8); + if (next_len) { + used = 0; + avail = buf_len; + if (buf_offs + next_len > c->leb_size) { + err = ubifs_update_one_lp(c, lnum, + LPROPS_NC, 0, 0, LPROPS_TAKEN); + if (err) + return err; + lnum = -1; + } + continue; + } + } else { + int blen, nxt_offs = buf_offs + used + next_len; + + if (next_len && nxt_offs <= c->leb_size) { + if (avail > 0) + continue; + else + blen = buf_len; + } else { + wlen = ALIGN(wlen, 8); + blen = ALIGN(wlen, c->min_io_size); + ubifs_pad(c, c->cbuf + wlen, blen - wlen); + } + /* + * The buffer is full or there are no more znodes + * to do + */ + err = ubifs_leb_write(c, lnum, c->cbuf, buf_offs, + blen, UBI_SHORTTERM); + if (err) + return err; + buf_offs += blen; + if (next_len) { + if (nxt_offs > c->leb_size) { + err = ubifs_update_one_lp(c, lnum, + LPROPS_NC, 0, 0, LPROPS_TAKEN); + if (err) + return err; + lnum = -1; + } + used -= blen; + if (used < 0) + used = 0; + avail = buf_len - used; + memmove(c->cbuf, c->cbuf + blen, used); + continue; + } + } + break; + } + +#ifdef CONFIG_UBIFS_FS_DEBUG + if (lnum != c->new_ihead_lnum || buf_offs != c->new_ihead_offs) { + ubifs_err("inconsistent ihead"); + return -EINVAL; + } +#endif + + c->ihead_lnum = lnum; + c->ihead_offs = buf_offs; + + return 0; +} + +/** + * free_obsolete_znodes - free obsolete znodes. + * @c: UBIFS file-system description object + * + * At the end of commit end, obsolete znodes are freed. + */ +static void free_obsolete_znodes(struct ubifs_info *c) +{ + struct ubifs_znode *znode, *cnext; + + cnext = c->cnext; + do { + znode = cnext; + cnext = znode->cnext; + if (test_bit(OBSOLETE_ZNODE, &znode->flags)) + kfree(znode); + else { + znode->cnext = NULL; + atomic_long_inc(&c->clean_zn_cnt); + atomic_long_inc(&ubifs_clean_zn_cnt); + } + } while (cnext != c->cnext); +} + +/** + * return_gap_lebs - return LEBs used by the in-gap commit method. + * @c: UBIFS file-system description object + * + * This function clears the "taken" flag for the LEBs which were used by the + * "commit in-the-gaps" method. + */ +static int return_gap_lebs(struct ubifs_info *c) +{ + int *p, err; + + if (!c->gap_lebs) + return 0; + + dbg_cmt(""); + for (p = c->gap_lebs; *p != -1; p++) { + err = ubifs_change_one_lp(c, *p, LPROPS_NC, LPROPS_NC, 0, + LPROPS_TAKEN, 0); + if (err) + return err; + } + + kfree(c->gap_lebs); + c->gap_lebs = NULL; + return 0; +} + +/** + * ubifs_tnc_end_commit - update the TNC for commit end. + * @c: UBIFS file-system description object + * + * Write the dirty znodes. + */ +int ubifs_tnc_end_commit(struct ubifs_info *c) +{ + int err; + + if (!c->cnext) + return 0; + + err = return_gap_lebs(c); + if (err) + return err; + + err = write_index(c); + if (err) + return err; + + mutex_lock(&c->tnc_mutex); + + dbg_cmt("TNC height is %d", c->zroot.znode->level + 1); + + free_obsolete_znodes(c); + + c->cnext = NULL; + kfree(c->ilebs); + c->ilebs = NULL; + + mutex_unlock(&c->tnc_mutex); + + return 0; +} diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c new file mode 100644 index 000000000000..a25c1cc1f8d9 --- /dev/null +++ b/fs/ubifs/tnc_misc.c @@ -0,0 +1,494 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * This file contains miscelanious TNC-related functions shared betweend + * different files. This file does not form any logically separate TNC + * sub-system. The file was created because there is a lot of TNC code and + * putting it all in one file would make that file too big and unreadable. + */ + +#include "ubifs.h" + +/** + * ubifs_tnc_levelorder_next - next TNC tree element in levelorder traversal. + * @zr: root of the subtree to traverse + * @znode: previous znode + * + * This function implements levelorder TNC traversal. The LNC is ignored. + * Returns the next element or %NULL if @znode is already the last one. + */ +struct ubifs_znode *ubifs_tnc_levelorder_next(struct ubifs_znode *zr, + struct ubifs_znode *znode) +{ + int level, iip, level_search = 0; + struct ubifs_znode *zn; + + ubifs_assert(zr); + + if (unlikely(!znode)) + return zr; + + if (unlikely(znode == zr)) { + if (znode->level == 0) + return NULL; + return ubifs_tnc_find_child(zr, 0); + } + + level = znode->level; + + iip = znode->iip; + while (1) { + ubifs_assert(znode->level <= zr->level); + + /* + * First walk up until there is a znode with next branch to + * look at. + */ + while (znode->parent != zr && iip >= znode->parent->child_cnt) { + znode = znode->parent; + iip = znode->iip; + } + + if (unlikely(znode->parent == zr && + iip >= znode->parent->child_cnt)) { + /* This level is done, switch to the lower one */ + level -= 1; + if (level_search || level < 0) + /* + * We were already looking for znode at lower + * level ('level_search'). As we are here + * again, it just does not exist. Or all levels + * were finished ('level < 0'). + */ + return NULL; + + level_search = 1; + iip = -1; + znode = ubifs_tnc_find_child(zr, 0); + ubifs_assert(znode); + } + + /* Switch to the next index */ + zn = ubifs_tnc_find_child(znode->parent, iip + 1); + if (!zn) { + /* No more children to look at, we have walk up */ + iip = znode->parent->child_cnt; + continue; + } + + /* Walk back down to the level we came from ('level') */ + while (zn->level != level) { + znode = zn; + zn = ubifs_tnc_find_child(zn, 0); + if (!zn) { + /* + * This path is not too deep so it does not + * reach 'level'. Try next path. + */ + iip = znode->iip; + break; + } + } + + if (zn) { + ubifs_assert(zn->level >= 0); + return zn; + } + } +} + +/** + * ubifs_search_zbranch - search znode branch. + * @c: UBIFS file-system description object + * @znode: znode to search in + * @key: key to search for + * @n: znode branch slot number is returned here + * + * This is a helper function which search branch with key @key in @znode using + * binary search. The result of the search may be: + * o exact match, then %1 is returned, and the slot number of the branch is + * stored in @n; + * o no exact match, then %0 is returned and the slot number of the left + * closest branch is returned in @n; the slot if all keys in this znode are + * greater than @key, then %-1 is returned in @n. + */ +int ubifs_search_zbranch(const struct ubifs_info *c, + const struct ubifs_znode *znode, + const union ubifs_key *key, int *n) +{ + int beg = 0, end = znode->child_cnt, uninitialized_var(mid); + int uninitialized_var(cmp); + const struct ubifs_zbranch *zbr = &znode->zbranch[0]; + + ubifs_assert(end > beg); + + while (end > beg) { + mid = (beg + end) >> 1; + cmp = keys_cmp(c, key, &zbr[mid].key); + if (cmp > 0) + beg = mid + 1; + else if (cmp < 0) + end = mid; + else { + *n = mid; + return 1; + } + } + + *n = end - 1; + + /* The insert point is after *n */ + ubifs_assert(*n >= -1 && *n < znode->child_cnt); + if (*n == -1) + ubifs_assert(keys_cmp(c, key, &zbr[0].key) < 0); + else + ubifs_assert(keys_cmp(c, key, &zbr[*n].key) > 0); + if (*n + 1 < znode->child_cnt) + ubifs_assert(keys_cmp(c, key, &zbr[*n + 1].key) < 0); + + return 0; +} + +/** + * ubifs_tnc_postorder_first - find first znode to do postorder tree traversal. + * @znode: znode to start at (root of the sub-tree to traverse) + * + * Find the lowest leftmost znode in a subtree of the TNC tree. The LNC is + * ignored. + */ +struct ubifs_znode *ubifs_tnc_postorder_first(struct ubifs_znode *znode) +{ + if (unlikely(!znode)) + return NULL; + + while (znode->level > 0) { + struct ubifs_znode *child; + + child = ubifs_tnc_find_child(znode, 0); + if (!child) + return znode; + znode = child; + } + + return znode; +} + +/** + * ubifs_tnc_postorder_next - next TNC tree element in postorder traversal. + * @znode: previous znode + * + * This function implements postorder TNC traversal. The LNC is ignored. + * Returns the next element or %NULL if @znode is already the last one. + */ +struct ubifs_znode *ubifs_tnc_postorder_next(struct ubifs_znode *znode) +{ + struct ubifs_znode *zn; + + ubifs_assert(znode); + if (unlikely(!znode->parent)) + return NULL; + + /* Switch to the next index in the parent */ + zn = ubifs_tnc_find_child(znode->parent, znode->iip + 1); + if (!zn) + /* This is in fact the last child, return parent */ + return znode->parent; + + /* Go to the first znode in this new subtree */ + return ubifs_tnc_postorder_first(zn); +} + +/** + * ubifs_destroy_tnc_subtree - destroy all znodes connected to a subtree. + * @znode: znode defining subtree to destroy + * + * This function destroys subtree of the TNC tree. Returns number of clean + * znodes in the subtree. + */ +long ubifs_destroy_tnc_subtree(struct ubifs_znode *znode) +{ + struct ubifs_znode *zn = ubifs_tnc_postorder_first(znode); + long clean_freed = 0; + int n; + + ubifs_assert(zn); + while (1) { + for (n = 0; n < zn->child_cnt; n++) { + if (!zn->zbranch[n].znode) + continue; + + if (zn->level > 0 && + !ubifs_zn_dirty(zn->zbranch[n].znode)) + clean_freed += 1; + + cond_resched(); + kfree(zn->zbranch[n].znode); + } + + if (zn == znode) { + if (!ubifs_zn_dirty(zn)) + clean_freed += 1; + kfree(zn); + return clean_freed; + } + + zn = ubifs_tnc_postorder_next(zn); + } +} + +/** + * read_znode - read an indexing node from flash and fill znode. + * @c: UBIFS file-system description object + * @lnum: LEB of the indexing node to read + * @offs: node offset + * @len: node length + * @znode: znode to read to + * + * This function reads an indexing node from the flash media and fills znode + * with the read data. Returns zero in case of success and a negative error + * code in case of failure. The read indexing node is validated and if anything + * is wrong with it, this function prints complaint messages and returns + * %-EINVAL. + */ +static int read_znode(struct ubifs_info *c, int lnum, int offs, int len, + struct ubifs_znode *znode) +{ + int i, err, type, cmp; + struct ubifs_idx_node *idx; + + idx = kmalloc(c->max_idx_node_sz, GFP_NOFS); + if (!idx) + return -ENOMEM; + + err = ubifs_read_node(c, idx, UBIFS_IDX_NODE, len, lnum, offs); + if (err < 0) { + kfree(idx); + return err; + } + + znode->child_cnt = le16_to_cpu(idx->child_cnt); + znode->level = le16_to_cpu(idx->level); + + dbg_tnc("LEB %d:%d, level %d, %d branch", + lnum, offs, znode->level, znode->child_cnt); + + if (znode->child_cnt > c->fanout || znode->level > UBIFS_MAX_LEVELS) { + dbg_err("current fanout %d, branch count %d", + c->fanout, znode->child_cnt); + dbg_err("max levels %d, znode level %d", + UBIFS_MAX_LEVELS, znode->level); + err = 1; + goto out_dump; + } + + for (i = 0; i < znode->child_cnt; i++) { + const struct ubifs_branch *br = ubifs_idx_branch(c, idx, i); + struct ubifs_zbranch *zbr = &znode->zbranch[i]; + + key_read(c, &br->key, &zbr->key); + zbr->lnum = le32_to_cpu(br->lnum); + zbr->offs = le32_to_cpu(br->offs); + zbr->len = le32_to_cpu(br->len); + zbr->znode = NULL; + + /* Validate branch */ + + if (zbr->lnum < c->main_first || + zbr->lnum >= c->leb_cnt || zbr->offs < 0 || + zbr->offs + zbr->len > c->leb_size || zbr->offs & 7) { + dbg_err("bad branch %d", i); + err = 2; + goto out_dump; + } + + switch (key_type(c, &zbr->key)) { + case UBIFS_INO_KEY: + case UBIFS_DATA_KEY: + case UBIFS_DENT_KEY: + case UBIFS_XENT_KEY: + break; + default: + dbg_msg("bad key type at slot %d: %s", i, + DBGKEY(&zbr->key)); + err = 3; + goto out_dump; + } + + if (znode->level) + continue; + + type = key_type(c, &zbr->key); + if (c->ranges[type].max_len == 0) { + if (zbr->len != c->ranges[type].len) { + dbg_err("bad target node (type %d) length (%d)", + type, zbr->len); + dbg_err("have to be %d", c->ranges[type].len); + err = 4; + goto out_dump; + } + } else if (zbr->len < c->ranges[type].min_len || + zbr->len > c->ranges[type].max_len) { + dbg_err("bad target node (type %d) length (%d)", + type, zbr->len); + dbg_err("have to be in range of %d-%d", + c->ranges[type].min_len, + c->ranges[type].max_len); + err = 5; + goto out_dump; + } + } + + /* + * Ensure that the next key is greater or equivalent to the + * previous one. + */ + for (i = 0; i < znode->child_cnt - 1; i++) { + const union ubifs_key *key1, *key2; + + key1 = &znode->zbranch[i].key; + key2 = &znode->zbranch[i + 1].key; + + cmp = keys_cmp(c, key1, key2); + if (cmp > 0) { + dbg_err("bad key order (keys %d and %d)", i, i + 1); + err = 6; + goto out_dump; + } else if (cmp == 0 && !is_hash_key(c, key1)) { + /* These can only be keys with colliding hash */ + dbg_err("keys %d and %d are not hashed but equivalent", + i, i + 1); + err = 7; + goto out_dump; + } + } + + kfree(idx); + return 0; + +out_dump: + ubifs_err("bad indexing node at LEB %d:%d, error %d", lnum, offs, err); + dbg_dump_node(c, idx); + kfree(idx); + return -EINVAL; +} + +/** + * ubifs_load_znode - load znode to TNC cache. + * @c: UBIFS file-system description object + * @zbr: znode branch + * @parent: znode's parent + * @iip: index in parent + * + * This function loads znode pointed to by @zbr into the TNC cache and + * returns pointer to it in case of success and a negative error code in case + * of failure. + */ +struct ubifs_znode *ubifs_load_znode(struct ubifs_info *c, + struct ubifs_zbranch *zbr, + struct ubifs_znode *parent, int iip) +{ + int err; + struct ubifs_znode *znode; + + ubifs_assert(!zbr->znode); + /* + * A slab cache is not presently used for znodes because the znode size + * depends on the fanout which is stored in the superblock. + */ + znode = kzalloc(c->max_znode_sz, GFP_NOFS); + if (!znode) + return ERR_PTR(-ENOMEM); + + err = read_znode(c, zbr->lnum, zbr->offs, zbr->len, znode); + if (err) + goto out; + + atomic_long_inc(&c->clean_zn_cnt); + + /* + * Increment the global clean znode counter as well. It is OK that + * global and per-FS clean znode counters may be inconsistent for some + * short time (because we might be preempted at this point), the global + * one is only used in shrinker. + */ + atomic_long_inc(&ubifs_clean_zn_cnt); + + zbr->znode = znode; + znode->parent = parent; + znode->time = get_seconds(); + znode->iip = iip; + + return znode; + +out: + kfree(znode); + return ERR_PTR(err); +} + +/** + * ubifs_tnc_read_node - read a leaf node from the flash media. + * @c: UBIFS file-system description object + * @zbr: key and position of the node + * @node: node is returned here + * + * This function reads a node defined by @zbr from the flash media. Returns + * zero in case of success or a negative negative error code in case of + * failure. + */ +int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr, + void *node) +{ + union ubifs_key key1, *key = &zbr->key; + int err, type = key_type(c, key); + struct ubifs_wbuf *wbuf; + + /* + * 'zbr' has to point to on-flash node. The node may sit in a bud and + * may even be in a write buffer, so we have to take care about this. + */ + wbuf = ubifs_get_wbuf(c, zbr->lnum); + if (wbuf) + err = ubifs_read_node_wbuf(wbuf, node, type, zbr->len, + zbr->lnum, zbr->offs); + else + err = ubifs_read_node(c, node, type, zbr->len, zbr->lnum, + zbr->offs); + + if (err) { + dbg_tnc("key %s", DBGKEY(key)); + return err; + } + + /* Make sure the key of the read node is correct */ + key_read(c, key, &key1); + if (memcmp(node + UBIFS_KEY_OFFSET, &key1, c->key_len)) { + ubifs_err("bad key in node at LEB %d:%d", + zbr->lnum, zbr->offs); + dbg_tnc("looked for key %s found node's key %s", + DBGKEY(key), DBGKEY1(&key1)); + dbg_dump_node(c, node); + return -EINVAL; + } + + return 0; +} diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h new file mode 100644 index 000000000000..a9ecbd9af20d --- /dev/null +++ b/fs/ubifs/ubifs-media.h @@ -0,0 +1,745 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* + * This file describes UBIFS on-flash format and contains definitions of all the + * relevant data structures and constants. + * + * All UBIFS on-flash objects are stored in the form of nodes. All nodes start + * with the UBIFS node magic number and have the same common header. Nodes + * always sit at 8-byte aligned positions on the media and node header sizes are + * also 8-byte aligned (except for the indexing node and the padding node). + */ + +#ifndef __UBIFS_MEDIA_H__ +#define __UBIFS_MEDIA_H__ + +/* UBIFS node magic number (must not have the padding byte first or last) */ +#define UBIFS_NODE_MAGIC 0x06101831 + +/* UBIFS on-flash format version */ +#define UBIFS_FORMAT_VERSION 4 + +/* Minimum logical eraseblock size in bytes */ +#define UBIFS_MIN_LEB_SZ (15*1024) + +/* Initial CRC32 value used when calculating CRC checksums */ +#define UBIFS_CRC32_INIT 0xFFFFFFFFU + +/* + * UBIFS does not try to compress data if its length is less than the below + * constant. + */ +#define UBIFS_MIN_COMPR_LEN 128 + +/* Root inode number */ +#define UBIFS_ROOT_INO 1 + +/* Lowest inode number used for regular inodes (not UBIFS-only internal ones) */ +#define UBIFS_FIRST_INO 64 + +/* + * Maximum file name and extended attribute length (must be a multiple of 8, + * minus 1). + */ +#define UBIFS_MAX_NLEN 255 + +/* Maximum number of data journal heads */ +#define UBIFS_MAX_JHEADS 1 + +/* + * Size of UBIFS data block. Note, UBIFS is not a block oriented file-system, + * which means that it does not treat the underlying media as consisting of + * blocks like in case of hard drives. Do not be confused. UBIFS block is just + * the maximum amount of data which one data node can have or which can be + * attached to an inode node. + */ +#define UBIFS_BLOCK_SIZE 4096 +#define UBIFS_BLOCK_SHIFT 12 +#define UBIFS_BLOCK_MASK 0x00000FFF + +/* UBIFS padding byte pattern (must not be first or last byte of node magic) */ +#define UBIFS_PADDING_BYTE 0xCE + +/* Maximum possible key length */ +#define UBIFS_MAX_KEY_LEN 16 + +/* Key length ("simple" format) */ +#define UBIFS_SK_LEN 8 + +/* Minimum index tree fanout */ +#define UBIFS_MIN_FANOUT 3 + +/* Maximum number of levels in UBIFS indexing B-tree */ +#define UBIFS_MAX_LEVELS 512 + +/* Maximum amount of data attached to an inode in bytes */ +#define UBIFS_MAX_INO_DATA UBIFS_BLOCK_SIZE + +/* LEB Properties Tree fanout (must be power of 2) and fanout shift */ +#define UBIFS_LPT_FANOUT 4 +#define UBIFS_LPT_FANOUT_SHIFT 2 + +/* LEB Properties Tree bit field sizes */ +#define UBIFS_LPT_CRC_BITS 16 +#define UBIFS_LPT_CRC_BYTES 2 +#define UBIFS_LPT_TYPE_BITS 4 + +/* The key is always at the same position in all keyed nodes */ +#define UBIFS_KEY_OFFSET offsetof(struct ubifs_ino_node, key) + +/* + * LEB Properties Tree node types. + * + * UBIFS_LPT_PNODE: LPT leaf node (contains LEB properties) + * UBIFS_LPT_NNODE: LPT internal node + * UBIFS_LPT_LTAB: LPT's own lprops table + * UBIFS_LPT_LSAVE: LPT's save table (big model only) + * UBIFS_LPT_NODE_CNT: count of LPT node types + * UBIFS_LPT_NOT_A_NODE: all ones (15 for 4 bits) is never a valid node type + */ +enum { + UBIFS_LPT_PNODE, + UBIFS_LPT_NNODE, + UBIFS_LPT_LTAB, + UBIFS_LPT_LSAVE, + UBIFS_LPT_NODE_CNT, + UBIFS_LPT_NOT_A_NODE = (1 << UBIFS_LPT_TYPE_BITS) - 1, +}; + +/* + * UBIFS inode types. + * + * UBIFS_ITYPE_REG: regular file + * UBIFS_ITYPE_DIR: directory + * UBIFS_ITYPE_LNK: soft link + * UBIFS_ITYPE_BLK: block device node + * UBIFS_ITYPE_CHR: character device node + * UBIFS_ITYPE_FIFO: fifo + * UBIFS_ITYPE_SOCK: socket + * UBIFS_ITYPES_CNT: count of supported file types + */ +enum { + UBIFS_ITYPE_REG, + UBIFS_ITYPE_DIR, + UBIFS_ITYPE_LNK, + UBIFS_ITYPE_BLK, + UBIFS_ITYPE_CHR, + UBIFS_ITYPE_FIFO, + UBIFS_ITYPE_SOCK, + UBIFS_ITYPES_CNT, +}; + +/* + * Supported key hash functions. + * + * UBIFS_KEY_HASH_R5: R5 hash + * UBIFS_KEY_HASH_TEST: test hash which just returns first 4 bytes of the name + */ +enum { + UBIFS_KEY_HASH_R5, + UBIFS_KEY_HASH_TEST, +}; + +/* + * Supported key formats. + * + * UBIFS_SIMPLE_KEY_FMT: simple key format + */ +enum { + UBIFS_SIMPLE_KEY_FMT, +}; + +/* + * The simple key format uses 29 bits for storing UBIFS block number and hash + * value. + */ +#define UBIFS_S_KEY_BLOCK_BITS 29 +#define UBIFS_S_KEY_BLOCK_MASK 0x1FFFFFFF +#define UBIFS_S_KEY_HASH_BITS UBIFS_S_KEY_BLOCK_BITS +#define UBIFS_S_KEY_HASH_MASK UBIFS_S_KEY_BLOCK_MASK + +/* + * Key types. + * + * UBIFS_INO_KEY: inode node key + * UBIFS_DATA_KEY: data node key + * UBIFS_DENT_KEY: directory entry node key + * UBIFS_XENT_KEY: extended attribute entry key + * UBIFS_KEY_TYPES_CNT: number of supported key types + */ +enum { + UBIFS_INO_KEY, + UBIFS_DATA_KEY, + UBIFS_DENT_KEY, + UBIFS_XENT_KEY, + UBIFS_KEY_TYPES_CNT, +}; + +/* Count of LEBs reserved for the superblock area */ +#define UBIFS_SB_LEBS 1 +/* Count of LEBs reserved for the master area */ +#define UBIFS_MST_LEBS 2 + +/* First LEB of the superblock area */ +#define UBIFS_SB_LNUM 0 +/* First LEB of the master area */ +#define UBIFS_MST_LNUM (UBIFS_SB_LNUM + UBIFS_SB_LEBS) +/* First LEB of the log area */ +#define UBIFS_LOG_LNUM (UBIFS_MST_LNUM + UBIFS_MST_LEBS) + +/* + * The below constants define the absolute minimum values for various UBIFS + * media areas. Many of them actually depend of flash geometry and the FS + * configuration (number of journal heads, orphan LEBs, etc). This means that + * the smallest volume size which can be used for UBIFS cannot be pre-defined + * by these constants. The file-system that meets the below limitation will not + * necessarily mount. UBIFS does run-time calculations and validates the FS + * size. + */ + +/* Minimum number of logical eraseblocks in the log */ +#define UBIFS_MIN_LOG_LEBS 2 +/* Minimum number of bud logical eraseblocks (one for each head) */ +#define UBIFS_MIN_BUD_LEBS 3 +/* Minimum number of journal logical eraseblocks */ +#define UBIFS_MIN_JNL_LEBS (UBIFS_MIN_LOG_LEBS + UBIFS_MIN_BUD_LEBS) +/* Minimum number of LPT area logical eraseblocks */ +#define UBIFS_MIN_LPT_LEBS 2 +/* Minimum number of orphan area logical eraseblocks */ +#define UBIFS_MIN_ORPH_LEBS 1 +/* + * Minimum number of main area logical eraseblocks (buds, 3 for the index, 1 + * for GC, 1 for deletions, and at least 1 for committed data). + */ +#define UBIFS_MIN_MAIN_LEBS (UBIFS_MIN_BUD_LEBS + 6) + +/* Minimum number of logical eraseblocks */ +#define UBIFS_MIN_LEB_CNT (UBIFS_SB_LEBS + UBIFS_MST_LEBS + \ + UBIFS_MIN_LOG_LEBS + UBIFS_MIN_LPT_LEBS + \ + UBIFS_MIN_ORPH_LEBS + UBIFS_MIN_MAIN_LEBS) + +/* Node sizes (N.B. these are guaranteed to be multiples of 8) */ +#define UBIFS_CH_SZ sizeof(struct ubifs_ch) +#define UBIFS_INO_NODE_SZ sizeof(struct ubifs_ino_node) +#define UBIFS_DATA_NODE_SZ sizeof(struct ubifs_data_node) +#define UBIFS_DENT_NODE_SZ sizeof(struct ubifs_dent_node) +#define UBIFS_TRUN_NODE_SZ sizeof(struct ubifs_trun_node) +#define UBIFS_PAD_NODE_SZ sizeof(struct ubifs_pad_node) +#define UBIFS_SB_NODE_SZ sizeof(struct ubifs_sb_node) +#define UBIFS_MST_NODE_SZ sizeof(struct ubifs_mst_node) +#define UBIFS_REF_NODE_SZ sizeof(struct ubifs_ref_node) +#define UBIFS_IDX_NODE_SZ sizeof(struct ubifs_idx_node) +#define UBIFS_CS_NODE_SZ sizeof(struct ubifs_cs_node) +#define UBIFS_ORPH_NODE_SZ sizeof(struct ubifs_orph_node) +/* Extended attribute entry nodes are identical to directory entry nodes */ +#define UBIFS_XENT_NODE_SZ UBIFS_DENT_NODE_SZ +/* Only this does not have to be multiple of 8 bytes */ +#define UBIFS_BRANCH_SZ sizeof(struct ubifs_branch) + +/* Maximum node sizes (N.B. these are guaranteed to be multiples of 8) */ +#define UBIFS_MAX_DATA_NODE_SZ (UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE) +#define UBIFS_MAX_INO_NODE_SZ (UBIFS_INO_NODE_SZ + UBIFS_MAX_INO_DATA) +#define UBIFS_MAX_DENT_NODE_SZ (UBIFS_DENT_NODE_SZ + UBIFS_MAX_NLEN + 1) +#define UBIFS_MAX_XENT_NODE_SZ UBIFS_MAX_DENT_NODE_SZ + +/* The largest UBIFS node */ +#define UBIFS_MAX_NODE_SZ UBIFS_MAX_INO_NODE_SZ + +/* + * On-flash inode flags. + * + * UBIFS_COMPR_FL: use compression for this inode + * UBIFS_SYNC_FL: I/O on this inode has to be synchronous + * UBIFS_IMMUTABLE_FL: inode is immutable + * UBIFS_APPEND_FL: writes to the inode may only append data + * UBIFS_DIRSYNC_FL: I/O on this directory inode has to be synchronous + * UBIFS_XATTR_FL: this inode is the inode for an extended attribute value + * + * Note, these are on-flash flags which correspond to ioctl flags + * (@FS_COMPR_FL, etc). They have the same values now, but generally, do not + * have to be the same. + */ +enum { + UBIFS_COMPR_FL = 0x01, + UBIFS_SYNC_FL = 0x02, + UBIFS_IMMUTABLE_FL = 0x04, + UBIFS_APPEND_FL = 0x08, + UBIFS_DIRSYNC_FL = 0x10, + UBIFS_XATTR_FL = 0x20, +}; + +/* Inode flag bits used by UBIFS */ +#define UBIFS_FL_MASK 0x0000001F + +/* + * UBIFS compression algorithms. + * + * UBIFS_COMPR_NONE: no compression + * UBIFS_COMPR_LZO: LZO compression + * UBIFS_COMPR_ZLIB: ZLIB compression + * UBIFS_COMPR_TYPES_CNT: count of supported compression types + */ +enum { + UBIFS_COMPR_NONE, + UBIFS_COMPR_LZO, + UBIFS_COMPR_ZLIB, + UBIFS_COMPR_TYPES_CNT, +}; + +/* + * UBIFS node types. + * + * UBIFS_INO_NODE: inode node + * UBIFS_DATA_NODE: data node + * UBIFS_DENT_NODE: directory entry node + * UBIFS_XENT_NODE: extended attribute node + * UBIFS_TRUN_NODE: truncation node + * UBIFS_PAD_NODE: padding node + * UBIFS_SB_NODE: superblock node + * UBIFS_MST_NODE: master node + * UBIFS_REF_NODE: LEB reference node + * UBIFS_IDX_NODE: index node + * UBIFS_CS_NODE: commit start node + * UBIFS_ORPH_NODE: orphan node + * UBIFS_NODE_TYPES_CNT: count of supported node types + * + * Note, we index arrays by these numbers, so keep them low and contiguous. + * Node type constants for inodes, direntries and so on have to be the same as + * corresponding key type constants. + */ +enum { + UBIFS_INO_NODE, + UBIFS_DATA_NODE, + UBIFS_DENT_NODE, + UBIFS_XENT_NODE, + UBIFS_TRUN_NODE, + UBIFS_PAD_NODE, + UBIFS_SB_NODE, + UBIFS_MST_NODE, + UBIFS_REF_NODE, + UBIFS_IDX_NODE, + UBIFS_CS_NODE, + UBIFS_ORPH_NODE, + UBIFS_NODE_TYPES_CNT, +}; + +/* + * Master node flags. + * + * UBIFS_MST_DIRTY: rebooted uncleanly - master node is dirty + * UBIFS_MST_NO_ORPHS: no orphan inodes present + * UBIFS_MST_RCVRY: written by recovery + */ +enum { + UBIFS_MST_DIRTY = 1, + UBIFS_MST_NO_ORPHS = 2, + UBIFS_MST_RCVRY = 4, +}; + +/* + * Node group type (used by recovery to recover whole group or none). + * + * UBIFS_NO_NODE_GROUP: this node is not part of a group + * UBIFS_IN_NODE_GROUP: this node is a part of a group + * UBIFS_LAST_OF_NODE_GROUP: this node is the last in a group + */ +enum { + UBIFS_NO_NODE_GROUP = 0, + UBIFS_IN_NODE_GROUP, + UBIFS_LAST_OF_NODE_GROUP, +}; + +/* + * Superblock flags. + * + * UBIFS_FLG_BIGLPT: if "big" LPT model is used if set + */ +enum { + UBIFS_FLG_BIGLPT = 0x02, +}; + +/** + * struct ubifs_ch - common header node. + * @magic: UBIFS node magic number (%UBIFS_NODE_MAGIC) + * @crc: CRC-32 checksum of the node header + * @sqnum: sequence number + * @len: full node length + * @node_type: node type + * @group_type: node group type + * @padding: reserved for future, zeroes + * + * Every UBIFS node starts with this common part. If the node has a key, the + * key always goes next. + */ +struct ubifs_ch { + __le32 magic; + __le32 crc; + __le64 sqnum; + __le32 len; + __u8 node_type; + __u8 group_type; + __u8 padding[2]; +} __attribute__ ((packed)); + +/** + * union ubifs_dev_desc - device node descriptor. + * @new: new type device descriptor + * @huge: huge type device descriptor + * + * This data structure describes major/minor numbers of a device node. In an + * inode is a device node then its data contains an object of this type. UBIFS + * uses standard Linux "new" and "huge" device node encodings. + */ +union ubifs_dev_desc { + __le32 new; + __le64 huge; +} __attribute__ ((packed)); + +/** + * struct ubifs_ino_node - inode node. + * @ch: common header + * @key: node key + * @creat_sqnum: sequence number at time of creation + * @size: inode size in bytes (amount of uncompressed data) + * @atime_sec: access time seconds + * @ctime_sec: creation time seconds + * @mtime_sec: modification time seconds + * @atime_nsec: access time nanoseconds + * @ctime_nsec: creation time nanoseconds + * @mtime_nsec: modification time nanoseconds + * @nlink: number of hard links + * @uid: owner ID + * @gid: group ID + * @mode: access flags + * @flags: per-inode flags (%UBIFS_COMPR_FL, %UBIFS_SYNC_FL, etc) + * @data_len: inode data length + * @xattr_cnt: count of extended attributes this inode has + * @xattr_size: summarized size of all extended attributes in bytes + * @padding1: reserved for future, zeroes + * @xattr_names: sum of lengths of all extended attribute names belonging to + * this inode + * @compr_type: compression type used for this inode + * @padding2: reserved for future, zeroes + * @data: data attached to the inode + * + * Note, even though inode compression type is defined by @compr_type, some + * nodes of this inode may be compressed with different compressor - this + * happens if compression type is changed while the inode already has data + * nodes. But @compr_type will be use for further writes to the inode. + * + * Note, do not forget to amend 'zero_ino_node_unused()' function when changing + * the padding fields. + */ +struct ubifs_ino_node { + struct ubifs_ch ch; + __u8 key[UBIFS_MAX_KEY_LEN]; + __le64 creat_sqnum; + __le64 size; + __le64 atime_sec; + __le64 ctime_sec; + __le64 mtime_sec; + __le32 atime_nsec; + __le32 ctime_nsec; + __le32 mtime_nsec; + __le32 nlink; + __le32 uid; + __le32 gid; + __le32 mode; + __le32 flags; + __le32 data_len; + __le32 xattr_cnt; + __le32 xattr_size; + __u8 padding1[4]; /* Watch 'zero_ino_node_unused()' if changing! */ + __le32 xattr_names; + __le16 compr_type; + __u8 padding2[26]; /* Watch 'zero_ino_node_unused()' if changing! */ + __u8 data[]; +} __attribute__ ((packed)); + +/** + * struct ubifs_dent_node - directory entry node. + * @ch: common header + * @key: node key + * @inum: target inode number + * @padding1: reserved for future, zeroes + * @type: type of the target inode (%UBIFS_ITYPE_REG, %UBIFS_ITYPE_DIR, etc) + * @nlen: name length + * @padding2: reserved for future, zeroes + * @name: zero-terminated name + * + * Note, do not forget to amend 'zero_dent_node_unused()' function when + * changing the padding fields. + */ +struct ubifs_dent_node { + struct ubifs_ch ch; + __u8 key[UBIFS_MAX_KEY_LEN]; + __le64 inum; + __u8 padding1; + __u8 type; + __le16 nlen; + __u8 padding2[4]; /* Watch 'zero_dent_node_unused()' if changing! */ + __u8 name[]; +} __attribute__ ((packed)); + +/** + * struct ubifs_data_node - data node. + * @ch: common header + * @key: node key + * @size: uncompressed data size in bytes + * @compr_type: compression type (%UBIFS_COMPR_NONE, %UBIFS_COMPR_LZO, etc) + * @padding: reserved for future, zeroes + * @data: data + * + * Note, do not forget to amend 'zero_data_node_unused()' function when + * changing the padding fields. + */ +struct ubifs_data_node { + struct ubifs_ch ch; + __u8 key[UBIFS_MAX_KEY_LEN]; + __le32 size; + __le16 compr_type; + __u8 padding[2]; /* Watch 'zero_data_node_unused()' if changing! */ + __u8 data[]; +} __attribute__ ((packed)); + +/** + * struct ubifs_trun_node - truncation node. + * @ch: common header + * @inum: truncated inode number + * @padding: reserved for future, zeroes + * @old_size: size before truncation + * @new_size: size after truncation + * + * This node exists only in the journal and never goes to the main area. Note, + * do not forget to amend 'zero_trun_node_unused()' function when changing the + * padding fields. + */ +struct ubifs_trun_node { + struct ubifs_ch ch; + __le32 inum; + __u8 padding[12]; /* Watch 'zero_trun_node_unused()' if changing! */ + __le64 old_size; + __le64 new_size; +} __attribute__ ((packed)); + +/** + * struct ubifs_pad_node - padding node. + * @ch: common header + * @pad_len: how many bytes after this node are unused (because padded) + * @padding: reserved for future, zeroes + */ +struct ubifs_pad_node { + struct ubifs_ch ch; + __le32 pad_len; +} __attribute__ ((packed)); + +/** + * struct ubifs_sb_node - superblock node. + * @ch: common header + * @padding: reserved for future, zeroes + * @key_hash: type of hash function used in keys + * @key_fmt: format of the key + * @flags: file-system flags (%UBIFS_FLG_BIGLPT, etc) + * @min_io_size: minimal input/output unit size + * @leb_size: logical eraseblock size in bytes + * @leb_cnt: count of LEBs used by file-system + * @max_leb_cnt: maximum count of LEBs used by file-system + * @max_bud_bytes: maximum amount of data stored in buds + * @log_lebs: log size in logical eraseblocks + * @lpt_lebs: number of LEBs used for lprops table + * @orph_lebs: number of LEBs used for recording orphans + * @jhead_cnt: count of journal heads + * @fanout: tree fanout (max. number of links per indexing node) + * @lsave_cnt: number of LEB numbers in LPT's save table + * @fmt_version: UBIFS on-flash format version + * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc) + * @padding1: reserved for future, zeroes + * @rp_uid: reserve pool UID + * @rp_gid: reserve pool GID + * @rp_size: size of the reserved pool in bytes + * @padding2: reserved for future, zeroes + * @time_gran: time granularity in nanoseconds + * @uuid: UUID generated when the file system image was created + */ +struct ubifs_sb_node { + struct ubifs_ch ch; + __u8 padding[2]; + __u8 key_hash; + __u8 key_fmt; + __le32 flags; + __le32 min_io_size; + __le32 leb_size; + __le32 leb_cnt; + __le32 max_leb_cnt; + __le64 max_bud_bytes; + __le32 log_lebs; + __le32 lpt_lebs; + __le32 orph_lebs; + __le32 jhead_cnt; + __le32 fanout; + __le32 lsave_cnt; + __le32 fmt_version; + __le16 default_compr; + __u8 padding1[2]; + __le32 rp_uid; + __le32 rp_gid; + __le64 rp_size; + __le32 time_gran; + __u8 uuid[16]; + __u8 padding2[3972]; +} __attribute__ ((packed)); + +/** + * struct ubifs_mst_node - master node. + * @ch: common header + * @highest_inum: highest inode number in the committed index + * @cmt_no: commit number + * @flags: various flags (%UBIFS_MST_DIRTY, etc) + * @log_lnum: start of the log + * @root_lnum: LEB number of the root indexing node + * @root_offs: offset within @root_lnum + * @root_len: root indexing node length + * @gc_lnum: LEB reserved for garbage collection (%-1 value means the LEB was + * not reserved and should be reserved on mount) + * @ihead_lnum: LEB number of index head + * @ihead_offs: offset of index head + * @index_size: size of index on flash + * @total_free: total free space in bytes + * @total_dirty: total dirty space in bytes + * @total_used: total used space in bytes (includes only data LEBs) + * @total_dead: total dead space in bytes (includes only data LEBs) + * @total_dark: total dark space in bytes (includes only data LEBs) + * @lpt_lnum: LEB number of LPT root nnode + * @lpt_offs: offset of LPT root nnode + * @nhead_lnum: LEB number of LPT head + * @nhead_offs: offset of LPT head + * @ltab_lnum: LEB number of LPT's own lprops table + * @ltab_offs: offset of LPT's own lprops table + * @lsave_lnum: LEB number of LPT's save table (big model only) + * @lsave_offs: offset of LPT's save table (big model only) + * @lscan_lnum: LEB number of last LPT scan + * @empty_lebs: number of empty logical eraseblocks + * @idx_lebs: number of indexing logical eraseblocks + * @leb_cnt: count of LEBs used by file-system + * @padding: reserved for future, zeroes + */ +struct ubifs_mst_node { + struct ubifs_ch ch; + __le64 highest_inum; + __le64 cmt_no; + __le32 flags; + __le32 log_lnum; + __le32 root_lnum; + __le32 root_offs; + __le32 root_len; + __le32 gc_lnum; + __le32 ihead_lnum; + __le32 ihead_offs; + __le64 index_size; + __le64 total_free; + __le64 total_dirty; + __le64 total_used; + __le64 total_dead; + __le64 total_dark; + __le32 lpt_lnum; + __le32 lpt_offs; + __le32 nhead_lnum; + __le32 nhead_offs; + __le32 ltab_lnum; + __le32 ltab_offs; + __le32 lsave_lnum; + __le32 lsave_offs; + __le32 lscan_lnum; + __le32 empty_lebs; + __le32 idx_lebs; + __le32 leb_cnt; + __u8 padding[344]; +} __attribute__ ((packed)); + +/** + * struct ubifs_ref_node - logical eraseblock reference node. + * @ch: common header + * @lnum: the referred logical eraseblock number + * @offs: start offset in the referred LEB + * @jhead: journal head number + * @padding: reserved for future, zeroes + */ +struct ubifs_ref_node { + struct ubifs_ch ch; + __le32 lnum; + __le32 offs; + __le32 jhead; + __u8 padding[28]; +} __attribute__ ((packed)); + +/** + * struct ubifs_branch - key/reference/length branch + * @lnum: LEB number of the target node + * @offs: offset within @lnum + * @len: target node length + * @key: key + */ +struct ubifs_branch { + __le32 lnum; + __le32 offs; + __le32 len; + __u8 key[]; +} __attribute__ ((packed)); + +/** + * struct ubifs_idx_node - indexing node. + * @ch: common header + * @child_cnt: number of child index nodes + * @level: tree level + * @branches: LEB number / offset / length / key branches + */ +struct ubifs_idx_node { + struct ubifs_ch ch; + __le16 child_cnt; + __le16 level; + __u8 branches[]; +} __attribute__ ((packed)); + +/** + * struct ubifs_cs_node - commit start node. + * @ch: common header + * @cmt_no: commit number + */ +struct ubifs_cs_node { + struct ubifs_ch ch; + __le64 cmt_no; +} __attribute__ ((packed)); + +/** + * struct ubifs_orph_node - orphan node. + * @ch: common header + * @cmt_no: commit number (also top bit is set on the last node of the commit) + * @inos: inode numbers of orphans + */ +struct ubifs_orph_node { + struct ubifs_ch ch; + __le64 cmt_no; + __le64 inos[]; +} __attribute__ ((packed)); + +#endif /* __UBIFS_MEDIA_H__ */ diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h new file mode 100644 index 000000000000..17c620b93eec --- /dev/null +++ b/fs/ubifs/ubifs.h @@ -0,0 +1,1668 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +#ifndef __UBIFS_H__ +#define __UBIFS_H__ + +#include <asm/div64.h> +#include <linux/statfs.h> +#include <linux/fs.h> +#include <linux/err.h> +#include <linux/sched.h> +#include <linux/vmalloc.h> +#include <linux/spinlock.h> +#include <linux/mutex.h> +#include <linux/rwsem.h> +#include <linux/mtd/ubi.h> +#include <linux/pagemap.h> +#include <linux/backing-dev.h> +#include "ubifs-media.h" + +/* Version of this UBIFS implementation */ +#define UBIFS_VERSION 1 + +/* Normal UBIFS messages */ +#define ubifs_msg(fmt, ...) \ + printk(KERN_NOTICE "UBIFS: " fmt "\n", ##__VA_ARGS__) +/* UBIFS error messages */ +#define ubifs_err(fmt, ...) \ + printk(KERN_ERR "UBIFS error (pid %d): %s: " fmt "\n", current->pid, \ + __func__, ##__VA_ARGS__) +/* UBIFS warning messages */ +#define ubifs_warn(fmt, ...) \ + printk(KERN_WARNING "UBIFS warning (pid %d): %s: " fmt "\n", \ + current->pid, __func__, ##__VA_ARGS__) + +/* UBIFS file system VFS magic number */ +#define UBIFS_SUPER_MAGIC 0x24051905 + +/* Number of UBIFS blocks per VFS page */ +#define UBIFS_BLOCKS_PER_PAGE (PAGE_CACHE_SIZE / UBIFS_BLOCK_SIZE) +#define UBIFS_BLOCKS_PER_PAGE_SHIFT (PAGE_CACHE_SHIFT - UBIFS_BLOCK_SHIFT) + +/* "File system end of life" sequence number watermark */ +#define SQNUM_WARN_WATERMARK 0xFFFFFFFF00000000ULL +#define SQNUM_WATERMARK 0xFFFFFFFFFF000000ULL + +/* Minimum amount of data UBIFS writes to the flash */ +#define MIN_WRITE_SZ (UBIFS_DATA_NODE_SZ + 8) + +/* + * Currently we do not support inode number overlapping and re-using, so this + * watermark defines dangerous inode number level. This should be fixed later, + * although it is difficult to exceed current limit. Another option is to use + * 64-bit inode numbers, but this means more overhead. + */ +#define INUM_WARN_WATERMARK 0xFFF00000 +#define INUM_WATERMARK 0xFFFFFF00 + +/* Largest key size supported in this implementation */ +#define CUR_MAX_KEY_LEN UBIFS_SK_LEN + +/* Maximum number of entries in each LPT (LEB category) heap */ +#define LPT_HEAP_SZ 256 + +/* + * Background thread name pattern. The numbers are UBI device and volume + * numbers. + */ +#define BGT_NAME_PATTERN "ubifs_bgt%d_%d" + +/* Default write-buffer synchronization timeout (5 secs) */ +#define DEFAULT_WBUF_TIMEOUT (5 * HZ) + +/* Maximum possible inode number (only 32-bit inodes are supported now) */ +#define MAX_INUM 0xFFFFFFFF + +/* Number of non-data journal heads */ +#define NONDATA_JHEADS_CNT 2 + +/* Garbage collector head */ +#define GCHD 0 +/* Base journal head number */ +#define BASEHD 1 +/* First "general purpose" journal head */ +#define DATAHD 2 + +/* 'No change' value for 'ubifs_change_lp()' */ +#define LPROPS_NC 0x80000001 + +/* + * There is no notion of truncation key because truncation nodes do not exist + * in TNC. However, when replaying, it is handy to introduce fake "truncation" + * keys for truncation nodes because the code becomes simpler. So we define + * %UBIFS_TRUN_KEY type. + */ +#define UBIFS_TRUN_KEY UBIFS_KEY_TYPES_CNT + +/* + * How much a directory entry/extended attribute entry adds to the parent/host + * inode. + */ +#define CALC_DENT_SIZE(name_len) ALIGN(UBIFS_DENT_NODE_SZ + (name_len) + 1, 8) + +/* How much an extended attribute adds to the host inode */ +#define CALC_XATTR_BYTES(data_len) ALIGN(UBIFS_INO_NODE_SZ + (data_len) + 1, 8) + +/* + * Znodes which were not touched for 'OLD_ZNODE_AGE' seconds are considered + * "old", and znode which were touched last 'YOUNG_ZNODE_AGE' seconds ago are + * considered "young". This is used by shrinker when selecting znode to trim + * off. + */ +#define OLD_ZNODE_AGE 20 +#define YOUNG_ZNODE_AGE 5 + +/* + * Some compressors, like LZO, may end up with more data then the input buffer. + * So UBIFS always allocates larger output buffer, to be sure the compressor + * will not corrupt memory in case of worst case compression. + */ +#define WORST_COMPR_FACTOR 2 + +/* Maximum expected tree height for use by bottom_up_buf */ +#define BOTTOM_UP_HEIGHT 64 + +/* + * Lockdep classes for UBIFS inode @ui_mutex. + */ +enum { + WB_MUTEX_1 = 0, + WB_MUTEX_2 = 1, + WB_MUTEX_3 = 2, +}; + +/* + * Znode flags (actually, bit numbers which store the flags). + * + * DIRTY_ZNODE: znode is dirty + * COW_ZNODE: znode is being committed and a new instance of this znode has to + * be created before changing this znode + * OBSOLETE_ZNODE: znode is obsolete, which means it was deleted, but it is + * still in the commit list and the ongoing commit operation + * will commit it, and delete this znode after it is done + */ +enum { + DIRTY_ZNODE = 0, + COW_ZNODE = 1, + OBSOLETE_ZNODE = 2, +}; + +/* + * Commit states. + * + * COMMIT_RESTING: commit is not wanted + * COMMIT_BACKGROUND: background commit has been requested + * COMMIT_REQUIRED: commit is required + * COMMIT_RUNNING_BACKGROUND: background commit is running + * COMMIT_RUNNING_REQUIRED: commit is running and it is required + * COMMIT_BROKEN: commit failed + */ +enum { + COMMIT_RESTING = 0, + COMMIT_BACKGROUND, + COMMIT_REQUIRED, + COMMIT_RUNNING_BACKGROUND, + COMMIT_RUNNING_REQUIRED, + COMMIT_BROKEN, +}; + +/* + * 'ubifs_scan_a_node()' return values. + * + * SCANNED_GARBAGE: scanned garbage + * SCANNED_EMPTY_SPACE: scanned empty space + * SCANNED_A_NODE: scanned a valid node + * SCANNED_A_CORRUPT_NODE: scanned a corrupted node + * SCANNED_A_BAD_PAD_NODE: scanned a padding node with invalid pad length + * + * Greater than zero means: 'scanned that number of padding bytes' + */ +enum { + SCANNED_GARBAGE = 0, + SCANNED_EMPTY_SPACE = -1, + SCANNED_A_NODE = -2, + SCANNED_A_CORRUPT_NODE = -3, + SCANNED_A_BAD_PAD_NODE = -4, +}; + +/* + * LPT cnode flag bits. + * + * DIRTY_CNODE: cnode is dirty + * COW_CNODE: cnode is being committed and must be copied before writing + * OBSOLETE_CNODE: cnode is being committed and has been copied (or deleted), + * so it can (and must) be freed when the commit is finished + */ +enum { + DIRTY_CNODE = 0, + COW_CNODE = 1, + OBSOLETE_CNODE = 2, +}; + +/* + * Dirty flag bits (lpt_drty_flgs) for LPT special nodes. + * + * LTAB_DIRTY: ltab node is dirty + * LSAVE_DIRTY: lsave node is dirty + */ +enum { + LTAB_DIRTY = 1, + LSAVE_DIRTY = 2, +}; + +/* + * Return codes used by the garbage collector. + * @LEB_FREED: the logical eraseblock was freed and is ready to use + * @LEB_FREED_IDX: indexing LEB was freed and can be used only after the commit + * @LEB_RETAINED: the logical eraseblock was freed and retained for GC purposes + */ +enum { + LEB_FREED, + LEB_FREED_IDX, + LEB_RETAINED, +}; + +/** + * struct ubifs_old_idx - index node obsoleted since last commit start. + * @rb: rb-tree node + * @lnum: LEB number of obsoleted index node + * @offs: offset of obsoleted index node + */ +struct ubifs_old_idx { + struct rb_node rb; + int lnum; + int offs; +}; + +/* The below union makes it easier to deal with keys */ +union ubifs_key { + uint8_t u8[CUR_MAX_KEY_LEN]; + uint32_t u32[CUR_MAX_KEY_LEN/4]; + uint64_t u64[CUR_MAX_KEY_LEN/8]; + __le32 j32[CUR_MAX_KEY_LEN/4]; +}; + +/** + * struct ubifs_scan_node - UBIFS scanned node information. + * @list: list of scanned nodes + * @key: key of node scanned (if it has one) + * @sqnum: sequence number + * @type: type of node scanned + * @offs: offset with LEB of node scanned + * @len: length of node scanned + * @node: raw node + */ +struct ubifs_scan_node { + struct list_head list; + union ubifs_key key; + unsigned long long sqnum; + int type; + int offs; + int len; + void *node; +}; + +/** + * struct ubifs_scan_leb - UBIFS scanned LEB information. + * @lnum: logical eraseblock number + * @nodes_cnt: number of nodes scanned + * @nodes: list of struct ubifs_scan_node + * @endpt: end point (and therefore the start of empty space) + * @ecc: read returned -EBADMSG + * @buf: buffer containing entire LEB scanned + */ +struct ubifs_scan_leb { + int lnum; + int nodes_cnt; + struct list_head nodes; + int endpt; + int ecc; + void *buf; +}; + +/** + * struct ubifs_gced_idx_leb - garbage-collected indexing LEB. + * @list: list + * @lnum: LEB number + * @unmap: OK to unmap this LEB + * + * This data structure is used to temporary store garbage-collected indexing + * LEBs - they are not released immediately, but only after the next commit. + * This is needed to guarantee recoverability. + */ +struct ubifs_gced_idx_leb { + struct list_head list; + int lnum; + int unmap; +}; + +/** + * struct ubifs_inode - UBIFS in-memory inode description. + * @vfs_inode: VFS inode description object + * @creat_sqnum: sequence number at time of creation + * @del_cmtno: commit number corresponding to the time the inode was deleted, + * protected by @c->commit_sem; + * @xattr_size: summarized size of all extended attributes in bytes + * @xattr_cnt: count of extended attributes this inode has + * @xattr_names: sum of lengths of all extended attribute names belonging to + * this inode + * @dirty: non-zero if the inode is dirty + * @xattr: non-zero if this is an extended attribute inode + * @ui_mutex: serializes inode write-back with the rest of VFS operations, + * serializes "clean <-> dirty" state changes, protects @dirty, + * @ui_size, and @xattr_size + * @ui_lock: protects @synced_i_size + * @synced_i_size: synchronized size of inode, i.e. the value of inode size + * currently stored on the flash; used only for regular file + * inodes + * @ui_size: inode size used by UBIFS when writing to flash + * @flags: inode flags (@UBIFS_COMPR_FL, etc) + * @compr_type: default compression type used for this inode + * @data_len: length of the data attached to the inode + * @data: inode's data + * + * @ui_mutex exists for two main reasons. At first it prevents inodes from + * being written back while UBIFS changing them, being in the middle of an VFS + * operation. This way UBIFS makes sure the inode fields are consistent. For + * example, in 'ubifs_rename()' we change 3 inodes simultaneously, and + * write-back must not write any of them before we have finished. + * + * The second reason is budgeting - UBIFS has to budget all operations. If an + * operation is going to mark an inode dirty, it has to allocate budget for + * this. It cannot just mark it dirty because there is no guarantee there will + * be enough flash space to write the inode back later. This means UBIFS has + * to have full control over inode "clean <-> dirty" transitions (and pages + * actually). But unfortunately, VFS marks inodes dirty in many places, and it + * does not ask the file-system if it is allowed to do so (there is a notifier, + * but it is not enough), i.e., there is no mechanism to synchronize with this. + * So UBIFS has its own inode dirty flag and its own mutex to serialize + * "clean <-> dirty" transitions. + * + * The @synced_i_size field is used to make sure we never write pages which are + * beyond last synchronized inode size. See 'ubifs_writepage()' for more + * information. + * + * The @ui_size is a "shadow" variable for @inode->i_size and UBIFS uses + * @ui_size instead of @inode->i_size. The reason for this is that UBIFS cannot + * make sure @inode->i_size is always changed under @ui_mutex, because it + * cannot call 'vmtruncate()' with @ui_mutex locked, because it would deadlock + * with 'ubifs_writepage()' (see file.c). All the other inode fields are + * changed under @ui_mutex, so they do not need "shadow" fields. Note, one + * could consider to rework locking and base it on "shadow" fields. + */ +struct ubifs_inode { + struct inode vfs_inode; + unsigned long long creat_sqnum; + unsigned long long del_cmtno; + unsigned int xattr_size; + unsigned int xattr_cnt; + unsigned int xattr_names; + unsigned int dirty:1; + unsigned int xattr:1; + struct mutex ui_mutex; + spinlock_t ui_lock; + loff_t synced_i_size; + loff_t ui_size; + int flags; + int compr_type; + int data_len; + void *data; +}; + +/** + * struct ubifs_unclean_leb - records a LEB recovered under read-only mode. + * @list: list + * @lnum: LEB number of recovered LEB + * @endpt: offset where recovery ended + * + * This structure records a LEB identified during recovery that needs to be + * cleaned but was not because UBIFS was mounted read-only. The information + * is used to clean the LEB when remounting to read-write mode. + */ +struct ubifs_unclean_leb { + struct list_head list; + int lnum; + int endpt; +}; + +/* + * LEB properties flags. + * + * LPROPS_UNCAT: not categorized + * LPROPS_DIRTY: dirty > 0, not index + * LPROPS_DIRTY_IDX: dirty + free > UBIFS_CH_SZ and index + * LPROPS_FREE: free > 0, not empty, not index + * LPROPS_HEAP_CNT: number of heaps used for storing categorized LEBs + * LPROPS_EMPTY: LEB is empty, not taken + * LPROPS_FREEABLE: free + dirty == leb_size, not index, not taken + * LPROPS_FRDI_IDX: free + dirty == leb_size and index, may be taken + * LPROPS_CAT_MASK: mask for the LEB categories above + * LPROPS_TAKEN: LEB was taken (this flag is not saved on the media) + * LPROPS_INDEX: LEB contains indexing nodes (this flag also exists on flash) + */ +enum { + LPROPS_UNCAT = 0, + LPROPS_DIRTY = 1, + LPROPS_DIRTY_IDX = 2, + LPROPS_FREE = 3, + LPROPS_HEAP_CNT = 3, + LPROPS_EMPTY = 4, + LPROPS_FREEABLE = 5, + LPROPS_FRDI_IDX = 6, + LPROPS_CAT_MASK = 15, + LPROPS_TAKEN = 16, + LPROPS_INDEX = 32, +}; + +/** + * struct ubifs_lprops - logical eraseblock properties. + * @free: amount of free space in bytes + * @dirty: amount of dirty space in bytes + * @flags: LEB properties flags (see above) + * @lnum: LEB number + * @list: list of same-category lprops (for LPROPS_EMPTY and LPROPS_FREEABLE) + * @hpos: heap position in heap of same-category lprops (other categories) + */ +struct ubifs_lprops { + int free; + int dirty; + int flags; + int lnum; + union { + struct list_head list; + int hpos; + }; +}; + +/** + * struct ubifs_lpt_lprops - LPT logical eraseblock properties. + * @free: amount of free space in bytes + * @dirty: amount of dirty space in bytes + * @tgc: trivial GC flag (1 => unmap after commit end) + * @cmt: commit flag (1 => reserved for commit) + */ +struct ubifs_lpt_lprops { + int free; + int dirty; + unsigned tgc : 1; + unsigned cmt : 1; +}; + +/** + * struct ubifs_lp_stats - statistics of eraseblocks in the main area. + * @empty_lebs: number of empty LEBs + * @taken_empty_lebs: number of taken LEBs + * @idx_lebs: number of indexing LEBs + * @total_free: total free space in bytes + * @total_dirty: total dirty space in bytes + * @total_used: total used space in bytes (includes only data LEBs) + * @total_dead: total dead space in bytes (includes only data LEBs) + * @total_dark: total dark space in bytes (includes only data LEBs) + * + * N.B. total_dirty and total_used are different to other total_* fields, + * because they account _all_ LEBs, not just data LEBs. + * + * 'taken_empty_lebs' counts the LEBs that are in the transient state of having + * been 'taken' for use but not yet written to. 'taken_empty_lebs' is needed + * to account correctly for gc_lnum, otherwise 'empty_lebs' could be used + * by itself (in which case 'unused_lebs' would be a better name). In the case + * of gc_lnum, it is 'taken' at mount time or whenever a LEB is retained by GC, + * but unlike other empty LEBs that are 'taken', it may not be written straight + * away (i.e. before the next commit start or unmount), so either gc_lnum must + * be specially accounted for, or the current approach followed i.e. count it + * under 'taken_empty_lebs'. + */ +struct ubifs_lp_stats { + int empty_lebs; + int taken_empty_lebs; + int idx_lebs; + long long total_free; + long long total_dirty; + long long total_used; + long long total_dead; + long long total_dark; +}; + +struct ubifs_nnode; + +/** + * struct ubifs_cnode - LEB Properties Tree common node. + * @parent: parent nnode + * @cnext: next cnode to commit + * @flags: flags (%DIRTY_LPT_NODE or %OBSOLETE_LPT_NODE) + * @iip: index in parent + * @level: level in the tree (zero for pnodes, greater than zero for nnodes) + * @num: node number + */ +struct ubifs_cnode { + struct ubifs_nnode *parent; + struct ubifs_cnode *cnext; + unsigned long flags; + int iip; + int level; + int num; +}; + +/** + * struct ubifs_pnode - LEB Properties Tree leaf node. + * @parent: parent nnode + * @cnext: next cnode to commit + * @flags: flags (%DIRTY_LPT_NODE or %OBSOLETE_LPT_NODE) + * @iip: index in parent + * @level: level in the tree (always zero for pnodes) + * @num: node number + * @lprops: LEB properties array + */ +struct ubifs_pnode { + struct ubifs_nnode *parent; + struct ubifs_cnode *cnext; + unsigned long flags; + int iip; + int level; + int num; + struct ubifs_lprops lprops[UBIFS_LPT_FANOUT]; +}; + +/** + * struct ubifs_nbranch - LEB Properties Tree internal node branch. + * @lnum: LEB number of child + * @offs: offset of child + * @nnode: nnode child + * @pnode: pnode child + * @cnode: cnode child + */ +struct ubifs_nbranch { + int lnum; + int offs; + union { + struct ubifs_nnode *nnode; + struct ubifs_pnode *pnode; + struct ubifs_cnode *cnode; + }; +}; + +/** + * struct ubifs_nnode - LEB Properties Tree internal node. + * @parent: parent nnode + * @cnext: next cnode to commit + * @flags: flags (%DIRTY_LPT_NODE or %OBSOLETE_LPT_NODE) + * @iip: index in parent + * @level: level in the tree (always greater than zero for nnodes) + * @num: node number + * @nbranch: branches to child nodes + */ +struct ubifs_nnode { + struct ubifs_nnode *parent; + struct ubifs_cnode *cnext; + unsigned long flags; + int iip; + int level; + int num; + struct ubifs_nbranch nbranch[UBIFS_LPT_FANOUT]; +}; + +/** + * struct ubifs_lpt_heap - heap of categorized lprops. + * @arr: heap array + * @cnt: number in heap + * @max_cnt: maximum number allowed in heap + * + * There are %LPROPS_HEAP_CNT heaps. + */ +struct ubifs_lpt_heap { + struct ubifs_lprops **arr; + int cnt; + int max_cnt; +}; + +/* + * Return codes for LPT scan callback function. + * + * LPT_SCAN_CONTINUE: continue scanning + * LPT_SCAN_ADD: add the LEB properties scanned to the tree in memory + * LPT_SCAN_STOP: stop scanning + */ +enum { + LPT_SCAN_CONTINUE = 0, + LPT_SCAN_ADD = 1, + LPT_SCAN_STOP = 2, +}; + +struct ubifs_info; + +/* Callback used by the 'ubifs_lpt_scan_nolock()' function */ +typedef int (*ubifs_lpt_scan_callback)(struct ubifs_info *c, + const struct ubifs_lprops *lprops, + int in_tree, void *data); + +/** + * struct ubifs_wbuf - UBIFS write-buffer. + * @c: UBIFS file-system description object + * @buf: write-buffer (of min. flash I/O unit size) + * @lnum: logical eraseblock number the write-buffer points to + * @offs: write-buffer offset in this logical eraseblock + * @avail: number of bytes available in the write-buffer + * @used: number of used bytes in the write-buffer + * @dtype: type of data stored in this LEB (%UBI_LONGTERM, %UBI_SHORTTERM, + * %UBI_UNKNOWN) + * @jhead: journal head the mutex belongs to (note, needed only to shut lockdep + * up by 'mutex_lock_nested()). + * @sync_callback: write-buffer synchronization callback + * @io_mutex: serializes write-buffer I/O + * @lock: serializes @buf, @lnum, @offs, @avail, @used, @next_ino and @inodes + * fields + * @timer: write-buffer timer + * @timeout: timer expire interval in jiffies + * @need_sync: it is set if its timer expired and needs sync + * @next_ino: points to the next position of the following inode number + * @inodes: stores the inode numbers of the nodes which are in wbuf + * + * The write-buffer synchronization callback is called when the write-buffer is + * synchronized in order to notify how much space was wasted due to + * write-buffer padding and how much free space is left in the LEB. + * + * Note: the fields @buf, @lnum, @offs, @avail and @used can be read under + * spin-lock or mutex because they are written under both mutex and spin-lock. + * @buf is appended to under mutex but overwritten under both mutex and + * spin-lock. Thus the data between @buf and @buf + @used can be read under + * spinlock. + */ +struct ubifs_wbuf { + struct ubifs_info *c; + void *buf; + int lnum; + int offs; + int avail; + int used; + int dtype; + int jhead; + int (*sync_callback)(struct ubifs_info *c, int lnum, int free, int pad); + struct mutex io_mutex; + spinlock_t lock; + struct timer_list timer; + int timeout; + int need_sync; + int next_ino; + ino_t *inodes; +}; + +/** + * struct ubifs_bud - bud logical eraseblock. + * @lnum: logical eraseblock number + * @start: where the (uncommitted) bud data starts + * @jhead: journal head number this bud belongs to + * @list: link in the list buds belonging to the same journal head + * @rb: link in the tree of all buds + */ +struct ubifs_bud { + int lnum; + int start; + int jhead; + struct list_head list; + struct rb_node rb; +}; + +/** + * struct ubifs_jhead - journal head. + * @wbuf: head's write-buffer + * @buds_list: list of bud LEBs belonging to this journal head + * + * Note, the @buds list is protected by the @c->buds_lock. + */ +struct ubifs_jhead { + struct ubifs_wbuf wbuf; + struct list_head buds_list; +}; + +/** + * struct ubifs_zbranch - key/coordinate/length branch stored in znodes. + * @key: key + * @znode: znode address in memory + * @lnum: LEB number of the indexing node + * @offs: offset of the indexing node within @lnum + * @len: target node length + */ +struct ubifs_zbranch { + union ubifs_key key; + union { + struct ubifs_znode *znode; + void *leaf; + }; + int lnum; + int offs; + int len; +}; + +/** + * struct ubifs_znode - in-memory representation of an indexing node. + * @parent: parent znode or NULL if it is the root + * @cnext: next znode to commit + * @flags: znode flags (%DIRTY_ZNODE, %COW_ZNODE or %OBSOLETE_ZNODE) + * @time: last access time (seconds) + * @level: level of the entry in the TNC tree + * @child_cnt: count of child znodes + * @iip: index in parent's zbranch array + * @alt: lower bound of key range has altered i.e. child inserted at slot 0 + * @lnum: LEB number of the corresponding indexing node + * @offs: offset of the corresponding indexing node + * @len: length of the corresponding indexing node + * @zbranch: array of znode branches (@c->fanout elements) + */ +struct ubifs_znode { + struct ubifs_znode *parent; + struct ubifs_znode *cnext; + unsigned long flags; + unsigned long time; + int level; + int child_cnt; + int iip; + int alt; +#ifdef CONFIG_UBIFS_FS_DEBUG + int lnum, offs, len; +#endif + struct ubifs_zbranch zbranch[]; +}; + +/** + * struct ubifs_node_range - node length range description data structure. + * @len: fixed node length + * @min_len: minimum possible node length + * @max_len: maximum possible node length + * + * If @max_len is %0, the node has fixed length @len. + */ +struct ubifs_node_range { + union { + int len; + int min_len; + }; + int max_len; +}; + +/** + * struct ubifs_compressor - UBIFS compressor description structure. + * @compr_type: compressor type (%UBIFS_COMPR_LZO, etc) + * @cc: cryptoapi compressor handle + * @comp_mutex: mutex used during compression + * @decomp_mutex: mutex used during decompression + * @name: compressor name + * @capi_name: cryptoapi compressor name + */ +struct ubifs_compressor { + int compr_type; + struct crypto_comp *cc; + struct mutex *comp_mutex; + struct mutex *decomp_mutex; + const char *name; + const char *capi_name; +}; + +/** + * struct ubifs_budget_req - budget requirements of an operation. + * + * @fast: non-zero if the budgeting should try to acquire budget quickly and + * should not try to call write-back + * @recalculate: non-zero if @idx_growth, @data_growth, and @dd_growth fields + * have to be re-calculated + * @new_page: non-zero if the operation adds a new page + * @dirtied_page: non-zero if the operation makes a page dirty + * @new_dent: non-zero if the operation adds a new directory entry + * @mod_dent: non-zero if the operation removes or modifies an existing + * directory entry + * @new_ino: non-zero if the operation adds a new inode + * @new_ino_d: now much data newly created inode contains + * @dirtied_ino: how many inodes the operation makes dirty + * @dirtied_ino_d: now much data dirtied inode contains + * @idx_growth: how much the index will supposedly grow + * @data_growth: how much new data the operation will supposedly add + * @dd_growth: how much data that makes other data dirty the operation will + * supposedly add + * + * @idx_growth, @data_growth and @dd_growth are not used in budget request. The + * budgeting subsystem caches index and data growth values there to avoid + * re-calculating them when the budget is released. However, if @idx_growth is + * %-1, it is calculated by the release function using other fields. + * + * An inode may contain 4KiB of data at max., thus the widths of @new_ino_d + * is 13 bits, and @dirtied_ino_d - 15, because up to 4 inodes may be made + * dirty by the re-name operation. + * + * Note, UBIFS aligns node lengths to 8-bytes boundary, so the requester has to + * make sure the amount of inode data which contribute to @new_ino_d and + * @dirtied_ino_d fields are aligned. + */ +struct ubifs_budget_req { + unsigned int fast:1; + unsigned int recalculate:1; +#ifndef UBIFS_DEBUG + unsigned int new_page:1; + unsigned int dirtied_page:1; + unsigned int new_dent:1; + unsigned int mod_dent:1; + unsigned int new_ino:1; + unsigned int new_ino_d:13; + unsigned int dirtied_ino:4; + unsigned int dirtied_ino_d:15; +#else + /* Not bit-fields to check for overflows */ + unsigned int new_page; + unsigned int dirtied_page; + unsigned int new_dent; + unsigned int mod_dent; + unsigned int new_ino; + unsigned int new_ino_d; + unsigned int dirtied_ino; + unsigned int dirtied_ino_d; +#endif + int idx_growth; + int data_growth; + int dd_growth; +}; + +/** + * struct ubifs_orphan - stores the inode number of an orphan. + * @rb: rb-tree node of rb-tree of orphans sorted by inode number + * @list: list head of list of orphans in order added + * @new_list: list head of list of orphans added since the last commit + * @cnext: next orphan to commit + * @dnext: next orphan to delete + * @inum: inode number + * @new: %1 => added since the last commit, otherwise %0 + */ +struct ubifs_orphan { + struct rb_node rb; + struct list_head list; + struct list_head new_list; + struct ubifs_orphan *cnext; + struct ubifs_orphan *dnext; + ino_t inum; + int new; +}; + +/** + * struct ubifs_mount_opts - UBIFS-specific mount options information. + * @unmount_mode: selected unmount mode (%0 default, %1 normal, %2 fast) + */ +struct ubifs_mount_opts { + unsigned int unmount_mode:2; +}; + +/** + * struct ubifs_info - UBIFS file-system description data structure + * (per-superblock). + * @vfs_sb: VFS @struct super_block object + * @bdi: backing device info object to make VFS happy and disable read-ahead + * + * @highest_inum: highest used inode number + * @max_sqnum: current global sequence number + * @cmt_no: commit number of the last successfully completed commit, protected + * by @commit_sem + * @cnt_lock: protects @highest_inum and @max_sqnum counters + * @fmt_version: UBIFS on-flash format version + * @uuid: UUID from super block + * + * @lhead_lnum: log head logical eraseblock number + * @lhead_offs: log head offset + * @ltail_lnum: log tail logical eraseblock number (offset is always 0) + * @log_mutex: protects the log, @lhead_lnum, @lhead_offs, @ltail_lnum, and + * @bud_bytes + * @min_log_bytes: minimum required number of bytes in the log + * @cmt_bud_bytes: used during commit to temporarily amount of bytes in + * committed buds + * + * @buds: tree of all buds indexed by bud LEB number + * @bud_bytes: how many bytes of flash is used by buds + * @buds_lock: protects the @buds tree, @bud_bytes, and per-journal head bud + * lists + * @jhead_cnt: count of journal heads + * @jheads: journal heads (head zero is base head) + * @max_bud_bytes: maximum number of bytes allowed in buds + * @bg_bud_bytes: number of bud bytes when background commit is initiated + * @old_buds: buds to be released after commit ends + * @max_bud_cnt: maximum number of buds + * + * @commit_sem: synchronizes committer with other processes + * @cmt_state: commit state + * @cs_lock: commit state lock + * @cmt_wq: wait queue to sleep on if the log is full and a commit is running + * @fast_unmount: do not run journal commit before un-mounting + * @big_lpt: flag that LPT is too big to write whole during commit + * @check_lpt_free: flag that indicates LPT GC may be needed + * @nospace: non-zero if the file-system does not have flash space (used as + * optimization) + * @nospace_rp: the same as @nospace, but additionally means that even reserved + * pool is full + * + * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and + * @calc_idx_sz + * @zroot: zbranch which points to the root index node and znode + * @cnext: next znode to commit + * @enext: next znode to commit to empty space + * @gap_lebs: array of LEBs used by the in-gaps commit method + * @cbuf: commit buffer + * @ileb_buf: buffer for commit in-the-gaps method + * @ileb_len: length of data in ileb_buf + * @ihead_lnum: LEB number of index head + * @ihead_offs: offset of index head + * @ilebs: pre-allocated index LEBs + * @ileb_cnt: number of pre-allocated index LEBs + * @ileb_nxt: next pre-allocated index LEBs + * @old_idx: tree of index nodes obsoleted since the last commit start + * @bottom_up_buf: a buffer which is used by 'dirty_cow_bottom_up()' in tnc.c + * @new_ihead_lnum: used by debugging to check ihead_lnum + * @new_ihead_offs: used by debugging to check ihead_offs + * + * @mst_node: master node + * @mst_offs: offset of valid master node + * @mst_mutex: protects the master node area, @mst_node, and @mst_offs + * + * @log_lebs: number of logical eraseblocks in the log + * @log_bytes: log size in bytes + * @log_last: last LEB of the log + * @lpt_lebs: number of LEBs used for lprops table + * @lpt_first: first LEB of the lprops table area + * @lpt_last: last LEB of the lprops table area + * @orph_lebs: number of LEBs used for the orphan area + * @orph_first: first LEB of the orphan area + * @orph_last: last LEB of the orphan area + * @main_lebs: count of LEBs in the main area + * @main_first: first LEB of the main area + * @main_bytes: main area size in bytes + * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc) + * + * @key_hash_type: type of the key hash + * @key_hash: direntry key hash function + * @key_fmt: key format + * @key_len: key length + * @fanout: fanout of the index tree (number of links per indexing node) + * + * @min_io_size: minimal input/output unit size + * @min_io_shift: number of bits in @min_io_size minus one + * @leb_size: logical eraseblock size in bytes + * @half_leb_size: half LEB size + * @leb_cnt: count of logical eraseblocks + * @max_leb_cnt: maximum count of logical eraseblocks + * @old_leb_cnt: count of logical eraseblocks before re-size + * @ro_media: the underlying UBI volume is read-only + * + * @dirty_pg_cnt: number of dirty pages (not used) + * @dirty_zn_cnt: number of dirty znodes + * @clean_zn_cnt: number of clean znodes + * + * @budg_idx_growth: amount of bytes budgeted for index growth + * @budg_data_growth: amount of bytes budgeted for cached data + * @budg_dd_growth: amount of bytes budgeted for cached data that will make + * other data dirty + * @budg_uncommitted_idx: amount of bytes were budgeted for growth of the index, + * but which still have to be taken into account because + * the index has not been committed so far + * @space_lock: protects @budg_idx_growth, @budg_data_growth, @budg_dd_growth, + * @budg_uncommited_idx, @min_idx_lebs, @old_idx_sz, and @lst; + * @min_idx_lebs: minimum number of LEBs required for the index + * @old_idx_sz: size of index on flash + * @calc_idx_sz: temporary variable which is used to calculate new index size + * (contains accurate new index size at end of TNC commit start) + * @lst: lprops statistics + * + * @page_budget: budget for a page + * @inode_budget: budget for an inode + * @dent_budget: budget for a directory entry + * + * @ref_node_alsz: size of the LEB reference node aligned to the min. flash + * I/O unit + * @mst_node_alsz: master node aligned size + * @min_idx_node_sz: minimum indexing node aligned on 8-bytes boundary + * @max_idx_node_sz: maximum indexing node aligned on 8-bytes boundary + * @max_inode_sz: maximum possible inode size in bytes + * @max_znode_sz: size of znode in bytes + * + * @leb_overhead: how many bytes are wasted in an LEB when it is filled with + * data nodes of maximum size - used in free space reporting + * @dead_wm: LEB dead space watermark + * @dark_wm: LEB dark space watermark + * @block_cnt: count of 4KiB blocks on the FS + * + * @ranges: UBIFS node length ranges + * @ubi: UBI volume descriptor + * @di: UBI device information + * @vi: UBI volume information + * + * @orph_tree: rb-tree of orphan inode numbers + * @orph_list: list of orphan inode numbers in order added + * @orph_new: list of orphan inode numbers added since last commit + * @orph_cnext: next orphan to commit + * @orph_dnext: next orphan to delete + * @orphan_lock: lock for orph_tree and orph_new + * @orph_buf: buffer for orphan nodes + * @new_orphans: number of orphans since last commit + * @cmt_orphans: number of orphans being committed + * @tot_orphans: number of orphans in the rb_tree + * @max_orphans: maximum number of orphans allowed + * @ohead_lnum: orphan head LEB number + * @ohead_offs: orphan head offset + * @no_orphs: non-zero if there are no orphans + * + * @bgt: UBIFS background thread + * @bgt_name: background thread name + * @need_bgt: if background thread should run + * @need_wbuf_sync: if write-buffers have to be synchronized + * + * @gc_lnum: LEB number used for garbage collection + * @sbuf: a buffer of LEB size used by GC and replay for scanning + * @idx_gc: list of index LEBs that have been garbage collected + * @idx_gc_cnt: number of elements on the idx_gc list + * @gc_seq: incremented for every non-index LEB garbage collected + * @gced_lnum: last non-index LEB that was garbage collected + * + * @infos_list: links all 'ubifs_info' objects + * @umount_mutex: serializes shrinker and un-mount + * @shrinker_run_no: shrinker run number + * + * @space_bits: number of bits needed to record free or dirty space + * @lpt_lnum_bits: number of bits needed to record a LEB number in the LPT + * @lpt_offs_bits: number of bits needed to record an offset in the LPT + * @lpt_spc_bits: number of bits needed to space in the LPT + * @pcnt_bits: number of bits needed to record pnode or nnode number + * @lnum_bits: number of bits needed to record LEB number + * @nnode_sz: size of on-flash nnode + * @pnode_sz: size of on-flash pnode + * @ltab_sz: size of on-flash LPT lprops table + * @lsave_sz: size of on-flash LPT save table + * @pnode_cnt: number of pnodes + * @nnode_cnt: number of nnodes + * @lpt_hght: height of the LPT + * @pnodes_have: number of pnodes in memory + * + * @lp_mutex: protects lprops table and all the other lprops-related fields + * @lpt_lnum: LEB number of the root nnode of the LPT + * @lpt_offs: offset of the root nnode of the LPT + * @nhead_lnum: LEB number of LPT head + * @nhead_offs: offset of LPT head + * @lpt_drty_flgs: dirty flags for LPT special nodes e.g. ltab + * @dirty_nn_cnt: number of dirty nnodes + * @dirty_pn_cnt: number of dirty pnodes + * @lpt_sz: LPT size + * @lpt_nod_buf: buffer for an on-flash nnode or pnode + * @lpt_buf: buffer of LEB size used by LPT + * @nroot: address in memory of the root nnode of the LPT + * @lpt_cnext: next LPT node to commit + * @lpt_heap: array of heaps of categorized lprops + * @dirty_idx: a (reverse sorted) copy of the LPROPS_DIRTY_IDX heap as at + * previous commit start + * @uncat_list: list of un-categorized LEBs + * @empty_list: list of empty LEBs + * @freeable_list: list of freeable non-index LEBs (free + dirty == leb_size) + * @frdi_idx_list: list of freeable index LEBs (free + dirty == leb_size) + * @freeable_cnt: number of freeable LEBs in @freeable_list + * + * @ltab_lnum: LEB number of LPT's own lprops table + * @ltab_offs: offset of LPT's own lprops table + * @ltab: LPT's own lprops table + * @ltab_cmt: LPT's own lprops table (commit copy) + * @lsave_cnt: number of LEB numbers in LPT's save table + * @lsave_lnum: LEB number of LPT's save table + * @lsave_offs: offset of LPT's save table + * @lsave: LPT's save table + * @lscan_lnum: LEB number of last LPT scan + * + * @rp_size: size of the reserved pool in bytes + * @report_rp_size: size of the reserved pool reported to user-space + * @rp_uid: reserved pool user ID + * @rp_gid: reserved pool group ID + * + * @empty: if the UBI device is empty + * @replay_tree: temporary tree used during journal replay + * @replay_list: temporary list used during journal replay + * @replay_buds: list of buds to replay + * @cs_sqnum: sequence number of first node in the log (commit start node) + * @replay_sqnum: sequence number of node currently being replayed + * @need_recovery: file-system needs recovery + * @replaying: set to %1 during journal replay + * @unclean_leb_list: LEBs to recover when mounting ro to rw + * @rcvrd_mst_node: recovered master node to write when mounting ro to rw + * @size_tree: inode size information for recovery + * @remounting_rw: set while remounting from ro to rw (sb flags have MS_RDONLY) + * @mount_opts: UBIFS-specific mount options + * + * @dbg_buf: a buffer of LEB size used for debugging purposes + * @old_zroot: old index root - used by 'dbg_check_old_index()' + * @old_zroot_level: old index root level - used by 'dbg_check_old_index()' + * @old_zroot_sqnum: old index root sqnum - used by 'dbg_check_old_index()' + * @failure_mode: failure mode for recovery testing + * @fail_delay: 0=>don't delay, 1=>delay a time, 2=>delay a number of calls + * @fail_timeout: time in jiffies when delay of failure mode expires + * @fail_cnt: current number of calls to failure mode I/O functions + * @fail_cnt_max: number of calls by which to delay failure mode + */ +struct ubifs_info { + struct super_block *vfs_sb; + struct backing_dev_info bdi; + + ino_t highest_inum; + unsigned long long max_sqnum; + unsigned long long cmt_no; + spinlock_t cnt_lock; + int fmt_version; + unsigned char uuid[16]; + + int lhead_lnum; + int lhead_offs; + int ltail_lnum; + struct mutex log_mutex; + int min_log_bytes; + long long cmt_bud_bytes; + + struct rb_root buds; + long long bud_bytes; + spinlock_t buds_lock; + int jhead_cnt; + struct ubifs_jhead *jheads; + long long max_bud_bytes; + long long bg_bud_bytes; + struct list_head old_buds; + int max_bud_cnt; + + struct rw_semaphore commit_sem; + int cmt_state; + spinlock_t cs_lock; + wait_queue_head_t cmt_wq; + unsigned int fast_unmount:1; + unsigned int big_lpt:1; + unsigned int check_lpt_free:1; + unsigned int nospace:1; + unsigned int nospace_rp:1; + + struct mutex tnc_mutex; + struct ubifs_zbranch zroot; + struct ubifs_znode *cnext; + struct ubifs_znode *enext; + int *gap_lebs; + void *cbuf; + void *ileb_buf; + int ileb_len; + int ihead_lnum; + int ihead_offs; + int *ilebs; + int ileb_cnt; + int ileb_nxt; + struct rb_root old_idx; + int *bottom_up_buf; +#ifdef CONFIG_UBIFS_FS_DEBUG + int new_ihead_lnum; + int new_ihead_offs; +#endif + + struct ubifs_mst_node *mst_node; + int mst_offs; + struct mutex mst_mutex; + + int log_lebs; + long long log_bytes; + int log_last; + int lpt_lebs; + int lpt_first; + int lpt_last; + int orph_lebs; + int orph_first; + int orph_last; + int main_lebs; + int main_first; + long long main_bytes; + int default_compr; + + uint8_t key_hash_type; + uint32_t (*key_hash)(const char *str, int len); + int key_fmt; + int key_len; + int fanout; + + int min_io_size; + int min_io_shift; + int leb_size; + int half_leb_size; + int leb_cnt; + int max_leb_cnt; + int old_leb_cnt; + int ro_media; + + atomic_long_t dirty_pg_cnt; + atomic_long_t dirty_zn_cnt; + atomic_long_t clean_zn_cnt; + + long long budg_idx_growth; + long long budg_data_growth; + long long budg_dd_growth; + long long budg_uncommitted_idx; + spinlock_t space_lock; + int min_idx_lebs; + unsigned long long old_idx_sz; + unsigned long long calc_idx_sz; + struct ubifs_lp_stats lst; + + int page_budget; + int inode_budget; + int dent_budget; + + int ref_node_alsz; + int mst_node_alsz; + int min_idx_node_sz; + int max_idx_node_sz; + long long max_inode_sz; + int max_znode_sz; + + int leb_overhead; + int dead_wm; + int dark_wm; + int block_cnt; + + struct ubifs_node_range ranges[UBIFS_NODE_TYPES_CNT]; + struct ubi_volume_desc *ubi; + struct ubi_device_info di; + struct ubi_volume_info vi; + + struct rb_root orph_tree; + struct list_head orph_list; + struct list_head orph_new; + struct ubifs_orphan *orph_cnext; + struct ubifs_orphan *orph_dnext; + spinlock_t orphan_lock; + void *orph_buf; + int new_orphans; + int cmt_orphans; + int tot_orphans; + int max_orphans; + int ohead_lnum; + int ohead_offs; + int no_orphs; + + struct task_struct *bgt; + char bgt_name[sizeof(BGT_NAME_PATTERN) + 9]; + int need_bgt; + int need_wbuf_sync; + + int gc_lnum; + void *sbuf; + struct list_head idx_gc; + int idx_gc_cnt; + volatile int gc_seq; + volatile int gced_lnum; + + struct list_head infos_list; + struct mutex umount_mutex; + unsigned int shrinker_run_no; + + int space_bits; + int lpt_lnum_bits; + int lpt_offs_bits; + int lpt_spc_bits; + int pcnt_bits; + int lnum_bits; + int nnode_sz; + int pnode_sz; + int ltab_sz; + int lsave_sz; + int pnode_cnt; + int nnode_cnt; + int lpt_hght; + int pnodes_have; + + struct mutex lp_mutex; + int lpt_lnum; + int lpt_offs; + int nhead_lnum; + int nhead_offs; + int lpt_drty_flgs; + int dirty_nn_cnt; + int dirty_pn_cnt; + long long lpt_sz; + void *lpt_nod_buf; + void *lpt_buf; + struct ubifs_nnode *nroot; + struct ubifs_cnode *lpt_cnext; + struct ubifs_lpt_heap lpt_heap[LPROPS_HEAP_CNT]; + struct ubifs_lpt_heap dirty_idx; + struct list_head uncat_list; + struct list_head empty_list; + struct list_head freeable_list; + struct list_head frdi_idx_list; + int freeable_cnt; + + int ltab_lnum; + int ltab_offs; + struct ubifs_lpt_lprops *ltab; + struct ubifs_lpt_lprops *ltab_cmt; + int lsave_cnt; + int lsave_lnum; + int lsave_offs; + int *lsave; + int lscan_lnum; + + long long rp_size; + long long report_rp_size; + uid_t rp_uid; + gid_t rp_gid; + + /* The below fields are used only during mounting and re-mounting */ + int empty; + struct rb_root replay_tree; + struct list_head replay_list; + struct list_head replay_buds; + unsigned long long cs_sqnum; + unsigned long long replay_sqnum; + int need_recovery; + int replaying; + struct list_head unclean_leb_list; + struct ubifs_mst_node *rcvrd_mst_node; + struct rb_root size_tree; + int remounting_rw; + struct ubifs_mount_opts mount_opts; + +#ifdef CONFIG_UBIFS_FS_DEBUG + void *dbg_buf; + struct ubifs_zbranch old_zroot; + int old_zroot_level; + unsigned long long old_zroot_sqnum; + int failure_mode; + int fail_delay; + unsigned long fail_timeout; + unsigned int fail_cnt; + unsigned int fail_cnt_max; +#endif +}; + +extern struct list_head ubifs_infos; +extern spinlock_t ubifs_infos_lock; +extern atomic_long_t ubifs_clean_zn_cnt; +extern struct kmem_cache *ubifs_inode_slab; +extern struct super_operations ubifs_super_operations; +extern struct address_space_operations ubifs_file_address_operations; +extern struct file_operations ubifs_file_operations; +extern struct inode_operations ubifs_file_inode_operations; +extern struct file_operations ubifs_dir_operations; +extern struct inode_operations ubifs_dir_inode_operations; +extern struct inode_operations ubifs_symlink_inode_operations; +extern struct backing_dev_info ubifs_backing_dev_info; +extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT]; + +/* io.c */ +void ubifs_ro_mode(struct ubifs_info *c, int err); +int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len); +int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs, + int dtype); +int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf); +int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len, + int lnum, int offs); +int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len, + int lnum, int offs); +int ubifs_write_node(struct ubifs_info *c, void *node, int len, int lnum, + int offs, int dtype); +int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, + int offs, int quiet); +void ubifs_prepare_node(struct ubifs_info *c, void *buf, int len, int pad); +void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last); +int ubifs_io_init(struct ubifs_info *c); +void ubifs_pad(const struct ubifs_info *c, void *buf, int pad); +int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf); +int ubifs_bg_wbufs_sync(struct ubifs_info *c); +void ubifs_wbuf_add_ino_nolock(struct ubifs_wbuf *wbuf, ino_t inum); +int ubifs_sync_wbufs_by_inode(struct ubifs_info *c, struct inode *inode); + +/* scan.c */ +struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum, + int offs, void *sbuf); +void ubifs_scan_destroy(struct ubifs_scan_leb *sleb); +int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum, + int offs, int quiet); +struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum, + int offs, void *sbuf); +void ubifs_end_scan(const struct ubifs_info *c, struct ubifs_scan_leb *sleb, + int lnum, int offs); +int ubifs_add_snod(const struct ubifs_info *c, struct ubifs_scan_leb *sleb, + void *buf, int offs); +void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs, + void *buf); + +/* log.c */ +void ubifs_add_bud(struct ubifs_info *c, struct ubifs_bud *bud); +void ubifs_create_buds_lists(struct ubifs_info *c); +int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs); +struct ubifs_bud *ubifs_search_bud(struct ubifs_info *c, int lnum); +struct ubifs_wbuf *ubifs_get_wbuf(struct ubifs_info *c, int lnum); +int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum); +int ubifs_log_end_commit(struct ubifs_info *c, int new_ltail_lnum); +int ubifs_log_post_commit(struct ubifs_info *c, int old_ltail_lnum); +int ubifs_consolidate_log(struct ubifs_info *c); + +/* journal.c */ +int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, + const struct qstr *nm, const struct inode *inode, + int deletion, int xent); +int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, + const union ubifs_key *key, const void *buf, int len); +int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode); +int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode); +int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, + const struct dentry *old_dentry, + const struct inode *new_dir, + const struct dentry *new_dentry, int sync); +int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, + loff_t old_size, loff_t new_size); +int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host, + const struct inode *inode, const struct qstr *nm); +int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode1, + const struct inode *inode2); + +/* budget.c */ +int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req); +void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req); +void ubifs_release_dirty_inode_budget(struct ubifs_info *c, + struct ubifs_inode *ui); +int ubifs_budget_inode_op(struct ubifs_info *c, struct inode *inode, + struct ubifs_budget_req *req); +void ubifs_release_ino_dirty(struct ubifs_info *c, struct inode *inode, + struct ubifs_budget_req *req); +void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode, + struct ubifs_budget_req *req); +long long ubifs_get_free_space(struct ubifs_info *c); +int ubifs_calc_min_idx_lebs(struct ubifs_info *c); +void ubifs_convert_page_budget(struct ubifs_info *c); +long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free); +long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs); + +/* find.c */ +int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free, + int squeeze); +int ubifs_find_free_leb_for_idx(struct ubifs_info *c); +int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp, + int min_space, int pick_free); +int ubifs_find_dirty_idx_leb(struct ubifs_info *c); +int ubifs_save_dirty_idx_lnums(struct ubifs_info *c); + +/* tnc.c */ +int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key, + struct ubifs_znode **zn, int *n); +int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key, + void *node, const struct qstr *nm); +int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key, + void *node, int *lnum, int *offs); +int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum, + int offs, int len); +int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key, + int old_lnum, int old_offs, int lnum, int offs, int len); +int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key, + int lnum, int offs, int len, const struct qstr *nm); +int ubifs_tnc_remove(struct ubifs_info *c, const union ubifs_key *key); +int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key, + const struct qstr *nm); +int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key, + union ubifs_key *to_key); +int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum); +struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c, + union ubifs_key *key, + const struct qstr *nm); +void ubifs_tnc_close(struct ubifs_info *c); +int ubifs_tnc_has_node(struct ubifs_info *c, union ubifs_key *key, int level, + int lnum, int offs, int is_idx); +int ubifs_dirty_idx_node(struct ubifs_info *c, union ubifs_key *key, int level, + int lnum, int offs); +/* Shared by tnc.c for tnc_commit.c */ +void destroy_old_idx(struct ubifs_info *c); +int is_idx_node_in_tnc(struct ubifs_info *c, union ubifs_key *key, int level, + int lnum, int offs); +int insert_old_idx_znode(struct ubifs_info *c, struct ubifs_znode *znode); + +/* tnc_misc.c */ +struct ubifs_znode *ubifs_tnc_levelorder_next(struct ubifs_znode *zr, + struct ubifs_znode *znode); +int ubifs_search_zbranch(const struct ubifs_info *c, + const struct ubifs_znode *znode, + const union ubifs_key *key, int *n); +struct ubifs_znode *ubifs_tnc_postorder_first(struct ubifs_znode *znode); +struct ubifs_znode *ubifs_tnc_postorder_next(struct ubifs_znode *znode); +long ubifs_destroy_tnc_subtree(struct ubifs_znode *zr); +struct ubifs_znode *ubifs_load_znode(struct ubifs_info *c, + struct ubifs_zbranch *zbr, + struct ubifs_znode *parent, int iip); +int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr, + void *node); + +/* tnc_commit.c */ +int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot); +int ubifs_tnc_end_commit(struct ubifs_info *c); + +/* shrinker.c */ +int ubifs_shrinker(int nr_to_scan, gfp_t gfp_mask); + +/* commit.c */ +int ubifs_bg_thread(void *info); +void ubifs_commit_required(struct ubifs_info *c); +void ubifs_request_bg_commit(struct ubifs_info *c); +int ubifs_run_commit(struct ubifs_info *c); +void ubifs_recovery_commit(struct ubifs_info *c); +int ubifs_gc_should_commit(struct ubifs_info *c); +void ubifs_wait_for_commit(struct ubifs_info *c); + +/* master.c */ +int ubifs_read_master(struct ubifs_info *c); +int ubifs_write_master(struct ubifs_info *c); + +/* sb.c */ +int ubifs_read_superblock(struct ubifs_info *c); +struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c); +int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup); + +/* replay.c */ +int ubifs_validate_entry(struct ubifs_info *c, + const struct ubifs_dent_node *dent); +int ubifs_replay_journal(struct ubifs_info *c); + +/* gc.c */ +int ubifs_garbage_collect(struct ubifs_info *c, int anyway); +int ubifs_gc_start_commit(struct ubifs_info *c); +int ubifs_gc_end_commit(struct ubifs_info *c); +void ubifs_destroy_idx_gc(struct ubifs_info *c); +int ubifs_get_idx_gc_leb(struct ubifs_info *c); +int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp); + +/* orphan.c */ +int ubifs_add_orphan(struct ubifs_info *c, ino_t inum); +void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum); +int ubifs_orphan_start_commit(struct ubifs_info *c); +int ubifs_orphan_end_commit(struct ubifs_info *c); +int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only); + +/* lpt.c */ +int ubifs_calc_lpt_geom(struct ubifs_info *c); +int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first, + int *lpt_lebs, int *big_lpt); +int ubifs_lpt_init(struct ubifs_info *c, int rd, int wr); +struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum); +struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum); +int ubifs_lpt_scan_nolock(struct ubifs_info *c, int start_lnum, int end_lnum, + ubifs_lpt_scan_callback scan_cb, void *data); + +/* Shared by lpt.c for lpt_commit.c */ +void ubifs_pack_lsave(struct ubifs_info *c, void *buf, int *lsave); +void ubifs_pack_ltab(struct ubifs_info *c, void *buf, + struct ubifs_lpt_lprops *ltab); +void ubifs_pack_pnode(struct ubifs_info *c, void *buf, + struct ubifs_pnode *pnode); +void ubifs_pack_nnode(struct ubifs_info *c, void *buf, + struct ubifs_nnode *nnode); +struct ubifs_pnode *ubifs_get_pnode(struct ubifs_info *c, + struct ubifs_nnode *parent, int iip); +struct ubifs_nnode *ubifs_get_nnode(struct ubifs_info *c, + struct ubifs_nnode *parent, int iip); +int ubifs_read_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip); +void ubifs_add_lpt_dirt(struct ubifs_info *c, int lnum, int dirty); +void ubifs_add_nnode_dirt(struct ubifs_info *c, struct ubifs_nnode *nnode); +uint32_t ubifs_unpack_bits(uint8_t **addr, int *pos, int nrbits); +struct ubifs_nnode *ubifs_first_nnode(struct ubifs_info *c, int *hght); + +/* lpt_commit.c */ +int ubifs_lpt_start_commit(struct ubifs_info *c); +int ubifs_lpt_end_commit(struct ubifs_info *c); +int ubifs_lpt_post_commit(struct ubifs_info *c); +void ubifs_lpt_free(struct ubifs_info *c, int wr_only); + +/* lprops.c */ +void ubifs_get_lprops(struct ubifs_info *c); +const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c, + const struct ubifs_lprops *lp, + int free, int dirty, int flags, + int idx_gc_cnt); +void ubifs_release_lprops(struct ubifs_info *c); +void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *stats); +void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops, + int cat); +void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops, + struct ubifs_lprops *new_lprops); +void ubifs_ensure_cat(struct ubifs_info *c, struct ubifs_lprops *lprops); +int ubifs_categorize_lprops(const struct ubifs_info *c, + const struct ubifs_lprops *lprops); +int ubifs_change_one_lp(struct ubifs_info *c, int lnum, int free, int dirty, + int flags_set, int flags_clean, int idx_gc_cnt); +int ubifs_update_one_lp(struct ubifs_info *c, int lnum, int free, int dirty, + int flags_set, int flags_clean); +int ubifs_read_one_lp(struct ubifs_info *c, int lnum, struct ubifs_lprops *lp); +const struct ubifs_lprops *ubifs_fast_find_free(struct ubifs_info *c); +const struct ubifs_lprops *ubifs_fast_find_empty(struct ubifs_info *c); +const struct ubifs_lprops *ubifs_fast_find_freeable(struct ubifs_info *c); +const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c); + +/* file.c */ +int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync); +int ubifs_setattr(struct dentry *dentry, struct iattr *attr); + +/* dir.c */ +struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir, + int mode); +int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat); + +/* xattr.c */ +int ubifs_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags); +ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf, + size_t size); +ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size); +int ubifs_removexattr(struct dentry *dentry, const char *name); + +/* super.c */ +struct inode *ubifs_iget(struct super_block *sb, unsigned long inum); + +/* recovery.c */ +int ubifs_recover_master_node(struct ubifs_info *c); +int ubifs_write_rcvrd_mst_node(struct ubifs_info *c); +struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, + int offs, void *sbuf, int grouped); +struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum, + int offs, void *sbuf); +int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf); +int ubifs_clean_lebs(const struct ubifs_info *c, void *sbuf); +int ubifs_rcvry_gc_commit(struct ubifs_info *c); +int ubifs_recover_size_accum(struct ubifs_info *c, union ubifs_key *key, + int deletion, loff_t new_size); +int ubifs_recover_size(struct ubifs_info *c); +void ubifs_destroy_size_tree(struct ubifs_info *c); + +/* ioctl.c */ +long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +void ubifs_set_inode_flags(struct inode *inode); +#ifdef CONFIG_COMPAT +long ubifs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +#endif + +/* compressor.c */ +int __init ubifs_compressors_init(void); +void __exit ubifs_compressors_exit(void); +void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len, + int *compr_type); +int ubifs_decompress(const void *buf, int len, void *out, int *out_len, + int compr_type); + +#include "debug.h" +#include "misc.h" +#include "key.h" + +#endif /* !__UBIFS_H__ */ diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c new file mode 100644 index 000000000000..649bec78b645 --- /dev/null +++ b/fs/ubifs/xattr.c @@ -0,0 +1,571 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* + * This file implements UBIFS extended attributes support. + * + * Extended attributes are implemented as regular inodes with attached data, + * which limits extended attribute size to UBIFS block size (4KiB). Names of + * extended attributes are described by extended attribute entries (xentries), + * which are almost identical to directory entries, but have different key type. + * + * In other words, the situation with extended attributes is very similar to + * directories. Indeed, any inode (but of course not xattr inodes) may have a + * number of associated xentries, just like directory inodes have associated + * directory entries. Extended attribute entries store the name of the extended + * attribute, the host inode number, and the extended attribute inode number. + * Similarly, direntries store the name, the parent and the target inode + * numbers. Thus, most of the common UBIFS mechanisms may be re-used for + * extended attributes. + * + * The number of extended attributes is not limited, but there is Linux + * limitation on the maximum possible size of the list of all extended + * attributes associated with an inode (%XATTR_LIST_MAX), so UBIFS makes sure + * the sum of all extended attribute names of the inode does not exceed that + * limit. + * + * Extended attributes are synchronous, which means they are written to the + * flash media synchronously and there is no write-back for extended attribute + * inodes. The extended attribute values are not stored in compressed form on + * the media. + * + * Since extended attributes are represented by regular inodes, they are cached + * in the VFS inode cache. The xentries are cached in the LNC cache (see + * tnc.c). + * + * ACL support is not implemented. + */ + +#include <linux/xattr.h> +#include <linux/posix_acl_xattr.h> +#include "ubifs.h" + +/* + * Limit the number of extended attributes per inode so that the total size + * (@xattr_size) is guaranteeded to fit in an 'unsigned int'. + */ +#define MAX_XATTRS_PER_INODE 65535 + +/* + * Extended attribute type constants. + * + * USER_XATTR: user extended attribute ("user.*") + * TRUSTED_XATTR: trusted extended attribute ("trusted.*) + * SECURITY_XATTR: security extended attribute ("security.*") + */ +enum { + USER_XATTR, + TRUSTED_XATTR, + SECURITY_XATTR, +}; + +static struct inode_operations none_inode_operations; +static struct address_space_operations none_address_operations; +static struct file_operations none_file_operations; + +/** + * create_xattr - create an extended attribute. + * @c: UBIFS file-system description object + * @host: host inode + * @nm: extended attribute name + * @value: extended attribute value + * @size: size of extended attribute value + * + * This is a helper function which creates an extended attribute of name @nm + * and value @value for inode @host. The host inode is also updated on flash + * because the ctime and extended attribute accounting data changes. This + * function returns zero in case of success and a negative error code in case + * of failure. + */ +static int create_xattr(struct ubifs_info *c, struct inode *host, + const struct qstr *nm, const void *value, int size) +{ + int err; + struct inode *inode; + struct ubifs_inode *ui, *host_ui = ubifs_inode(host); + struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, + .new_ino_d = ALIGN(size, 8), .dirtied_ino = 1, + .dirtied_ino_d = ALIGN(host_ui->data_len, 8) }; + + if (host_ui->xattr_cnt >= MAX_XATTRS_PER_INODE) + return -ENOSPC; + /* + * Linux limits the maximum size of the extended attribute names list + * to %XATTR_LIST_MAX. This means we should not allow creating more + * extended attributes if the name list becomes larger. This limitation + * is artificial for UBIFS, though. + */ + if (host_ui->xattr_names + host_ui->xattr_cnt + + nm->len + 1 > XATTR_LIST_MAX) + return -ENOSPC; + + err = ubifs_budget_space(c, &req); + if (err) + return err; + + inode = ubifs_new_inode(c, host, S_IFREG | S_IRWXUGO); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_budg; + } + + /* Re-define all operations to be "nothing" */ + inode->i_mapping->a_ops = &none_address_operations; + inode->i_op = &none_inode_operations; + inode->i_fop = &none_file_operations; + + inode->i_flags |= S_SYNC | S_NOATIME | S_NOCMTIME | S_NOQUOTA; + ui = ubifs_inode(inode); + ui->xattr = 1; + ui->flags |= UBIFS_XATTR_FL; + ui->data = kmalloc(size, GFP_NOFS); + if (!ui->data) { + err = -ENOMEM; + goto out_free; + } + memcpy(ui->data, value, size); + inode->i_size = ui->ui_size = size; + ui->data_len = size; + + mutex_lock(&host_ui->ui_mutex); + host->i_ctime = ubifs_current_time(host); + host_ui->xattr_cnt += 1; + host_ui->xattr_size += CALC_DENT_SIZE(nm->len); + host_ui->xattr_size += CALC_XATTR_BYTES(size); + host_ui->xattr_names += nm->len; + + err = ubifs_jnl_update(c, host, nm, inode, 0, 1); + if (err) + goto out_cancel; + mutex_unlock(&host_ui->ui_mutex); + + ubifs_release_budget(c, &req); + insert_inode_hash(inode); + iput(inode); + return 0; + +out_cancel: + host_ui->xattr_cnt -= 1; + host_ui->xattr_size -= CALC_DENT_SIZE(nm->len); + host_ui->xattr_size -= CALC_XATTR_BYTES(size); + mutex_unlock(&host_ui->ui_mutex); +out_free: + make_bad_inode(inode); + iput(inode); +out_budg: + ubifs_release_budget(c, &req); + return err; +} + +/** + * change_xattr - change an extended attribute. + * @c: UBIFS file-system description object + * @host: host inode + * @inode: extended attribute inode + * @value: extended attribute value + * @size: size of extended attribute value + * + * This helper function changes the value of extended attribute @inode with new + * data from @value. Returns zero in case of success and a negative error code + * in case of failure. + */ +static int change_xattr(struct ubifs_info *c, struct inode *host, + struct inode *inode, const void *value, int size) +{ + int err; + struct ubifs_inode *host_ui = ubifs_inode(host); + struct ubifs_inode *ui = ubifs_inode(inode); + struct ubifs_budget_req req = { .dirtied_ino = 2, + .dirtied_ino_d = ALIGN(size, 8) + ALIGN(host_ui->data_len, 8) }; + + ubifs_assert(ui->data_len == inode->i_size); + err = ubifs_budget_space(c, &req); + if (err) + return err; + + kfree(ui->data); + ui->data = kmalloc(size, GFP_NOFS); + if (!ui->data) { + err = -ENOMEM; + goto out_free; + } + memcpy(ui->data, value, size); + inode->i_size = ui->ui_size = size; + ui->data_len = size; + + mutex_lock(&host_ui->ui_mutex); + host->i_ctime = ubifs_current_time(host); + host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len); + host_ui->xattr_size += CALC_XATTR_BYTES(size); + + /* + * It is important to write the host inode after the xattr inode + * because if the host inode gets synchronized (via 'fsync()'), then + * the extended attribute inode gets synchronized, because it goes + * before the host inode in the write-buffer. + */ + err = ubifs_jnl_change_xattr(c, inode, host); + if (err) + goto out_cancel; + mutex_unlock(&host_ui->ui_mutex); + + ubifs_release_budget(c, &req); + return 0; + +out_cancel: + host_ui->xattr_size -= CALC_XATTR_BYTES(size); + host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len); + mutex_unlock(&host_ui->ui_mutex); + make_bad_inode(inode); +out_free: + ubifs_release_budget(c, &req); + return err; +} + +/** + * check_namespace - check extended attribute name-space. + * @nm: extended attribute name + * + * This function makes sure the extended attribute name belongs to one of the + * supported extended attribute name-spaces. Returns name-space index in case + * of success and a negative error code in case of failure. + */ +static int check_namespace(const struct qstr *nm) +{ + int type; + + if (nm->len > UBIFS_MAX_NLEN) + return -ENAMETOOLONG; + + if (!strncmp(nm->name, XATTR_TRUSTED_PREFIX, + XATTR_TRUSTED_PREFIX_LEN)) { + if (nm->name[sizeof(XATTR_TRUSTED_PREFIX) - 1] == '\0') + return -EINVAL; + type = TRUSTED_XATTR; + } else if (!strncmp(nm->name, XATTR_USER_PREFIX, + XATTR_USER_PREFIX_LEN)) { + if (nm->name[XATTR_USER_PREFIX_LEN] == '\0') + return -EINVAL; + type = USER_XATTR; + } else if (!strncmp(nm->name, XATTR_SECURITY_PREFIX, + XATTR_SECURITY_PREFIX_LEN)) { + if (nm->name[sizeof(XATTR_SECURITY_PREFIX) - 1] == '\0') + return -EINVAL; + type = SECURITY_XATTR; + } else + return -EOPNOTSUPP; + + return type; +} + +static struct inode *iget_xattr(struct ubifs_info *c, ino_t inum) +{ + struct inode *inode; + + inode = ubifs_iget(c->vfs_sb, inum); + if (IS_ERR(inode)) { + ubifs_err("dead extended attribute entry, error %d", + (int)PTR_ERR(inode)); + return inode; + } + if (ubifs_inode(inode)->xattr) + return inode; + ubifs_err("corrupt extended attribute entry"); + iput(inode); + return ERR_PTR(-EINVAL); +} + +int ubifs_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + struct inode *inode, *host = dentry->d_inode; + struct ubifs_info *c = host->i_sb->s_fs_info; + struct qstr nm = { .name = name, .len = strlen(name) }; + struct ubifs_dent_node *xent; + union ubifs_key key; + int err, type; + + dbg_gen("xattr '%s', host ino %lu ('%.*s'), size %zd", name, + host->i_ino, dentry->d_name.len, dentry->d_name.name, size); + ubifs_assert(mutex_is_locked(&host->i_mutex)); + + if (size > UBIFS_MAX_INO_DATA) + return -ERANGE; + + type = check_namespace(&nm); + if (type < 0) + return type; + + xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS); + if (!xent) + return -ENOMEM; + + /* + * The extended attribute entries are stored in LNC, so multiple + * look-ups do not involve reading the flash. + */ + xent_key_init(c, &key, host->i_ino, &nm); + err = ubifs_tnc_lookup_nm(c, &key, xent, &nm); + if (err) { + if (err != -ENOENT) + goto out_free; + + if (flags & XATTR_REPLACE) + /* We are asked not to create the xattr */ + err = -ENODATA; + else + err = create_xattr(c, host, &nm, value, size); + goto out_free; + } + + if (flags & XATTR_CREATE) { + /* We are asked not to replace the xattr */ + err = -EEXIST; + goto out_free; + } + + inode = iget_xattr(c, le64_to_cpu(xent->inum)); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_free; + } + + err = change_xattr(c, host, inode, value, size); + iput(inode); + +out_free: + kfree(xent); + return err; +} + +ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf, + size_t size) +{ + struct inode *inode, *host = dentry->d_inode; + struct ubifs_info *c = host->i_sb->s_fs_info; + struct qstr nm = { .name = name, .len = strlen(name) }; + struct ubifs_inode *ui; + struct ubifs_dent_node *xent; + union ubifs_key key; + int err; + + dbg_gen("xattr '%s', ino %lu ('%.*s'), buf size %zd", name, + host->i_ino, dentry->d_name.len, dentry->d_name.name, size); + + err = check_namespace(&nm); + if (err < 0) + return err; + + xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS); + if (!xent) + return -ENOMEM; + + xent_key_init(c, &key, host->i_ino, &nm); + err = ubifs_tnc_lookup_nm(c, &key, xent, &nm); + if (err) { + if (err == -ENOENT) + err = -ENODATA; + goto out_unlock; + } + + inode = iget_xattr(c, le64_to_cpu(xent->inum)); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_unlock; + } + + ui = ubifs_inode(inode); + ubifs_assert(inode->i_size == ui->data_len); + ubifs_assert(ubifs_inode(host)->xattr_size > ui->data_len); + + if (buf) { + /* If @buf is %NULL we are supposed to return the length */ + if (ui->data_len > size) { + dbg_err("buffer size %zd, xattr len %d", + size, ui->data_len); + err = -ERANGE; + goto out_iput; + } + + memcpy(buf, ui->data, ui->data_len); + } + err = ui->data_len; + +out_iput: + iput(inode); +out_unlock: + kfree(xent); + return err; +} + +ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + union ubifs_key key; + struct inode *host = dentry->d_inode; + struct ubifs_info *c = host->i_sb->s_fs_info; + struct ubifs_inode *host_ui = ubifs_inode(host); + struct ubifs_dent_node *xent, *pxent = NULL; + int err, len, written = 0; + struct qstr nm = { .name = NULL }; + + dbg_gen("ino %lu ('%.*s'), buffer size %zd", host->i_ino, + dentry->d_name.len, dentry->d_name.name, size); + + len = host_ui->xattr_names + host_ui->xattr_cnt; + if (!buffer) + /* + * We should return the minimum buffer size which will fit a + * null-terminated list of all the extended attribute names. + */ + return len; + + if (len > size) + return -ERANGE; + + lowest_xent_key(c, &key, host->i_ino); + while (1) { + int type; + + xent = ubifs_tnc_next_ent(c, &key, &nm); + if (unlikely(IS_ERR(xent))) { + err = PTR_ERR(xent); + break; + } + + nm.name = xent->name; + nm.len = le16_to_cpu(xent->nlen); + + type = check_namespace(&nm); + if (unlikely(type < 0)) { + err = type; + break; + } + + /* Show trusted namespace only for "power" users */ + if (type != TRUSTED_XATTR || capable(CAP_SYS_ADMIN)) { + memcpy(buffer + written, nm.name, nm.len + 1); + written += nm.len + 1; + } + + kfree(pxent); + pxent = xent; + key_read(c, &xent->key, &key); + } + + kfree(pxent); + if (err != -ENOENT) { + ubifs_err("cannot find next direntry, error %d", err); + return err; + } + + ubifs_assert(written <= size); + return written; +} + +static int remove_xattr(struct ubifs_info *c, struct inode *host, + struct inode *inode, const struct qstr *nm) +{ + int err; + struct ubifs_inode *host_ui = ubifs_inode(host); + struct ubifs_inode *ui = ubifs_inode(inode); + struct ubifs_budget_req req = { .dirtied_ino = 2, .mod_dent = 1, + .dirtied_ino_d = ALIGN(host_ui->data_len, 8) }; + + ubifs_assert(ui->data_len == inode->i_size); + + err = ubifs_budget_space(c, &req); + if (err) + return err; + + mutex_lock(&host_ui->ui_mutex); + host->i_ctime = ubifs_current_time(host); + host_ui->xattr_cnt -= 1; + host_ui->xattr_size -= CALC_DENT_SIZE(nm->len); + host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len); + host_ui->xattr_names -= nm->len; + + err = ubifs_jnl_delete_xattr(c, host, inode, nm); + if (err) + goto out_cancel; + mutex_unlock(&host_ui->ui_mutex); + + ubifs_release_budget(c, &req); + return 0; + +out_cancel: + host_ui->xattr_cnt += 1; + host_ui->xattr_size += CALC_DENT_SIZE(nm->len); + host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len); + mutex_unlock(&host_ui->ui_mutex); + ubifs_release_budget(c, &req); + make_bad_inode(inode); + return err; +} + +int ubifs_removexattr(struct dentry *dentry, const char *name) +{ + struct inode *inode, *host = dentry->d_inode; + struct ubifs_info *c = host->i_sb->s_fs_info; + struct qstr nm = { .name = name, .len = strlen(name) }; + struct ubifs_dent_node *xent; + union ubifs_key key; + int err; + + dbg_gen("xattr '%s', ino %lu ('%.*s')", name, + host->i_ino, dentry->d_name.len, dentry->d_name.name); + ubifs_assert(mutex_is_locked(&host->i_mutex)); + + err = check_namespace(&nm); + if (err < 0) + return err; + + xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS); + if (!xent) + return -ENOMEM; + + xent_key_init(c, &key, host->i_ino, &nm); + err = ubifs_tnc_lookup_nm(c, &key, xent, &nm); + if (err) { + if (err == -ENOENT) + err = -ENODATA; + goto out_free; + } + + inode = iget_xattr(c, le64_to_cpu(xent->inum)); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_free; + } + + ubifs_assert(inode->i_nlink == 1); + inode->i_nlink = 0; + err = remove_xattr(c, host, inode, &nm); + if (err) + inode->i_nlink = 1; + + /* If @i_nlink is 0, 'iput()' will delete the inode */ + iput(inode); + +out_free: + kfree(xent); + return err; +} diff --git a/fs/udf/file.c b/fs/udf/file.c index 0ed6e146a0d9..eb91f3b70320 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -211,6 +211,7 @@ const struct file_operations udf_file_operations = { .release = udf_release_file, .fsync = udf_fsync_file, .splice_read = generic_file_splice_read, + .llseek = generic_file_llseek, }; const struct inode_operations udf_file_inode_operations = { diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index eb9cfa23dc3d..a4f2b3ce45b0 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -76,11 +76,24 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err) *err = -ENOSPC; iinfo = UDF_I(inode); - iinfo->i_unique = 0; - iinfo->i_lenExtents = 0; - iinfo->i_next_alloc_block = 0; - iinfo->i_next_alloc_goal = 0; - iinfo->i_strat4096 = 0; + if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_EXTENDED_FE)) { + iinfo->i_efe = 1; + if (UDF_VERS_USE_EXTENDED_FE > sbi->s_udfrev) + sbi->s_udfrev = UDF_VERS_USE_EXTENDED_FE; + iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize - + sizeof(struct extendedFileEntry), + GFP_KERNEL); + } else { + iinfo->i_efe = 0; + iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize - + sizeof(struct fileEntry), + GFP_KERNEL); + } + if (!iinfo->i_ext.i_data) { + iput(inode); + *err = -ENOMEM; + return NULL; + } block = udf_new_block(dir->i_sb, NULL, dinfo->i_location.partitionReferenceNum, @@ -111,6 +124,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err) lvhd->uniqueID = cpu_to_le64(uniqueID); mark_buffer_dirty(sbi->s_lvid_bh); } + mutex_unlock(&sbi->s_alloc_mutex); inode->i_mode = mode; inode->i_uid = current->fsuid; if (dir->i_mode & S_ISGID) { @@ -129,25 +143,6 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err) iinfo->i_lenEAttr = 0; iinfo->i_lenAlloc = 0; iinfo->i_use = 0; - if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_EXTENDED_FE)) { - iinfo->i_efe = 1; - if (UDF_VERS_USE_EXTENDED_FE > sbi->s_udfrev) - sbi->s_udfrev = UDF_VERS_USE_EXTENDED_FE; - iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize - - sizeof(struct extendedFileEntry), - GFP_KERNEL); - } else { - iinfo->i_efe = 0; - iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize - - sizeof(struct fileEntry), - GFP_KERNEL); - } - if (!iinfo->i_ext.i_data) { - iput(inode); - *err = -ENOMEM; - mutex_unlock(&sbi->s_alloc_mutex); - return NULL; - } if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_AD_IN_ICB)) iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; else if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) @@ -158,7 +153,6 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err) iinfo->i_crtime = current_fs_time(inode->i_sb); insert_inode_hash(inode); mark_inode_dirty(inode); - mutex_unlock(&sbi->s_alloc_mutex); if (DQUOT_ALLOC_INODE(inode)) { DQUOT_DROP(inode); diff --git a/fs/udf/super.c b/fs/udf/super.c index 7a5f69be6ac2..e25e7010627b 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -148,7 +148,7 @@ static void udf_destroy_inode(struct inode *inode) kmem_cache_free(udf_inode_cachep, UDF_I(inode)); } -static void init_once(struct kmem_cache *cachep, void *foo) +static void init_once(void *foo) { struct udf_inode_info *ei = (struct udf_inode_info *)foo; @@ -369,7 +369,7 @@ enum { Opt_err, Opt_uforget, Opt_uignore, Opt_gforget, Opt_gignore }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_novrs, "novrs"}, {Opt_nostrict, "nostrict"}, {Opt_bs, "bs=%u"}, @@ -682,38 +682,26 @@ static int udf_vrs(struct super_block *sb, int silent) /* * Check whether there is an anchor block in the given block */ -static int udf_check_anchor_block(struct super_block *sb, sector_t block, - bool varconv) +static int udf_check_anchor_block(struct super_block *sb, sector_t block) { - struct buffer_head *bh = NULL; - tag *t; + struct buffer_head *bh; uint16_t ident; - uint32_t location; - if (varconv) { - if (udf_fixed_to_variable(block) >= - sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits) - return 0; - bh = sb_bread(sb, udf_fixed_to_variable(block)); - } - else - bh = sb_bread(sb, block); + if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) && + udf_fixed_to_variable(block) >= + sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits) + return 0; + bh = udf_read_tagged(sb, block, block, &ident); if (!bh) return 0; - - t = (tag *)bh->b_data; - ident = le16_to_cpu(t->tagIdent); - location = le32_to_cpu(t->tagLocation); brelse(bh); - if (ident != TAG_IDENT_AVDP) - return 0; - return location == block; + + return ident == TAG_IDENT_AVDP; } /* Search for an anchor volume descriptor pointer */ -static sector_t udf_scan_anchors(struct super_block *sb, bool varconv, - sector_t lastblock) +static sector_t udf_scan_anchors(struct super_block *sb, sector_t lastblock) { sector_t last[6]; int i; @@ -739,7 +727,7 @@ static sector_t udf_scan_anchors(struct super_block *sb, bool varconv, sb->s_blocksize_bits) continue; - if (udf_check_anchor_block(sb, last[i], varconv)) { + if (udf_check_anchor_block(sb, last[i])) { sbi->s_anchor[0] = last[i]; sbi->s_anchor[1] = last[i] - 256; return last[i]; @@ -748,17 +736,17 @@ static sector_t udf_scan_anchors(struct super_block *sb, bool varconv, if (last[i] < 256) continue; - if (udf_check_anchor_block(sb, last[i] - 256, varconv)) { + if (udf_check_anchor_block(sb, last[i] - 256)) { sbi->s_anchor[1] = last[i] - 256; return last[i]; } } - if (udf_check_anchor_block(sb, sbi->s_session + 256, varconv)) { + if (udf_check_anchor_block(sb, sbi->s_session + 256)) { sbi->s_anchor[0] = sbi->s_session + 256; return last[0]; } - if (udf_check_anchor_block(sb, sbi->s_session + 512, varconv)) { + if (udf_check_anchor_block(sb, sbi->s_session + 512)) { sbi->s_anchor[0] = sbi->s_session + 512; return last[0]; } @@ -780,23 +768,24 @@ static void udf_find_anchor(struct super_block *sb) int i; struct udf_sb_info *sbi = UDF_SB(sb); - lastblock = udf_scan_anchors(sb, 0, sbi->s_last_block); + lastblock = udf_scan_anchors(sb, sbi->s_last_block); if (lastblock) goto check_anchor; /* No anchor found? Try VARCONV conversion of block numbers */ + UDF_SET_FLAG(sb, UDF_FLAG_VARCONV); /* Firstly, we try to not convert number of the last block */ - lastblock = udf_scan_anchors(sb, 1, + lastblock = udf_scan_anchors(sb, udf_variable_to_fixed(sbi->s_last_block)); - if (lastblock) { - UDF_SET_FLAG(sb, UDF_FLAG_VARCONV); + if (lastblock) goto check_anchor; - } /* Secondly, we try with converted number of the last block */ - lastblock = udf_scan_anchors(sb, 1, sbi->s_last_block); - if (lastblock) - UDF_SET_FLAG(sb, UDF_FLAG_VARCONV); + lastblock = udf_scan_anchors(sb, sbi->s_last_block); + if (!lastblock) { + /* VARCONV didn't help. Clear it. */ + UDF_CLEAR_FLAG(sb, UDF_FLAG_VARCONV); + } check_anchor: /* diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 85b22b5977fa..e65212dfb60e 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -76,6 +76,7 @@ #include <linux/errno.h> #include <linux/fs.h> +#include <linux/quotaops.h> #include <linux/slab.h> #include <linux/time.h> #include <linux/stat.h> @@ -308,7 +309,7 @@ enum { Opt_err }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_type_old, "ufstype=old"}, {Opt_type_sunx86, "ufstype=sunx86"}, {Opt_type_sun, "ufstype=sun"}, @@ -1232,7 +1233,7 @@ static int ufs_show_options(struct seq_file *seq, struct vfsmount *vfs) { struct ufs_sb_info *sbi = UFS_SB(vfs->mnt_sb); unsigned mval = sbi->s_mount_opt & UFS_MOUNT_UFSTYPE; - struct match_token *tp = tokens; + const struct match_token *tp = tokens; while (tp->token != Opt_onerror_panic && tp->token != mval) ++tp; @@ -1301,7 +1302,7 @@ static void ufs_destroy_inode(struct inode *inode) kmem_cache_free(ufs_inode_cachep, UFS_I(inode)); } -static void init_once(struct kmem_cache * cachep, void *foo) +static void init_once(void *foo) { struct ufs_inode_info *ei = (struct ufs_inode_info *) foo; diff --git a/fs/utimes.c b/fs/utimes.c index af059d5cb485..6929e3e91d05 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -40,75 +40,30 @@ asmlinkage long sys_utime(char __user *filename, struct utimbuf __user *times) #endif -static bool nsec_special(long nsec) -{ - return nsec == UTIME_OMIT || nsec == UTIME_NOW; -} - static bool nsec_valid(long nsec) { - if (nsec_special(nsec)) + if (nsec == UTIME_OMIT || nsec == UTIME_NOW) return true; return nsec >= 0 && nsec <= 999999999; } -/* If times==NULL, set access and modification to current time, - * must be owner or have write permission. - * Else, update from *times, must be owner or super user. - */ -long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags) +static int utimes_common(struct path *path, struct timespec *times) { int error; - struct nameidata nd; - struct dentry *dentry; - struct inode *inode; struct iattr newattrs; - struct file *f = NULL; - struct vfsmount *mnt; - - error = -EINVAL; - if (times && (!nsec_valid(times[0].tv_nsec) || - !nsec_valid(times[1].tv_nsec))) { - goto out; - } + struct inode *inode = path->dentry->d_inode; - if (flags & ~AT_SYMLINK_NOFOLLOW) + error = mnt_want_write(path->mnt); + if (error) goto out; - if (filename == NULL && dfd != AT_FDCWD) { - error = -EINVAL; - if (flags & AT_SYMLINK_NOFOLLOW) - goto out; - - error = -EBADF; - f = fget(dfd); - if (!f) - goto out; - dentry = f->f_path.dentry; - mnt = f->f_path.mnt; - } else { - error = __user_walk_fd(dfd, filename, (flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW, &nd); - if (error) - goto out; - - dentry = nd.path.dentry; - mnt = nd.path.mnt; - } - - inode = dentry->d_inode; + if (times && times[0].tv_nsec == UTIME_NOW && + times[1].tv_nsec == UTIME_NOW) + times = NULL; - error = mnt_want_write(mnt); - if (error) - goto dput_and_out; - - /* Don't worry, the checks are done in inode_change_ok() */ newattrs.ia_valid = ATTR_CTIME | ATTR_MTIME | ATTR_ATIME; if (times) { - error = -EPERM; - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) - goto mnt_drop_write_and_out; - if (times[0].tv_nsec == UTIME_OMIT) newattrs.ia_valid &= ~ATTR_ATIME; else if (times[0].tv_nsec != UTIME_NOW) { @@ -124,40 +79,93 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags newattrs.ia_mtime.tv_nsec = times[1].tv_nsec; newattrs.ia_valid |= ATTR_MTIME_SET; } - } - - /* - * If times is NULL or both times are either UTIME_OMIT or - * UTIME_NOW, then need to check permissions, because - * inode_change_ok() won't do it. - */ - if (!times || (nsec_special(times[0].tv_nsec) && - nsec_special(times[1].tv_nsec))) { + /* + * Tell inode_change_ok(), that this is an explicit time + * update, even if neither ATTR_ATIME_SET nor ATTR_MTIME_SET + * were used. + */ + newattrs.ia_valid |= ATTR_TIMES_SET; + } else { + /* + * If times is NULL (or both times are UTIME_NOW), + * then we need to check permissions, because + * inode_change_ok() won't do it. + */ error = -EACCES; if (IS_IMMUTABLE(inode)) goto mnt_drop_write_and_out; if (!is_owner_or_cap(inode)) { - if (f) { - if (!(f->f_mode & FMODE_WRITE)) - goto mnt_drop_write_and_out; - } else { - error = vfs_permission(&nd, MAY_WRITE); - if (error) - goto mnt_drop_write_and_out; - } + error = inode_permission(inode, MAY_WRITE); + if (error) + goto mnt_drop_write_and_out; } } mutex_lock(&inode->i_mutex); - error = notify_change(dentry, &newattrs); + error = notify_change(path->dentry, &newattrs); mutex_unlock(&inode->i_mutex); + mnt_drop_write_and_out: - mnt_drop_write(mnt); -dput_and_out: - if (f) - fput(f); - else - path_put(&nd.path); + mnt_drop_write(path->mnt); +out: + return error; +} + +/* + * do_utimes - change times on filename or file descriptor + * @dfd: open file descriptor, -1 or AT_FDCWD + * @filename: path name or NULL + * @times: new times or NULL + * @flags: zero or more flags (only AT_SYMLINK_NOFOLLOW for the moment) + * + * If filename is NULL and dfd refers to an open file, then operate on + * the file. Otherwise look up filename, possibly using dfd as a + * starting point. + * + * If times==NULL, set access and modification to current time, + * must be owner or have write permission. + * Else, update from *times, must be owner or super user. + */ +long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags) +{ + int error = -EINVAL; + + if (times && (!nsec_valid(times[0].tv_nsec) || + !nsec_valid(times[1].tv_nsec))) { + goto out; + } + + if (flags & ~AT_SYMLINK_NOFOLLOW) + goto out; + + if (filename == NULL && dfd != AT_FDCWD) { + struct file *file; + + if (flags & AT_SYMLINK_NOFOLLOW) + goto out; + + file = fget(dfd); + error = -EBADF; + if (!file) + goto out; + + error = utimes_common(&file->f_path, times); + fput(file); + } else { + struct path path; + int lookup_flags = 0; + + if (!(flags & AT_SYMLINK_NOFOLLOW)) + lookup_flags |= LOOKUP_FOLLOW; + + error = user_path_at(dfd, filename, lookup_flags, &path); + if (error) + goto out; + + error = utimes_common(&path, times); + path_put(&path); + } + out: return error; } @@ -169,14 +177,6 @@ asmlinkage long sys_utimensat(int dfd, char __user *filename, struct timespec __ if (utimes) { if (copy_from_user(&tstimes, utimes, sizeof(tstimes))) return -EFAULT; - if ((tstimes[0].tv_nsec == UTIME_OMIT || - tstimes[0].tv_nsec == UTIME_NOW) && - tstimes[0].tv_sec != 0) - return -EINVAL; - if ((tstimes[1].tv_nsec == UTIME_OMIT || - tstimes[1].tv_nsec == UTIME_NOW) && - tstimes[1].tv_sec != 0) - return -EINVAL; /* Nothing to do, we must not even check the path. */ if (tstimes[0].tv_nsec == UTIME_OMIT && diff --git a/fs/vfat/namei.c b/fs/vfat/namei.c index a3522727ea5b..155c10b4adbd 100644 --- a/fs/vfat/namei.c +++ b/fs/vfat/namei.c @@ -621,7 +621,7 @@ shortname: memcpy(de->name, msdos_name, MSDOS_NAME); de->attr = is_dir ? ATTR_DIR : ATTR_ARCH; de->lcase = lcase; - fat_date_unix2dos(ts->tv_sec, &time, &date); + fat_date_unix2dos(ts->tv_sec, &time, &date, sbi->options.tz_utc); de->time = de->ctime = time; de->date = de->cdate = de->adate = date; de->ctime_cs = 0; @@ -645,7 +645,7 @@ static int vfat_add_entry(struct inode *dir, struct qstr *qname, int is_dir, if (len == 0) return -ENOENT; - slots = kmalloc(sizeof(*slots) * MSDOS_SLOTS, GFP_KERNEL); + slots = kmalloc(sizeof(*slots) * MSDOS_SLOTS, GFP_NOFS); if (slots == NULL) return -ENOMEM; @@ -687,7 +687,7 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry, struct dentry *alias; int err, table; - lock_kernel(); + lock_super(sb); table = (MSDOS_SB(sb)->options.name_check == 's') ? 2 : 0; dentry->d_op = &vfat_dentry_ops[table]; @@ -699,7 +699,7 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry, inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos); brelse(sinfo.bh); if (IS_ERR(inode)) { - unlock_kernel(); + unlock_super(sb); return ERR_CAST(inode); } alias = d_find_alias(inode); @@ -708,13 +708,13 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry, dput(alias); else { iput(inode); - unlock_kernel(); + unlock_super(sb); return alias; } } error: - unlock_kernel(); + unlock_super(sb); dentry->d_op = &vfat_dentry_ops[table]; dentry->d_time = dentry->d_parent->d_inode->i_version; dentry = d_splice_alias(inode, dentry); @@ -734,7 +734,7 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, int mode, struct timespec ts; int err; - lock_kernel(); + lock_super(sb); ts = CURRENT_TIME_SEC; err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &ts, &sinfo); @@ -755,17 +755,18 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, int mode, dentry->d_time = dentry->d_parent->d_inode->i_version; d_instantiate(dentry, inode); out: - unlock_kernel(); + unlock_super(sb); return err; } static int vfat_rmdir(struct inode *dir, struct dentry *dentry) { struct inode *inode = dentry->d_inode; + struct super_block *sb = dir->i_sb; struct fat_slot_info sinfo; int err; - lock_kernel(); + lock_super(sb); err = fat_dir_empty(inode); if (err) @@ -783,7 +784,7 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry) inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC; fat_detach(inode); out: - unlock_kernel(); + unlock_super(sb); return err; } @@ -791,10 +792,11 @@ out: static int vfat_unlink(struct inode *dir, struct dentry *dentry) { struct inode *inode = dentry->d_inode; + struct super_block *sb = dir->i_sb; struct fat_slot_info sinfo; int err; - lock_kernel(); + lock_super(sb); err = vfat_find(dir, &dentry->d_name, &sinfo); if (err) @@ -807,7 +809,7 @@ static int vfat_unlink(struct inode *dir, struct dentry *dentry) inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC; fat_detach(inode); out: - unlock_kernel(); + unlock_super(sb); return err; } @@ -820,7 +822,7 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode) struct timespec ts; int err, cluster; - lock_kernel(); + lock_super(sb); ts = CURRENT_TIME_SEC; cluster = fat_alloc_new_dir(dir, &ts); @@ -849,13 +851,13 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode) dentry->d_time = dentry->d_parent->d_inode->i_version; d_instantiate(dentry, inode); - unlock_kernel(); + unlock_super(sb); return 0; out_free: fat_free_clusters(dir, cluster); out: - unlock_kernel(); + unlock_super(sb); return err; } @@ -869,11 +871,12 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, struct timespec ts; loff_t dotdot_i_pos, new_i_pos; int err, is_dir, update_dotdot, corrupt = 0; + struct super_block *sb = old_dir->i_sb; old_sinfo.bh = sinfo.bh = dotdot_bh = NULL; old_inode = old_dentry->d_inode; new_inode = new_dentry->d_inode; - lock_kernel(); + lock_super(sb); err = vfat_find(old_dir, &old_dentry->d_name, &old_sinfo); if (err) goto out; @@ -951,7 +954,7 @@ out: brelse(sinfo.bh); brelse(dotdot_bh); brelse(old_sinfo.bh); - unlock_kernel(); + unlock_super(sb); return err; diff --git a/fs/xattr.c b/fs/xattr.c index 4706a8b1f495..468377e66531 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -63,7 +63,7 @@ xattr_permission(struct inode *inode, const char *name, int mask) return -EPERM; } - return permission(inode, mask, NULL); + return inode_permission(inode, mask); } int @@ -252,40 +252,40 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value, } asmlinkage long -sys_setxattr(const char __user *path, const char __user *name, +sys_setxattr(const char __user *pathname, const char __user *name, const void __user *value, size_t size, int flags) { - struct nameidata nd; + struct path path; int error; - error = user_path_walk(path, &nd); + error = user_path(pathname, &path); if (error) return error; - error = mnt_want_write(nd.path.mnt); + error = mnt_want_write(path.mnt); if (!error) { - error = setxattr(nd.path.dentry, name, value, size, flags); - mnt_drop_write(nd.path.mnt); + error = setxattr(path.dentry, name, value, size, flags); + mnt_drop_write(path.mnt); } - path_put(&nd.path); + path_put(&path); return error; } asmlinkage long -sys_lsetxattr(const char __user *path, const char __user *name, +sys_lsetxattr(const char __user *pathname, const char __user *name, const void __user *value, size_t size, int flags) { - struct nameidata nd; + struct path path; int error; - error = user_path_walk_link(path, &nd); + error = user_lpath(pathname, &path); if (error) return error; - error = mnt_want_write(nd.path.mnt); + error = mnt_want_write(path.mnt); if (!error) { - error = setxattr(nd.path.dentry, name, value, size, flags); - mnt_drop_write(nd.path.mnt); + error = setxattr(path.dentry, name, value, size, flags); + mnt_drop_write(path.mnt); } - path_put(&nd.path); + path_put(&path); return error; } @@ -350,32 +350,32 @@ getxattr(struct dentry *d, const char __user *name, void __user *value, } asmlinkage ssize_t -sys_getxattr(const char __user *path, const char __user *name, +sys_getxattr(const char __user *pathname, const char __user *name, void __user *value, size_t size) { - struct nameidata nd; + struct path path; ssize_t error; - error = user_path_walk(path, &nd); + error = user_path(pathname, &path); if (error) return error; - error = getxattr(nd.path.dentry, name, value, size); - path_put(&nd.path); + error = getxattr(path.dentry, name, value, size); + path_put(&path); return error; } asmlinkage ssize_t -sys_lgetxattr(const char __user *path, const char __user *name, void __user *value, +sys_lgetxattr(const char __user *pathname, const char __user *name, void __user *value, size_t size) { - struct nameidata nd; + struct path path; ssize_t error; - error = user_path_walk_link(path, &nd); + error = user_lpath(pathname, &path); if (error) return error; - error = getxattr(nd.path.dentry, name, value, size); - path_put(&nd.path); + error = getxattr(path.dentry, name, value, size); + path_put(&path); return error; } @@ -425,30 +425,30 @@ listxattr(struct dentry *d, char __user *list, size_t size) } asmlinkage ssize_t -sys_listxattr(const char __user *path, char __user *list, size_t size) +sys_listxattr(const char __user *pathname, char __user *list, size_t size) { - struct nameidata nd; + struct path path; ssize_t error; - error = user_path_walk(path, &nd); + error = user_path(pathname, &path); if (error) return error; - error = listxattr(nd.path.dentry, list, size); - path_put(&nd.path); + error = listxattr(path.dentry, list, size); + path_put(&path); return error; } asmlinkage ssize_t -sys_llistxattr(const char __user *path, char __user *list, size_t size) +sys_llistxattr(const char __user *pathname, char __user *list, size_t size) { - struct nameidata nd; + struct path path; ssize_t error; - error = user_path_walk_link(path, &nd); + error = user_lpath(pathname, &path); if (error) return error; - error = listxattr(nd.path.dentry, list, size); - path_put(&nd.path); + error = listxattr(path.dentry, list, size); + path_put(&path); return error; } @@ -486,38 +486,38 @@ removexattr(struct dentry *d, const char __user *name) } asmlinkage long -sys_removexattr(const char __user *path, const char __user *name) +sys_removexattr(const char __user *pathname, const char __user *name) { - struct nameidata nd; + struct path path; int error; - error = user_path_walk(path, &nd); + error = user_path(pathname, &path); if (error) return error; - error = mnt_want_write(nd.path.mnt); + error = mnt_want_write(path.mnt); if (!error) { - error = removexattr(nd.path.dentry, name); - mnt_drop_write(nd.path.mnt); + error = removexattr(path.dentry, name); + mnt_drop_write(path.mnt); } - path_put(&nd.path); + path_put(&path); return error; } asmlinkage long -sys_lremovexattr(const char __user *path, const char __user *name) +sys_lremovexattr(const char __user *pathname, const char __user *name) { - struct nameidata nd; + struct path path; int error; - error = user_path_walk_link(path, &nd); + error = user_lpath(pathname, &path); if (error) return error; - error = mnt_want_write(nd.path.mnt); + error = mnt_want_write(path.mnt); if (!error) { - error = removexattr(nd.path.dentry, name); - mnt_drop_write(nd.path.mnt); + error = removexattr(path.dentry, name); + mnt_drop_write(path.mnt); } - path_put(&nd.path); + path_put(&path); return error; } diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 36ec614e699a..737c9a425361 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -106,7 +106,8 @@ xfs-y += $(addprefix $(XFS_LINUX)/, \ xfs_iops.o \ xfs_lrw.o \ xfs_super.o \ - xfs_vnode.o) + xfs_vnode.o \ + xfs_xattr.o) # Objects in support/ xfs-y += $(addprefix support/, \ diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c index 9b1bb17a0501..1cd3b55ee3d2 100644 --- a/fs/xfs/linux-2.6/kmem.c +++ b/fs/xfs/linux-2.6/kmem.c @@ -90,7 +90,7 @@ kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize, } void -kmem_free(void *ptr, size_t size) +kmem_free(const void *ptr) { if (!is_vmalloc_addr(ptr)) { kfree(ptr); @@ -100,7 +100,7 @@ kmem_free(void *ptr, size_t size) } void * -kmem_realloc(void *ptr, size_t newsize, size_t oldsize, +kmem_realloc(const void *ptr, size_t newsize, size_t oldsize, unsigned int __nocast flags) { void *new; @@ -110,7 +110,7 @@ kmem_realloc(void *ptr, size_t newsize, size_t oldsize, if (new) memcpy(new, ptr, ((oldsize < newsize) ? oldsize : newsize)); - kmem_free(ptr, oldsize); + kmem_free(ptr); } return new; } diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h index 5e9564902976..af6843c7ee4b 100644 --- a/fs/xfs/linux-2.6/kmem.h +++ b/fs/xfs/linux-2.6/kmem.h @@ -57,8 +57,8 @@ kmem_flags_convert(unsigned int __nocast flags) extern void *kmem_alloc(size_t, unsigned int __nocast); extern void *kmem_zalloc(size_t, unsigned int __nocast); extern void *kmem_zalloc_greedy(size_t *, size_t, size_t, unsigned int __nocast); -extern void *kmem_realloc(void *, size_t, size_t, unsigned int __nocast); -extern void kmem_free(void *, size_t); +extern void *kmem_realloc(const void *, size_t, size_t, unsigned int __nocast); +extern void kmem_free(const void *); /* * Zone interfaces @@ -79,7 +79,7 @@ kmem_zone_init(int size, char *zone_name) static inline kmem_zone_t * kmem_zone_init_flags(int size, char *zone_name, unsigned long flags, - void (*construct)(kmem_zone_t *, void *)) + void (*construct)(void *)) { return kmem_cache_create(zone_name, size, 0, flags, construct); } diff --git a/fs/xfs/linux-2.6/sema.h b/fs/xfs/linux-2.6/sema.h deleted file mode 100644 index 3abe7e9ceb33..000000000000 --- a/fs/xfs/linux-2.6/sema.h +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_SUPPORT_SEMA_H__ -#define __XFS_SUPPORT_SEMA_H__ - -#include <linux/time.h> -#include <linux/wait.h> -#include <linux/semaphore.h> -#include <asm/atomic.h> - -/* - * sema_t structure just maps to struct semaphore in Linux kernel. - */ - -typedef struct semaphore sema_t; - -#define initnsema(sp, val, name) sema_init(sp, val) -#define psema(sp, b) down(sp) -#define vsema(sp) up(sp) -#define freesema(sema) do { } while (0) - -static inline int issemalocked(sema_t *sp) -{ - return down_trylock(sp) || (up(sp), 0); -} - -/* - * Map cpsema (try to get the sema) to down_trylock. We need to switch - * the return values since cpsema returns 1 (acquired) 0 (failed) and - * down_trylock returns the reverse 0 (acquired) 1 (failed). - */ -static inline int cpsema(sema_t *sp) -{ - return down_trylock(sp) ? 0 : 1; -} - -#endif /* __XFS_SUPPORT_SEMA_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index a55c3b26d840..a44d68eb50b5 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -73,7 +73,6 @@ xfs_page_trace( unsigned long pgoff) { xfs_inode_t *ip; - bhv_vnode_t *vp = vn_from_inode(inode); loff_t isize = i_size_read(inode); loff_t offset = page_offset(page); int delalloc = -1, unmapped = -1, unwritten = -1; @@ -81,7 +80,7 @@ xfs_page_trace( if (page_has_buffers(page)) xfs_count_page_state(page, &delalloc, &unmapped, &unwritten); - ip = xfs_vtoi(vp); + ip = XFS_I(inode); if (!ip->i_rwtrace) return; @@ -409,7 +408,6 @@ xfs_start_buffer_writeback( STATIC void xfs_start_page_writeback( struct page *page, - struct writeback_control *wbc, int clear_dirty, int buffers) { @@ -676,7 +674,7 @@ xfs_probe_cluster( } else pg_offset = PAGE_CACHE_SIZE; - if (page->index == tindex && !TestSetPageLocked(page)) { + if (page->index == tindex && trylock_page(page)) { pg_len = xfs_probe_page(page, pg_offset, mapped); unlock_page(page); } @@ -760,7 +758,7 @@ xfs_convert_page( if (page->index != tindex) goto fail; - if (TestSetPageLocked(page)) + if (!trylock_page(page)) goto fail; if (PageWriteback(page)) goto fail_unlock_page; @@ -858,7 +856,7 @@ xfs_convert_page( done = 1; } } - xfs_start_page_writeback(page, wbc, !page_dirty, count); + xfs_start_page_writeback(page, !page_dirty, count); } return done; @@ -1105,7 +1103,7 @@ xfs_page_state_convert( * that we are writing into for the first time. */ type = IOMAP_NEW; - if (!test_and_set_bit(BH_Lock, &bh->b_state)) { + if (trylock_buffer(bh)) { ASSERT(buffer_mapped(bh)); if (iomap_valid) all_bh = 1; @@ -1130,7 +1128,7 @@ xfs_page_state_convert( SetPageUptodate(page); if (startio) - xfs_start_page_writeback(page, wbc, 1, count); + xfs_start_page_writeback(page, 1, count); if (ioend && iomap_valid) { offset = (iomap.iomap_offset + iomap.iomap_bsize - 1) >> @@ -1340,6 +1338,10 @@ __xfs_get_blocks( offset = (xfs_off_t)iblock << inode->i_blkbits; ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); size = bh_result->b_size; + + if (!create && direct && offset >= i_size_read(inode)) + return 0; + error = xfs_iomap(XFS_I(inode), offset, size, create ? flags : BMAPI_READ, &iomap, &niomap); if (error) diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 98e0e86093b4..36d5fcd3f593 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -58,7 +58,7 @@ xfs_buf_trace( bp, id, (void *)(unsigned long)bp->b_flags, (void *)(unsigned long)bp->b_hold.counter, - (void *)(unsigned long)bp->b_sema.count.counter, + (void *)(unsigned long)bp->b_sema.count, (void *)current, data, ra, (void *)(unsigned long)((bp->b_file_offset>>32) & 0xffffffff), @@ -253,7 +253,7 @@ _xfs_buf_initialize( memset(bp, 0, sizeof(xfs_buf_t)); atomic_set(&bp->b_hold, 1); - init_MUTEX_LOCKED(&bp->b_iodonesema); + init_completion(&bp->b_iowait); INIT_LIST_HEAD(&bp->b_list); INIT_LIST_HEAD(&bp->b_hash_list); init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */ @@ -310,8 +310,7 @@ _xfs_buf_free_pages( xfs_buf_t *bp) { if (bp->b_pages != bp->b_page_array) { - kmem_free(bp->b_pages, - bp->b_page_count * sizeof(struct page *)); + kmem_free(bp->b_pages); } } @@ -839,6 +838,7 @@ xfs_buf_rele( return; } + ASSERT(atomic_read(&bp->b_hold) > 0); if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) { if (bp->b_relse) { atomic_inc(&bp->b_hold); @@ -852,11 +852,6 @@ xfs_buf_rele( spin_unlock(&hash->bh_lock); xfs_buf_free(bp); } - } else { - /* - * Catch reference count leaks - */ - ASSERT(atomic_read(&bp->b_hold) >= 0); } } @@ -1006,12 +1001,13 @@ xfs_buf_iodone_work( * We can get an EOPNOTSUPP to ordered writes. Here we clear the * ordered flag and reissue them. Because we can't tell the higher * layers directly that they should not issue ordered I/O anymore, they - * need to check if the ordered flag was cleared during I/O completion. + * need to check if the _XFS_BARRIER_FAILED flag was set during I/O completion. */ if ((bp->b_error == EOPNOTSUPP) && (bp->b_flags & (XBF_ORDERED|XBF_ASYNC)) == (XBF_ORDERED|XBF_ASYNC)) { XB_TRACE(bp, "ordered_retry", bp->b_iodone); bp->b_flags &= ~XBF_ORDERED; + bp->b_flags |= _XFS_BARRIER_FAILED; xfs_buf_iorequest(bp); } else if (bp->b_iodone) (*(bp->b_iodone))(bp); @@ -1038,7 +1034,7 @@ xfs_buf_ioend( xfs_buf_iodone_work(&bp->b_iodone_work); } } else { - up(&bp->b_iodonesema); + complete(&bp->b_iowait); } } @@ -1276,7 +1272,7 @@ xfs_buf_iowait( XB_TRACE(bp, "iowait", 0); if (atomic_read(&bp->b_io_remaining)) blk_run_address_space(bp->b_target->bt_mapping); - down(&bp->b_iodonesema); + wait_for_completion(&bp->b_iowait); XB_TRACE(bp, "iowaited", (long)bp->b_error); return bp->b_error; } @@ -1398,7 +1394,7 @@ STATIC void xfs_free_bufhash( xfs_buftarg_t *btp) { - kmem_free(btp->bt_hash, (1<<btp->bt_hashshift) * sizeof(xfs_bufhash_t)); + kmem_free(btp->bt_hash); btp->bt_hash = NULL; } @@ -1428,13 +1424,10 @@ xfs_unregister_buftarg( void xfs_free_buftarg( - xfs_buftarg_t *btp, - int external) + xfs_buftarg_t *btp) { xfs_flush_buftarg(btp, 1); xfs_blkdev_issue_flush(btp); - if (external) - xfs_blkdev_put(btp->bt_bdev); xfs_free_bufhash(btp); iput(btp->bt_mapping->host); @@ -1444,7 +1437,7 @@ xfs_free_buftarg( xfs_unregister_buftarg(btp); kthread_stop(btp->bt_task); - kmem_free(btp, sizeof(*btp)); + kmem_free(btp); } STATIC int @@ -1575,7 +1568,7 @@ xfs_alloc_buftarg( return btp; error: - kmem_free(btp, sizeof(*btp)); + kmem_free(btp); return NULL; } @@ -1803,7 +1796,7 @@ int __init xfs_buf_init(void) { #ifdef XFS_BUF_TRACE - xfs_buf_trace_buf = ktrace_alloc(XFS_BUF_TRACE_SIZE, KM_SLEEP); + xfs_buf_trace_buf = ktrace_alloc(XFS_BUF_TRACE_SIZE, KM_NOFS); #endif xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf", diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index f948ec7ba9a4..456519a088c7 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -85,6 +85,14 @@ typedef enum { * modifications being lost. */ _XBF_PAGE_LOCKED = (1 << 22), + + /* + * If we try a barrier write, but it fails we have to communicate + * this to the upper layers. Unfortunately b_error gets overwritten + * when the buffer is re-issued so we have to add another flag to + * keep this information. + */ + _XFS_BARRIER_FAILED = (1 << 23), } xfs_buf_flags_t; typedef enum { @@ -157,7 +165,7 @@ typedef struct xfs_buf { xfs_buf_iodone_t b_iodone; /* I/O completion function */ xfs_buf_relse_t b_relse; /* releasing function */ xfs_buf_bdstrat_t b_strat; /* pre-write function */ - struct semaphore b_iodonesema; /* Semaphore for I/O waiters */ + struct completion b_iowait; /* queue for I/O waiters */ void *b_fspriv; void *b_fspriv2; void *b_fspriv3; @@ -352,7 +360,7 @@ extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *); #define XFS_BUF_CPSEMA(bp) (xfs_buf_cond_lock(bp) == 0) #define XFS_BUF_VSEMA(bp) xfs_buf_unlock(bp) #define XFS_BUF_PSEMA(bp,x) xfs_buf_lock(bp) -#define XFS_BUF_V_IODONESEMA(bp) up(&bp->b_iodonesema); +#define XFS_BUF_FINISH_IOWAIT(bp) complete(&bp->b_iowait); #define XFS_BUF_SET_TARGET(bp, target) ((bp)->b_target = (target)) #define XFS_BUF_TARGET(bp) ((bp)->b_target) @@ -429,7 +437,7 @@ static inline void xfs_bdwrite(void *mp, xfs_buf_t *bp) * Handling of buftargs. */ extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int); -extern void xfs_free_buftarg(xfs_buftarg_t *, int); +extern void xfs_free_buftarg(xfs_buftarg_t *); extern void xfs_wait_buftarg(xfs_buftarg_t *); extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); extern int xfs_flush_buftarg(xfs_buftarg_t *, int); diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c index c672b3238b14..24fd598af846 100644 --- a/fs/xfs/linux-2.6/xfs_export.c +++ b/fs/xfs/linux-2.6/xfs_export.c @@ -139,7 +139,7 @@ xfs_nfs_get_inode( } xfs_iunlock(ip, XFS_ILOCK_SHARED); - return ip->i_vnode; + return VFS_I(ip); } STATIC struct dentry * @@ -167,7 +167,7 @@ xfs_fs_fh_to_dentry(struct super_block *sb, struct fid *fid, if (!inode) return NULL; if (IS_ERR(inode)) - return ERR_PTR(PTR_ERR(inode)); + return ERR_CAST(inode); result = d_alloc_anon(inode); if (!result) { iput(inode); @@ -198,7 +198,7 @@ xfs_fs_fh_to_parent(struct super_block *sb, struct fid *fid, if (!inode) return NULL; if (IS_ERR(inode)) - return ERR_PTR(PTR_ERR(inode)); + return ERR_CAST(inode); result = d_alloc_anon(inode); if (!result) { iput(inode); @@ -215,13 +215,13 @@ xfs_fs_get_parent( struct xfs_inode *cip; struct dentry *parent; - error = xfs_lookup(XFS_I(child->d_inode), &xfs_name_dotdot, &cip); + error = xfs_lookup(XFS_I(child->d_inode), &xfs_name_dotdot, &cip, NULL); if (unlikely(error)) return ERR_PTR(-error); - parent = d_alloc_anon(cip->i_vnode); + parent = d_alloc_anon(VFS_I(cip)); if (unlikely(!parent)) { - iput(cip->i_vnode); + iput(VFS_I(cip)); return ERR_PTR(-ENOMEM); } return parent; diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index 5f60363b9343..5311c1acdd40 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -475,6 +475,7 @@ const struct file_operations xfs_invis_file_operations = { const struct file_operations xfs_dir_file_operations = { .read = generic_read_dir, .readdir = xfs_file_readdir, + .llseek = generic_file_llseek, .unlocked_ioctl = xfs_file_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = xfs_file_compat_ioctl, diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c index 1eefe61f0e10..36caa6d957df 100644 --- a/fs/xfs/linux-2.6/xfs_fs_subr.c +++ b/fs/xfs/linux-2.6/xfs_fs_subr.c @@ -31,7 +31,7 @@ xfs_tosspages( xfs_off_t last, int fiopt) { - struct address_space *mapping = ip->i_vnode->i_mapping; + struct address_space *mapping = VFS_I(ip)->i_mapping; if (mapping->nrpages) truncate_inode_pages(mapping, first); @@ -44,7 +44,7 @@ xfs_flushinval_pages( xfs_off_t last, int fiopt) { - struct address_space *mapping = ip->i_vnode->i_mapping; + struct address_space *mapping = VFS_I(ip)->i_mapping; int ret = 0; if (mapping->nrpages) { @@ -64,7 +64,7 @@ xfs_flush_pages( uint64_t flags, int fiopt) { - struct address_space *mapping = ip->i_vnode->i_mapping; + struct address_space *mapping = VFS_I(ip)->i_mapping; int ret = 0; int ret2; diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index a42ba9d71156..48799ba7e3e6 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -48,6 +48,8 @@ #include "xfs_dfrag.h" #include "xfs_fsops.h" #include "xfs_vnodeops.h" +#include "xfs_quota.h" +#include "xfs_inode_item.h" #include <linux/capability.h> #include <linux/dcache.h> @@ -84,17 +86,15 @@ xfs_find_handle( switch (cmd) { case XFS_IOC_PATH_TO_FSHANDLE: case XFS_IOC_PATH_TO_HANDLE: { - struct nameidata nd; - int error; - - error = user_path_walk_link((const char __user *)hreq.path, &nd); + struct path path; + int error = user_lpath((const char __user *)hreq.path, &path); if (error) return error; - ASSERT(nd.path.dentry); - ASSERT(nd.path.dentry->d_inode); - inode = igrab(nd.path.dentry->d_inode); - path_put(&nd.path); + ASSERT(path.dentry); + ASSERT(path.dentry->d_inode); + inode = igrab(path.dentry->d_inode); + path_put(&path); break; } @@ -245,7 +245,7 @@ xfs_vget_fsop_handlereq( xfs_iunlock(ip, XFS_ILOCK_SHARED); - *inode = XFS_ITOV(ip); + *inode = VFS_I(ip); return 0; } @@ -470,6 +470,12 @@ xfs_attrlist_by_handle( if (al_hreq.buflen > XATTR_LIST_MAX) return -XFS_ERROR(EINVAL); + /* + * Reject flags, only allow namespaces. + */ + if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE)) + return -XFS_ERROR(EINVAL); + error = xfs_vget_fsop_handlereq(mp, parinode, &al_hreq.hreq, &inode); if (error) goto out; @@ -589,7 +595,7 @@ xfs_attrmulti_by_handle( goto out; error = E2BIG; - size = am_hreq.opcount * sizeof(attr_multiop_t); + size = am_hreq.opcount * sizeof(xfs_attr_multiop_t); if (!size || size > 16 * PAGE_SIZE) goto out_vn_rele; @@ -682,9 +688,9 @@ xfs_ioc_space( return -XFS_ERROR(EFAULT); if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) - attr_flags |= ATTR_NONBLOCK; + attr_flags |= XFS_ATTR_NONBLOCK; if (ioflags & IO_INVIS) - attr_flags |= ATTR_DMI; + attr_flags |= XFS_ATTR_DMI; error = xfs_change_file_space(ip, cmd, &bf, filp->f_pos, NULL, attr_flags); @@ -875,6 +881,322 @@ xfs_ioc_fsgetxattr( return 0; } +STATIC void +xfs_set_diflags( + struct xfs_inode *ip, + unsigned int xflags) +{ + unsigned int di_flags; + + /* can't set PREALLOC this way, just preserve it */ + di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC); + if (xflags & XFS_XFLAG_IMMUTABLE) + di_flags |= XFS_DIFLAG_IMMUTABLE; + if (xflags & XFS_XFLAG_APPEND) + di_flags |= XFS_DIFLAG_APPEND; + if (xflags & XFS_XFLAG_SYNC) + di_flags |= XFS_DIFLAG_SYNC; + if (xflags & XFS_XFLAG_NOATIME) + di_flags |= XFS_DIFLAG_NOATIME; + if (xflags & XFS_XFLAG_NODUMP) + di_flags |= XFS_DIFLAG_NODUMP; + if (xflags & XFS_XFLAG_PROJINHERIT) + di_flags |= XFS_DIFLAG_PROJINHERIT; + if (xflags & XFS_XFLAG_NODEFRAG) + di_flags |= XFS_DIFLAG_NODEFRAG; + if (xflags & XFS_XFLAG_FILESTREAM) + di_flags |= XFS_DIFLAG_FILESTREAM; + if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) { + if (xflags & XFS_XFLAG_RTINHERIT) + di_flags |= XFS_DIFLAG_RTINHERIT; + if (xflags & XFS_XFLAG_NOSYMLINKS) + di_flags |= XFS_DIFLAG_NOSYMLINKS; + if (xflags & XFS_XFLAG_EXTSZINHERIT) + di_flags |= XFS_DIFLAG_EXTSZINHERIT; + } else if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) { + if (xflags & XFS_XFLAG_REALTIME) + di_flags |= XFS_DIFLAG_REALTIME; + if (xflags & XFS_XFLAG_EXTSIZE) + di_flags |= XFS_DIFLAG_EXTSIZE; + } + + ip->i_d.di_flags = di_flags; +} + +STATIC void +xfs_diflags_to_linux( + struct xfs_inode *ip) +{ + struct inode *inode = VFS_I(ip); + unsigned int xflags = xfs_ip2xflags(ip); + + if (xflags & XFS_XFLAG_IMMUTABLE) + inode->i_flags |= S_IMMUTABLE; + else + inode->i_flags &= ~S_IMMUTABLE; + if (xflags & XFS_XFLAG_APPEND) + inode->i_flags |= S_APPEND; + else + inode->i_flags &= ~S_APPEND; + if (xflags & XFS_XFLAG_SYNC) + inode->i_flags |= S_SYNC; + else + inode->i_flags &= ~S_SYNC; + if (xflags & XFS_XFLAG_NOATIME) + inode->i_flags |= S_NOATIME; + else + inode->i_flags &= ~S_NOATIME; +} + +#define FSX_PROJID 1 +#define FSX_EXTSIZE 2 +#define FSX_XFLAGS 4 +#define FSX_NONBLOCK 8 + +STATIC int +xfs_ioctl_setattr( + xfs_inode_t *ip, + struct fsxattr *fa, + int mask) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + unsigned int lock_flags = 0; + struct xfs_dquot *udqp = NULL, *gdqp = NULL; + struct xfs_dquot *olddquot = NULL; + int code; + + xfs_itrace_entry(ip); + + if (mp->m_flags & XFS_MOUNT_RDONLY) + return XFS_ERROR(EROFS); + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + /* + * If disk quotas is on, we make sure that the dquots do exist on disk, + * before we start any other transactions. Trying to do this later + * is messy. We don't care to take a readlock to look at the ids + * in inode here, because we can't hold it across the trans_reserve. + * If the IDs do change before we take the ilock, we're covered + * because the i_*dquot fields will get updated anyway. + */ + if (XFS_IS_QUOTA_ON(mp) && (mask & FSX_PROJID)) { + code = XFS_QM_DQVOPALLOC(mp, ip, ip->i_d.di_uid, + ip->i_d.di_gid, fa->fsx_projid, + XFS_QMOPT_PQUOTA, &udqp, &gdqp); + if (code) + return code; + } + + /* + * For the other attributes, we acquire the inode lock and + * first do an error checking pass. + */ + tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); + code = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0); + if (code) + goto error_return; + + lock_flags = XFS_ILOCK_EXCL; + xfs_ilock(ip, lock_flags); + + /* + * CAP_FOWNER overrides the following restrictions: + * + * The user ID of the calling process must be equal + * to the file owner ID, except in cases where the + * CAP_FSETID capability is applicable. + */ + if (current->fsuid != ip->i_d.di_uid && !capable(CAP_FOWNER)) { + code = XFS_ERROR(EPERM); + goto error_return; + } + + /* + * Do a quota reservation only if projid is actually going to change. + */ + if (mask & FSX_PROJID) { + if (XFS_IS_PQUOTA_ON(mp) && + ip->i_d.di_projid != fa->fsx_projid) { + ASSERT(tp); + code = XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, udqp, gdqp, + capable(CAP_FOWNER) ? + XFS_QMOPT_FORCE_RES : 0); + if (code) /* out of quota */ + goto error_return; + } + } + + if (mask & FSX_EXTSIZE) { + /* + * Can't change extent size if any extents are allocated. + */ + if (ip->i_d.di_nextents && + ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) != + fa->fsx_extsize)) { + code = XFS_ERROR(EINVAL); /* EFBIG? */ + goto error_return; + } + + /* + * Extent size must be a multiple of the appropriate block + * size, if set at all. + */ + if (fa->fsx_extsize != 0) { + xfs_extlen_t size; + + if (XFS_IS_REALTIME_INODE(ip) || + ((mask & FSX_XFLAGS) && + (fa->fsx_xflags & XFS_XFLAG_REALTIME))) { + size = mp->m_sb.sb_rextsize << + mp->m_sb.sb_blocklog; + } else { + size = mp->m_sb.sb_blocksize; + } + + if (fa->fsx_extsize % size) { + code = XFS_ERROR(EINVAL); + goto error_return; + } + } + } + + + if (mask & FSX_XFLAGS) { + /* + * Can't change realtime flag if any extents are allocated. + */ + if ((ip->i_d.di_nextents || ip->i_delayed_blks) && + (XFS_IS_REALTIME_INODE(ip)) != + (fa->fsx_xflags & XFS_XFLAG_REALTIME)) { + code = XFS_ERROR(EINVAL); /* EFBIG? */ + goto error_return; + } + + /* + * If realtime flag is set then must have realtime data. + */ + if ((fa->fsx_xflags & XFS_XFLAG_REALTIME)) { + if ((mp->m_sb.sb_rblocks == 0) || + (mp->m_sb.sb_rextsize == 0) || + (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) { + code = XFS_ERROR(EINVAL); + goto error_return; + } + } + + /* + * Can't modify an immutable/append-only file unless + * we have appropriate permission. + */ + if ((ip->i_d.di_flags & + (XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) || + (fa->fsx_xflags & + (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) && + !capable(CAP_LINUX_IMMUTABLE)) { + code = XFS_ERROR(EPERM); + goto error_return; + } + } + + xfs_trans_ijoin(tp, ip, lock_flags); + xfs_trans_ihold(tp, ip); + + /* + * Change file ownership. Must be the owner or privileged. + * If the system was configured with the "restricted_chown" + * option, the owner is not permitted to give away the file, + * and can change the group id only to a group of which he + * or she is a member. + */ + if (mask & FSX_PROJID) { + /* + * CAP_FSETID overrides the following restrictions: + * + * The set-user-ID and set-group-ID bits of a file will be + * cleared upon successful return from chown() + */ + if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) && + !capable(CAP_FSETID)) + ip->i_d.di_mode &= ~(S_ISUID|S_ISGID); + + /* + * Change the ownerships and register quota modifications + * in the transaction. + */ + if (ip->i_d.di_projid != fa->fsx_projid) { + if (XFS_IS_PQUOTA_ON(mp)) { + olddquot = XFS_QM_DQVOPCHOWN(mp, tp, ip, + &ip->i_gdquot, gdqp); + } + ip->i_d.di_projid = fa->fsx_projid; + + /* + * We may have to rev the inode as well as + * the superblock version number since projids didn't + * exist before DINODE_VERSION_2 and SB_VERSION_NLINK. + */ + if (ip->i_d.di_version == XFS_DINODE_VERSION_1) + xfs_bump_ino_vers2(tp, ip); + } + + } + + if (mask & FSX_EXTSIZE) + ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog; + if (mask & FSX_XFLAGS) { + xfs_set_diflags(ip, fa->fsx_xflags); + xfs_diflags_to_linux(ip); + } + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + xfs_ichgtime(ip, XFS_ICHGTIME_CHG); + + XFS_STATS_INC(xs_ig_attrchg); + + /* + * If this is a synchronous mount, make sure that the + * transaction goes to disk before returning to the user. + * This is slightly sub-optimal in that truncates require + * two sync transactions instead of one for wsync filesystems. + * One for the truncate and one for the timestamps since we + * don't want to change the timestamps unless we're sure the + * truncate worked. Truncates are less than 1% of the laddis + * mix so this probably isn't worth the trouble to optimize. + */ + if (mp->m_flags & XFS_MOUNT_WSYNC) + xfs_trans_set_sync(tp); + code = xfs_trans_commit(tp, 0); + xfs_iunlock(ip, lock_flags); + + /* + * Release any dquot(s) the inode had kept before chown. + */ + XFS_QM_DQRELE(mp, olddquot); + XFS_QM_DQRELE(mp, udqp); + XFS_QM_DQRELE(mp, gdqp); + + if (code) + return code; + + if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE)) { + XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, ip, DM_RIGHT_NULL, + NULL, DM_RIGHT_NULL, NULL, NULL, 0, 0, + (mask & FSX_NONBLOCK) ? DM_FLAGS_NDELAY : 0); + } + + return 0; + + error_return: + XFS_QM_DQRELE(mp, udqp); + XFS_QM_DQRELE(mp, gdqp); + xfs_trans_cancel(tp, 0); + if (lock_flags) + xfs_iunlock(ip, lock_flags); + return code; +} + STATIC int xfs_ioc_fssetxattr( xfs_inode_t *ip, @@ -882,31 +1204,16 @@ xfs_ioc_fssetxattr( void __user *arg) { struct fsxattr fa; - struct bhv_vattr *vattr; - int error; - int attr_flags; + unsigned int mask; if (copy_from_user(&fa, arg, sizeof(fa))) return -EFAULT; - vattr = kmalloc(sizeof(*vattr), GFP_KERNEL); - if (unlikely(!vattr)) - return -ENOMEM; - - attr_flags = 0; + mask = FSX_XFLAGS | FSX_EXTSIZE | FSX_PROJID; if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) - attr_flags |= ATTR_NONBLOCK; - - vattr->va_mask = XFS_AT_XFLAGS | XFS_AT_EXTSIZE | XFS_AT_PROJID; - vattr->va_xflags = fa.fsx_xflags; - vattr->va_extsize = fa.fsx_extsize; - vattr->va_projid = fa.fsx_projid; + mask |= FSX_NONBLOCK; - error = -xfs_setattr(ip, vattr, attr_flags, NULL); - if (!error) - vn_revalidate(XFS_ITOV(ip)); /* update flags */ - kfree(vattr); - return 0; + return -xfs_ioctl_setattr(ip, &fa, mask); } STATIC int @@ -928,10 +1235,9 @@ xfs_ioc_setxflags( struct file *filp, void __user *arg) { - struct bhv_vattr *vattr; + struct fsxattr fa; unsigned int flags; - int attr_flags; - int error; + unsigned int mask; if (copy_from_user(&flags, arg, sizeof(flags))) return -EFAULT; @@ -941,22 +1247,12 @@ xfs_ioc_setxflags( FS_SYNC_FL)) return -EOPNOTSUPP; - vattr = kmalloc(sizeof(*vattr), GFP_KERNEL); - if (unlikely(!vattr)) - return -ENOMEM; - - attr_flags = 0; + mask = FSX_XFLAGS; if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) - attr_flags |= ATTR_NONBLOCK; - - vattr->va_mask = XFS_AT_XFLAGS; - vattr->va_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip)); + mask |= FSX_NONBLOCK; + fa.fsx_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip)); - error = -xfs_setattr(ip, vattr, attr_flags, NULL); - if (likely(!error)) - vn_revalidate(XFS_ITOV(ip)); /* update flags */ - kfree(vattr); - return error; + return -xfs_ioctl_setattr(ip, &fa, mask); } STATIC int diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 2bf287ef5489..095d271f3434 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -62,7 +62,7 @@ void xfs_synchronize_atime( xfs_inode_t *ip) { - struct inode *inode = ip->i_vnode; + struct inode *inode = VFS_I(ip); if (inode) { ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec; @@ -79,7 +79,7 @@ void xfs_mark_inode_dirty_sync( xfs_inode_t *ip) { - struct inode *inode = ip->i_vnode; + struct inode *inode = VFS_I(ip); if (inode) mark_inode_dirty_sync(inode); @@ -89,36 +89,31 @@ xfs_mark_inode_dirty_sync( * Change the requested timestamp in the given inode. * We don't lock across timestamp updates, and we don't log them but * we do record the fact that there is dirty information in core. - * - * NOTE -- callers MUST combine XFS_ICHGTIME_MOD or XFS_ICHGTIME_CHG - * with XFS_ICHGTIME_ACC to be sure that access time - * update will take. Calling first with XFS_ICHGTIME_ACC - * and then XFS_ICHGTIME_MOD may fail to modify the access - * timestamp if the filesystem is mounted noacctm. */ void xfs_ichgtime( xfs_inode_t *ip, int flags) { - struct inode *inode = vn_to_inode(XFS_ITOV(ip)); + struct inode *inode = VFS_I(ip); timespec_t tv; + int sync_it = 0; + + tv = current_fs_time(inode->i_sb); - nanotime(&tv); - if (flags & XFS_ICHGTIME_MOD) { + if ((flags & XFS_ICHGTIME_MOD) && + !timespec_equal(&inode->i_mtime, &tv)) { inode->i_mtime = tv; ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec; ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec; + sync_it = 1; } - if (flags & XFS_ICHGTIME_ACC) { - inode->i_atime = tv; - ip->i_d.di_atime.t_sec = (__int32_t)tv.tv_sec; - ip->i_d.di_atime.t_nsec = (__int32_t)tv.tv_nsec; - } - if (flags & XFS_ICHGTIME_CHG) { + if ((flags & XFS_ICHGTIME_CHG) && + !timespec_equal(&inode->i_ctime, &tv)) { inode->i_ctime = tv; ip->i_d.di_ctime.t_sec = (__int32_t)tv.tv_sec; ip->i_d.di_ctime.t_nsec = (__int32_t)tv.tv_nsec; + sync_it = 1; } /* @@ -130,72 +125,11 @@ xfs_ichgtime( * ensure that the compiler does not reorder the update * of i_update_core above the timestamp updates above. */ - SYNCHRONIZE(); - ip->i_update_core = 1; - if (!(inode->i_state & I_NEW)) + if (sync_it) { + SYNCHRONIZE(); + ip->i_update_core = 1; mark_inode_dirty_sync(inode); -} - -/* - * Variant on the above which avoids querying the system clock - * in situations where we know the Linux inode timestamps have - * just been updated (and so we can update our inode cheaply). - */ -void -xfs_ichgtime_fast( - xfs_inode_t *ip, - struct inode *inode, - int flags) -{ - timespec_t *tvp; - - /* - * Atime updates for read() & friends are handled lazily now, and - * explicit updates must go through xfs_ichgtime() - */ - ASSERT((flags & XFS_ICHGTIME_ACC) == 0); - - if (flags & XFS_ICHGTIME_MOD) { - tvp = &inode->i_mtime; - ip->i_d.di_mtime.t_sec = (__int32_t)tvp->tv_sec; - ip->i_d.di_mtime.t_nsec = (__int32_t)tvp->tv_nsec; - } - if (flags & XFS_ICHGTIME_CHG) { - tvp = &inode->i_ctime; - ip->i_d.di_ctime.t_sec = (__int32_t)tvp->tv_sec; - ip->i_d.di_ctime.t_nsec = (__int32_t)tvp->tv_nsec; } - - /* - * We update the i_update_core field _after_ changing - * the timestamps in order to coordinate properly with - * xfs_iflush() so that we don't lose timestamp updates. - * This keeps us from having to hold the inode lock - * while doing this. We use the SYNCHRONIZE macro to - * ensure that the compiler does not reorder the update - * of i_update_core above the timestamp updates above. - */ - SYNCHRONIZE(); - ip->i_update_core = 1; - if (!(inode->i_state & I_NEW)) - mark_inode_dirty_sync(inode); -} - - -/* - * Pull the link count and size up from the xfs inode to the linux inode - */ -STATIC void -xfs_validate_fields( - struct inode *inode) -{ - struct xfs_inode *ip = XFS_I(inode); - loff_t size; - - /* we're under i_sem so i_size can't change under us */ - size = XFS_ISIZE(ip); - if (i_size_read(inode) != size) - i_size_write(inode, size); } /* @@ -245,8 +179,7 @@ STATIC void xfs_cleanup_inode( struct inode *dir, struct inode *inode, - struct dentry *dentry, - int mode) + struct dentry *dentry) { struct xfs_name teardown; @@ -257,10 +190,7 @@ xfs_cleanup_inode( */ xfs_dentry_to_name(&teardown, dentry); - if (S_ISDIR(mode)) - xfs_rmdir(XFS_I(dir), &teardown, XFS_I(inode)); - else - xfs_remove(XFS_I(dir), &teardown, XFS_I(inode)); + xfs_remove(XFS_I(dir), &teardown, XFS_I(inode)); iput(inode); } @@ -275,7 +205,7 @@ xfs_vn_mknod( struct xfs_inode *ip = NULL; xfs_acl_t *default_acl = NULL; struct xfs_name name; - attrexists_t test_default_acl = _ACL_DEFAULT_EXISTS; + int (*test_default_acl)(struct inode *) = _ACL_DEFAULT_EXISTS; int error; /* @@ -320,7 +250,7 @@ xfs_vn_mknod( if (unlikely(error)) goto out_free_acl; - inode = ip->i_vnode; + inode = VFS_I(ip); error = xfs_init_security(inode, dir); if (unlikely(error)) @@ -335,14 +265,11 @@ xfs_vn_mknod( } - if (S_ISDIR(mode)) - xfs_validate_fields(inode); d_instantiate(dentry, inode); - xfs_validate_fields(dir); return -error; out_cleanup_inode: - xfs_cleanup_inode(dir, inode, dentry, mode); + xfs_cleanup_inode(dir, inode, dentry); out_free_acl: if (default_acl) _ACL_FREE(default_acl); @@ -382,7 +309,7 @@ xfs_vn_lookup( return ERR_PTR(-ENAMETOOLONG); xfs_dentry_to_name(&name, dentry); - error = xfs_lookup(XFS_I(dir), &name, &cip); + error = xfs_lookup(XFS_I(dir), &name, &cip, NULL); if (unlikely(error)) { if (unlikely(error != ENOENT)) return ERR_PTR(-error); @@ -390,7 +317,47 @@ xfs_vn_lookup( return NULL; } - return d_splice_alias(cip->i_vnode, dentry); + return d_splice_alias(VFS_I(cip), dentry); +} + +STATIC struct dentry * +xfs_vn_ci_lookup( + struct inode *dir, + struct dentry *dentry, + struct nameidata *nd) +{ + struct xfs_inode *ip; + struct xfs_name xname; + struct xfs_name ci_name; + struct qstr dname; + int error; + + if (dentry->d_name.len >= MAXNAMELEN) + return ERR_PTR(-ENAMETOOLONG); + + xfs_dentry_to_name(&xname, dentry); + error = xfs_lookup(XFS_I(dir), &xname, &ip, &ci_name); + if (unlikely(error)) { + if (unlikely(error != ENOENT)) + return ERR_PTR(-error); + /* + * call d_add(dentry, NULL) here when d_drop_negative_children + * is called in xfs_vn_mknod (ie. allow negative dentries + * with CI filesystems). + */ + return NULL; + } + + /* if exact match, just splice and exit */ + if (!ci_name.name) + return d_splice_alias(VFS_I(ip), dentry); + + /* else case-insensitive match... */ + dname.name = ci_name.name; + dname.len = ci_name.len; + dentry = d_add_ci(dentry, VFS_I(ip), &dname); + kmem_free(ci_name.name); + return dentry; } STATIC int @@ -414,7 +381,6 @@ xfs_vn_link( } xfs_iflags_set(XFS_I(dir), XFS_IMODIFIED); - xfs_validate_fields(inode); d_instantiate(dentry, inode); return 0; } @@ -424,19 +390,23 @@ xfs_vn_unlink( struct inode *dir, struct dentry *dentry) { - struct inode *inode; struct xfs_name name; int error; - inode = dentry->d_inode; xfs_dentry_to_name(&name, dentry); - error = xfs_remove(XFS_I(dir), &name, XFS_I(inode)); - if (likely(!error)) { - xfs_validate_fields(dir); /* size needs update */ - xfs_validate_fields(inode); - } - return -error; + error = -xfs_remove(XFS_I(dir), &name, XFS_I(dentry->d_inode)); + if (error) + return error; + + /* + * With unlink, the VFS makes the dentry "negative": no inode, + * but still hashed. This is incompatible with case-insensitive + * mode, so invalidate (unhash) the dentry in CI-mode. + */ + if (xfs_sb_version_hasasciici(&XFS_M(dir->i_sb)->m_sb)) + d_invalidate(dentry); + return 0; } STATIC int @@ -459,43 +429,22 @@ xfs_vn_symlink( if (unlikely(error)) goto out; - inode = cip->i_vnode; + inode = VFS_I(cip); error = xfs_init_security(inode, dir); if (unlikely(error)) goto out_cleanup_inode; d_instantiate(dentry, inode); - xfs_validate_fields(dir); - xfs_validate_fields(inode); return 0; out_cleanup_inode: - xfs_cleanup_inode(dir, inode, dentry, 0); + xfs_cleanup_inode(dir, inode, dentry); out: return -error; } STATIC int -xfs_vn_rmdir( - struct inode *dir, - struct dentry *dentry) -{ - struct inode *inode = dentry->d_inode; - struct xfs_name name; - int error; - - xfs_dentry_to_name(&name, dentry); - - error = xfs_rmdir(XFS_I(dir), &name, XFS_I(inode)); - if (likely(!error)) { - xfs_validate_fields(inode); - xfs_validate_fields(dir); - } - return -error; -} - -STATIC int xfs_vn_rename( struct inode *odir, struct dentry *odentry, @@ -505,22 +454,13 @@ xfs_vn_rename( struct inode *new_inode = ndentry->d_inode; struct xfs_name oname; struct xfs_name nname; - int error; xfs_dentry_to_name(&oname, odentry); xfs_dentry_to_name(&nname, ndentry); - error = xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode), + return -xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode), XFS_I(ndir), &nname, new_inode ? XFS_I(new_inode) : NULL); - if (likely(!error)) { - if (new_inode) - xfs_validate_fields(new_inode); - xfs_validate_fields(odir); - if (ndir != odir) - xfs_validate_fields(ndir); - } - return -error; } /* @@ -589,8 +529,7 @@ xfs_check_acl( STATIC int xfs_vn_permission( struct inode *inode, - int mask, - struct nameidata *nd) + int mask) { return generic_permission(inode, mask, xfs_check_acl); } @@ -660,57 +599,9 @@ xfs_vn_getattr( STATIC int xfs_vn_setattr( struct dentry *dentry, - struct iattr *attr) + struct iattr *iattr) { - struct inode *inode = dentry->d_inode; - unsigned int ia_valid = attr->ia_valid; - bhv_vattr_t vattr = { 0 }; - int flags = 0; - int error; - - if (ia_valid & ATTR_UID) { - vattr.va_mask |= XFS_AT_UID; - vattr.va_uid = attr->ia_uid; - } - if (ia_valid & ATTR_GID) { - vattr.va_mask |= XFS_AT_GID; - vattr.va_gid = attr->ia_gid; - } - if (ia_valid & ATTR_SIZE) { - vattr.va_mask |= XFS_AT_SIZE; - vattr.va_size = attr->ia_size; - } - if (ia_valid & ATTR_ATIME) { - vattr.va_mask |= XFS_AT_ATIME; - vattr.va_atime = attr->ia_atime; - inode->i_atime = attr->ia_atime; - } - if (ia_valid & ATTR_MTIME) { - vattr.va_mask |= XFS_AT_MTIME; - vattr.va_mtime = attr->ia_mtime; - } - if (ia_valid & ATTR_CTIME) { - vattr.va_mask |= XFS_AT_CTIME; - vattr.va_ctime = attr->ia_ctime; - } - if (ia_valid & ATTR_MODE) { - vattr.va_mask |= XFS_AT_MODE; - vattr.va_mode = attr->ia_mode; - if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) - inode->i_mode &= ~S_ISGID; - } - - if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET)) - flags |= ATTR_UTIME; -#ifdef ATTR_NO_BLOCK - if ((ia_valid & ATTR_NO_BLOCK)) - flags |= ATTR_NONBLOCK; -#endif - - error = xfs_setattr(XFS_I(inode), &vattr, flags, NULL); - if (likely(!error)) - vn_revalidate(vn_from_inode(inode)); - return -error; + return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0, NULL); } /* @@ -728,109 +619,6 @@ xfs_vn_truncate( WARN_ON(error); } -STATIC int -xfs_vn_setxattr( - struct dentry *dentry, - const char *name, - const void *data, - size_t size, - int flags) -{ - bhv_vnode_t *vp = vn_from_inode(dentry->d_inode); - char *attr = (char *)name; - attrnames_t *namesp; - int xflags = 0; - int error; - - namesp = attr_lookup_namespace(attr, attr_namespaces, ATTR_NAMECOUNT); - if (!namesp) - return -EOPNOTSUPP; - attr += namesp->attr_namelen; - error = namesp->attr_capable(vp, NULL); - if (error) - return error; - - /* Convert Linux syscall to XFS internal ATTR flags */ - if (flags & XATTR_CREATE) - xflags |= ATTR_CREATE; - if (flags & XATTR_REPLACE) - xflags |= ATTR_REPLACE; - xflags |= namesp->attr_flag; - return namesp->attr_set(vp, attr, (void *)data, size, xflags); -} - -STATIC ssize_t -xfs_vn_getxattr( - struct dentry *dentry, - const char *name, - void *data, - size_t size) -{ - bhv_vnode_t *vp = vn_from_inode(dentry->d_inode); - char *attr = (char *)name; - attrnames_t *namesp; - int xflags = 0; - ssize_t error; - - namesp = attr_lookup_namespace(attr, attr_namespaces, ATTR_NAMECOUNT); - if (!namesp) - return -EOPNOTSUPP; - attr += namesp->attr_namelen; - error = namesp->attr_capable(vp, NULL); - if (error) - return error; - - /* Convert Linux syscall to XFS internal ATTR flags */ - if (!size) { - xflags |= ATTR_KERNOVAL; - data = NULL; - } - xflags |= namesp->attr_flag; - return namesp->attr_get(vp, attr, (void *)data, size, xflags); -} - -STATIC ssize_t -xfs_vn_listxattr( - struct dentry *dentry, - char *data, - size_t size) -{ - bhv_vnode_t *vp = vn_from_inode(dentry->d_inode); - int error, xflags = ATTR_KERNAMELS; - ssize_t result; - - if (!size) - xflags |= ATTR_KERNOVAL; - xflags |= capable(CAP_SYS_ADMIN) ? ATTR_KERNFULLS : ATTR_KERNORMALS; - - error = attr_generic_list(vp, data, size, xflags, &result); - if (error < 0) - return error; - return result; -} - -STATIC int -xfs_vn_removexattr( - struct dentry *dentry, - const char *name) -{ - bhv_vnode_t *vp = vn_from_inode(dentry->d_inode); - char *attr = (char *)name; - attrnames_t *namesp; - int xflags = 0; - int error; - - namesp = attr_lookup_namespace(attr, attr_namespaces, ATTR_NAMECOUNT); - if (!namesp) - return -EOPNOTSUPP; - attr += namesp->attr_namelen; - error = namesp->attr_capable(vp, NULL); - if (error) - return error; - xflags |= namesp->attr_flag; - return namesp->attr_remove(vp, attr, xflags); -} - STATIC long xfs_vn_fallocate( struct inode *inode, @@ -854,18 +642,18 @@ xfs_vn_fallocate( xfs_ilock(ip, XFS_IOLOCK_EXCL); error = xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf, - 0, NULL, ATTR_NOLOCK); + 0, NULL, XFS_ATTR_NOLOCK); if (!error && !(mode & FALLOC_FL_KEEP_SIZE) && offset + len > i_size_read(inode)) new_size = offset + len; /* Change file size if needed */ if (new_size) { - bhv_vattr_t va; + struct iattr iattr; - va.va_mask = XFS_AT_SIZE; - va.va_size = new_size; - error = xfs_setattr(ip, &va, ATTR_NOLOCK, NULL); + iattr.ia_valid = ATTR_SIZE; + iattr.ia_size = new_size; + error = xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK, NULL); } xfs_iunlock(ip, XFS_IOLOCK_EXCL); @@ -873,46 +661,172 @@ out_error: return error; } -const struct inode_operations xfs_inode_operations = { +static const struct inode_operations xfs_inode_operations = { .permission = xfs_vn_permission, .truncate = xfs_vn_truncate, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, - .setxattr = xfs_vn_setxattr, - .getxattr = xfs_vn_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, .listxattr = xfs_vn_listxattr, - .removexattr = xfs_vn_removexattr, .fallocate = xfs_vn_fallocate, }; -const struct inode_operations xfs_dir_inode_operations = { +static const struct inode_operations xfs_dir_inode_operations = { .create = xfs_vn_create, .lookup = xfs_vn_lookup, .link = xfs_vn_link, .unlink = xfs_vn_unlink, .symlink = xfs_vn_symlink, .mkdir = xfs_vn_mkdir, - .rmdir = xfs_vn_rmdir, + /* + * Yes, XFS uses the same method for rmdir and unlink. + * + * There are some subtile differences deeper in the code, + * but we use S_ISDIR to check for those. + */ + .rmdir = xfs_vn_unlink, .mknod = xfs_vn_mknod, .rename = xfs_vn_rename, .permission = xfs_vn_permission, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, - .setxattr = xfs_vn_setxattr, - .getxattr = xfs_vn_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, .listxattr = xfs_vn_listxattr, - .removexattr = xfs_vn_removexattr, }; -const struct inode_operations xfs_symlink_inode_operations = { +static const struct inode_operations xfs_dir_ci_inode_operations = { + .create = xfs_vn_create, + .lookup = xfs_vn_ci_lookup, + .link = xfs_vn_link, + .unlink = xfs_vn_unlink, + .symlink = xfs_vn_symlink, + .mkdir = xfs_vn_mkdir, + /* + * Yes, XFS uses the same method for rmdir and unlink. + * + * There are some subtile differences deeper in the code, + * but we use S_ISDIR to check for those. + */ + .rmdir = xfs_vn_unlink, + .mknod = xfs_vn_mknod, + .rename = xfs_vn_rename, + .permission = xfs_vn_permission, + .getattr = xfs_vn_getattr, + .setattr = xfs_vn_setattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, + .listxattr = xfs_vn_listxattr, +}; + +static const struct inode_operations xfs_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = xfs_vn_follow_link, .put_link = xfs_vn_put_link, .permission = xfs_vn_permission, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, - .setxattr = xfs_vn_setxattr, - .getxattr = xfs_vn_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, .listxattr = xfs_vn_listxattr, - .removexattr = xfs_vn_removexattr, }; + +STATIC void +xfs_diflags_to_iflags( + struct inode *inode, + struct xfs_inode *ip) +{ + if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE) + inode->i_flags |= S_IMMUTABLE; + else + inode->i_flags &= ~S_IMMUTABLE; + if (ip->i_d.di_flags & XFS_DIFLAG_APPEND) + inode->i_flags |= S_APPEND; + else + inode->i_flags &= ~S_APPEND; + if (ip->i_d.di_flags & XFS_DIFLAG_SYNC) + inode->i_flags |= S_SYNC; + else + inode->i_flags &= ~S_SYNC; + if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME) + inode->i_flags |= S_NOATIME; + else + inode->i_flags &= ~S_NOATIME; +} + +/* + * Initialize the Linux inode, set up the operation vectors and + * unlock the inode. + * + * When reading existing inodes from disk this is called directly + * from xfs_iget, when creating a new inode it is called from + * xfs_ialloc after setting up the inode. + */ +void +xfs_setup_inode( + struct xfs_inode *ip) +{ + struct inode *inode = ip->i_vnode; + + inode->i_mode = ip->i_d.di_mode; + inode->i_nlink = ip->i_d.di_nlink; + inode->i_uid = ip->i_d.di_uid; + inode->i_gid = ip->i_d.di_gid; + + switch (inode->i_mode & S_IFMT) { + case S_IFBLK: + case S_IFCHR: + inode->i_rdev = + MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff, + sysv_minor(ip->i_df.if_u2.if_rdev)); + break; + default: + inode->i_rdev = 0; + break; + } + + inode->i_generation = ip->i_d.di_gen; + i_size_write(inode, ip->i_d.di_size); + inode->i_atime.tv_sec = ip->i_d.di_atime.t_sec; + inode->i_atime.tv_nsec = ip->i_d.di_atime.t_nsec; + inode->i_mtime.tv_sec = ip->i_d.di_mtime.t_sec; + inode->i_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec; + inode->i_ctime.tv_sec = ip->i_d.di_ctime.t_sec; + inode->i_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec; + xfs_diflags_to_iflags(inode, ip); + xfs_iflags_clear(ip, XFS_IMODIFIED); + + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + inode->i_op = &xfs_inode_operations; + inode->i_fop = &xfs_file_operations; + inode->i_mapping->a_ops = &xfs_address_space_operations; + break; + case S_IFDIR: + if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb)) + inode->i_op = &xfs_dir_ci_inode_operations; + else + inode->i_op = &xfs_dir_inode_operations; + inode->i_fop = &xfs_dir_file_operations; + break; + case S_IFLNK: + inode->i_op = &xfs_symlink_inode_operations; + if (!(ip->i_df.if_flags & XFS_IFINLINE)) + inode->i_mapping->a_ops = &xfs_address_space_operations; + break; + default: + inode->i_op = &xfs_inode_operations; + init_special_inode(inode, inode->i_mode, inode->i_rdev); + break; + } + + xfs_iflags_clear(ip, XFS_INEW); + barrier(); + + unlock_new_inode(inode); +} diff --git a/fs/xfs/linux-2.6/xfs_iops.h b/fs/xfs/linux-2.6/xfs_iops.h index 14d0deb7afff..8b1a1e31dc21 100644 --- a/fs/xfs/linux-2.6/xfs_iops.h +++ b/fs/xfs/linux-2.6/xfs_iops.h @@ -18,23 +18,14 @@ #ifndef __XFS_IOPS_H__ #define __XFS_IOPS_H__ -extern const struct inode_operations xfs_inode_operations; -extern const struct inode_operations xfs_dir_inode_operations; -extern const struct inode_operations xfs_symlink_inode_operations; +struct xfs_inode; extern const struct file_operations xfs_file_operations; extern const struct file_operations xfs_dir_file_operations; extern const struct file_operations xfs_invis_file_operations; +extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size); -struct xfs_inode; -extern void xfs_ichgtime(struct xfs_inode *, int); -extern void xfs_ichgtime_fast(struct xfs_inode *, struct inode *, int); - -#define xfs_vtoi(vp) \ - ((struct xfs_inode *)vn_to_inode(vp)->i_private) - -#define XFS_I(inode) \ - ((struct xfs_inode *)(inode)->i_private) +extern void xfs_setup_inode(struct xfs_inode *); #endif /* __XFS_IOPS_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h index 4edc46915b57..cc0f7b3a9795 100644 --- a/fs/xfs/linux-2.6/xfs_linux.h +++ b/fs/xfs/linux-2.6/xfs_linux.h @@ -45,13 +45,13 @@ #include <mrlock.h> #include <sv.h> #include <mutex.h> -#include <sema.h> #include <time.h> #include <support/ktrace.h> #include <support/debug.h> #include <support/uuid.h> +#include <linux/semaphore.h> #include <linux/mm.h> #include <linux/kernel.h> #include <linux/blkdev.h> @@ -76,6 +76,7 @@ #include <linux/log2.h> #include <linux/spinlock.h> #include <linux/random.h> +#include <linux/ctype.h> #include <asm/page.h> #include <asm/div64.h> @@ -125,8 +126,6 @@ #define current_cpu() (raw_smp_processor_id()) #define current_pid() (current->pid) -#define current_fsuid(cred) (current->fsuid) -#define current_fsgid(cred) (current->fsgid) #define current_test_flags(f) (current->flags & (f)) #define current_set_flags_nested(sp, f) \ (*(sp) = current->flags, current->flags |= (f)) @@ -179,7 +178,7 @@ #define xfs_sort(a,n,s,fn) sort(a,n,s,fn,NULL) #define xfs_stack_trace() dump_stack() #define xfs_itruncate_data(ip, off) \ - (-vmtruncate(vn_to_inode(XFS_ITOV(ip)), (off))) + (-vmtruncate(VFS_I(ip), (off))) /* Move the kernel do_div definition off to one side */ @@ -299,4 +298,11 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y) return x; } +/* ARM old ABI has some weird alignment/padding */ +#if defined(__arm__) && !defined(__ARM_EABI__) +#define __arch_pack __attribute__((packed)) +#else +#define __arch_pack +#endif + #endif /* __XFS_LINUX__ */ diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index 5e3b57516ec7..1957e5357d04 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -137,7 +137,7 @@ xfs_iozero( struct address_space *mapping; int status; - mapping = ip->i_vnode->i_mapping; + mapping = VFS_I(ip)->i_mapping; do { unsigned offset, bytes; void *fsdata; @@ -674,9 +674,7 @@ start: */ if (likely(!(ioflags & IO_INVIS) && !mnt_want_write(file->f_path.mnt))) { - file_update_time(file); - xfs_ichgtime_fast(xip, inode, - XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); mnt_drop_write(file->f_path.mnt); } @@ -711,7 +709,7 @@ start: !capable(CAP_FSETID)) { error = xfs_write_clear_setuid(xip); if (likely(!error)) - error = -remove_suid(file->f_path.dentry); + error = -file_remove_suid(file); if (unlikely(error)) { goto out_unlock_internal; } diff --git a/fs/xfs/linux-2.6/xfs_stats.c b/fs/xfs/linux-2.6/xfs_stats.c index e480b6102051..3d5b67c075c7 100644 --- a/fs/xfs/linux-2.6/xfs_stats.c +++ b/fs/xfs/linux-2.6/xfs_stats.c @@ -98,12 +98,21 @@ xfs_read_xfsstats( return len; } -void +int xfs_init_procfs(void) { if (!proc_mkdir("fs/xfs", NULL)) - return; - create_proc_read_entry("fs/xfs/stat", 0, NULL, xfs_read_xfsstats, NULL); + goto out; + + if (!create_proc_read_entry("fs/xfs/stat", 0, NULL, + xfs_read_xfsstats, NULL)) + goto out_remove_entry; + return 0; + + out_remove_entry: + remove_proc_entry("fs/xfs", NULL); + out: + return -ENOMEM; } void diff --git a/fs/xfs/linux-2.6/xfs_stats.h b/fs/xfs/linux-2.6/xfs_stats.h index afd0b0d5fdb2..e83820febc9f 100644 --- a/fs/xfs/linux-2.6/xfs_stats.h +++ b/fs/xfs/linux-2.6/xfs_stats.h @@ -134,7 +134,7 @@ DECLARE_PER_CPU(struct xfsstats, xfsstats); #define XFS_STATS_DEC(v) (per_cpu(xfsstats, current_cpu()).v--) #define XFS_STATS_ADD(v, inc) (per_cpu(xfsstats, current_cpu()).v += (inc)) -extern void xfs_init_procfs(void); +extern int xfs_init_procfs(void); extern void xfs_cleanup_procfs(void); @@ -144,8 +144,14 @@ extern void xfs_cleanup_procfs(void); # define XFS_STATS_DEC(count) # define XFS_STATS_ADD(count, inc) -static inline void xfs_init_procfs(void) { }; -static inline void xfs_cleanup_procfs(void) { }; +static inline int xfs_init_procfs(void) +{ + return 0; +} + +static inline void xfs_cleanup_procfs(void) +{ +} #endif /* !CONFIG_PROC_FS */ diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 742b2c7852c1..7227b2efef22 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -52,6 +52,12 @@ #include "xfs_version.h" #include "xfs_log_priv.h" #include "xfs_trans_priv.h" +#include "xfs_filestream.h" +#include "xfs_da_btree.h" +#include "xfs_dir2_trace.h" +#include "xfs_extfree_item.h" +#include "xfs_mru_cache.h" +#include "xfs_inode_item.h" #include <linux/namei.h> #include <linux/init.h> @@ -60,6 +66,7 @@ #include <linux/writeback.h> #include <linux/kthread.h> #include <linux/freezer.h> +#include <linux/parser.h> static struct quotactl_ops xfs_quotactl_operations; static struct super_operations xfs_super_operations; @@ -74,7 +81,10 @@ xfs_args_allocate( { struct xfs_mount_args *args; - args = kmem_zalloc(sizeof(struct xfs_mount_args), KM_SLEEP); + args = kzalloc(sizeof(struct xfs_mount_args), GFP_KERNEL); + if (!args) + return NULL; + args->logbufs = args->logbufsize = -1; strncpy(args->fsname, sb->s_id, MAXNAMELEN); @@ -138,6 +148,23 @@ xfs_args_allocate( #define MNTOPT_XDSM "xdsm" /* DMI enabled (DMAPI / XDSM) */ #define MNTOPT_DMI "dmi" /* DMI enabled (DMAPI / XDSM) */ +/* + * Table driven mount option parser. + * + * Currently only used for remount, but it will be used for mount + * in the future, too. + */ +enum { + Opt_barrier, Opt_nobarrier, Opt_err +}; + +static const match_table_t tokens = { + {Opt_barrier, "barrier"}, + {Opt_nobarrier, "nobarrier"}, + {Opt_err, NULL} +}; + + STATIC unsigned long suffix_strtoul(char *s, char **endp, unsigned int base) { @@ -314,6 +341,7 @@ xfs_parseargs( args->flags |= XFSMNT_ATTR2; } else if (!strcmp(this_char, MNTOPT_NOATTR2)) { args->flags &= ~XFSMNT_ATTR2; + args->flags |= XFSMNT_NOATTR2; } else if (!strcmp(this_char, MNTOPT_FILESTREAM)) { args->flags2 |= XFSMNT2_FILESTREAMS; } else if (!strcmp(this_char, MNTOPT_NOQUOTA)) { @@ -553,115 +581,6 @@ xfs_max_file_offset( return (((__uint64_t)pagefactor) << bitshift) - 1; } -STATIC_INLINE void -xfs_set_inodeops( - struct inode *inode) -{ - switch (inode->i_mode & S_IFMT) { - case S_IFREG: - inode->i_op = &xfs_inode_operations; - inode->i_fop = &xfs_file_operations; - inode->i_mapping->a_ops = &xfs_address_space_operations; - break; - case S_IFDIR: - inode->i_op = &xfs_dir_inode_operations; - inode->i_fop = &xfs_dir_file_operations; - break; - case S_IFLNK: - inode->i_op = &xfs_symlink_inode_operations; - if (!(XFS_I(inode)->i_df.if_flags & XFS_IFINLINE)) - inode->i_mapping->a_ops = &xfs_address_space_operations; - break; - default: - inode->i_op = &xfs_inode_operations; - init_special_inode(inode, inode->i_mode, inode->i_rdev); - break; - } -} - -STATIC_INLINE void -xfs_revalidate_inode( - xfs_mount_t *mp, - bhv_vnode_t *vp, - xfs_inode_t *ip) -{ - struct inode *inode = vn_to_inode(vp); - - inode->i_mode = ip->i_d.di_mode; - inode->i_nlink = ip->i_d.di_nlink; - inode->i_uid = ip->i_d.di_uid; - inode->i_gid = ip->i_d.di_gid; - - switch (inode->i_mode & S_IFMT) { - case S_IFBLK: - case S_IFCHR: - inode->i_rdev = - MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff, - sysv_minor(ip->i_df.if_u2.if_rdev)); - break; - default: - inode->i_rdev = 0; - break; - } - - inode->i_generation = ip->i_d.di_gen; - i_size_write(inode, ip->i_d.di_size); - inode->i_atime.tv_sec = ip->i_d.di_atime.t_sec; - inode->i_atime.tv_nsec = ip->i_d.di_atime.t_nsec; - inode->i_mtime.tv_sec = ip->i_d.di_mtime.t_sec; - inode->i_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec; - inode->i_ctime.tv_sec = ip->i_d.di_ctime.t_sec; - inode->i_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec; - if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE) - inode->i_flags |= S_IMMUTABLE; - else - inode->i_flags &= ~S_IMMUTABLE; - if (ip->i_d.di_flags & XFS_DIFLAG_APPEND) - inode->i_flags |= S_APPEND; - else - inode->i_flags &= ~S_APPEND; - if (ip->i_d.di_flags & XFS_DIFLAG_SYNC) - inode->i_flags |= S_SYNC; - else - inode->i_flags &= ~S_SYNC; - if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME) - inode->i_flags |= S_NOATIME; - else - inode->i_flags &= ~S_NOATIME; - xfs_iflags_clear(ip, XFS_IMODIFIED); -} - -void -xfs_initialize_vnode( - struct xfs_mount *mp, - bhv_vnode_t *vp, - struct xfs_inode *ip) -{ - struct inode *inode = vn_to_inode(vp); - - if (!ip->i_vnode) { - ip->i_vnode = vp; - inode->i_private = ip; - } - - /* - * We need to set the ops vectors, and unlock the inode, but if - * we have been called during the new inode create process, it is - * too early to fill in the Linux inode. We will get called a - * second time once the inode is properly set up, and then we can - * finish our work. - */ - if (ip->i_d.di_mode != 0 && (inode->i_state & I_NEW)) { - xfs_revalidate_inode(mp, vp, ip); - xfs_set_inodeops(inode); - - xfs_iflags_clear(ip, XFS_INEW); - barrier(); - - unlock_new_inode(inode); - } -} - int xfs_blkdev_get( xfs_mount_t *mp, @@ -733,14 +652,6 @@ xfs_mountfs_check_barriers(xfs_mount_t *mp) return; } - if (mp->m_ddev_targp->bt_bdev->bd_disk->queue->ordered == - QUEUE_ORDERED_NONE) { - xfs_fs_cmn_err(CE_NOTE, mp, - "Disabling barriers, not supported by the underlying device"); - mp->m_flags &= ~XFS_MOUNT_BARRIER; - return; - } - if (xfs_readonly_buftarg(mp->m_ddev_targp)) { xfs_fs_cmn_err(CE_NOTE, mp, "Disabling barriers, underlying device is readonly"); @@ -764,6 +675,139 @@ xfs_blkdev_issue_flush( blkdev_issue_flush(buftarg->bt_bdev, NULL); } +STATIC void +xfs_close_devices( + struct xfs_mount *mp) +{ + if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) { + struct block_device *logdev = mp->m_logdev_targp->bt_bdev; + xfs_free_buftarg(mp->m_logdev_targp); + xfs_blkdev_put(logdev); + } + if (mp->m_rtdev_targp) { + struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev; + xfs_free_buftarg(mp->m_rtdev_targp); + xfs_blkdev_put(rtdev); + } + xfs_free_buftarg(mp->m_ddev_targp); +} + +/* + * The file system configurations are: + * (1) device (partition) with data and internal log + * (2) logical volume with data and log subvolumes. + * (3) logical volume with data, log, and realtime subvolumes. + * + * We only have to handle opening the log and realtime volumes here if + * they are present. The data subvolume has already been opened by + * get_sb_bdev() and is stored in sb->s_bdev. + */ +STATIC int +xfs_open_devices( + struct xfs_mount *mp, + struct xfs_mount_args *args) +{ + struct block_device *ddev = mp->m_super->s_bdev; + struct block_device *logdev = NULL, *rtdev = NULL; + int error; + + /* + * Open real time and log devices - order is important. + */ + if (args->logname[0]) { + error = xfs_blkdev_get(mp, args->logname, &logdev); + if (error) + goto out; + } + + if (args->rtname[0]) { + error = xfs_blkdev_get(mp, args->rtname, &rtdev); + if (error) + goto out_close_logdev; + + if (rtdev == ddev || rtdev == logdev) { + cmn_err(CE_WARN, + "XFS: Cannot mount filesystem with identical rtdev and ddev/logdev."); + error = EINVAL; + goto out_close_rtdev; + } + } + + /* + * Setup xfs_mount buffer target pointers + */ + error = ENOMEM; + mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0); + if (!mp->m_ddev_targp) + goto out_close_rtdev; + + if (rtdev) { + mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1); + if (!mp->m_rtdev_targp) + goto out_free_ddev_targ; + } + + if (logdev && logdev != ddev) { + mp->m_logdev_targp = xfs_alloc_buftarg(logdev, 1); + if (!mp->m_logdev_targp) + goto out_free_rtdev_targ; + } else { + mp->m_logdev_targp = mp->m_ddev_targp; + } + + return 0; + + out_free_rtdev_targ: + if (mp->m_rtdev_targp) + xfs_free_buftarg(mp->m_rtdev_targp); + out_free_ddev_targ: + xfs_free_buftarg(mp->m_ddev_targp); + out_close_rtdev: + if (rtdev) + xfs_blkdev_put(rtdev); + out_close_logdev: + if (logdev && logdev != ddev) + xfs_blkdev_put(logdev); + out: + return error; +} + +/* + * Setup xfs_mount buffer target pointers based on superblock + */ +STATIC int +xfs_setup_devices( + struct xfs_mount *mp) +{ + int error; + + error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_blocksize, + mp->m_sb.sb_sectsize); + if (error) + return error; + + if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) { + unsigned int log_sector_size = BBSIZE; + + if (xfs_sb_version_hassector(&mp->m_sb)) + log_sector_size = mp->m_sb.sb_logsectsize; + error = xfs_setsize_buftarg(mp->m_logdev_targp, + mp->m_sb.sb_blocksize, + log_sector_size); + if (error) + return error; + } + if (mp->m_rtdev_targp) { + error = xfs_setsize_buftarg(mp->m_rtdev_targp, + mp->m_sb.sb_blocksize, + mp->m_sb.sb_sectsize); + if (error) + return error; + } + + return 0; +} + /* * XFS AIL push thread support */ @@ -826,63 +870,21 @@ STATIC struct inode * xfs_fs_alloc_inode( struct super_block *sb) { - bhv_vnode_t *vp; - - vp = kmem_zone_alloc(xfs_vnode_zone, KM_SLEEP); - if (unlikely(!vp)) - return NULL; - return vn_to_inode(vp); + return kmem_zone_alloc(xfs_vnode_zone, KM_SLEEP); } STATIC void xfs_fs_destroy_inode( struct inode *inode) { - kmem_zone_free(xfs_vnode_zone, vn_from_inode(inode)); + kmem_zone_free(xfs_vnode_zone, inode); } STATIC void xfs_fs_inode_init_once( - kmem_zone_t *zonep, void *vnode) { - inode_init_once(vn_to_inode((bhv_vnode_t *)vnode)); -} - -STATIC int __init -xfs_init_zones(void) -{ - xfs_vnode_zone = kmem_zone_init_flags(sizeof(bhv_vnode_t), "xfs_vnode", - KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | - KM_ZONE_SPREAD, - xfs_fs_inode_init_once); - if (!xfs_vnode_zone) - goto out; - - xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend"); - if (!xfs_ioend_zone) - goto out_destroy_vnode_zone; - - xfs_ioend_pool = mempool_create_slab_pool(4 * MAX_BUF_PER_PAGE, - xfs_ioend_zone); - if (!xfs_ioend_pool) - goto out_free_ioend_zone; - return 0; - - out_free_ioend_zone: - kmem_zone_destroy(xfs_ioend_zone); - out_destroy_vnode_zone: - kmem_zone_destroy(xfs_vnode_zone); - out: - return -ENOMEM; -} - -STATIC void -xfs_destroy_zones(void) -{ - mempool_destroy(xfs_ioend_pool); - kmem_zone_destroy(xfs_vnode_zone); - kmem_zone_destroy(xfs_ioend_zone); + inode_init_once((struct inode *)vnode); } /* @@ -987,7 +989,7 @@ void xfs_flush_inode( xfs_inode_t *ip) { - struct inode *inode = ip->i_vnode; + struct inode *inode = VFS_I(ip); igrab(inode); xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inode_work); @@ -1012,7 +1014,7 @@ void xfs_flush_device( xfs_inode_t *ip) { - struct inode *inode = vn_to_inode(XFS_ITOV(ip)); + struct inode *inode = VFS_I(ip); igrab(inode); xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_device_work); @@ -1074,7 +1076,7 @@ xfssyncd( list_del(&work->w_list); if (work == &mp->m_sync_work) continue; - kmem_free(work, sizeof(struct bhv_vfs_sync_work)); + kmem_free(work); } } @@ -1082,18 +1084,76 @@ xfssyncd( } STATIC void +xfs_free_fsname( + struct xfs_mount *mp) +{ + kfree(mp->m_fsname); + kfree(mp->m_rtname); + kfree(mp->m_logname); +} + +STATIC void xfs_fs_put_super( struct super_block *sb) { struct xfs_mount *mp = XFS_M(sb); + struct xfs_inode *rip = mp->m_rootip; + int unmount_event_flags = 0; int error; kthread_stop(mp->m_sync_task); xfs_sync(mp, SYNC_ATTR | SYNC_DELWRI); - error = xfs_unmount(mp, 0, NULL); - if (error) - printk("XFS: unmount got error=%d\n", error); + +#ifdef HAVE_DMAPI + if (mp->m_flags & XFS_MOUNT_DMAPI) { + unmount_event_flags = + (mp->m_dmevmask & (1 << DM_EVENT_UNMOUNT)) ? + 0 : DM_FLAGS_UNWANTED; + /* + * Ignore error from dmapi here, first unmount is not allowed + * to fail anyway, and second we wouldn't want to fail a + * unmount because of dmapi. + */ + XFS_SEND_PREUNMOUNT(mp, rip, DM_RIGHT_NULL, rip, DM_RIGHT_NULL, + NULL, NULL, 0, 0, unmount_event_flags); + } +#endif + + /* + * Blow away any referenced inode in the filestreams cache. + * This can and will cause log traffic as inodes go inactive + * here. + */ + xfs_filestream_unmount(mp); + + XFS_bflush(mp->m_ddev_targp); + error = xfs_unmount_flush(mp, 0); + WARN_ON(error); + + /* + * If we're forcing a shutdown, typically because of a media error, + * we want to make sure we invalidate dirty pages that belong to + * referenced vnodes as well. + */ + if (XFS_FORCED_SHUTDOWN(mp)) { + error = xfs_sync(mp, SYNC_WAIT | SYNC_CLOSE); + ASSERT(error != EFSCORRUPTED); + } + + if (mp->m_flags & XFS_MOUNT_DMAPI) { + XFS_SEND_UNMOUNT(mp, rip, DM_RIGHT_NULL, 0, 0, + unmount_event_flags); + } + + xfs_unmountfs(mp); + xfs_freesb(mp); + xfs_icsb_destroy_counters(mp); + xfs_close_devices(mp); + xfs_qmops_put(mp); + xfs_dmops_put(mp); + xfs_free_fsname(mp); + kfree(mp); } STATIC void @@ -1216,14 +1276,74 @@ xfs_fs_remount( char *options) { struct xfs_mount *mp = XFS_M(sb); - struct xfs_mount_args *args = xfs_args_allocate(sb, 0); - int error; + substring_t args[MAX_OPT_ARGS]; + char *p; - error = xfs_parseargs(mp, options, args, 1); - if (!error) - error = xfs_mntupdate(mp, flags, args); - kmem_free(args, sizeof(*args)); - return -error; + while ((p = strsep(&options, ",")) != NULL) { + int token; + + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_barrier: + mp->m_flags |= XFS_MOUNT_BARRIER; + + /* + * Test if barriers are actually working if we can, + * else delay this check until the filesystem is + * marked writeable. + */ + if (!(mp->m_flags & XFS_MOUNT_RDONLY)) + xfs_mountfs_check_barriers(mp); + break; + case Opt_nobarrier: + mp->m_flags &= ~XFS_MOUNT_BARRIER; + break; + default: + /* + * Logically we would return an error here to prevent + * users from believing they might have changed + * mount options using remount which can't be changed. + * + * But unfortunately mount(8) adds all options from + * mtab and fstab to the mount arguments in some cases + * so we can't blindly reject options, but have to + * check for each specified option if it actually + * differs from the currently set option and only + * reject it if that's the case. + * + * Until that is implemented we return success for + * every remount request, and silently ignore all + * options that we can't actually change. + */ +#if 0 + printk(KERN_INFO + "XFS: mount option \"%s\" not supported for remount\n", p); + return -EINVAL; +#else + return 0; +#endif + } + } + + /* rw/ro -> rw */ + if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) { + mp->m_flags &= ~XFS_MOUNT_RDONLY; + if (mp->m_flags & XFS_MOUNT_BARRIER) + xfs_mountfs_check_barriers(mp); + } + + /* rw -> ro */ + if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) { + xfs_filestream_flush(mp); + xfs_sync(mp, SYNC_DATA_QUIESCE); + xfs_attr_quiesce(mp); + mp->m_flags |= XFS_MOUNT_RDONLY; + } + + return 0; } /* @@ -1300,6 +1420,245 @@ xfs_fs_setxquota( Q_XSETPQLIM), id, (caddr_t)fdq); } +/* + * This function fills in xfs_mount_t fields based on mount args. + * Note: the superblock has _not_ yet been read in. + */ +STATIC int +xfs_start_flags( + struct xfs_mount_args *ap, + struct xfs_mount *mp) +{ + int error; + + /* Values are in BBs */ + if ((ap->flags & XFSMNT_NOALIGN) != XFSMNT_NOALIGN) { + /* + * At this point the superblock has not been read + * in, therefore we do not know the block size. + * Before the mount call ends we will convert + * these to FSBs. + */ + mp->m_dalign = ap->sunit; + mp->m_swidth = ap->swidth; + } + + if (ap->logbufs != -1 && + ap->logbufs != 0 && + (ap->logbufs < XLOG_MIN_ICLOGS || + ap->logbufs > XLOG_MAX_ICLOGS)) { + cmn_err(CE_WARN, + "XFS: invalid logbufs value: %d [not %d-%d]", + ap->logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS); + return XFS_ERROR(EINVAL); + } + mp->m_logbufs = ap->logbufs; + if (ap->logbufsize != -1 && + ap->logbufsize != 0 && + (ap->logbufsize < XLOG_MIN_RECORD_BSIZE || + ap->logbufsize > XLOG_MAX_RECORD_BSIZE || + !is_power_of_2(ap->logbufsize))) { + cmn_err(CE_WARN, + "XFS: invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]", + ap->logbufsize); + return XFS_ERROR(EINVAL); + } + + error = ENOMEM; + + mp->m_logbsize = ap->logbufsize; + mp->m_fsname_len = strlen(ap->fsname) + 1; + + mp->m_fsname = kstrdup(ap->fsname, GFP_KERNEL); + if (!mp->m_fsname) + goto out; + + if (ap->rtname[0]) { + mp->m_rtname = kstrdup(ap->rtname, GFP_KERNEL); + if (!mp->m_rtname) + goto out_free_fsname; + + } + + if (ap->logname[0]) { + mp->m_logname = kstrdup(ap->logname, GFP_KERNEL); + if (!mp->m_logname) + goto out_free_rtname; + } + + if (ap->flags & XFSMNT_WSYNC) + mp->m_flags |= XFS_MOUNT_WSYNC; +#if XFS_BIG_INUMS + if (ap->flags & XFSMNT_INO64) { + mp->m_flags |= XFS_MOUNT_INO64; + mp->m_inoadd = XFS_INO64_OFFSET; + } +#endif + if (ap->flags & XFSMNT_RETERR) + mp->m_flags |= XFS_MOUNT_RETERR; + if (ap->flags & XFSMNT_NOALIGN) + mp->m_flags |= XFS_MOUNT_NOALIGN; + if (ap->flags & XFSMNT_SWALLOC) + mp->m_flags |= XFS_MOUNT_SWALLOC; + if (ap->flags & XFSMNT_OSYNCISOSYNC) + mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC; + if (ap->flags & XFSMNT_32BITINODES) + mp->m_flags |= XFS_MOUNT_32BITINODES; + + if (ap->flags & XFSMNT_IOSIZE) { + if (ap->iosizelog > XFS_MAX_IO_LOG || + ap->iosizelog < XFS_MIN_IO_LOG) { + cmn_err(CE_WARN, + "XFS: invalid log iosize: %d [not %d-%d]", + ap->iosizelog, XFS_MIN_IO_LOG, + XFS_MAX_IO_LOG); + return XFS_ERROR(EINVAL); + } + + mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE; + mp->m_readio_log = mp->m_writeio_log = ap->iosizelog; + } + + if (ap->flags & XFSMNT_IKEEP) + mp->m_flags |= XFS_MOUNT_IKEEP; + if (ap->flags & XFSMNT_DIRSYNC) + mp->m_flags |= XFS_MOUNT_DIRSYNC; + if (ap->flags & XFSMNT_ATTR2) + mp->m_flags |= XFS_MOUNT_ATTR2; + if (ap->flags & XFSMNT_NOATTR2) + mp->m_flags |= XFS_MOUNT_NOATTR2; + + if (ap->flags2 & XFSMNT2_COMPAT_IOSIZE) + mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE; + + /* + * no recovery flag requires a read-only mount + */ + if (ap->flags & XFSMNT_NORECOVERY) { + if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { + cmn_err(CE_WARN, + "XFS: tried to mount a FS read-write without recovery!"); + return XFS_ERROR(EINVAL); + } + mp->m_flags |= XFS_MOUNT_NORECOVERY; + } + + if (ap->flags & XFSMNT_NOUUID) + mp->m_flags |= XFS_MOUNT_NOUUID; + if (ap->flags & XFSMNT_BARRIER) + mp->m_flags |= XFS_MOUNT_BARRIER; + else + mp->m_flags &= ~XFS_MOUNT_BARRIER; + + if (ap->flags2 & XFSMNT2_FILESTREAMS) + mp->m_flags |= XFS_MOUNT_FILESTREAMS; + + if (ap->flags & XFSMNT_DMAPI) + mp->m_flags |= XFS_MOUNT_DMAPI; + return 0; + + + out_free_rtname: + kfree(mp->m_rtname); + out_free_fsname: + kfree(mp->m_fsname); + out: + return error; +} + +/* + * This function fills in xfs_mount_t fields based on mount args. + * Note: the superblock _has_ now been read in. + */ +STATIC int +xfs_finish_flags( + struct xfs_mount_args *ap, + struct xfs_mount *mp) +{ + int ronly = (mp->m_flags & XFS_MOUNT_RDONLY); + + /* Fail a mount where the logbuf is smaller then the log stripe */ + if (xfs_sb_version_haslogv2(&mp->m_sb)) { + if ((ap->logbufsize <= 0) && + (mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE)) { + mp->m_logbsize = mp->m_sb.sb_logsunit; + } else if (ap->logbufsize > 0 && + ap->logbufsize < mp->m_sb.sb_logsunit) { + cmn_err(CE_WARN, + "XFS: logbuf size must be greater than or equal to log stripe size"); + return XFS_ERROR(EINVAL); + } + } else { + /* Fail a mount if the logbuf is larger than 32K */ + if (ap->logbufsize > XLOG_BIG_RECORD_BSIZE) { + cmn_err(CE_WARN, + "XFS: logbuf size for version 1 logs must be 16K or 32K"); + return XFS_ERROR(EINVAL); + } + } + + /* + * mkfs'ed attr2 will turn on attr2 mount unless explicitly + * told by noattr2 to turn it off + */ + if (xfs_sb_version_hasattr2(&mp->m_sb) && + !(ap->flags & XFSMNT_NOATTR2)) + mp->m_flags |= XFS_MOUNT_ATTR2; + + /* + * prohibit r/w mounts of read-only filesystems + */ + if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) { + cmn_err(CE_WARN, + "XFS: cannot mount a read-only filesystem as read-write"); + return XFS_ERROR(EROFS); + } + + /* + * check for shared mount. + */ + if (ap->flags & XFSMNT_SHARED) { + if (!xfs_sb_version_hasshared(&mp->m_sb)) + return XFS_ERROR(EINVAL); + + /* + * For IRIX 6.5, shared mounts must have the shared + * version bit set, have the persistent readonly + * field set, must be version 0 and can only be mounted + * read-only. + */ + if (!ronly || !(mp->m_sb.sb_flags & XFS_SBF_READONLY) || + (mp->m_sb.sb_shared_vn != 0)) + return XFS_ERROR(EINVAL); + + mp->m_flags |= XFS_MOUNT_SHARED; + + /* + * Shared XFS V0 can't deal with DMI. Return EINVAL. + */ + if (mp->m_sb.sb_shared_vn == 0 && (ap->flags & XFSMNT_DMAPI)) + return XFS_ERROR(EINVAL); + } + + if (ap->flags & XFSMNT_UQUOTA) { + mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE); + if (ap->flags & XFSMNT_UQUOTAENF) + mp->m_qflags |= XFS_UQUOTA_ENFD; + } + + if (ap->flags & XFSMNT_GQUOTA) { + mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE); + if (ap->flags & XFSMNT_GQUOTAENF) + mp->m_qflags |= XFS_OQUOTA_ENFD; + } else if (ap->flags & XFSMNT_PQUOTA) { + mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE); + if (ap->flags & XFSMNT_PQUOTAENF) + mp->m_qflags |= XFS_OQUOTA_ENFD; + } + + return 0; +} + STATIC int xfs_fs_fill_super( struct super_block *sb, @@ -1308,11 +1667,21 @@ xfs_fs_fill_super( { struct inode *root; struct xfs_mount *mp = NULL; - struct xfs_mount_args *args = xfs_args_allocate(sb, silent); - int error; + struct xfs_mount_args *args; + int flags = 0, error = ENOMEM; - mp = xfs_mount_init(); + args = xfs_args_allocate(sb, silent); + if (!args) + return -ENOMEM; + mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL); + if (!mp) + goto out_free_args; + + spin_lock_init(&mp->m_sb_lock); + mutex_init(&mp->m_ilock); + mutex_init(&mp->m_growlock); + atomic_set(&mp->m_active_trans, 0); INIT_LIST_HEAD(&mp->m_sync_list); spin_lock_init(&mp->m_sync_lock); init_waitqueue_head(&mp->m_wait_single_sync_task); @@ -1325,16 +1694,60 @@ xfs_fs_fill_super( error = xfs_parseargs(mp, (char *)data, args, 0); if (error) - goto fail_vfsop; + goto out_free_mp; sb_min_blocksize(sb, BBSIZE); + sb->s_xattr = xfs_xattr_handlers; sb->s_export_op = &xfs_export_operations; sb->s_qcop = &xfs_quotactl_operations; sb->s_op = &xfs_super_operations; - error = xfs_mount(mp, args, NULL); + error = xfs_dmops_get(mp, args); + if (error) + goto out_free_mp; + error = xfs_qmops_get(mp, args); + if (error) + goto out_put_dmops; + + if (args->flags & XFSMNT_QUIET) + flags |= XFS_MFSI_QUIET; + + error = xfs_open_devices(mp, args); + if (error) + goto out_put_qmops; + + if (xfs_icsb_init_counters(mp)) + mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB; + + /* + * Setup flags based on mount(2) options and then the superblock + */ + error = xfs_start_flags(args, mp); + if (error) + goto out_free_fsname; + error = xfs_readsb(mp, flags); + if (error) + goto out_free_fsname; + error = xfs_finish_flags(args, mp); if (error) - goto fail_vfsop; + goto out_free_sb; + + error = xfs_setup_devices(mp); + if (error) + goto out_free_sb; + + if (mp->m_flags & XFS_MOUNT_BARRIER) + xfs_mountfs_check_barriers(mp); + + error = xfs_filestream_mount(mp); + if (error) + goto out_free_sb; + + error = xfs_mountfs(mp); + if (error) + goto out_filestream_unmount; + + XFS_SEND_MOUNT(mp, DM_RIGHT_NULL, args->mtpt, args->fsname); sb->s_dirt = 1; sb->s_magic = XFS_SB_MAGIC; @@ -1344,7 +1757,7 @@ xfs_fs_fill_super( sb->s_time_gran = 1; set_posix_acl_flag(sb); - root = igrab(mp->m_rootip->i_vnode); + root = igrab(VFS_I(mp->m_rootip)); if (!root) { error = ENOENT; goto fail_unmount; @@ -1369,10 +1782,28 @@ xfs_fs_fill_super( xfs_itrace_exit(XFS_I(sb->s_root->d_inode)); - kmem_free(args, sizeof(*args)); + kfree(args); return 0; -fail_vnrele: + out_filestream_unmount: + xfs_filestream_unmount(mp); + out_free_sb: + xfs_freesb(mp); + out_free_fsname: + xfs_free_fsname(mp); + xfs_icsb_destroy_counters(mp); + xfs_close_devices(mp); + out_put_qmops: + xfs_qmops_put(mp); + out_put_dmops: + xfs_dmops_put(mp); + out_free_mp: + kfree(mp); + out_free_args: + kfree(args); + return -error; + + fail_vnrele: if (sb->s_root) { dput(sb->s_root); sb->s_root = NULL; @@ -1380,12 +1811,20 @@ fail_vnrele: iput(root); } -fail_unmount: - xfs_unmount(mp, 0, NULL); + fail_unmount: + /* + * Blow away any referenced inode in the filestreams cache. + * This can and will cause log traffic as inodes go inactive + * here. + */ + xfs_filestream_unmount(mp); -fail_vfsop: - kmem_free(args, sizeof(*args)); - return -error; + XFS_bflush(mp->m_ddev_targp); + error = xfs_unmount_flush(mp, 0); + WARN_ON(error); + + xfs_unmountfs(mp); + goto out_free_sb; } STATIC int @@ -1430,9 +1869,235 @@ static struct file_system_type xfs_fs_type = { .fs_flags = FS_REQUIRES_DEV, }; +STATIC int __init +xfs_alloc_trace_bufs(void) +{ +#ifdef XFS_ALLOC_TRACE + xfs_alloc_trace_buf = ktrace_alloc(XFS_ALLOC_TRACE_SIZE, KM_MAYFAIL); + if (!xfs_alloc_trace_buf) + goto out; +#endif +#ifdef XFS_BMAP_TRACE + xfs_bmap_trace_buf = ktrace_alloc(XFS_BMAP_TRACE_SIZE, KM_MAYFAIL); + if (!xfs_bmap_trace_buf) + goto out_free_alloc_trace; +#endif +#ifdef XFS_BMBT_TRACE + xfs_bmbt_trace_buf = ktrace_alloc(XFS_BMBT_TRACE_SIZE, KM_MAYFAIL); + if (!xfs_bmbt_trace_buf) + goto out_free_bmap_trace; +#endif +#ifdef XFS_ATTR_TRACE + xfs_attr_trace_buf = ktrace_alloc(XFS_ATTR_TRACE_SIZE, KM_MAYFAIL); + if (!xfs_attr_trace_buf) + goto out_free_bmbt_trace; +#endif +#ifdef XFS_DIR2_TRACE + xfs_dir2_trace_buf = ktrace_alloc(XFS_DIR2_GTRACE_SIZE, KM_MAYFAIL); + if (!xfs_dir2_trace_buf) + goto out_free_attr_trace; +#endif + + return 0; + +#ifdef XFS_DIR2_TRACE + out_free_attr_trace: +#endif +#ifdef XFS_ATTR_TRACE + ktrace_free(xfs_attr_trace_buf); + out_free_bmbt_trace: +#endif +#ifdef XFS_BMBT_TRACE + ktrace_free(xfs_bmbt_trace_buf); + out_free_bmap_trace: +#endif +#ifdef XFS_BMAP_TRACE + ktrace_free(xfs_bmap_trace_buf); + out_free_alloc_trace: +#endif +#ifdef XFS_ALLOC_TRACE + ktrace_free(xfs_alloc_trace_buf); + out: +#endif + return -ENOMEM; +} + +STATIC void +xfs_free_trace_bufs(void) +{ +#ifdef XFS_DIR2_TRACE + ktrace_free(xfs_dir2_trace_buf); +#endif +#ifdef XFS_ATTR_TRACE + ktrace_free(xfs_attr_trace_buf); +#endif +#ifdef XFS_BMBT_TRACE + ktrace_free(xfs_bmbt_trace_buf); +#endif +#ifdef XFS_BMAP_TRACE + ktrace_free(xfs_bmap_trace_buf); +#endif +#ifdef XFS_ALLOC_TRACE + ktrace_free(xfs_alloc_trace_buf); +#endif +} + +STATIC int __init +xfs_init_zones(void) +{ + xfs_vnode_zone = kmem_zone_init_flags(sizeof(struct inode), "xfs_vnode", + KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | + KM_ZONE_SPREAD, + xfs_fs_inode_init_once); + if (!xfs_vnode_zone) + goto out; + + xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend"); + if (!xfs_ioend_zone) + goto out_destroy_vnode_zone; + + xfs_ioend_pool = mempool_create_slab_pool(4 * MAX_BUF_PER_PAGE, + xfs_ioend_zone); + if (!xfs_ioend_pool) + goto out_destroy_ioend_zone; + + xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t), + "xfs_log_ticket"); + if (!xfs_log_ticket_zone) + goto out_destroy_ioend_pool; + + xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t), + "xfs_bmap_free_item"); + if (!xfs_bmap_free_item_zone) + goto out_destroy_log_ticket_zone; + xfs_btree_cur_zone = kmem_zone_init(sizeof(xfs_btree_cur_t), + "xfs_btree_cur"); + if (!xfs_btree_cur_zone) + goto out_destroy_bmap_free_item_zone; + + xfs_da_state_zone = kmem_zone_init(sizeof(xfs_da_state_t), + "xfs_da_state"); + if (!xfs_da_state_zone) + goto out_destroy_btree_cur_zone; + + xfs_dabuf_zone = kmem_zone_init(sizeof(xfs_dabuf_t), "xfs_dabuf"); + if (!xfs_dabuf_zone) + goto out_destroy_da_state_zone; + + xfs_ifork_zone = kmem_zone_init(sizeof(xfs_ifork_t), "xfs_ifork"); + if (!xfs_ifork_zone) + goto out_destroy_dabuf_zone; + + xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans"); + if (!xfs_trans_zone) + goto out_destroy_ifork_zone; + + /* + * The size of the zone allocated buf log item is the maximum + * size possible under XFS. This wastes a little bit of memory, + * but it is much faster. + */ + xfs_buf_item_zone = kmem_zone_init((sizeof(xfs_buf_log_item_t) + + (((XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK) / + NBWORD) * sizeof(int))), "xfs_buf_item"); + if (!xfs_buf_item_zone) + goto out_destroy_trans_zone; + + xfs_efd_zone = kmem_zone_init((sizeof(xfs_efd_log_item_t) + + ((XFS_EFD_MAX_FAST_EXTENTS - 1) * + sizeof(xfs_extent_t))), "xfs_efd_item"); + if (!xfs_efd_zone) + goto out_destroy_buf_item_zone; + + xfs_efi_zone = kmem_zone_init((sizeof(xfs_efi_log_item_t) + + ((XFS_EFI_MAX_FAST_EXTENTS - 1) * + sizeof(xfs_extent_t))), "xfs_efi_item"); + if (!xfs_efi_zone) + goto out_destroy_efd_zone; + + xfs_inode_zone = + kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode", + KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | + KM_ZONE_SPREAD, NULL); + if (!xfs_inode_zone) + goto out_destroy_efi_zone; + + xfs_ili_zone = + kmem_zone_init_flags(sizeof(xfs_inode_log_item_t), "xfs_ili", + KM_ZONE_SPREAD, NULL); + if (!xfs_ili_zone) + goto out_destroy_inode_zone; + +#ifdef CONFIG_XFS_POSIX_ACL + xfs_acl_zone = kmem_zone_init(sizeof(xfs_acl_t), "xfs_acl"); + if (!xfs_acl_zone) + goto out_destroy_ili_zone; +#endif + + return 0; + +#ifdef CONFIG_XFS_POSIX_ACL + out_destroy_ili_zone: +#endif + kmem_zone_destroy(xfs_ili_zone); + out_destroy_inode_zone: + kmem_zone_destroy(xfs_inode_zone); + out_destroy_efi_zone: + kmem_zone_destroy(xfs_efi_zone); + out_destroy_efd_zone: + kmem_zone_destroy(xfs_efd_zone); + out_destroy_buf_item_zone: + kmem_zone_destroy(xfs_buf_item_zone); + out_destroy_trans_zone: + kmem_zone_destroy(xfs_trans_zone); + out_destroy_ifork_zone: + kmem_zone_destroy(xfs_ifork_zone); + out_destroy_dabuf_zone: + kmem_zone_destroy(xfs_dabuf_zone); + out_destroy_da_state_zone: + kmem_zone_destroy(xfs_da_state_zone); + out_destroy_btree_cur_zone: + kmem_zone_destroy(xfs_btree_cur_zone); + out_destroy_bmap_free_item_zone: + kmem_zone_destroy(xfs_bmap_free_item_zone); + out_destroy_log_ticket_zone: + kmem_zone_destroy(xfs_log_ticket_zone); + out_destroy_ioend_pool: + mempool_destroy(xfs_ioend_pool); + out_destroy_ioend_zone: + kmem_zone_destroy(xfs_ioend_zone); + out_destroy_vnode_zone: + kmem_zone_destroy(xfs_vnode_zone); + out: + return -ENOMEM; +} + +STATIC void +xfs_destroy_zones(void) +{ +#ifdef CONFIG_XFS_POSIX_ACL + kmem_zone_destroy(xfs_acl_zone); +#endif + kmem_zone_destroy(xfs_ili_zone); + kmem_zone_destroy(xfs_inode_zone); + kmem_zone_destroy(xfs_efi_zone); + kmem_zone_destroy(xfs_efd_zone); + kmem_zone_destroy(xfs_buf_item_zone); + kmem_zone_destroy(xfs_trans_zone); + kmem_zone_destroy(xfs_ifork_zone); + kmem_zone_destroy(xfs_dabuf_zone); + kmem_zone_destroy(xfs_da_state_zone); + kmem_zone_destroy(xfs_btree_cur_zone); + kmem_zone_destroy(xfs_bmap_free_item_zone); + kmem_zone_destroy(xfs_log_ticket_zone); + mempool_destroy(xfs_ioend_pool); + kmem_zone_destroy(xfs_ioend_zone); + kmem_zone_destroy(xfs_vnode_zone); + +} STATIC int __init -init_xfs_fs( void ) +init_xfs_fs(void) { int error; static char message[] __initdata = KERN_INFO \ @@ -1441,42 +2106,73 @@ init_xfs_fs( void ) printk(message); ktrace_init(64); + vn_init(); + xfs_dir_startup(); error = xfs_init_zones(); - if (error < 0) - goto undo_zones; + if (error) + goto out; + + error = xfs_alloc_trace_bufs(); + if (error) + goto out_destroy_zones; + + error = xfs_mru_cache_init(); + if (error) + goto out_free_trace_buffers; + + error = xfs_filestream_init(); + if (error) + goto out_mru_cache_uninit; error = xfs_buf_init(); - if (error < 0) - goto undo_buffers; + if (error) + goto out_filestream_uninit; + + error = xfs_init_procfs(); + if (error) + goto out_buf_terminate; + + error = xfs_sysctl_register(); + if (error) + goto out_cleanup_procfs; - vn_init(); - xfs_init(); - uuid_init(); vfs_initquota(); error = register_filesystem(&xfs_fs_type); if (error) - goto undo_register; + goto out_sysctl_unregister; return 0; -undo_register: + out_sysctl_unregister: + xfs_sysctl_unregister(); + out_cleanup_procfs: + xfs_cleanup_procfs(); + out_buf_terminate: xfs_buf_terminate(); - -undo_buffers: + out_filestream_uninit: + xfs_filestream_uninit(); + out_mru_cache_uninit: + xfs_mru_cache_uninit(); + out_free_trace_buffers: + xfs_free_trace_bufs(); + out_destroy_zones: xfs_destroy_zones(); - -undo_zones: + out: return error; } STATIC void __exit -exit_xfs_fs( void ) +exit_xfs_fs(void) { vfs_exitquota(); unregister_filesystem(&xfs_fs_type); - xfs_cleanup(); + xfs_sysctl_unregister(); + xfs_cleanup_procfs(); xfs_buf_terminate(); + xfs_filestream_uninit(); + xfs_mru_cache_uninit(); + xfs_free_trace_bufs(); xfs_destroy_zones(); ktrace_uninit(); } diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h index 3efb7c6d3303..fe2ef4e6a0f9 100644 --- a/fs/xfs/linux-2.6/xfs_super.h +++ b/fs/xfs/linux-2.6/xfs_super.h @@ -101,18 +101,13 @@ struct block_device; extern __uint64_t xfs_max_file_offset(unsigned int); -extern void xfs_initialize_vnode(struct xfs_mount *mp, bhv_vnode_t *vp, - struct xfs_inode *ip); - extern void xfs_flush_inode(struct xfs_inode *); extern void xfs_flush_device(struct xfs_inode *); -extern int xfs_blkdev_get(struct xfs_mount *, const char *, - struct block_device **); -extern void xfs_blkdev_put(struct block_device *); extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); extern const struct export_operations xfs_export_operations; +extern struct xattr_handler *xfs_xattr_handlers[]; #define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info)) diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c index bb997d75c05c..7dacb5bbde3f 100644 --- a/fs/xfs/linux-2.6/xfs_sysctl.c +++ b/fs/xfs/linux-2.6/xfs_sysctl.c @@ -259,15 +259,17 @@ static ctl_table xfs_root_table[] = { {} }; -void +int xfs_sysctl_register(void) { xfs_table_header = register_sysctl_table(xfs_root_table); + if (!xfs_table_header) + return -ENOMEM; + return 0; } void xfs_sysctl_unregister(void) { - if (xfs_table_header) - unregister_sysctl_table(xfs_table_header); + unregister_sysctl_table(xfs_table_header); } diff --git a/fs/xfs/linux-2.6/xfs_sysctl.h b/fs/xfs/linux-2.6/xfs_sysctl.h index 98b97e399d6f..4aadb8056c37 100644 --- a/fs/xfs/linux-2.6/xfs_sysctl.h +++ b/fs/xfs/linux-2.6/xfs_sysctl.h @@ -93,10 +93,10 @@ enum { extern xfs_param_t xfs_params; #ifdef CONFIG_SYSCTL -extern void xfs_sysctl_register(void); +extern int xfs_sysctl_register(void); extern void xfs_sysctl_unregister(void); #else -# define xfs_sysctl_register() do { } while (0) +# define xfs_sysctl_register() (0) # define xfs_sysctl_unregister() do { } while (0) #endif /* CONFIG_SYSCTL */ diff --git a/fs/xfs/linux-2.6/xfs_vnode.c b/fs/xfs/linux-2.6/xfs_vnode.c index bc7afe007338..b52528bbbfff 100644 --- a/fs/xfs/linux-2.6/xfs_vnode.c +++ b/fs/xfs/linux-2.6/xfs_vnode.c @@ -33,7 +33,7 @@ /* - * Dedicated vnode inactive/reclaim sync semaphores. + * Dedicated vnode inactive/reclaim sync wait queues. * Prime number of hash buckets since address is used as the key. */ #define NVSYNC 37 @@ -82,74 +82,6 @@ vn_ioerror( xfs_do_force_shutdown(ip->i_mount, SHUTDOWN_DEVICE_REQ, f, l); } -/* - * Revalidate the Linux inode from the XFS inode. - * Note: i_size _not_ updated; we must hold the inode - * semaphore when doing that - callers responsibility. - */ -int -vn_revalidate( - bhv_vnode_t *vp) -{ - struct inode *inode = vn_to_inode(vp); - struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - unsigned long xflags; - - xfs_itrace_entry(ip); - - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; - - xfs_ilock(ip, XFS_ILOCK_SHARED); - inode->i_mode = ip->i_d.di_mode; - inode->i_uid = ip->i_d.di_uid; - inode->i_gid = ip->i_d.di_gid; - inode->i_mtime.tv_sec = ip->i_d.di_mtime.t_sec; - inode->i_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec; - inode->i_ctime.tv_sec = ip->i_d.di_ctime.t_sec; - inode->i_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec; - - xflags = xfs_ip2xflags(ip); - if (xflags & XFS_XFLAG_IMMUTABLE) - inode->i_flags |= S_IMMUTABLE; - else - inode->i_flags &= ~S_IMMUTABLE; - if (xflags & XFS_XFLAG_APPEND) - inode->i_flags |= S_APPEND; - else - inode->i_flags &= ~S_APPEND; - if (xflags & XFS_XFLAG_SYNC) - inode->i_flags |= S_SYNC; - else - inode->i_flags &= ~S_SYNC; - if (xflags & XFS_XFLAG_NOATIME) - inode->i_flags |= S_NOATIME; - else - inode->i_flags &= ~S_NOATIME; - xfs_iunlock(ip, XFS_ILOCK_SHARED); - - xfs_iflags_clear(ip, XFS_IMODIFIED); - return 0; -} - -/* - * Add a reference to a referenced vnode. - */ -bhv_vnode_t * -vn_hold( - bhv_vnode_t *vp) -{ - struct inode *inode; - - XFS_STATS_INC(vn_hold); - - inode = igrab(vn_to_inode(vp)); - ASSERT(inode); - - return vp; -} - #ifdef XFS_INODE_TRACE /* @@ -158,7 +90,7 @@ vn_hold( */ static inline int xfs_icount(struct xfs_inode *ip) { - bhv_vnode_t *vp = XFS_ITOV_NULL(ip); + struct inode *vp = VFS_I(ip); if (vp) return vn_count(vp); diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h index 25eb2a9e8d9b..683ce16210ff 100644 --- a/fs/xfs/linux-2.6/xfs_vnode.h +++ b/fs/xfs/linux-2.6/xfs_vnode.h @@ -19,24 +19,9 @@ #define __XFS_VNODE_H__ struct file; -struct bhv_vattr; struct xfs_iomap; struct attrlist_cursor_kern; -typedef struct inode bhv_vnode_t; - -/* - * Vnode to Linux inode mapping. - */ -static inline bhv_vnode_t *vn_from_inode(struct inode *inode) -{ - return inode; -} -static inline struct inode *vn_to_inode(bhv_vnode_t *vnode) -{ - return vnode; -} - /* * Return values for xfs_inactive. A return value of * VN_INACTIVE_NOCACHE implies that the file system behavior @@ -66,87 +51,8 @@ static inline struct inode *vn_to_inode(bhv_vnode_t *vnode) Prevent VM access to the pages until the operation completes. */ -/* - * Vnode attributes. va_mask indicates those attributes the caller - * wants to set or extract. - */ -typedef struct bhv_vattr { - int va_mask; /* bit-mask of attributes present */ - mode_t va_mode; /* file access mode and type */ - xfs_nlink_t va_nlink; /* number of references to file */ - uid_t va_uid; /* owner user id */ - gid_t va_gid; /* owner group id */ - xfs_ino_t va_nodeid; /* file id */ - xfs_off_t va_size; /* file size in bytes */ - u_long va_blocksize; /* blocksize preferred for i/o */ - struct timespec va_atime; /* time of last access */ - struct timespec va_mtime; /* time of last modification */ - struct timespec va_ctime; /* time file changed */ - u_int va_gen; /* generation number of file */ - xfs_dev_t va_rdev; /* device the special file represents */ - __int64_t va_nblocks; /* number of blocks allocated */ - u_long va_xflags; /* random extended file flags */ - u_long va_extsize; /* file extent size */ - u_long va_nextents; /* number of extents in file */ - u_long va_anextents; /* number of attr extents in file */ - prid_t va_projid; /* project id */ -} bhv_vattr_t; - -/* - * setattr or getattr attributes - */ -#define XFS_AT_TYPE 0x00000001 -#define XFS_AT_MODE 0x00000002 -#define XFS_AT_UID 0x00000004 -#define XFS_AT_GID 0x00000008 -#define XFS_AT_FSID 0x00000010 -#define XFS_AT_NODEID 0x00000020 -#define XFS_AT_NLINK 0x00000040 -#define XFS_AT_SIZE 0x00000080 -#define XFS_AT_ATIME 0x00000100 -#define XFS_AT_MTIME 0x00000200 -#define XFS_AT_CTIME 0x00000400 -#define XFS_AT_RDEV 0x00000800 -#define XFS_AT_BLKSIZE 0x00001000 -#define XFS_AT_NBLOCKS 0x00002000 -#define XFS_AT_VCODE 0x00004000 -#define XFS_AT_MAC 0x00008000 -#define XFS_AT_UPDATIME 0x00010000 -#define XFS_AT_UPDMTIME 0x00020000 -#define XFS_AT_UPDCTIME 0x00040000 -#define XFS_AT_ACL 0x00080000 -#define XFS_AT_CAP 0x00100000 -#define XFS_AT_INF 0x00200000 -#define XFS_AT_XFLAGS 0x00400000 -#define XFS_AT_EXTSIZE 0x00800000 -#define XFS_AT_NEXTENTS 0x01000000 -#define XFS_AT_ANEXTENTS 0x02000000 -#define XFS_AT_PROJID 0x04000000 -#define XFS_AT_SIZE_NOPERM 0x08000000 -#define XFS_AT_GENCOUNT 0x10000000 - -#define XFS_AT_ALL (XFS_AT_TYPE|XFS_AT_MODE|XFS_AT_UID|XFS_AT_GID|\ - XFS_AT_FSID|XFS_AT_NODEID|XFS_AT_NLINK|XFS_AT_SIZE|\ - XFS_AT_ATIME|XFS_AT_MTIME|XFS_AT_CTIME|XFS_AT_RDEV|\ - XFS_AT_BLKSIZE|XFS_AT_NBLOCKS|XFS_AT_VCODE|XFS_AT_MAC|\ - XFS_AT_ACL|XFS_AT_CAP|XFS_AT_INF|XFS_AT_XFLAGS|XFS_AT_EXTSIZE|\ - XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|XFS_AT_PROJID|XFS_AT_GENCOUNT) - -#define XFS_AT_STAT (XFS_AT_TYPE|XFS_AT_MODE|XFS_AT_UID|XFS_AT_GID|\ - XFS_AT_FSID|XFS_AT_NODEID|XFS_AT_NLINK|XFS_AT_SIZE|\ - XFS_AT_ATIME|XFS_AT_MTIME|XFS_AT_CTIME|XFS_AT_RDEV|\ - XFS_AT_BLKSIZE|XFS_AT_NBLOCKS|XFS_AT_PROJID) - -#define XFS_AT_TIMES (XFS_AT_ATIME|XFS_AT_MTIME|XFS_AT_CTIME) - -#define XFS_AT_UPDTIMES (XFS_AT_UPDATIME|XFS_AT_UPDMTIME|XFS_AT_UPDCTIME) - -#define XFS_AT_NOSET (XFS_AT_NLINK|XFS_AT_RDEV|XFS_AT_FSID|XFS_AT_NODEID|\ - XFS_AT_TYPE|XFS_AT_BLKSIZE|XFS_AT_NBLOCKS|XFS_AT_VCODE|\ - XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|XFS_AT_GENCOUNT) extern void vn_init(void); -extern int vn_revalidate(bhv_vnode_t *); /* * Yeah, these don't take vnode anymore at all, all this should be @@ -156,57 +62,52 @@ extern void vn_iowait(struct xfs_inode *ip); extern void vn_iowake(struct xfs_inode *ip); extern void vn_ioerror(struct xfs_inode *ip, int error, char *f, int l); -static inline int vn_count(bhv_vnode_t *vp) +static inline int vn_count(struct inode *vp) { - return atomic_read(&vn_to_inode(vp)->i_count); + return atomic_read(&vp->i_count); } -/* - * Vnode reference counting functions (and macros for compatibility). - */ -extern bhv_vnode_t *vn_hold(bhv_vnode_t *); +#define IHOLD(ip) \ +do { \ + ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \ + atomic_inc(&(VFS_I(ip)->i_count)); \ + xfs_itrace_hold((ip), __FILE__, __LINE__, (inst_t *)__return_address); \ +} while (0) -#if defined(XFS_INODE_TRACE) -#define VN_HOLD(vp) \ - ((void)vn_hold(vp), \ - xfs_itrace_hold(xfs_vtoi(vp), __FILE__, __LINE__, (inst_t *)__return_address)) -#define VN_RELE(vp) \ - (xfs_itrace_rele(xfs_vtoi(vp), __FILE__, __LINE__, (inst_t *)__return_address), \ - iput(vn_to_inode(vp))) -#else -#define VN_HOLD(vp) ((void)vn_hold(vp)) -#define VN_RELE(vp) (iput(vn_to_inode(vp))) -#endif +#define IRELE(ip) \ +do { \ + xfs_itrace_rele((ip), __FILE__, __LINE__, (inst_t *)__return_address); \ + iput(VFS_I(ip)); \ +} while (0) -static inline bhv_vnode_t *vn_grab(bhv_vnode_t *vp) +static inline struct inode *vn_grab(struct inode *vp) { - struct inode *inode = igrab(vn_to_inode(vp)); - return inode ? vn_from_inode(inode) : NULL; + return igrab(vp); } /* * Dealing with bad inodes */ -static inline int VN_BAD(bhv_vnode_t *vp) +static inline int VN_BAD(struct inode *vp) { - return is_bad_inode(vn_to_inode(vp)); + return is_bad_inode(vp); } /* * Extracting atime values in various formats */ -static inline void vn_atime_to_bstime(bhv_vnode_t *vp, xfs_bstime_t *bs_atime) +static inline void vn_atime_to_bstime(struct inode *vp, xfs_bstime_t *bs_atime) { bs_atime->tv_sec = vp->i_atime.tv_sec; bs_atime->tv_nsec = vp->i_atime.tv_nsec; } -static inline void vn_atime_to_timespec(bhv_vnode_t *vp, struct timespec *ts) +static inline void vn_atime_to_timespec(struct inode *vp, struct timespec *ts) { *ts = vp->i_atime; } -static inline void vn_atime_to_time_t(bhv_vnode_t *vp, time_t *tt) +static inline void vn_atime_to_time_t(struct inode *vp, time_t *tt) { *tt = vp->i_atime.tv_sec; } @@ -214,20 +115,11 @@ static inline void vn_atime_to_time_t(bhv_vnode_t *vp, time_t *tt) /* * Some useful predicates. */ -#define VN_MAPPED(vp) mapping_mapped(vn_to_inode(vp)->i_mapping) -#define VN_CACHED(vp) (vn_to_inode(vp)->i_mapping->nrpages) -#define VN_DIRTY(vp) mapping_tagged(vn_to_inode(vp)->i_mapping, \ +#define VN_MAPPED(vp) mapping_mapped(vp->i_mapping) +#define VN_CACHED(vp) (vp->i_mapping->nrpages) +#define VN_DIRTY(vp) mapping_tagged(vp->i_mapping, \ PAGECACHE_TAG_DIRTY) -/* - * Flags to vop_setattr/getattr. - */ -#define ATTR_UTIME 0x01 /* non-default utime(2) request */ -#define ATTR_DMI 0x08 /* invocation from a DMI function */ -#define ATTR_LAZY 0x80 /* set/get attributes lazily */ -#define ATTR_NONBLOCK 0x100 /* return EAGAIN if operation would block */ -#define ATTR_NOLOCK 0x200 /* Don't grab any conflicting locks */ -#define ATTR_NOSIZETOK 0x400 /* Don't get the SIZE token */ /* * Tracking vnode activity. diff --git a/fs/xfs/linux-2.6/xfs_xattr.c b/fs/xfs/linux-2.6/xfs_xattr.c new file mode 100644 index 000000000000..964621fde6ed --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_xattr.c @@ -0,0 +1,330 @@ +/* + * Copyright (C) 2008 Christoph Hellwig. + * Portions Copyright (C) 2000-2008 Silicon Graphics, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "xfs.h" +#include "xfs_da_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_inode.h" +#include "xfs_attr.h" +#include "xfs_attr_leaf.h" +#include "xfs_acl.h" +#include "xfs_vnodeops.h" + +#include <linux/posix_acl_xattr.h> +#include <linux/xattr.h> + + +/* + * ACL handling. Should eventually be moved into xfs_acl.c + */ + +static int +xfs_decode_acl(const char *name) +{ + if (strcmp(name, "posix_acl_access") == 0) + return _ACL_TYPE_ACCESS; + else if (strcmp(name, "posix_acl_default") == 0) + return _ACL_TYPE_DEFAULT; + return -EINVAL; +} + +/* + * Get system extended attributes which at the moment only + * includes Posix ACLs. + */ +static int +xfs_xattr_system_get(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + int acl; + + acl = xfs_decode_acl(name); + if (acl < 0) + return acl; + + return xfs_acl_vget(inode, buffer, size, acl); +} + +static int +xfs_xattr_system_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + int acl; + + acl = xfs_decode_acl(name); + if (acl < 0) + return acl; + if (flags & XATTR_CREATE) + return -EINVAL; + + if (!value) + return xfs_acl_vremove(inode, acl); + + return xfs_acl_vset(inode, (void *)value, size, acl); +} + +static struct xattr_handler xfs_xattr_system_handler = { + .prefix = XATTR_SYSTEM_PREFIX, + .get = xfs_xattr_system_get, + .set = xfs_xattr_system_set, +}; + + +/* + * Real xattr handling. The only difference between the namespaces is + * a flag passed to the low-level attr code. + */ + +static int +__xfs_xattr_get(struct inode *inode, const char *name, + void *value, size_t size, int xflags) +{ + struct xfs_inode *ip = XFS_I(inode); + int error, asize = size; + + if (strcmp(name, "") == 0) + return -EINVAL; + + /* Convert Linux syscall to XFS internal ATTR flags */ + if (!size) { + xflags |= ATTR_KERNOVAL; + value = NULL; + } + + error = -xfs_attr_get(ip, name, value, &asize, xflags); + if (error) + return error; + return asize; +} + +static int +__xfs_xattr_set(struct inode *inode, const char *name, const void *value, + size_t size, int flags, int xflags) +{ + struct xfs_inode *ip = XFS_I(inode); + + if (strcmp(name, "") == 0) + return -EINVAL; + + /* Convert Linux syscall to XFS internal ATTR flags */ + if (flags & XATTR_CREATE) + xflags |= ATTR_CREATE; + if (flags & XATTR_REPLACE) + xflags |= ATTR_REPLACE; + + if (!value) + return -xfs_attr_remove(ip, name, xflags); + return -xfs_attr_set(ip, name, (void *)value, size, xflags); +} + +static int +xfs_xattr_user_get(struct inode *inode, const char *name, + void *value, size_t size) +{ + return __xfs_xattr_get(inode, name, value, size, 0); +} + +static int +xfs_xattr_user_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + return __xfs_xattr_set(inode, name, value, size, flags, 0); +} + +static struct xattr_handler xfs_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .get = xfs_xattr_user_get, + .set = xfs_xattr_user_set, +}; + + +static int +xfs_xattr_trusted_get(struct inode *inode, const char *name, + void *value, size_t size) +{ + return __xfs_xattr_get(inode, name, value, size, ATTR_ROOT); +} + +static int +xfs_xattr_trusted_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + return __xfs_xattr_set(inode, name, value, size, flags, ATTR_ROOT); +} + +static struct xattr_handler xfs_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .get = xfs_xattr_trusted_get, + .set = xfs_xattr_trusted_set, +}; + + +static int +xfs_xattr_secure_get(struct inode *inode, const char *name, + void *value, size_t size) +{ + return __xfs_xattr_get(inode, name, value, size, ATTR_SECURE); +} + +static int +xfs_xattr_secure_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + return __xfs_xattr_set(inode, name, value, size, flags, ATTR_SECURE); +} + +static struct xattr_handler xfs_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .get = xfs_xattr_secure_get, + .set = xfs_xattr_secure_set, +}; + + +struct xattr_handler *xfs_xattr_handlers[] = { + &xfs_xattr_user_handler, + &xfs_xattr_trusted_handler, + &xfs_xattr_security_handler, + &xfs_xattr_system_handler, + NULL +}; + +static unsigned int xfs_xattr_prefix_len(int flags) +{ + if (flags & XFS_ATTR_SECURE) + return sizeof("security"); + else if (flags & XFS_ATTR_ROOT) + return sizeof("trusted"); + else + return sizeof("user"); +} + +static const char *xfs_xattr_prefix(int flags) +{ + if (flags & XFS_ATTR_SECURE) + return xfs_xattr_security_handler.prefix; + else if (flags & XFS_ATTR_ROOT) + return xfs_xattr_trusted_handler.prefix; + else + return xfs_xattr_user_handler.prefix; +} + +static int +xfs_xattr_put_listent(struct xfs_attr_list_context *context, int flags, + char *name, int namelen, int valuelen, char *value) +{ + unsigned int prefix_len = xfs_xattr_prefix_len(flags); + char *offset; + int arraytop; + + ASSERT(context->count >= 0); + + /* + * Only show root namespace entries if we are actually allowed to + * see them. + */ + if ((flags & XFS_ATTR_ROOT) && !capable(CAP_SYS_ADMIN)) + return 0; + + arraytop = context->count + prefix_len + namelen + 1; + if (arraytop > context->firstu) { + context->count = -1; /* insufficient space */ + return 1; + } + offset = (char *)context->alist + context->count; + strncpy(offset, xfs_xattr_prefix(flags), prefix_len); + offset += prefix_len; + strncpy(offset, name, namelen); /* real name */ + offset += namelen; + *offset = '\0'; + context->count += prefix_len + namelen + 1; + return 0; +} + +static int +xfs_xattr_put_listent_sizes(struct xfs_attr_list_context *context, int flags, + char *name, int namelen, int valuelen, char *value) +{ + context->count += xfs_xattr_prefix_len(flags) + namelen + 1; + return 0; +} + +static int +list_one_attr(const char *name, const size_t len, void *data, + size_t size, ssize_t *result) +{ + char *p = data + *result; + + *result += len; + if (!size) + return 0; + if (*result > size) + return -ERANGE; + + strcpy(p, name); + return 0; +} + +ssize_t +xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size) +{ + struct xfs_attr_list_context context; + struct attrlist_cursor_kern cursor = { 0 }; + struct inode *inode = dentry->d_inode; + int error; + + /* + * First read the regular on-disk attributes. + */ + memset(&context, 0, sizeof(context)); + context.dp = XFS_I(inode); + context.cursor = &cursor; + context.resynch = 1; + context.alist = data; + context.bufsize = size; + context.firstu = context.bufsize; + + if (size) + context.put_listent = xfs_xattr_put_listent; + else + context.put_listent = xfs_xattr_put_listent_sizes; + + xfs_attr_list_int(&context); + if (context.count < 0) + return -ERANGE; + + /* + * Then add the two synthetic ACL attributes. + */ + if (xfs_acl_vhasacl_access(inode)) { + error = list_one_attr(POSIX_ACL_XATTR_ACCESS, + strlen(POSIX_ACL_XATTR_ACCESS) + 1, + data, size, &context.count); + if (error) + return error; + } + + if (xfs_acl_vhasacl_default(inode)) { + error = list_one_attr(POSIX_ACL_XATTR_DEFAULT, + strlen(POSIX_ACL_XATTR_DEFAULT) + 1, + data, size, &context.count); + if (error) + return error; + } + + return context.count; +} diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c index 85df3288efd5..f2705f2fd43c 100644 --- a/fs/xfs/quota/xfs_dquot.c +++ b/fs/xfs/quota/xfs_dquot.c @@ -101,11 +101,18 @@ xfs_qm_dqinit( if (brandnewdquot) { dqp->dq_flnext = dqp->dq_flprev = dqp; mutex_init(&dqp->q_qlock); - initnsema(&dqp->q_flock, 1, "fdq"); sv_init(&dqp->q_pinwait, SV_DEFAULT, "pdq"); + /* + * Because we want to use a counting completion, complete + * the flush completion once to allow a single access to + * the flush completion without blocking. + */ + init_completion(&dqp->q_flush); + complete(&dqp->q_flush); + #ifdef XFS_DQUOT_TRACE - dqp->q_trace = ktrace_alloc(DQUOT_TRACE_SIZE, KM_SLEEP); + dqp->q_trace = ktrace_alloc(DQUOT_TRACE_SIZE, KM_NOFS); xfs_dqtrace_entry(dqp, "DQINIT"); #endif } else { @@ -150,7 +157,6 @@ xfs_qm_dqdestroy( ASSERT(! XFS_DQ_IS_ON_FREELIST(dqp)); mutex_destroy(&dqp->q_qlock); - freesema(&dqp->q_flock); sv_destroy(&dqp->q_pinwait); #ifdef XFS_DQUOT_TRACE @@ -431,7 +437,7 @@ xfs_qm_dqalloc( * when it unlocks the inode. Since we want to keep the quota * inode around, we bump the vnode ref count now. */ - VN_HOLD(XFS_ITOV(quotip)); + IHOLD(quotip); xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL); nmaps = 1; @@ -1211,7 +1217,7 @@ xfs_qm_dqflush( int error; ASSERT(XFS_DQ_IS_LOCKED(dqp)); - ASSERT(XFS_DQ_IS_FLUSH_LOCKED(dqp)); + ASSERT(!completion_done(&dqp->q_flush)); xfs_dqtrace_entry(dqp, "DQFLUSH"); /* @@ -1348,34 +1354,18 @@ xfs_qm_dqflush_done( xfs_dqfunlock(dqp); } - -int -xfs_qm_dqflock_nowait( - xfs_dquot_t *dqp) -{ - int locked; - - locked = cpsema(&((dqp)->q_flock)); - - /* XXX ifdef these out */ - if (locked) - (dqp)->dq_flags |= XFS_DQ_FLOCKED; - return (locked); -} - - int xfs_qm_dqlock_nowait( xfs_dquot_t *dqp) { - return (mutex_trylock(&((dqp)->q_qlock))); + return mutex_trylock(&dqp->q_qlock); } void xfs_dqlock( xfs_dquot_t *dqp) { - mutex_lock(&(dqp->q_qlock)); + mutex_lock(&dqp->q_qlock); } void @@ -1435,8 +1425,7 @@ xfs_dqlock2( /* ARGSUSED */ int xfs_qm_dqpurge( - xfs_dquot_t *dqp, - uint flags) + xfs_dquot_t *dqp) { xfs_dqhash_t *thishash; xfs_mount_t *mp = dqp->q_mount; @@ -1469,7 +1458,7 @@ xfs_qm_dqpurge( * if we're turning off quotas. Basically, we need this flush * lock, and are willing to block on it. */ - if (! xfs_qm_dqflock_nowait(dqp)) { + if (!xfs_dqflock_nowait(dqp)) { /* * Block on the flush lock after nudging dquot buffer, * if it is incore. diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h index 5c371a92e3e2..8958d0faf8d3 100644 --- a/fs/xfs/quota/xfs_dquot.h +++ b/fs/xfs/quota/xfs_dquot.h @@ -82,7 +82,7 @@ typedef struct xfs_dquot { xfs_qcnt_t q_res_icount; /* total inos allocd+reserved */ xfs_qcnt_t q_res_rtbcount;/* total realtime blks used+reserved */ mutex_t q_qlock; /* quota lock */ - sema_t q_flock; /* flush lock */ + struct completion q_flush; /* flush completion queue */ uint q_pincount; /* pin count for this dquot */ sv_t q_pinwait; /* sync var for pinning */ #ifdef XFS_DQUOT_TRACE @@ -113,17 +113,25 @@ XFS_DQ_IS_LOCKED(xfs_dquot_t *dqp) /* - * The following three routines simply manage the q_flock - * semaphore embedded in the dquot. This semaphore synchronizes - * processes attempting to flush the in-core dquot back to disk. + * Manage the q_flush completion queue embedded in the dquot. This completion + * queue synchronizes processes attempting to flush the in-core dquot back to + * disk. */ -#define xfs_dqflock(dqp) { psema(&((dqp)->q_flock), PINOD | PRECALC);\ - (dqp)->dq_flags |= XFS_DQ_FLOCKED; } -#define xfs_dqfunlock(dqp) { ASSERT(issemalocked(&((dqp)->q_flock))); \ - vsema(&((dqp)->q_flock)); \ - (dqp)->dq_flags &= ~(XFS_DQ_FLOCKED); } +static inline void xfs_dqflock(xfs_dquot_t *dqp) +{ + wait_for_completion(&dqp->q_flush); +} + +static inline int xfs_dqflock_nowait(xfs_dquot_t *dqp) +{ + return try_wait_for_completion(&dqp->q_flush); +} + +static inline void xfs_dqfunlock(xfs_dquot_t *dqp) +{ + complete(&dqp->q_flush); +} -#define XFS_DQ_IS_FLUSH_LOCKED(dqp) (issemalocked(&((dqp)->q_flock))) #define XFS_DQ_IS_ON_FREELIST(dqp) ((dqp)->dq_flnext != (dqp)) #define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY) #define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER) @@ -164,10 +172,9 @@ extern void xfs_qm_dqprint(xfs_dquot_t *); extern void xfs_qm_dqdestroy(xfs_dquot_t *); extern int xfs_qm_dqflush(xfs_dquot_t *, uint); -extern int xfs_qm_dqpurge(xfs_dquot_t *, uint); +extern int xfs_qm_dqpurge(xfs_dquot_t *); extern void xfs_qm_dqunpin_wait(xfs_dquot_t *); extern int xfs_qm_dqlock_nowait(xfs_dquot_t *); -extern int xfs_qm_dqflock_nowait(xfs_dquot_t *); extern void xfs_qm_dqflock_pushbuf_wait(xfs_dquot_t *dqp); extern void xfs_qm_adjust_dqtimers(xfs_mount_t *, xfs_disk_dquot_t *); diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c index 36e05ca78412..f028644caa5e 100644 --- a/fs/xfs/quota/xfs_dquot_item.c +++ b/fs/xfs/quota/xfs_dquot_item.c @@ -151,7 +151,7 @@ xfs_qm_dquot_logitem_push( dqp = logitem->qli_dquot; ASSERT(XFS_DQ_IS_LOCKED(dqp)); - ASSERT(XFS_DQ_IS_FLUSH_LOCKED(dqp)); + ASSERT(!completion_done(&dqp->q_flush)); /* * Since we were able to lock the dquot's flush lock and @@ -245,7 +245,7 @@ xfs_qm_dquot_logitem_pushbuf( * inode flush completed and the inode was taken off the AIL. * So, just get out. */ - if (!issemalocked(&(dqp->q_flock)) || + if (completion_done(&dqp->q_flush) || ((qip->qli_item.li_flags & XFS_LI_IN_AIL) == 0)) { qip->qli_pushbuf_flag = 0; xfs_dqunlock(dqp); @@ -258,7 +258,7 @@ xfs_qm_dquot_logitem_pushbuf( if (bp != NULL) { if (XFS_BUF_ISDELAYWRITE(bp)) { dopush = ((qip->qli_item.li_flags & XFS_LI_IN_AIL) && - issemalocked(&(dqp->q_flock))); + !completion_done(&dqp->q_flush)); qip->qli_pushbuf_flag = 0; xfs_dqunlock(dqp); @@ -317,7 +317,7 @@ xfs_qm_dquot_logitem_trylock( return (XFS_ITEM_LOCKED); retval = XFS_ITEM_SUCCESS; - if (! xfs_qm_dqflock_nowait(dqp)) { + if (!xfs_dqflock_nowait(dqp)) { /* * The dquot is already being flushed. It may have been * flushed delayed write, however, and we don't want to @@ -576,8 +576,8 @@ xfs_qm_qoffend_logitem_committed( * xfs_trans_delete_ail() drops the AIL lock. */ xfs_trans_delete_ail(qfs->qql_item.li_mountp, (xfs_log_item_t *)qfs); - kmem_free(qfs, sizeof(xfs_qoff_logitem_t)); - kmem_free(qfe, sizeof(xfs_qoff_logitem_t)); + kmem_free(qfs); + kmem_free(qfe); return (xfs_lsn_t)-1; } diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index d31cce1165c5..df0ffef9775a 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -192,8 +192,8 @@ xfs_qm_destroy( xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i])); xfs_qm_list_destroy(&(xqm->qm_grp_dqhtable[i])); } - kmem_free(xqm->qm_usr_dqhtable, hsize * sizeof(xfs_dqhash_t)); - kmem_free(xqm->qm_grp_dqhtable, hsize * sizeof(xfs_dqhash_t)); + kmem_free(xqm->qm_usr_dqhtable); + kmem_free(xqm->qm_grp_dqhtable); xqm->qm_usr_dqhtable = NULL; xqm->qm_grp_dqhtable = NULL; xqm->qm_dqhashmask = 0; @@ -201,7 +201,7 @@ xfs_qm_destroy( #ifdef DEBUG mutex_destroy(&qcheck_lock); #endif - kmem_free(xqm, sizeof(xfs_qm_t)); + kmem_free(xqm); } /* @@ -310,8 +310,7 @@ xfs_qm_unmount_quotadestroy( */ void xfs_qm_mount_quotas( - xfs_mount_t *mp, - int mfsi_flags) + xfs_mount_t *mp) { int error = 0; uint sbf; @@ -346,8 +345,7 @@ xfs_qm_mount_quotas( /* * If any of the quotas are not consistent, do a quotacheck. */ - if (XFS_QM_NEED_QUOTACHECK(mp) && - !(mfsi_flags & XFS_MFSI_NO_QUOTACHECK)) { + if (XFS_QM_NEED_QUOTACHECK(mp)) { error = xfs_qm_quotacheck(mp); if (error) { /* Quotacheck failed and disabled quotas. */ @@ -445,11 +443,11 @@ xfs_qm_unmount_quotas( } } if (uqp) { - XFS_PURGE_INODE(uqp); + IRELE(uqp); mp->m_quotainfo->qi_uquotaip = NULL; } if (gqp) { - XFS_PURGE_INODE(gqp); + IRELE(gqp); mp->m_quotainfo->qi_gquotaip = NULL; } out: @@ -484,7 +482,7 @@ again: xfs_dqtrace_entry(dqp, "FLUSHALL: DQDIRTY"); /* XXX a sentinel would be better */ recl = XFS_QI_MPLRECLAIMS(mp); - if (! xfs_qm_dqflock_nowait(dqp)) { + if (!xfs_dqflock_nowait(dqp)) { /* * If we can't grab the flush lock then check * to see if the dquot has been flushed delayed @@ -631,7 +629,7 @@ xfs_qm_dqpurge_int( * freelist in INACTIVE state. */ nextdqp = dqp->MPL_NEXT; - nmisses += xfs_qm_dqpurge(dqp, flags); + nmisses += xfs_qm_dqpurge(dqp); dqp = nextdqp; } xfs_qm_mplist_unlock(mp); @@ -1062,7 +1060,7 @@ xfs_qm_sync( /* XXX a sentinel would be better */ recl = XFS_QI_MPLRECLAIMS(mp); - if (! xfs_qm_dqflock_nowait(dqp)) { + if (!xfs_dqflock_nowait(dqp)) { if (nowait) { xfs_dqunlock(dqp); continue; @@ -1134,7 +1132,7 @@ xfs_qm_init_quotainfo( * and change the superblock accordingly. */ if ((error = xfs_qm_init_quotainos(mp))) { - kmem_free(qinf, sizeof(xfs_quotainfo_t)); + kmem_free(qinf); mp->m_quotainfo = NULL; return error; } @@ -1240,15 +1238,15 @@ xfs_qm_destroy_quotainfo( xfs_qm_list_destroy(&qi->qi_dqlist); if (qi->qi_uquotaip) { - XFS_PURGE_INODE(qi->qi_uquotaip); + IRELE(qi->qi_uquotaip); qi->qi_uquotaip = NULL; /* paranoia */ } if (qi->qi_gquotaip) { - XFS_PURGE_INODE(qi->qi_gquotaip); + IRELE(qi->qi_gquotaip); qi->qi_gquotaip = NULL; } mutex_destroy(&qi->qi_quotaofflock); - kmem_free(qi, sizeof(xfs_quotainfo_t)); + kmem_free(qi); mp->m_quotainfo = NULL; } @@ -1394,7 +1392,7 @@ xfs_qm_qino_alloc( * locked exclusively and joined to the transaction already. */ ASSERT(xfs_isilocked(*ip, XFS_ILOCK_EXCL)); - VN_HOLD(XFS_ITOV((*ip))); + IHOLD(*ip); /* * Make the changes in the superblock, and log those too. @@ -1623,7 +1621,7 @@ xfs_qm_dqiterate( break; } while (nmaps > 0); - kmem_free(map, XFS_DQITER_MAP_SIZE * sizeof(*map)); + kmem_free(map); return error; } @@ -2079,7 +2077,7 @@ xfs_qm_shake_freelist( * Try to grab the flush lock. If this dquot is in the process of * getting flushed to disk, we don't want to reclaim it. */ - if (! xfs_qm_dqflock_nowait(dqp)) { + if (!xfs_dqflock_nowait(dqp)) { xfs_dqunlock(dqp); dqp = dqp->dq_flnext; continue; @@ -2257,7 +2255,7 @@ xfs_qm_dqreclaim_one(void) * Try to grab the flush lock. If this dquot is in the process of * getting flushed to disk, we don't want to reclaim it. */ - if (! xfs_qm_dqflock_nowait(dqp)) { + if (!xfs_dqflock_nowait(dqp)) { xfs_dqunlock(dqp); continue; } diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h index cd2300e374af..44f25349e478 100644 --- a/fs/xfs/quota/xfs_qm.h +++ b/fs/xfs/quota/xfs_qm.h @@ -165,7 +165,7 @@ typedef struct xfs_dquot_acct { #define XFS_QM_RELE(xqm) ((xqm)->qm_nrefs--) extern void xfs_qm_destroy_quotainfo(xfs_mount_t *); -extern void xfs_qm_mount_quotas(xfs_mount_t *, int); +extern void xfs_qm_mount_quotas(xfs_mount_t *); extern int xfs_qm_quotacheck(xfs_mount_t *); extern void xfs_qm_unmount_quotadestroy(xfs_mount_t *); extern int xfs_qm_unmount_quotas(xfs_mount_t *); diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c index f4f6c4c861d7..eea2e60b456b 100644 --- a/fs/xfs/quota/xfs_qm_bhv.c +++ b/fs/xfs/quota/xfs_qm_bhv.c @@ -162,7 +162,7 @@ xfs_qm_newmount( * mounting, and get on with the boring life * without disk quotas. */ - xfs_qm_mount_quotas(mp, 0); + xfs_qm_mount_quotas(mp); } else { /* * Clear the quota flags, but remember them. This @@ -184,13 +184,12 @@ STATIC int xfs_qm_endmount( xfs_mount_t *mp, uint needquotamount, - uint quotaflags, - int mfsi_flags) + uint quotaflags) { if (needquotamount) { ASSERT(mp->m_qflags == 0); mp->m_qflags = quotaflags; - xfs_qm_mount_quotas(mp, mfsi_flags); + xfs_qm_mount_quotas(mp); } #if defined(DEBUG) && defined(XFS_LOUD_RECOVERY) diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c index 768a3b27d2b6..1a3b803dfa55 100644 --- a/fs/xfs/quota/xfs_qm_syscalls.c +++ b/fs/xfs/quota/xfs_qm_syscalls.c @@ -362,11 +362,11 @@ xfs_qm_scall_quotaoff( * if we don't need them anymore. */ if ((dqtype & XFS_QMOPT_UQUOTA) && XFS_QI_UQIP(mp)) { - XFS_PURGE_INODE(XFS_QI_UQIP(mp)); + IRELE(XFS_QI_UQIP(mp)); XFS_QI_UQIP(mp) = NULL; } if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && XFS_QI_GQIP(mp)) { - XFS_PURGE_INODE(XFS_QI_GQIP(mp)); + IRELE(XFS_QI_GQIP(mp)); XFS_QI_GQIP(mp) = NULL; } out_error: @@ -1034,7 +1034,7 @@ xfs_qm_dqrele_all_inodes( { xfs_inode_t *ip, *topino; uint ireclaims; - bhv_vnode_t *vp; + struct inode *vp; boolean_t vnode_refd; ASSERT(mp->m_quotainfo); @@ -1059,7 +1059,7 @@ again: ip = ip->i_mnext; continue; } - vp = XFS_ITOV_NULL(ip); + vp = VFS_I(ip); if (!vp) { ASSERT(ip->i_udquot == NULL); ASSERT(ip->i_gdquot == NULL); @@ -1449,14 +1449,14 @@ xfs_qm_internalqcheck( for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) { xfs_dqtest_cmp(d); e = (xfs_dqtest_t *) d->HL_NEXT; - kmem_free(d, sizeof(xfs_dqtest_t)); + kmem_free(d); d = e; } h1 = &qmtest_gdqtab[i]; for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) { xfs_dqtest_cmp(d); e = (xfs_dqtest_t *) d->HL_NEXT; - kmem_free(d, sizeof(xfs_dqtest_t)); + kmem_free(d); d = e; } } @@ -1467,8 +1467,8 @@ xfs_qm_internalqcheck( } else { cmn_err(CE_DEBUG, "******** quotacheck successful! ********"); } - kmem_free(qmtest_udqtab, qmtest_hashmask * sizeof(xfs_dqhash_t)); - kmem_free(qmtest_gdqtab, qmtest_hashmask * sizeof(xfs_dqhash_t)); + kmem_free(qmtest_udqtab); + kmem_free(qmtest_gdqtab); mutex_unlock(&qcheck_lock); return (qmtest_nfails); } diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/quota/xfs_quota_priv.h index 5e4a40b1c565..c4fcea600bc2 100644 --- a/fs/xfs/quota/xfs_quota_priv.h +++ b/fs/xfs/quota/xfs_quota_priv.h @@ -158,9 +158,6 @@ for ((dqp) = (qlist)->qh_next; (dqp) != (xfs_dquot_t *)(qlist); \ #define XFS_IS_SUSER_DQUOT(dqp) \ (!((dqp)->q_core.d_id)) -#define XFS_PURGE_INODE(ip) \ - IRELE(ip); - #define DQFLAGTO_TYPESTR(d) (((d)->dq_flags & XFS_DQ_USER) ? "USR" : \ (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : \ (((d)->dq_flags & XFS_DQ_PROJ) ? "PRJ":"???"))) diff --git a/fs/xfs/support/ktrace.c b/fs/xfs/support/ktrace.c index 0b75d302508f..a34ef05489b1 100644 --- a/fs/xfs/support/ktrace.c +++ b/fs/xfs/support/ktrace.c @@ -89,7 +89,7 @@ ktrace_alloc(int nentries, unsigned int __nocast sleep) if (sleep & KM_SLEEP) panic("ktrace_alloc: NULL memory on KM_SLEEP request!"); - kmem_free(ktp, sizeof(*ktp)); + kmem_free(ktp); return NULL; } @@ -126,7 +126,7 @@ ktrace_free(ktrace_t *ktp) } else { entries_size = (int)(ktp->kt_nentries * sizeof(ktrace_entry_t)); - kmem_free(ktp->kt_entries, entries_size); + kmem_free(ktp->kt_entries); } kmem_zone_free(ktrace_hdr_zone, ktp); diff --git a/fs/xfs/support/uuid.c b/fs/xfs/support/uuid.c index 493a6ecf8590..5830c040ea7e 100644 --- a/fs/xfs/support/uuid.c +++ b/fs/xfs/support/uuid.c @@ -17,7 +17,7 @@ */ #include <xfs.h> -static mutex_t uuid_monitor; +static DEFINE_MUTEX(uuid_monitor); static int uuid_table_size; static uuid_t *uuid_table; @@ -132,9 +132,3 @@ uuid_table_remove(uuid_t *uuid) ASSERT(i < uuid_table_size); mutex_unlock(&uuid_monitor); } - -void __init -uuid_init(void) -{ - mutex_init(&uuid_monitor); -} diff --git a/fs/xfs/support/uuid.h b/fs/xfs/support/uuid.h index b6f5922199ba..cff5b607d445 100644 --- a/fs/xfs/support/uuid.h +++ b/fs/xfs/support/uuid.h @@ -22,7 +22,6 @@ typedef struct { unsigned char __u_bits[16]; } uuid_t; -extern void uuid_init(void); extern void uuid_create_nil(uuid_t *uuid); extern int uuid_is_nil(uuid_t *uuid); extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2); diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index ebee3a4f703a..b2f639a1416f 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -37,15 +37,15 @@ #include <linux/capability.h> #include <linux/posix_acl_xattr.h> -STATIC int xfs_acl_setmode(bhv_vnode_t *, xfs_acl_t *, int *); +STATIC int xfs_acl_setmode(struct inode *, xfs_acl_t *, int *); STATIC void xfs_acl_filter_mode(mode_t, xfs_acl_t *); STATIC void xfs_acl_get_endian(xfs_acl_t *); STATIC int xfs_acl_access(uid_t, gid_t, xfs_acl_t *, mode_t, cred_t *); STATIC int xfs_acl_invalid(xfs_acl_t *); STATIC void xfs_acl_sync_mode(mode_t, xfs_acl_t *); -STATIC void xfs_acl_get_attr(bhv_vnode_t *, xfs_acl_t *, int, int, int *); -STATIC void xfs_acl_set_attr(bhv_vnode_t *, xfs_acl_t *, int, int *); -STATIC int xfs_acl_allow_set(bhv_vnode_t *, int); +STATIC void xfs_acl_get_attr(struct inode *, xfs_acl_t *, int, int, int *); +STATIC void xfs_acl_set_attr(struct inode *, xfs_acl_t *, int, int *); +STATIC int xfs_acl_allow_set(struct inode *, int); kmem_zone_t *xfs_acl_zone; @@ -55,7 +55,7 @@ kmem_zone_t *xfs_acl_zone; */ int xfs_acl_vhasacl_access( - bhv_vnode_t *vp) + struct inode *vp) { int error; @@ -68,7 +68,7 @@ xfs_acl_vhasacl_access( */ int xfs_acl_vhasacl_default( - bhv_vnode_t *vp) + struct inode *vp) { int error; @@ -207,7 +207,7 @@ posix_acl_xfs_to_xattr( int xfs_acl_vget( - bhv_vnode_t *vp, + struct inode *vp, void *acl, size_t size, int kind) @@ -217,7 +217,6 @@ xfs_acl_vget( posix_acl_xattr_header *ext_acl = acl; int flags = 0; - VN_HOLD(vp); if(size) { if (!(_ACL_ALLOC(xfs_acl))) { error = ENOMEM; @@ -239,11 +238,10 @@ xfs_acl_vget( goto out; } if (kind == _ACL_TYPE_ACCESS) - xfs_acl_sync_mode(xfs_vtoi(vp)->i_d.di_mode, xfs_acl); + xfs_acl_sync_mode(XFS_I(vp)->i_d.di_mode, xfs_acl); error = -posix_acl_xfs_to_xattr(xfs_acl, ext_acl, size); } out: - VN_RELE(vp); if(xfs_acl) _ACL_FREE(xfs_acl); return -error; @@ -251,28 +249,26 @@ out: int xfs_acl_vremove( - bhv_vnode_t *vp, + struct inode *vp, int kind) { int error; - VN_HOLD(vp); error = xfs_acl_allow_set(vp, kind); if (!error) { - error = xfs_attr_remove(xfs_vtoi(vp), + error = xfs_attr_remove(XFS_I(vp), kind == _ACL_TYPE_DEFAULT? SGI_ACL_DEFAULT: SGI_ACL_FILE, ATTR_ROOT); if (error == ENOATTR) error = 0; /* 'scool */ } - VN_RELE(vp); return -error; } int xfs_acl_vset( - bhv_vnode_t *vp, + struct inode *vp, void *acl, size_t size, int kind) @@ -298,7 +294,6 @@ xfs_acl_vset( return 0; } - VN_HOLD(vp); error = xfs_acl_allow_set(vp, kind); /* Incoming ACL exists, set file mode based on its value */ @@ -321,7 +316,6 @@ xfs_acl_vset( } out: - VN_RELE(vp); _ACL_FREE(xfs_acl); return -error; } @@ -341,8 +335,7 @@ xfs_acl_iaccess( /* If the file has no ACL return -1. */ rval = sizeof(xfs_acl_t); - if (xfs_attr_fetch(ip, &acl_name, (char *)acl, &rval, - ATTR_ROOT | ATTR_KERNACCESS)) { + if (xfs_attr_fetch(ip, &acl_name, (char *)acl, &rval, ATTR_ROOT)) { _ACL_FREE(acl); return -1; } @@ -364,7 +357,7 @@ xfs_acl_iaccess( STATIC int xfs_acl_allow_set( - bhv_vnode_t *vp, + struct inode *vp, int kind) { if (vp->i_flags & (S_IMMUTABLE|S_APPEND)) @@ -373,7 +366,7 @@ xfs_acl_allow_set( return ENOTDIR; if (vp->i_sb->s_flags & MS_RDONLY) return EROFS; - if (xfs_vtoi(vp)->i_d.di_uid != current->fsuid && !capable(CAP_FOWNER)) + if (XFS_I(vp)->i_d.di_uid != current->fsuid && !capable(CAP_FOWNER)) return EPERM; return 0; } @@ -567,7 +560,7 @@ xfs_acl_get_endian( */ STATIC void xfs_acl_get_attr( - bhv_vnode_t *vp, + struct inode *vp, xfs_acl_t *aclp, int kind, int flags, @@ -577,7 +570,7 @@ xfs_acl_get_attr( ASSERT((flags & ATTR_KERNOVAL) ? (aclp == NULL) : 1); flags |= ATTR_ROOT; - *error = xfs_attr_get(xfs_vtoi(vp), + *error = xfs_attr_get(XFS_I(vp), kind == _ACL_TYPE_ACCESS ? SGI_ACL_FILE : SGI_ACL_DEFAULT, (char *)aclp, &len, flags); @@ -591,7 +584,7 @@ xfs_acl_get_attr( */ STATIC void xfs_acl_set_attr( - bhv_vnode_t *vp, + struct inode *vp, xfs_acl_t *aclp, int kind, int *error) @@ -616,7 +609,7 @@ xfs_acl_set_attr( INT_SET(newace->ae_perm, ARCH_CONVERT, ace->ae_perm); } INT_SET(newacl->acl_cnt, ARCH_CONVERT, aclp->acl_cnt); - *error = xfs_attr_set(xfs_vtoi(vp), + *error = xfs_attr_set(XFS_I(vp), kind == _ACL_TYPE_ACCESS ? SGI_ACL_FILE: SGI_ACL_DEFAULT, (char *)newacl, len, ATTR_ROOT); @@ -625,7 +618,7 @@ xfs_acl_set_attr( int xfs_acl_vtoacl( - bhv_vnode_t *vp, + struct inode *vp, xfs_acl_t *access_acl, xfs_acl_t *default_acl) { @@ -640,7 +633,7 @@ xfs_acl_vtoacl( if (error) access_acl->acl_cnt = XFS_ACL_NOT_PRESENT; else /* We have a good ACL and the file mode, synchronize. */ - xfs_acl_sync_mode(xfs_vtoi(vp)->i_d.di_mode, access_acl); + xfs_acl_sync_mode(XFS_I(vp)->i_d.di_mode, access_acl); } if (default_acl) { @@ -657,7 +650,7 @@ xfs_acl_vtoacl( */ int xfs_acl_inherit( - bhv_vnode_t *vp, + struct inode *vp, mode_t mode, xfs_acl_t *pdaclp) { @@ -716,11 +709,11 @@ out_error: */ STATIC int xfs_acl_setmode( - bhv_vnode_t *vp, + struct inode *vp, xfs_acl_t *acl, int *basicperms) { - bhv_vattr_t va; + struct iattr iattr; xfs_acl_entry_t *ap; xfs_acl_entry_t *gap = NULL; int i, nomask = 1; @@ -734,25 +727,25 @@ xfs_acl_setmode( * Copy the u::, g::, o::, and m:: bits from the ACL into the * mode. The m:: bits take precedence over the g:: bits. */ - va.va_mask = XFS_AT_MODE; - va.va_mode = xfs_vtoi(vp)->i_d.di_mode; - va.va_mode &= ~(S_IRWXU|S_IRWXG|S_IRWXO); + iattr.ia_valid = ATTR_MODE; + iattr.ia_mode = XFS_I(vp)->i_d.di_mode; + iattr.ia_mode &= ~(S_IRWXU|S_IRWXG|S_IRWXO); ap = acl->acl_entry; for (i = 0; i < acl->acl_cnt; ++i) { switch (ap->ae_tag) { case ACL_USER_OBJ: - va.va_mode |= ap->ae_perm << 6; + iattr.ia_mode |= ap->ae_perm << 6; break; case ACL_GROUP_OBJ: gap = ap; break; case ACL_MASK: /* more than just standard modes */ nomask = 0; - va.va_mode |= ap->ae_perm << 3; + iattr.ia_mode |= ap->ae_perm << 3; *basicperms = 0; break; case ACL_OTHER: - va.va_mode |= ap->ae_perm; + iattr.ia_mode |= ap->ae_perm; break; default: /* more than just standard modes */ *basicperms = 0; @@ -763,9 +756,9 @@ xfs_acl_setmode( /* Set the group bits from ACL_GROUP_OBJ if there's no ACL_MASK */ if (gap && nomask) - va.va_mode |= gap->ae_perm << 3; + iattr.ia_mode |= gap->ae_perm << 3; - return xfs_setattr(xfs_vtoi(vp), &va, 0, sys_cred); + return xfs_setattr(XFS_I(vp), &iattr, 0, sys_cred); } /* diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index 332a772461c4..a4e293b93efa 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -46,6 +46,8 @@ typedef struct xfs_acl { #define SGI_ACL_FILE_SIZE (sizeof(SGI_ACL_FILE)-1) #define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1) +#define _ACL_TYPE_ACCESS 1 +#define _ACL_TYPE_DEFAULT 2 #ifdef CONFIG_XFS_POSIX_ACL @@ -57,17 +59,15 @@ extern struct kmem_zone *xfs_acl_zone; (zone) = kmem_zone_init(sizeof(xfs_acl_t), (name)) #define xfs_acl_zone_destroy(zone) kmem_zone_destroy(zone) -extern int xfs_acl_inherit(bhv_vnode_t *, mode_t mode, xfs_acl_t *); +extern int xfs_acl_inherit(struct inode *, mode_t mode, xfs_acl_t *); extern int xfs_acl_iaccess(struct xfs_inode *, mode_t, cred_t *); -extern int xfs_acl_vtoacl(bhv_vnode_t *, xfs_acl_t *, xfs_acl_t *); -extern int xfs_acl_vhasacl_access(bhv_vnode_t *); -extern int xfs_acl_vhasacl_default(bhv_vnode_t *); -extern int xfs_acl_vset(bhv_vnode_t *, void *, size_t, int); -extern int xfs_acl_vget(bhv_vnode_t *, void *, size_t, int); -extern int xfs_acl_vremove(bhv_vnode_t *, int); +extern int xfs_acl_vtoacl(struct inode *, xfs_acl_t *, xfs_acl_t *); +extern int xfs_acl_vhasacl_access(struct inode *); +extern int xfs_acl_vhasacl_default(struct inode *); +extern int xfs_acl_vset(struct inode *, void *, size_t, int); +extern int xfs_acl_vget(struct inode *, void *, size_t, int); +extern int xfs_acl_vremove(struct inode *, int); -#define _ACL_TYPE_ACCESS 1 -#define _ACL_TYPE_DEFAULT 2 #define _ACL_PERM_INVALID(perm) ((perm) & ~(ACL_READ|ACL_WRITE|ACL_EXECUTE)) #define _ACL_INHERIT(c,m,d) (xfs_acl_inherit(c,m,d)) diff --git a/fs/xfs/xfs_arch.h b/fs/xfs/xfs_arch.h index f9472a2076d4..0b3b5efe848c 100644 --- a/fs/xfs/xfs_arch.h +++ b/fs/xfs/xfs_arch.h @@ -92,16 +92,6 @@ ((__u8*)(pointer))[1] = (((value) ) & 0xff); \ } -/* define generic INT_ macros */ - -#define INT_GET(reference,arch) \ - (((arch) == ARCH_NOCONVERT) \ - ? \ - (reference) \ - : \ - INT_SWAP((reference),(reference)) \ - ) - /* does not return a value */ #define INT_SET(reference,arch,valueref) \ (__builtin_constant_p(valueref) ? \ @@ -112,64 +102,6 @@ ) \ ) -/* does not return a value */ -#define INT_MOD_EXPR(reference,arch,code) \ - (((arch) == ARCH_NOCONVERT) \ - ? \ - (void)((reference) code) \ - : \ - (void)( \ - (reference) = INT_GET((reference),arch) , \ - ((reference) code), \ - INT_SET(reference, arch, reference) \ - ) \ - ) - -/* does not return a value */ -#define INT_MOD(reference,arch,delta) \ - (void)( \ - INT_MOD_EXPR(reference,arch,+=(delta)) \ - ) - -/* - * INT_COPY - copy a value between two locations with the - * _same architecture_ but _potentially different sizes_ - * - * if the types of the two parameters are equal or they are - * in native architecture, a simple copy is done - * - * otherwise, architecture conversions are done - * - */ - -/* does not return a value */ -#define INT_COPY(dst,src,arch) \ - ( \ - ((sizeof(dst) == sizeof(src)) || ((arch) == ARCH_NOCONVERT)) \ - ? \ - (void)((dst) = (src)) \ - : \ - INT_SET(dst, arch, INT_GET(src, arch)) \ - ) - -/* - * INT_XLATE - copy a value in either direction between two locations - * with different architectures - * - * dir < 0 - copy from memory to buffer (native to arch) - * dir > 0 - copy from buffer to memory (arch to native) - */ - -/* does not return a value */ -#define INT_XLATE(buf,mem,dir,arch) {\ - ASSERT(dir); \ - if (dir>0) { \ - (mem)=INT_GET(buf, arch); \ - } else { \ - INT_SET(buf, arch, mem); \ - } \ -} - /* * In directories inode numbers are stored as unaligned arrays of unsigned * 8bit integers on disk. diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index df151a859186..f7cdc28aff41 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -16,8 +16,6 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ -#include <linux/capability.h> - #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" @@ -57,11 +55,6 @@ * Provide the external interfaces to manage attribute lists. */ -#define ATTR_SYSCOUNT 2 -static struct attrnames posix_acl_access; -static struct attrnames posix_acl_default; -static struct attrnames *attr_system_names[ATTR_SYSCOUNT]; - /*======================================================================== * Function prototypes for the kernel. *========================================================================*/ @@ -116,6 +109,17 @@ xfs_attr_name_to_xname( return 0; } +STATIC int +xfs_inode_hasattr( + struct xfs_inode *ip) +{ + if (!XFS_IFORK_Q(ip) || + (ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && + ip->i_d.di_anextents == 0)) + return 0; + return 1; +} + /*======================================================================== * Overall external interface routines. *========================================================================*/ @@ -127,10 +131,8 @@ xfs_attr_fetch(xfs_inode_t *ip, struct xfs_name *name, xfs_da_args_t args; int error; - if ((XFS_IFORK_Q(ip) == 0) || - (ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && - ip->i_d.di_anextents == 0)) - return(ENOATTR); + if (!xfs_inode_hasattr(ip)) + return ENOATTR; /* * Fill in the arg structure for this request. @@ -148,11 +150,7 @@ xfs_attr_fetch(xfs_inode_t *ip, struct xfs_name *name, /* * Decide on what work routines to call based on the inode size. */ - if (XFS_IFORK_Q(ip) == 0 || - (ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && - ip->i_d.di_anextents == 0)) { - error = XFS_ERROR(ENOATTR); - } else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { + if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { error = xfs_attr_shortform_getvalue(&args); } else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK)) { error = xfs_attr_leaf_get(&args); @@ -196,6 +194,46 @@ xfs_attr_get( return(error); } +/* + * Calculate how many blocks we need for the new attribute, + */ +int +xfs_attr_calc_size( + struct xfs_inode *ip, + int namelen, + int valuelen, + int *local) +{ + struct xfs_mount *mp = ip->i_mount; + int size; + int nblks; + + /* + * Determine space new attribute will use, and if it would be + * "local" or "remote" (note: local != inline). + */ + size = xfs_attr_leaf_newentsize(namelen, valuelen, + mp->m_sb.sb_blocksize, local); + + nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK); + if (*local) { + if (size > (mp->m_sb.sb_blocksize >> 1)) { + /* Double split possible */ + nblks *= 2; + } + } else { + /* + * Out of line attribute, cannot double split, but + * make room for the attribute value itself. + */ + uint dblocks = XFS_B_TO_FSB(mp, valuelen); + nblks += dblocks; + nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK); + } + + return nblks; +} + STATIC int xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name, char *value, int valuelen, int flags) @@ -204,10 +242,9 @@ xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name, xfs_fsblock_t firstblock; xfs_bmap_free_t flist; int error, err2, committed; - int local, size; - uint nblks; xfs_mount_t *mp = dp->i_mount; int rsvd = (flags & ATTR_ROOT) != 0; + int local; /* * Attach the dquots to the inode. @@ -241,33 +278,10 @@ xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name, args.firstblock = &firstblock; args.flist = &flist; args.whichfork = XFS_ATTR_FORK; - args.addname = 1; - args.oknoent = 1; - - /* - * Determine space new attribute will use, and if it would be - * "local" or "remote" (note: local != inline). - */ - size = xfs_attr_leaf_newentsize(name->len, valuelen, - mp->m_sb.sb_blocksize, &local); - - nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK); - if (local) { - if (size > (mp->m_sb.sb_blocksize >> 1)) { - /* Double split possible */ - nblks <<= 1; - } - } else { - uint dblocks = XFS_B_TO_FSB(mp, valuelen); - /* Out of line attribute, cannot double split, but make - * room for the attribute value itself. - */ - nblks += dblocks; - nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK); - } + args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; /* Size is now blocks for attribute data */ - args.total = nblks; + args.total = xfs_attr_calc_size(dp, name->len, valuelen, &local); /* * Start our first transaction of the day. @@ -289,18 +303,17 @@ xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name, if (rsvd) args.trans->t_flags |= XFS_TRANS_RESERVE; - if ((error = xfs_trans_reserve(args.trans, (uint) nblks, - XFS_ATTRSET_LOG_RES(mp, nblks), - 0, XFS_TRANS_PERM_LOG_RES, - XFS_ATTRSET_LOG_COUNT))) { + if ((error = xfs_trans_reserve(args.trans, args.total, + XFS_ATTRSET_LOG_RES(mp, args.total), 0, + XFS_TRANS_PERM_LOG_RES, XFS_ATTRSET_LOG_COUNT))) { xfs_trans_cancel(args.trans, 0); return(error); } xfs_ilock(dp, XFS_ILOCK_EXCL); - error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, args.trans, dp, nblks, 0, - rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : - XFS_QMOPT_RES_REGBLKS); + error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, args.trans, dp, args.total, 0, + rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : + XFS_QMOPT_RES_REGBLKS); if (error) { xfs_iunlock(dp, XFS_ILOCK_EXCL); xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES); @@ -387,7 +400,9 @@ xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name, * Commit the leaf transformation. We'll need another (linked) * transaction to add the new attribute to the leaf. */ - if ((error = xfs_attr_rolltrans(&args.trans, dp))) + + error = xfs_trans_roll(&args.trans, dp); + if (error) goto out; } @@ -529,9 +544,7 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags) /* * Decide on what work routines to call based on the inode size. */ - if (XFS_IFORK_Q(dp) == 0 || - (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && - dp->i_d.di_anextents == 0)) { + if (!xfs_inode_hasattr(dp)) { error = XFS_ERROR(ENOATTR); goto out; } @@ -601,29 +614,33 @@ xfs_attr_remove( return error; xfs_ilock(dp, XFS_ILOCK_SHARED); - if (XFS_IFORK_Q(dp) == 0 || - (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && - dp->i_d.di_anextents == 0)) { + if (!xfs_inode_hasattr(dp)) { xfs_iunlock(dp, XFS_ILOCK_SHARED); - return(XFS_ERROR(ENOATTR)); + return XFS_ERROR(ENOATTR); } xfs_iunlock(dp, XFS_ILOCK_SHARED); return xfs_attr_remove_int(dp, &xname, flags); } -STATIC int +int xfs_attr_list_int(xfs_attr_list_context_t *context) { int error; xfs_inode_t *dp = context->dp; + XFS_STATS_INC(xs_attr_list); + + if (XFS_FORCED_SHUTDOWN(dp->i_mount)) + return EIO; + + xfs_ilock(dp, XFS_ILOCK_SHARED); + xfs_attr_trace_l_c("syscall start", context); + /* * Decide on what work routines to call based on the inode size. */ - if (XFS_IFORK_Q(dp) == 0 || - (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && - dp->i_d.di_anextents == 0)) { + if (!xfs_inode_hasattr(dp)) { error = 0; } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { error = xfs_attr_shortform_list(context); @@ -632,6 +649,10 @@ xfs_attr_list_int(xfs_attr_list_context_t *context) } else { error = xfs_attr_node_list(context); } + + xfs_iunlock(dp, XFS_ILOCK_SHARED); + xfs_attr_trace_l_c("syscall end", context); + return error; } @@ -648,74 +669,50 @@ xfs_attr_list_int(xfs_attr_list_context_t *context) */ /*ARGSUSED*/ STATIC int -xfs_attr_put_listent(xfs_attr_list_context_t *context, attrnames_t *namesp, +xfs_attr_put_listent(xfs_attr_list_context_t *context, int flags, char *name, int namelen, int valuelen, char *value) { + struct attrlist *alist = (struct attrlist *)context->alist; attrlist_ent_t *aep; int arraytop; ASSERT(!(context->flags & ATTR_KERNOVAL)); ASSERT(context->count >= 0); ASSERT(context->count < (ATTR_MAX_VALUELEN/8)); - ASSERT(context->firstu >= sizeof(*context->alist)); + ASSERT(context->firstu >= sizeof(*alist)); ASSERT(context->firstu <= context->bufsize); - arraytop = sizeof(*context->alist) + - context->count * sizeof(context->alist->al_offset[0]); + /* + * Only list entries in the right namespace. + */ + if (((context->flags & ATTR_SECURE) == 0) != + ((flags & XFS_ATTR_SECURE) == 0)) + return 0; + if (((context->flags & ATTR_ROOT) == 0) != + ((flags & XFS_ATTR_ROOT) == 0)) + return 0; + + arraytop = sizeof(*alist) + + context->count * sizeof(alist->al_offset[0]); context->firstu -= ATTR_ENTSIZE(namelen); if (context->firstu < arraytop) { xfs_attr_trace_l_c("buffer full", context); - context->alist->al_more = 1; + alist->al_more = 1; context->seen_enough = 1; return 1; } - aep = (attrlist_ent_t *)&(((char *)context->alist)[ context->firstu ]); + aep = (attrlist_ent_t *)&context->alist[context->firstu]; aep->a_valuelen = valuelen; memcpy(aep->a_name, name, namelen); - aep->a_name[ namelen ] = 0; - context->alist->al_offset[ context->count++ ] = context->firstu; - context->alist->al_count = context->count; + aep->a_name[namelen] = 0; + alist->al_offset[context->count++] = context->firstu; + alist->al_count = context->count; xfs_attr_trace_l_c("add", context); return 0; } -STATIC int -xfs_attr_kern_list(xfs_attr_list_context_t *context, attrnames_t *namesp, - char *name, int namelen, - int valuelen, char *value) -{ - char *offset; - int arraytop; - - ASSERT(context->count >= 0); - - arraytop = context->count + namesp->attr_namelen + namelen + 1; - if (arraytop > context->firstu) { - context->count = -1; /* insufficient space */ - return 1; - } - offset = (char *)context->alist + context->count; - strncpy(offset, namesp->attr_name, namesp->attr_namelen); - offset += namesp->attr_namelen; - strncpy(offset, name, namelen); /* real name */ - offset += namelen; - *offset = '\0'; - context->count += namesp->attr_namelen + namelen + 1; - return 0; -} - -/*ARGSUSED*/ -STATIC int -xfs_attr_kern_list_sizes(xfs_attr_list_context_t *context, attrnames_t *namesp, - char *name, int namelen, - int valuelen, char *value) -{ - context->count += namesp->attr_namelen + namelen + 1; - return 0; -} - /* * Generate a list of extended attribute names and optionally * also value lengths. Positive return value follows the XFS @@ -732,10 +729,9 @@ xfs_attr_list( attrlist_cursor_kern_t *cursor) { xfs_attr_list_context_t context; + struct attrlist *alist; int error; - XFS_STATS_INC(xs_attr_list); - /* * Validate the cursor. */ @@ -756,52 +752,23 @@ xfs_attr_list( /* * Initialize the output buffer. */ + memset(&context, 0, sizeof(context)); context.dp = dp; context.cursor = cursor; - context.count = 0; - context.dupcnt = 0; context.resynch = 1; context.flags = flags; - context.seen_enough = 0; - context.alist = (attrlist_t *)buffer; - context.put_value = 0; - - if (flags & ATTR_KERNAMELS) { - context.bufsize = bufsize; - context.firstu = context.bufsize; - if (flags & ATTR_KERNOVAL) - context.put_listent = xfs_attr_kern_list_sizes; - else - context.put_listent = xfs_attr_kern_list; - } else { - context.bufsize = (bufsize & ~(sizeof(int)-1)); /* align */ - context.firstu = context.bufsize; - context.alist->al_count = 0; - context.alist->al_more = 0; - context.alist->al_offset[0] = context.bufsize; - context.put_listent = xfs_attr_put_listent; - } - - if (XFS_FORCED_SHUTDOWN(dp->i_mount)) - return EIO; + context.alist = buffer; + context.bufsize = (bufsize & ~(sizeof(int)-1)); /* align */ + context.firstu = context.bufsize; + context.put_listent = xfs_attr_put_listent; - xfs_ilock(dp, XFS_ILOCK_SHARED); - xfs_attr_trace_l_c("syscall start", &context); + alist = (struct attrlist *)context.alist; + alist->al_count = 0; + alist->al_more = 0; + alist->al_offset[0] = context.bufsize; error = xfs_attr_list_int(&context); - - xfs_iunlock(dp, XFS_ILOCK_SHARED); - xfs_attr_trace_l_c("syscall end", &context); - - if (context.flags & (ATTR_KERNOVAL|ATTR_KERNAMELS)) { - /* must return negated buffer size or the error */ - if (context.count < 0) - error = XFS_ERROR(ERANGE); - else - error = -context.count; - } else - ASSERT(error >= 0); - + ASSERT(error >= 0); return error; } @@ -816,12 +783,10 @@ xfs_attr_inactive(xfs_inode_t *dp) ASSERT(! XFS_NOT_DQATTACHED(mp, dp)); xfs_ilock(dp, XFS_ILOCK_SHARED); - if ((XFS_IFORK_Q(dp) == 0) || - (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) || - (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && - dp->i_d.di_anextents == 0)) { + if (!xfs_inode_hasattr(dp) || + dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { xfs_iunlock(dp, XFS_ILOCK_SHARED); - return(0); + return 0; } xfs_iunlock(dp, XFS_ILOCK_SHARED); @@ -854,10 +819,8 @@ xfs_attr_inactive(xfs_inode_t *dp) /* * Decide on what work routines to call based on the inode size. */ - if ((XFS_IFORK_Q(dp) == 0) || - (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) || - (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && - dp->i_d.di_anextents == 0)) { + if (!xfs_inode_hasattr(dp) || + dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { error = 0; goto out; } @@ -974,7 +937,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) xfs_da_brelse(args->trans, bp); return(retval); } - args->rename = 1; /* an atomic rename */ + args->op_flags |= XFS_DA_OP_RENAME; /* an atomic rename */ args->blkno2 = args->blkno; /* set 2nd entry info*/ args->index2 = args->index; args->rmtblkno2 = args->rmtblkno; @@ -1019,7 +982,8 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * Commit the current trans (including the inode) and start * a new one. */ - if ((error = xfs_attr_rolltrans(&args->trans, dp))) + error = xfs_trans_roll(&args->trans, dp); + if (error) return (error); /* @@ -1033,7 +997,8 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * Commit the transaction that added the attr name so that * later routines can manage their own transactions. */ - if ((error = xfs_attr_rolltrans(&args->trans, dp))) + error = xfs_trans_roll(&args->trans, dp); + if (error) return (error); /* @@ -1054,7 +1019,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * so that one disappears and one appears atomically. Then we * must remove the "old" attribute/value pair. */ - if (args->rename) { + if (args->op_flags & XFS_DA_OP_RENAME) { /* * In a separate transaction, set the incomplete flag on the * "old" attr and clear the incomplete flag on the "new" attr. @@ -1122,7 +1087,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) /* * Commit the remove and start the next trans in series. */ - error = xfs_attr_rolltrans(&args->trans, dp); + error = xfs_trans_roll(&args->trans, dp); } else if (args->rmtblkno > 0) { /* @@ -1307,7 +1272,7 @@ restart: } else if (retval == EEXIST) { if (args->flags & ATTR_CREATE) goto out; - args->rename = 1; /* atomic rename op */ + args->op_flags |= XFS_DA_OP_RENAME; /* atomic rename op */ args->blkno2 = args->blkno; /* set 2nd entry info*/ args->index2 = args->index; args->rmtblkno2 = args->rmtblkno; @@ -1353,7 +1318,8 @@ restart: * Commit the node conversion and start the next * trans in the chain. */ - if ((error = xfs_attr_rolltrans(&args->trans, dp))) + error = xfs_trans_roll(&args->trans, dp); + if (error) goto out; goto restart; @@ -1404,7 +1370,8 @@ restart: * Commit the leaf addition or btree split and start the next * trans in the chain. */ - if ((error = xfs_attr_rolltrans(&args->trans, dp))) + error = xfs_trans_roll(&args->trans, dp); + if (error) goto out; /* @@ -1425,7 +1392,7 @@ restart: * so that one disappears and one appears atomically. Then we * must remove the "old" attribute/value pair. */ - if (args->rename) { + if (args->op_flags & XFS_DA_OP_RENAME) { /* * In a separate transaction, set the incomplete flag on the * "old" attr and clear the incomplete flag on the "new" attr. @@ -1504,7 +1471,8 @@ restart: /* * Commit and start the next trans in the chain. */ - if ((error = xfs_attr_rolltrans(&args->trans, dp))) + error = xfs_trans_roll(&args->trans, dp); + if (error) goto out; } else if (args->rmtblkno > 0) { @@ -1636,7 +1604,8 @@ xfs_attr_node_removename(xfs_da_args_t *args) /* * Commit the Btree join operation and start a new trans. */ - if ((error = xfs_attr_rolltrans(&args->trans, dp))) + error = xfs_trans_roll(&args->trans, dp); + if (error) goto out; } @@ -2137,7 +2106,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) /* * Start the next trans in the chain. */ - if ((error = xfs_attr_rolltrans(&args->trans, dp))) + error = xfs_trans_roll(&args->trans, dp); + if (error) return (error); } @@ -2287,7 +2257,8 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) /* * Close out trans and start the next one in the chain. */ - if ((error = xfs_attr_rolltrans(&args->trans, args->dp))) + error = xfs_trans_roll(&args->trans, args->dp); + if (error) return (error); } return(0); @@ -2300,23 +2271,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) void xfs_attr_trace_l_c(char *where, struct xfs_attr_list_context *context) { - xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_C, where, - (__psunsigned_t)context->dp, - (__psunsigned_t)context->cursor->hashval, - (__psunsigned_t)context->cursor->blkno, - (__psunsigned_t)context->cursor->offset, - (__psunsigned_t)context->alist, - (__psunsigned_t)context->bufsize, - (__psunsigned_t)context->count, - (__psunsigned_t)context->firstu, - (__psunsigned_t) - ((context->count > 0) && - !(context->flags & (ATTR_KERNAMELS|ATTR_KERNOVAL))) - ? (ATTR_ENTRY(context->alist, - context->count-1)->a_valuelen) - : 0, - (__psunsigned_t)context->dupcnt, - (__psunsigned_t)context->flags, + xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_C, where, context, (__psunsigned_t)NULL, (__psunsigned_t)NULL, (__psunsigned_t)NULL); @@ -2329,23 +2284,7 @@ void xfs_attr_trace_l_cn(char *where, struct xfs_attr_list_context *context, struct xfs_da_intnode *node) { - xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CN, where, - (__psunsigned_t)context->dp, - (__psunsigned_t)context->cursor->hashval, - (__psunsigned_t)context->cursor->blkno, - (__psunsigned_t)context->cursor->offset, - (__psunsigned_t)context->alist, - (__psunsigned_t)context->bufsize, - (__psunsigned_t)context->count, - (__psunsigned_t)context->firstu, - (__psunsigned_t) - ((context->count > 0) && - !(context->flags & (ATTR_KERNAMELS|ATTR_KERNOVAL))) - ? (ATTR_ENTRY(context->alist, - context->count-1)->a_valuelen) - : 0, - (__psunsigned_t)context->dupcnt, - (__psunsigned_t)context->flags, + xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CN, where, context, (__psunsigned_t)be16_to_cpu(node->hdr.count), (__psunsigned_t)be32_to_cpu(node->btree[0].hashval), (__psunsigned_t)be32_to_cpu(node->btree[ @@ -2359,23 +2298,7 @@ void xfs_attr_trace_l_cb(char *where, struct xfs_attr_list_context *context, struct xfs_da_node_entry *btree) { - xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CB, where, - (__psunsigned_t)context->dp, - (__psunsigned_t)context->cursor->hashval, - (__psunsigned_t)context->cursor->blkno, - (__psunsigned_t)context->cursor->offset, - (__psunsigned_t)context->alist, - (__psunsigned_t)context->bufsize, - (__psunsigned_t)context->count, - (__psunsigned_t)context->firstu, - (__psunsigned_t) - ((context->count > 0) && - !(context->flags & (ATTR_KERNAMELS|ATTR_KERNOVAL))) - ? (ATTR_ENTRY(context->alist, - context->count-1)->a_valuelen) - : 0, - (__psunsigned_t)context->dupcnt, - (__psunsigned_t)context->flags, + xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CB, where, context, (__psunsigned_t)be32_to_cpu(btree->hashval), (__psunsigned_t)be32_to_cpu(btree->before), (__psunsigned_t)NULL); @@ -2388,23 +2311,7 @@ void xfs_attr_trace_l_cl(char *where, struct xfs_attr_list_context *context, struct xfs_attr_leafblock *leaf) { - xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CL, where, - (__psunsigned_t)context->dp, - (__psunsigned_t)context->cursor->hashval, - (__psunsigned_t)context->cursor->blkno, - (__psunsigned_t)context->cursor->offset, - (__psunsigned_t)context->alist, - (__psunsigned_t)context->bufsize, - (__psunsigned_t)context->count, - (__psunsigned_t)context->firstu, - (__psunsigned_t) - ((context->count > 0) && - !(context->flags & (ATTR_KERNAMELS|ATTR_KERNOVAL))) - ? (ATTR_ENTRY(context->alist, - context->count-1)->a_valuelen) - : 0, - (__psunsigned_t)context->dupcnt, - (__psunsigned_t)context->flags, + xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CL, where, context, (__psunsigned_t)be16_to_cpu(leaf->hdr.count), (__psunsigned_t)be32_to_cpu(leaf->entries[0].hashval), (__psunsigned_t)be32_to_cpu(leaf->entries[ @@ -2417,329 +2324,24 @@ xfs_attr_trace_l_cl(char *where, struct xfs_attr_list_context *context, */ void xfs_attr_trace_enter(int type, char *where, - __psunsigned_t a2, __psunsigned_t a3, - __psunsigned_t a4, __psunsigned_t a5, - __psunsigned_t a6, __psunsigned_t a7, - __psunsigned_t a8, __psunsigned_t a9, - __psunsigned_t a10, __psunsigned_t a11, - __psunsigned_t a12, __psunsigned_t a13, - __psunsigned_t a14, __psunsigned_t a15) + struct xfs_attr_list_context *context, + __psunsigned_t a13, __psunsigned_t a14, + __psunsigned_t a15) { ASSERT(xfs_attr_trace_buf); ktrace_enter(xfs_attr_trace_buf, (void *)((__psunsigned_t)type), - (void *)where, - (void *)a2, (void *)a3, (void *)a4, - (void *)a5, (void *)a6, (void *)a7, - (void *)a8, (void *)a9, (void *)a10, - (void *)a11, (void *)a12, (void *)a13, - (void *)a14, (void *)a15); + (void *)((__psunsigned_t)where), + (void *)((__psunsigned_t)context->dp), + (void *)((__psunsigned_t)context->cursor->hashval), + (void *)((__psunsigned_t)context->cursor->blkno), + (void *)((__psunsigned_t)context->cursor->offset), + (void *)((__psunsigned_t)context->alist), + (void *)((__psunsigned_t)context->bufsize), + (void *)((__psunsigned_t)context->count), + (void *)((__psunsigned_t)context->firstu), + NULL, + (void *)((__psunsigned_t)context->dupcnt), + (void *)((__psunsigned_t)context->flags), + (void *)a13, (void *)a14, (void *)a15); } #endif /* XFS_ATTR_TRACE */ - - -/*======================================================================== - * System (pseudo) namespace attribute interface routines. - *========================================================================*/ - -STATIC int -posix_acl_access_set( - bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags) -{ - return xfs_acl_vset(vp, data, size, _ACL_TYPE_ACCESS); -} - -STATIC int -posix_acl_access_remove( - bhv_vnode_t *vp, char *name, int xflags) -{ - return xfs_acl_vremove(vp, _ACL_TYPE_ACCESS); -} - -STATIC int -posix_acl_access_get( - bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags) -{ - return xfs_acl_vget(vp, data, size, _ACL_TYPE_ACCESS); -} - -STATIC int -posix_acl_access_exists( - bhv_vnode_t *vp) -{ - return xfs_acl_vhasacl_access(vp); -} - -STATIC int -posix_acl_default_set( - bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags) -{ - return xfs_acl_vset(vp, data, size, _ACL_TYPE_DEFAULT); -} - -STATIC int -posix_acl_default_get( - bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags) -{ - return xfs_acl_vget(vp, data, size, _ACL_TYPE_DEFAULT); -} - -STATIC int -posix_acl_default_remove( - bhv_vnode_t *vp, char *name, int xflags) -{ - return xfs_acl_vremove(vp, _ACL_TYPE_DEFAULT); -} - -STATIC int -posix_acl_default_exists( - bhv_vnode_t *vp) -{ - return xfs_acl_vhasacl_default(vp); -} - -static struct attrnames posix_acl_access = { - .attr_name = "posix_acl_access", - .attr_namelen = sizeof("posix_acl_access") - 1, - .attr_get = posix_acl_access_get, - .attr_set = posix_acl_access_set, - .attr_remove = posix_acl_access_remove, - .attr_exists = posix_acl_access_exists, -}; - -static struct attrnames posix_acl_default = { - .attr_name = "posix_acl_default", - .attr_namelen = sizeof("posix_acl_default") - 1, - .attr_get = posix_acl_default_get, - .attr_set = posix_acl_default_set, - .attr_remove = posix_acl_default_remove, - .attr_exists = posix_acl_default_exists, -}; - -static struct attrnames *attr_system_names[] = - { &posix_acl_access, &posix_acl_default }; - - -/*======================================================================== - * Namespace-prefix-style attribute name interface routines. - *========================================================================*/ - -STATIC int -attr_generic_set( - bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags) -{ - return -xfs_attr_set(xfs_vtoi(vp), name, data, size, xflags); -} - -STATIC int -attr_generic_get( - bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags) -{ - int error, asize = size; - - error = xfs_attr_get(xfs_vtoi(vp), name, data, &asize, xflags); - if (!error) - return asize; - return -error; -} - -STATIC int -attr_generic_remove( - bhv_vnode_t *vp, char *name, int xflags) -{ - return -xfs_attr_remove(xfs_vtoi(vp), name, xflags); -} - -STATIC int -attr_generic_listadd( - attrnames_t *prefix, - attrnames_t *namesp, - void *data, - size_t size, - ssize_t *result) -{ - char *p = data + *result; - - *result += prefix->attr_namelen; - *result += namesp->attr_namelen + 1; - if (!size) - return 0; - if (*result > size) - return -ERANGE; - strcpy(p, prefix->attr_name); - p += prefix->attr_namelen; - strcpy(p, namesp->attr_name); - p += namesp->attr_namelen + 1; - return 0; -} - -STATIC int -attr_system_list( - bhv_vnode_t *vp, - void *data, - size_t size, - ssize_t *result) -{ - attrnames_t *namesp; - int i, error = 0; - - for (i = 0; i < ATTR_SYSCOUNT; i++) { - namesp = attr_system_names[i]; - if (!namesp->attr_exists || !namesp->attr_exists(vp)) - continue; - error = attr_generic_listadd(&attr_system, namesp, - data, size, result); - if (error) - break; - } - return error; -} - -int -attr_generic_list( - bhv_vnode_t *vp, void *data, size_t size, int xflags, ssize_t *result) -{ - attrlist_cursor_kern_t cursor = { 0 }; - int error; - - error = xfs_attr_list(xfs_vtoi(vp), data, size, xflags, &cursor); - if (error > 0) - return -error; - *result = -error; - return attr_system_list(vp, data, size, result); -} - -attrnames_t * -attr_lookup_namespace( - char *name, - struct attrnames **names, - int nnames) -{ - int i; - - for (i = 0; i < nnames; i++) - if (!strncmp(name, names[i]->attr_name, names[i]->attr_namelen)) - return names[i]; - return NULL; -} - -/* - * Some checks to prevent people abusing EAs to get over quota: - * - Don't allow modifying user EAs on devices/symlinks; - * - Don't allow modifying user EAs if sticky bit set; - */ -STATIC int -attr_user_capable( - bhv_vnode_t *vp, - cred_t *cred) -{ - struct inode *inode = vn_to_inode(vp); - - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) - return -EPERM; - if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode) && - !capable(CAP_SYS_ADMIN)) - return -EPERM; - if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) && - (current_fsuid(cred) != inode->i_uid) && !capable(CAP_FOWNER)) - return -EPERM; - return 0; -} - -STATIC int -attr_trusted_capable( - bhv_vnode_t *vp, - cred_t *cred) -{ - struct inode *inode = vn_to_inode(vp); - - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) - return -EPERM; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - return 0; -} - -STATIC int -attr_system_set( - bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags) -{ - attrnames_t *namesp; - int error; - - if (xflags & ATTR_CREATE) - return -EINVAL; - - namesp = attr_lookup_namespace(name, attr_system_names, ATTR_SYSCOUNT); - if (!namesp) - return -EOPNOTSUPP; - error = namesp->attr_set(vp, name, data, size, xflags); - if (!error) - error = vn_revalidate(vp); - return error; -} - -STATIC int -attr_system_get( - bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags) -{ - attrnames_t *namesp; - - namesp = attr_lookup_namespace(name, attr_system_names, ATTR_SYSCOUNT); - if (!namesp) - return -EOPNOTSUPP; - return namesp->attr_get(vp, name, data, size, xflags); -} - -STATIC int -attr_system_remove( - bhv_vnode_t *vp, char *name, int xflags) -{ - attrnames_t *namesp; - - namesp = attr_lookup_namespace(name, attr_system_names, ATTR_SYSCOUNT); - if (!namesp) - return -EOPNOTSUPP; - return namesp->attr_remove(vp, name, xflags); -} - -struct attrnames attr_system = { - .attr_name = "system.", - .attr_namelen = sizeof("system.") - 1, - .attr_flag = ATTR_SYSTEM, - .attr_get = attr_system_get, - .attr_set = attr_system_set, - .attr_remove = attr_system_remove, - .attr_capable = (attrcapable_t)fs_noerr, -}; - -struct attrnames attr_trusted = { - .attr_name = "trusted.", - .attr_namelen = sizeof("trusted.") - 1, - .attr_flag = ATTR_ROOT, - .attr_get = attr_generic_get, - .attr_set = attr_generic_set, - .attr_remove = attr_generic_remove, - .attr_capable = attr_trusted_capable, -}; - -struct attrnames attr_secure = { - .attr_name = "security.", - .attr_namelen = sizeof("security.") - 1, - .attr_flag = ATTR_SECURE, - .attr_get = attr_generic_get, - .attr_set = attr_generic_set, - .attr_remove = attr_generic_remove, - .attr_capable = (attrcapable_t)fs_noerr, -}; - -struct attrnames attr_user = { - .attr_name = "user.", - .attr_namelen = sizeof("user.") - 1, - .attr_get = attr_generic_get, - .attr_set = attr_generic_set, - .attr_remove = attr_generic_remove, - .attr_capable = attr_user_capable, -}; - -struct attrnames *attr_namespaces[] = - { &attr_system, &attr_trusted, &attr_secure, &attr_user }; diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h index 6cfc9384fe35..fb3b2a68b9b9 100644 --- a/fs/xfs/xfs_attr.h +++ b/fs/xfs/xfs_attr.h @@ -18,9 +18,11 @@ #ifndef __XFS_ATTR_H__ #define __XFS_ATTR_H__ +struct xfs_inode; +struct xfs_da_args; +struct xfs_attr_list_context; + /* - * xfs_attr.h - * * Large attribute lists are structured around Btrees where all the data * elements are in the leaf nodes. Attribute names are hashed into an int, * then that int is used as the index into the Btree. Since the hashval @@ -35,35 +37,6 @@ * External interfaces *========================================================================*/ -struct cred; -struct xfs_attr_list_context; - -typedef int (*attrset_t)(bhv_vnode_t *, char *, void *, size_t, int); -typedef int (*attrget_t)(bhv_vnode_t *, char *, void *, size_t, int); -typedef int (*attrremove_t)(bhv_vnode_t *, char *, int); -typedef int (*attrexists_t)(bhv_vnode_t *); -typedef int (*attrcapable_t)(bhv_vnode_t *, struct cred *); - -typedef struct attrnames { - char * attr_name; - unsigned int attr_namelen; - unsigned int attr_flag; - attrget_t attr_get; - attrset_t attr_set; - attrremove_t attr_remove; - attrexists_t attr_exists; - attrcapable_t attr_capable; -} attrnames_t; - -#define ATTR_NAMECOUNT 4 -extern struct attrnames attr_user; -extern struct attrnames attr_secure; -extern struct attrnames attr_system; -extern struct attrnames attr_trusted; -extern struct attrnames *attr_namespaces[ATTR_NAMECOUNT]; - -extern attrnames_t *attr_lookup_namespace(char *, attrnames_t **, int); -extern int attr_generic_list(bhv_vnode_t *, void *, size_t, int, ssize_t *); #define ATTR_DONTFOLLOW 0x0001 /* -- unused, from IRIX -- */ #define ATTR_ROOT 0x0002 /* use attrs in root (trusted) namespace */ @@ -71,16 +44,9 @@ extern int attr_generic_list(bhv_vnode_t *, void *, size_t, int, ssize_t *); #define ATTR_SECURE 0x0008 /* use attrs in security namespace */ #define ATTR_CREATE 0x0010 /* pure create: fail if attr already exists */ #define ATTR_REPLACE 0x0020 /* pure set: fail if attr does not exist */ -#define ATTR_SYSTEM 0x0100 /* use attrs in system (pseudo) namespace */ -#define ATTR_KERNACCESS 0x0400 /* [kernel] iaccess, inode held io-locked */ #define ATTR_KERNOTIME 0x1000 /* [kernel] don't update inode timestamps */ #define ATTR_KERNOVAL 0x2000 /* [kernel] get attr size only, not value */ -#define ATTR_KERNAMELS 0x4000 /* [kernel] list attr names (simple list) */ - -#define ATTR_KERNORMALS 0x0800 /* [kernel] normal attr list: user+secure */ -#define ATTR_KERNROOTLS 0x8000 /* [kernel] include root in the attr list */ -#define ATTR_KERNFULLS (ATTR_KERNORMALS|ATTR_KERNROOTLS) /* * The maximum size (into the kernel or returned from the kernel) of an @@ -119,22 +85,6 @@ typedef struct attrlist_ent { /* data from attr_list() */ &((char *)buffer)[ ((attrlist_t *)(buffer))->al_offset[index] ]) /* - * Multi-attribute operation vector. - */ -typedef struct attr_multiop { - int am_opcode; /* operation to perform (ATTR_OP_GET, etc.) */ - int am_error; /* [out arg] result of this sub-op (an errno) */ - char *am_attrname; /* attribute name to work with */ - char *am_attrvalue; /* [in/out arg] attribute value (raw bytes) */ - int am_length; /* [in/out arg] length of value */ - int am_flags; /* bitwise OR of attr API flags defined above */ -} attr_multiop_t; - -#define ATTR_OP_GET 1 /* return the indicated attr's value */ -#define ATTR_OP_SET 2 /* set/create the indicated attr/value pair */ -#define ATTR_OP_REMOVE 3 /* remove the indicated attr */ - -/* * Kernel-internal version of the attrlist cursor. */ typedef struct attrlist_cursor_kern { @@ -148,20 +98,41 @@ typedef struct attrlist_cursor_kern { /*======================================================================== - * Function prototypes for the kernel. + * Structure used to pass context around among the routines. *========================================================================*/ -struct xfs_inode; -struct attrlist_cursor_kern; -struct xfs_da_args; + +typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, int, + char *, int, int, char *); + +typedef struct xfs_attr_list_context { + struct xfs_inode *dp; /* inode */ + struct attrlist_cursor_kern *cursor; /* position in list */ + char *alist; /* output buffer */ + int seen_enough; /* T/F: seen enough of list? */ + ssize_t count; /* num used entries */ + int dupcnt; /* count dup hashvals seen */ + int bufsize; /* total buffer size */ + int firstu; /* first used byte in buffer */ + int flags; /* from VOP call */ + int resynch; /* T/F: resynch with cursor */ + int put_value; /* T/F: need value for listent */ + put_listent_func_t put_listent; /* list output fmt function */ + int index; /* index into output buffer */ +} xfs_attr_list_context_t; + + +/*======================================================================== + * Function prototypes for the kernel. + *========================================================================*/ /* * Overall external interface routines. */ +int xfs_attr_calc_size(struct xfs_inode *, int, int, int *); int xfs_attr_inactive(struct xfs_inode *dp); - -int xfs_attr_shortform_getvalue(struct xfs_da_args *); int xfs_attr_fetch(struct xfs_inode *, struct xfs_name *, char *, int *, int); int xfs_attr_rmtval_get(struct xfs_da_args *args); +int xfs_attr_list_int(struct xfs_attr_list_context *); #endif /* __XFS_ATTR_H__ */ diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 303d41e4217b..79da6b2ea99e 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -94,13 +94,6 @@ STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index); * Namespace helper routines *========================================================================*/ -STATIC_INLINE attrnames_t * -xfs_attr_flags_namesp(int flags) -{ - return ((flags & XFS_ATTR_SECURE) ? &attr_secure: - ((flags & XFS_ATTR_ROOT) ? &attr_trusted : &attr_user)); -} - /* * If namespace bits don't match return 0. * If all match then return 1. @@ -111,25 +104,6 @@ xfs_attr_namesp_match(int arg_flags, int ondisk_flags) return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags); } -/* - * If namespace bits don't match and we don't have an override for it - * then return 0. - * If all match or are overridable then return 1. - */ -STATIC_INLINE int -xfs_attr_namesp_match_overrides(int arg_flags, int ondisk_flags) -{ - if (((arg_flags & ATTR_SECURE) == 0) != - ((ondisk_flags & XFS_ATTR_SECURE) == 0) && - !(arg_flags & ATTR_KERNORMALS)) - return 0; - if (((arg_flags & ATTR_ROOT) == 0) != - ((ondisk_flags & XFS_ATTR_ROOT) == 0) && - !(arg_flags & ATTR_KERNROOTLS)) - return 0; - return 1; -} - /*======================================================================== * External routines when attribute fork size < XFS_LITINO(mp). @@ -369,9 +343,10 @@ xfs_attr_shortform_remove(xfs_da_args_t *args) * Fix up the start offset of the attribute fork */ totsize -= size; - if (totsize == sizeof(xfs_attr_sf_hdr_t) && !args->addname && - (mp->m_flags & XFS_MOUNT_ATTR2) && - (dp->i_d.di_format != XFS_DINODE_FMT_BTREE)) { + if (totsize == sizeof(xfs_attr_sf_hdr_t) && + !(args->op_flags & XFS_DA_OP_ADDNAME) && + (mp->m_flags & XFS_MOUNT_ATTR2) && + (dp->i_d.di_format != XFS_DINODE_FMT_BTREE)) { /* * Last attribute now removed, revert to original * inode format making all literal area available @@ -389,9 +364,10 @@ xfs_attr_shortform_remove(xfs_da_args_t *args) xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); dp->i_d.di_forkoff = xfs_attr_shortform_bytesfit(dp, totsize); ASSERT(dp->i_d.di_forkoff); - ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) || args->addname || - !(mp->m_flags & XFS_MOUNT_ATTR2) || - dp->i_d.di_format == XFS_DINODE_FMT_BTREE); + ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) || + (args->op_flags & XFS_DA_OP_ADDNAME) || + !(mp->m_flags & XFS_MOUNT_ATTR2) || + dp->i_d.di_format == XFS_DINODE_FMT_BTREE); dp->i_afp->if_ext_max = XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t); dp->i_df.if_ext_max = @@ -531,7 +507,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args) nargs.total = args->total; nargs.whichfork = XFS_ATTR_FORK; nargs.trans = args->trans; - nargs.oknoent = 1; + nargs.op_flags = XFS_DA_OP_OKNOENT; sfe = &sf->list[0]; for (i = 0; i < sf->hdr.count; i++) { @@ -555,7 +531,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args) out: if(bp) xfs_da_buf_done(bp); - kmem_free(tmpbuffer, size); + kmem_free(tmpbuffer); return(error); } @@ -624,15 +600,8 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) (XFS_ISRESET_CURSOR(cursor) && (dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) { for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { - attrnames_t *namesp; - - if (!xfs_attr_namesp_match_overrides(context->flags, sfe->flags)) { - sfe = XFS_ATTR_SF_NEXTENTRY(sfe); - continue; - } - namesp = xfs_attr_flags_namesp(sfe->flags); error = context->put_listent(context, - namesp, + sfe->flags, (char *)sfe->nameval, (int)sfe->namelen, (int)sfe->valuelen, @@ -676,13 +645,10 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) XFS_ERRLEVEL_LOW, context->dp->i_mount, sfe); xfs_attr_trace_l_c("sf corrupted", context); - kmem_free(sbuf, sbsize); + kmem_free(sbuf); return XFS_ERROR(EFSCORRUPTED); } - if (!xfs_attr_namesp_match_overrides(context->flags, sfe->flags)) { - sfe = XFS_ATTR_SF_NEXTENTRY(sfe); - continue; - } + sbp->entno = i; sbp->hash = xfs_da_hashname((char *)sfe->nameval, sfe->namelen); sbp->name = (char *)sfe->nameval; @@ -717,7 +683,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) } } if (i == nsbuf) { - kmem_free(sbuf, sbsize); + kmem_free(sbuf); xfs_attr_trace_l_c("blk end", context); return(0); } @@ -726,16 +692,12 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) * Loop putting entries into the user buffer. */ for ( ; i < nsbuf; i++, sbp++) { - attrnames_t *namesp; - - namesp = xfs_attr_flags_namesp(sbp->flags); - if (cursor->hashval != sbp->hash) { cursor->hashval = sbp->hash; cursor->offset = 0; } error = context->put_listent(context, - namesp, + sbp->flags, sbp->name, sbp->namelen, sbp->valuelen, @@ -747,7 +709,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) cursor->offset++; } - kmem_free(sbuf, sbsize); + kmem_free(sbuf); xfs_attr_trace_l_c("sf E-O-F", context); return(0); } @@ -853,7 +815,7 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff) nargs.total = args->total; nargs.whichfork = XFS_ATTR_FORK; nargs.trans = args->trans; - nargs.oknoent = 1; + nargs.op_flags = XFS_DA_OP_OKNOENT; entry = &leaf->entries[0]; for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) { if (entry->flags & XFS_ATTR_INCOMPLETE) @@ -873,7 +835,7 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff) error = 0; out: - kmem_free(tmpbuffer, XFS_LBSIZE(dp->i_mount)); + kmem_free(tmpbuffer); return(error); } @@ -1155,7 +1117,7 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex) entry->hashval = cpu_to_be32(args->hashval); entry->flags = tmp ? XFS_ATTR_LOCAL : 0; entry->flags |= XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags); - if (args->rename) { + if (args->op_flags & XFS_DA_OP_RENAME) { entry->flags |= XFS_ATTR_INCOMPLETE; if ((args->blkno2 == args->blkno) && (args->index2 <= args->index)) { @@ -1271,7 +1233,7 @@ xfs_attr_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *bp) be16_to_cpu(hdr_s->count), mp); xfs_da_log_buf(trans, bp, 0, XFS_LBSIZE(mp) - 1); - kmem_free(tmpbuffer, XFS_LBSIZE(mp)); + kmem_free(tmpbuffer); } /* @@ -1921,7 +1883,7 @@ xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, be16_to_cpu(drop_hdr->count), mp); } memcpy((char *)save_leaf, (char *)tmp_leaf, state->blocksize); - kmem_free(tmpbuffer, state->blocksize); + kmem_free(tmpbuffer); } xfs_da_log_buf(state->args->trans, save_blk->bp, 0, @@ -2400,8 +2362,6 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context) */ retval = 0; for ( ; (i < be16_to_cpu(leaf->hdr.count)); entry++, i++) { - attrnames_t *namesp; - if (be32_to_cpu(entry->hashval) != cursor->hashval) { cursor->hashval = be32_to_cpu(entry->hashval); cursor->offset = 0; @@ -2409,17 +2369,13 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context) if (entry->flags & XFS_ATTR_INCOMPLETE) continue; /* skip incomplete entries */ - if (!xfs_attr_namesp_match_overrides(context->flags, entry->flags)) - continue; - - namesp = xfs_attr_flags_namesp(entry->flags); if (entry->flags & XFS_ATTR_LOCAL) { xfs_attr_leaf_name_local_t *name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, i); retval = context->put_listent(context, - namesp, + entry->flags, (char *)name_loc->nameval, (int)name_loc->namelen, be16_to_cpu(name_loc->valuelen), @@ -2446,16 +2402,15 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context) if (retval) return retval; retval = context->put_listent(context, - namesp, + entry->flags, (char *)name_rmt->name, (int)name_rmt->namelen, valuelen, (char*)args.value); - kmem_free(args.value, valuelen); - } - else { + kmem_free(args.value); + } else { retval = context->put_listent(context, - namesp, + entry->flags, (char *)name_rmt->name, (int)name_rmt->namelen, valuelen, @@ -2543,9 +2498,7 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args) /* * Commit the flag value change and start the next trans in series. */ - error = xfs_attr_rolltrans(&args->trans, args->dp); - - return(error); + return xfs_trans_roll(&args->trans, args->dp); } /* @@ -2592,9 +2545,7 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args) /* * Commit the flag value change and start the next trans in series. */ - error = xfs_attr_rolltrans(&args->trans, args->dp); - - return(error); + return xfs_trans_roll(&args->trans, args->dp); } /* @@ -2710,7 +2661,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) /* * Commit the flag value change and start the next trans in series. */ - error = xfs_attr_rolltrans(&args->trans, args->dp); + error = xfs_trans_roll(&args->trans, args->dp); return(error); } @@ -2768,7 +2719,7 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp) /* * Commit the invalidate and start the next transaction. */ - error = xfs_attr_rolltrans(trans, dp); + error = xfs_trans_roll(trans, dp); return (error); } @@ -2870,7 +2821,8 @@ xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp, /* * Atomically commit the whole invalidate stuff. */ - if ((error = xfs_attr_rolltrans(trans, dp))) + error = xfs_trans_roll(trans, dp); + if (error) return (error); } @@ -2954,7 +2906,7 @@ xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp) error = tmp; /* save only the 1st errno */ } - kmem_free((xfs_caddr_t)list, size); + kmem_free((xfs_caddr_t)list); return(error); } @@ -3009,7 +2961,8 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp, /* * Roll to next transaction. */ - if ((error = xfs_attr_rolltrans(trans, dp))) + error = xfs_trans_roll(trans, dp); + if (error) return (error); } @@ -3019,60 +2972,3 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp, return(0); } - - -/* - * Roll from one trans in the sequence of PERMANENT transactions to the next. - */ -int -xfs_attr_rolltrans(xfs_trans_t **transp, xfs_inode_t *dp) -{ - xfs_trans_t *trans; - unsigned int logres, count; - int error; - - /* - * Ensure that the inode is always logged. - */ - trans = *transp; - xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE); - - /* - * Copy the critical parameters from one trans to the next. - */ - logres = trans->t_log_res; - count = trans->t_log_count; - *transp = xfs_trans_dup(trans); - - /* - * Commit the current transaction. - * If this commit failed, then it'd just unlock those items that - * are not marked ihold. That also means that a filesystem shutdown - * is in progress. The caller takes the responsibility to cancel - * the duplicate transaction that gets returned. - */ - if ((error = xfs_trans_commit(trans, 0))) - return (error); - - trans = *transp; - - /* - * Reserve space in the log for th next transaction. - * This also pushes items in the "AIL", the list of logged items, - * out to disk if they are taking up space at the tail of the log - * that we want to use. This requires that either nothing be locked - * across this call, or that anything that is locked be logged in - * the prior and the next transactions. - */ - error = xfs_trans_reserve(trans, 0, logres, 0, - XFS_TRANS_PERM_LOG_RES, count); - /* - * Ensure that the inode is in the new transaction and locked. - */ - if (!error) { - xfs_trans_ijoin(trans, dp, XFS_ILOCK_EXCL); - xfs_trans_ihold(trans, dp); - } - return (error); - -} diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h index 040f732ce1e2..83e9af417ca2 100644 --- a/fs/xfs/xfs_attr_leaf.h +++ b/fs/xfs/xfs_attr_leaf.h @@ -30,7 +30,7 @@ struct attrlist; struct attrlist_cursor_kern; -struct attrnames; +struct xfs_attr_list_context; struct xfs_dabuf; struct xfs_da_args; struct xfs_da_state; @@ -204,33 +204,6 @@ static inline int xfs_attr_leaf_entsize_local_max(int bsize) return (((bsize) >> 1) + ((bsize) >> 2)); } - -/*======================================================================== - * Structure used to pass context around among the routines. - *========================================================================*/ - - -struct xfs_attr_list_context; - -typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, struct attrnames *, - char *, int, int, char *); - -typedef struct xfs_attr_list_context { - struct xfs_inode *dp; /* inode */ - struct attrlist_cursor_kern *cursor; /* position in list */ - struct attrlist *alist; /* output buffer */ - int seen_enough; /* T/F: seen enough of list? */ - int count; /* num used entries */ - int dupcnt; /* count dup hashvals seen */ - int bufsize; /* total buffer size */ - int firstu; /* first used byte in buffer */ - int flags; /* from VOP call */ - int resynch; /* T/F: resynch with cursor */ - int put_value; /* T/F: need value for listent */ - put_listent_func_t put_listent; /* list output fmt function */ - int index; /* index into output buffer */ -} xfs_attr_list_context_t; - /* * Used to keep a list of "remote value" extents when unlinking an inode. */ @@ -301,6 +274,4 @@ int xfs_attr_leaf_order(struct xfs_dabuf *leaf1_bp, struct xfs_dabuf *leaf2_bp); int xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, int *local); -int xfs_attr_rolltrans(struct xfs_trans **transp, struct xfs_inode *dp); - #endif /* __XFS_ATTR_LEAF_H__ */ diff --git a/fs/xfs/xfs_attr_sf.h b/fs/xfs/xfs_attr_sf.h index f67f917803b1..ea22839caed2 100644 --- a/fs/xfs/xfs_attr_sf.h +++ b/fs/xfs/xfs_attr_sf.h @@ -97,13 +97,9 @@ void xfs_attr_trace_l_cb(char *where, struct xfs_attr_list_context *context, void xfs_attr_trace_l_cl(char *where, struct xfs_attr_list_context *context, struct xfs_attr_leafblock *leaf); void xfs_attr_trace_enter(int type, char *where, - __psunsigned_t a2, __psunsigned_t a3, - __psunsigned_t a4, __psunsigned_t a5, - __psunsigned_t a6, __psunsigned_t a7, - __psunsigned_t a8, __psunsigned_t a9, - __psunsigned_t a10, __psunsigned_t a11, - __psunsigned_t a12, __psunsigned_t a13, - __psunsigned_t a14, __psunsigned_t a15); + struct xfs_attr_list_context *context, + __psunsigned_t a13, __psunsigned_t a14, + __psunsigned_t a15); #else #define xfs_attr_trace_l_c(w,c) #define xfs_attr_trace_l_cn(w,c,n) diff --git a/fs/xfs/xfs_bit.c b/fs/xfs/xfs_bit.c index fab0b6d5a41b..48228848f5ae 100644 --- a/fs/xfs/xfs_bit.c +++ b/fs/xfs/xfs_bit.c @@ -25,109 +25,6 @@ * XFS bit manipulation routines, used in non-realtime code. */ -#ifndef HAVE_ARCH_HIGHBIT -/* - * Index of high bit number in byte, -1 for none set, 0..7 otherwise. - */ -static const char xfs_highbit[256] = { - -1, 0, 1, 1, 2, 2, 2, 2, /* 00 .. 07 */ - 3, 3, 3, 3, 3, 3, 3, 3, /* 08 .. 0f */ - 4, 4, 4, 4, 4, 4, 4, 4, /* 10 .. 17 */ - 4, 4, 4, 4, 4, 4, 4, 4, /* 18 .. 1f */ - 5, 5, 5, 5, 5, 5, 5, 5, /* 20 .. 27 */ - 5, 5, 5, 5, 5, 5, 5, 5, /* 28 .. 2f */ - 5, 5, 5, 5, 5, 5, 5, 5, /* 30 .. 37 */ - 5, 5, 5, 5, 5, 5, 5, 5, /* 38 .. 3f */ - 6, 6, 6, 6, 6, 6, 6, 6, /* 40 .. 47 */ - 6, 6, 6, 6, 6, 6, 6, 6, /* 48 .. 4f */ - 6, 6, 6, 6, 6, 6, 6, 6, /* 50 .. 57 */ - 6, 6, 6, 6, 6, 6, 6, 6, /* 58 .. 5f */ - 6, 6, 6, 6, 6, 6, 6, 6, /* 60 .. 67 */ - 6, 6, 6, 6, 6, 6, 6, 6, /* 68 .. 6f */ - 6, 6, 6, 6, 6, 6, 6, 6, /* 70 .. 77 */ - 6, 6, 6, 6, 6, 6, 6, 6, /* 78 .. 7f */ - 7, 7, 7, 7, 7, 7, 7, 7, /* 80 .. 87 */ - 7, 7, 7, 7, 7, 7, 7, 7, /* 88 .. 8f */ - 7, 7, 7, 7, 7, 7, 7, 7, /* 90 .. 97 */ - 7, 7, 7, 7, 7, 7, 7, 7, /* 98 .. 9f */ - 7, 7, 7, 7, 7, 7, 7, 7, /* a0 .. a7 */ - 7, 7, 7, 7, 7, 7, 7, 7, /* a8 .. af */ - 7, 7, 7, 7, 7, 7, 7, 7, /* b0 .. b7 */ - 7, 7, 7, 7, 7, 7, 7, 7, /* b8 .. bf */ - 7, 7, 7, 7, 7, 7, 7, 7, /* c0 .. c7 */ - 7, 7, 7, 7, 7, 7, 7, 7, /* c8 .. cf */ - 7, 7, 7, 7, 7, 7, 7, 7, /* d0 .. d7 */ - 7, 7, 7, 7, 7, 7, 7, 7, /* d8 .. df */ - 7, 7, 7, 7, 7, 7, 7, 7, /* e0 .. e7 */ - 7, 7, 7, 7, 7, 7, 7, 7, /* e8 .. ef */ - 7, 7, 7, 7, 7, 7, 7, 7, /* f0 .. f7 */ - 7, 7, 7, 7, 7, 7, 7, 7, /* f8 .. ff */ -}; -#endif - -/* - * xfs_highbit32: get high bit set out of 32-bit argument, -1 if none set. - */ -inline int -xfs_highbit32( - __uint32_t v) -{ -#ifdef HAVE_ARCH_HIGHBIT - return highbit32(v); -#else - int i; - - if (v & 0xffff0000) - if (v & 0xff000000) - i = 24; - else - i = 16; - else if (v & 0x0000ffff) - if (v & 0x0000ff00) - i = 8; - else - i = 0; - else - return -1; - return i + xfs_highbit[(v >> i) & 0xff]; -#endif -} - -/* - * xfs_lowbit64: get low bit set out of 64-bit argument, -1 if none set. - */ -int -xfs_lowbit64( - __uint64_t v) -{ - __uint32_t w = (__uint32_t)v; - int n = 0; - - if (w) { /* lower bits */ - n = ffs(w); - } else { /* upper bits */ - w = (__uint32_t)(v >> 32); - if (w && (n = ffs(w))) - n += 32; - } - return n - 1; -} - -/* - * xfs_highbit64: get high bit set out of 64-bit argument, -1 if none set. - */ -int -xfs_highbit64( - __uint64_t v) -{ - __uint32_t h = (__uint32_t)(v >> 32); - - if (h) - return xfs_highbit32(h) + 32; - return xfs_highbit32((__uint32_t)v); -} - - /* * Return whether bitmap is empty. * Size is number of words in the bitmap, which is padded to word boundary diff --git a/fs/xfs/xfs_bit.h b/fs/xfs/xfs_bit.h index 082641a9782c..8e0e463dae2d 100644 --- a/fs/xfs/xfs_bit.h +++ b/fs/xfs/xfs_bit.h @@ -47,13 +47,39 @@ static inline __uint64_t xfs_mask64lo(int n) } /* Get high bit set out of 32-bit argument, -1 if none set */ -extern int xfs_highbit32(__uint32_t v); +static inline int xfs_highbit32(__uint32_t v) +{ + return fls(v) - 1; +} + +/* Get high bit set out of 64-bit argument, -1 if none set */ +static inline int xfs_highbit64(__uint64_t v) +{ + return fls64(v) - 1; +} + +/* Get low bit set out of 32-bit argument, -1 if none set */ +static inline int xfs_lowbit32(__uint32_t v) +{ + unsigned long t = v; + return (v) ? find_first_bit(&t, 32) : -1; +} /* Get low bit set out of 64-bit argument, -1 if none set */ -extern int xfs_lowbit64(__uint64_t v); +static inline int xfs_lowbit64(__uint64_t v) +{ + __uint32_t w = (__uint32_t)v; + int n = 0; -/* Get high bit set out of 64-bit argument, -1 if none set */ -extern int xfs_highbit64(__uint64_t); + if (w) { /* lower bits */ + n = ffs(w); + } else { /* upper bits */ + w = (__uint32_t)(v >> 32); + if (w && (n = ffs(w))) + n += 32; + } + return n - 1; +} /* Return whether bitmap is empty (1 == empty) */ extern int xfs_bitmap_empty(uint *map, uint size); diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 53c259f5a5af..a1aab9275d5a 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -384,14 +384,14 @@ xfs_bmap_count_tree( int levelin, int *count); -STATIC int +STATIC void xfs_bmap_count_leaves( xfs_ifork_t *ifp, xfs_extnum_t idx, int numrecs, int *count); -STATIC int +STATIC void xfs_bmap_disk_count_leaves( xfs_extnum_t idx, xfs_bmbt_block_t *block, @@ -428,7 +428,8 @@ xfs_bmap_add_attrfork_btree( cur->bc_private.b.firstblock = *firstblock; if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat))) goto error0; - ASSERT(stat == 1); /* must be at least one entry */ + /* must be at least one entry */ + XFS_WANT_CORRUPTED_GOTO(stat == 1, error0); if ((error = xfs_bmbt_newroot(cur, flags, &stat))) goto error0; if (stat == 0) { @@ -816,13 +817,13 @@ xfs_bmap_add_extent_delay_real( RIGHT.br_startblock, RIGHT.br_blockcount, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_delete(cur, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_decrement(cur, 0, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + @@ -860,7 +861,7 @@ xfs_bmap_add_extent_delay_real( LEFT.br_startblock, LEFT.br_blockcount, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + @@ -895,7 +896,7 @@ xfs_bmap_add_extent_delay_real( RIGHT.br_startblock, RIGHT.br_blockcount, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_update(cur, PREV.br_startoff, new->br_startblock, PREV.br_blockcount + @@ -928,11 +929,11 @@ xfs_bmap_add_extent_delay_real( new->br_startblock, new->br_blockcount, &i))) goto done; - ASSERT(i == 0); + XFS_WANT_CORRUPTED_GOTO(i == 0, done); cur->bc_rec.b.br_state = XFS_EXT_NORM; if ((error = xfs_bmbt_insert(cur, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); } *dnew = 0; /* DELTA: The in-core extent described by new changed type. */ @@ -963,7 +964,7 @@ xfs_bmap_add_extent_delay_real( LEFT.br_startblock, LEFT.br_blockcount, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + @@ -1004,11 +1005,11 @@ xfs_bmap_add_extent_delay_real( new->br_startblock, new->br_blockcount, &i))) goto done; - ASSERT(i == 0); + XFS_WANT_CORRUPTED_GOTO(i == 0, done); cur->bc_rec.b.br_state = XFS_EXT_NORM; if ((error = xfs_bmbt_insert(cur, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); } if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && ip->i_d.di_nextents > ip->i_df.if_ext_max) { @@ -1054,7 +1055,7 @@ xfs_bmap_add_extent_delay_real( RIGHT.br_startblock, RIGHT.br_blockcount, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_update(cur, new->br_startoff, new->br_startblock, new->br_blockcount + @@ -1094,11 +1095,11 @@ xfs_bmap_add_extent_delay_real( new->br_startblock, new->br_blockcount, &i))) goto done; - ASSERT(i == 0); + XFS_WANT_CORRUPTED_GOTO(i == 0, done); cur->bc_rec.b.br_state = XFS_EXT_NORM; if ((error = xfs_bmbt_insert(cur, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); } if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && ip->i_d.di_nextents > ip->i_df.if_ext_max) { @@ -1149,11 +1150,11 @@ xfs_bmap_add_extent_delay_real( new->br_startblock, new->br_blockcount, &i))) goto done; - ASSERT(i == 0); + XFS_WANT_CORRUPTED_GOTO(i == 0, done); cur->bc_rec.b.br_state = XFS_EXT_NORM; if ((error = xfs_bmbt_insert(cur, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); } if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && ip->i_d.di_nextents > ip->i_df.if_ext_max) { @@ -1377,19 +1378,19 @@ xfs_bmap_add_extent_unwritten_real( RIGHT.br_startblock, RIGHT.br_blockcount, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_delete(cur, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_decrement(cur, 0, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_delete(cur, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_decrement(cur, 0, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + PREV.br_blockcount + @@ -1426,13 +1427,13 @@ xfs_bmap_add_extent_unwritten_real( PREV.br_startblock, PREV.br_blockcount, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_delete(cur, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_decrement(cur, 0, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + PREV.br_blockcount, @@ -1469,13 +1470,13 @@ xfs_bmap_add_extent_unwritten_real( RIGHT.br_startblock, RIGHT.br_blockcount, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_delete(cur, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_decrement(cur, 0, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_update(cur, new->br_startoff, new->br_startblock, new->br_blockcount + RIGHT.br_blockcount, @@ -1508,7 +1509,7 @@ xfs_bmap_add_extent_unwritten_real( new->br_startblock, new->br_blockcount, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_update(cur, new->br_startoff, new->br_startblock, new->br_blockcount, newext))) @@ -1549,7 +1550,7 @@ xfs_bmap_add_extent_unwritten_real( PREV.br_startblock, PREV.br_blockcount, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_update(cur, PREV.br_startoff + new->br_blockcount, PREV.br_startblock + new->br_blockcount, @@ -1596,7 +1597,7 @@ xfs_bmap_add_extent_unwritten_real( PREV.br_startblock, PREV.br_blockcount, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_update(cur, PREV.br_startoff + new->br_blockcount, PREV.br_startblock + new->br_blockcount, @@ -1606,7 +1607,7 @@ xfs_bmap_add_extent_unwritten_real( cur->bc_rec.b = *new; if ((error = xfs_bmbt_insert(cur, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); } /* DELTA: One in-core extent is split in two. */ temp = PREV.br_startoff; @@ -1640,7 +1641,7 @@ xfs_bmap_add_extent_unwritten_real( PREV.br_startblock, PREV.br_blockcount, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_update(cur, PREV.br_startoff, PREV.br_startblock, PREV.br_blockcount - new->br_blockcount, @@ -1682,7 +1683,7 @@ xfs_bmap_add_extent_unwritten_real( PREV.br_startblock, PREV.br_blockcount, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_update(cur, PREV.br_startoff, PREV.br_startblock, PREV.br_blockcount - new->br_blockcount, @@ -1692,11 +1693,11 @@ xfs_bmap_add_extent_unwritten_real( new->br_startblock, new->br_blockcount, &i))) goto done; - ASSERT(i == 0); + XFS_WANT_CORRUPTED_GOTO(i == 0, done); cur->bc_rec.b.br_state = XFS_EXT_NORM; if ((error = xfs_bmbt_insert(cur, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); } /* DELTA: One in-core extent is split in two. */ temp = PREV.br_startoff; @@ -1732,27 +1733,34 @@ xfs_bmap_add_extent_unwritten_real( PREV.br_startblock, PREV.br_blockcount, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); /* new right extent - oldext */ if ((error = xfs_bmbt_update(cur, r[1].br_startoff, r[1].br_startblock, r[1].br_blockcount, r[1].br_state))) goto done; /* new left extent - oldext */ - PREV.br_blockcount = - new->br_startoff - PREV.br_startoff; cur->bc_rec.b = PREV; + cur->bc_rec.b.br_blockcount = + new->br_startoff - PREV.br_startoff; if ((error = xfs_bmbt_insert(cur, &i))) goto done; - ASSERT(i == 1); - if ((error = xfs_bmbt_increment(cur, 0, &i))) + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + /* + * Reset the cursor to the position of the new extent + * we are about to insert as we can't trust it after + * the previous insert. + */ + if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + new->br_startblock, new->br_blockcount, + &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 0, done); /* new middle extent - newext */ - cur->bc_rec.b = *new; + cur->bc_rec.b.br_state = new->br_state; if ((error = xfs_bmbt_insert(cur, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); } /* DELTA: One in-core extent is split in three. */ temp = PREV.br_startoff; @@ -2097,13 +2105,13 @@ xfs_bmap_add_extent_hole_real( right.br_startblock, right.br_blockcount, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_delete(cur, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_decrement(cur, 0, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_update(cur, left.br_startoff, left.br_startblock, left.br_blockcount + @@ -2139,7 +2147,7 @@ xfs_bmap_add_extent_hole_real( left.br_startblock, left.br_blockcount, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_update(cur, left.br_startoff, left.br_startblock, left.br_blockcount + @@ -2174,7 +2182,7 @@ xfs_bmap_add_extent_hole_real( right.br_startblock, right.br_blockcount, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_update(cur, new->br_startoff, new->br_startblock, new->br_blockcount + @@ -2208,11 +2216,11 @@ xfs_bmap_add_extent_hole_real( new->br_startblock, new->br_blockcount, &i))) goto done; - ASSERT(i == 0); + XFS_WANT_CORRUPTED_GOTO(i == 0, done); cur->bc_rec.b.br_state = new->br_state; if ((error = xfs_bmbt_insert(cur, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); } /* DELTA: A new extent was added in a hole. */ temp = new->br_startoff; @@ -3131,7 +3139,7 @@ xfs_bmap_del_extent( got.br_startblock, got.br_blockcount, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); } da_old = da_new = 0; } else { @@ -3164,7 +3172,7 @@ xfs_bmap_del_extent( } if ((error = xfs_bmbt_delete(cur, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); break; case 2: @@ -3268,7 +3276,7 @@ xfs_bmap_del_extent( got.br_startblock, temp, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); /* * Update the btree record back * to the original value. @@ -3289,7 +3297,7 @@ xfs_bmap_del_extent( error = XFS_ERROR(ENOSPC); goto done; } - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); } else flags |= XFS_ILOG_FEXT(whichfork); XFS_IFORK_NEXT_SET(ip, whichfork, @@ -3992,7 +4000,7 @@ xfs_bmap_add_attrfork( ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; } ASSERT(ip->i_d.di_anextents == 0); - VN_HOLD(XFS_ITOV(ip)); + IHOLD(ip); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); switch (ip->i_d.di_format) { @@ -5970,7 +5978,7 @@ unlock_and_return: xfs_iunlock_map_shared(ip, lock); xfs_iunlock(ip, XFS_IOLOCK_SHARED); - kmem_free(map, subnex * sizeof(*map)); + kmem_free(map); return error; } @@ -6088,7 +6096,7 @@ xfs_bmap_get_bp( tp = cur->bc_tp; licp = &tp->t_items; while (!bp && licp != NULL) { - if (XFS_LIC_ARE_ALL_FREE(licp)) { + if (xfs_lic_are_all_free(licp)) { licp = licp->lic_next; continue; } @@ -6098,11 +6106,11 @@ xfs_bmap_get_bp( xfs_buf_log_item_t *bip; xfs_buf_t *lbp; - if (XFS_LIC_ISFREE(licp, i)) { + if (xfs_lic_isfree(licp, i)) { continue; } - lidp = XFS_LIC_SLOT(licp, i); + lidp = xfs_lic_slot(licp, i); lip = lidp->lid_item; if (lip->li_type != XFS_LI_BUF) continue; @@ -6359,13 +6367,9 @@ xfs_bmap_count_blocks( mp = ip->i_mount; ifp = XFS_IFORK_PTR(ip, whichfork); if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) { - if (unlikely(xfs_bmap_count_leaves(ifp, 0, + xfs_bmap_count_leaves(ifp, 0, ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t), - count) < 0)) { - XFS_ERROR_REPORT("xfs_bmap_count_blocks(1)", - XFS_ERRLEVEL_LOW, mp); - return XFS_ERROR(EFSCORRUPTED); - } + count); return 0; } @@ -6446,13 +6450,7 @@ xfs_bmap_count_tree( for (;;) { nextbno = be64_to_cpu(block->bb_rightsib); numrecs = be16_to_cpu(block->bb_numrecs); - if (unlikely(xfs_bmap_disk_count_leaves(0, - block, numrecs, count) < 0)) { - xfs_trans_brelse(tp, bp); - XFS_ERROR_REPORT("xfs_bmap_count_tree(2)", - XFS_ERRLEVEL_LOW, mp); - return XFS_ERROR(EFSCORRUPTED); - } + xfs_bmap_disk_count_leaves(0, block, numrecs, count); xfs_trans_brelse(tp, bp); if (nextbno == NULLFSBLOCK) break; @@ -6470,7 +6468,7 @@ xfs_bmap_count_tree( /* * Count leaf blocks given a range of extent records. */ -STATIC int +STATIC void xfs_bmap_count_leaves( xfs_ifork_t *ifp, xfs_extnum_t idx, @@ -6483,14 +6481,13 @@ xfs_bmap_count_leaves( xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b); *count += xfs_bmbt_get_blockcount(frp); } - return 0; } /* * Count leaf blocks given a range of extent records originally * in btree format. */ -STATIC int +STATIC void xfs_bmap_disk_count_leaves( xfs_extnum_t idx, xfs_bmbt_block_t *block, @@ -6504,5 +6501,4 @@ xfs_bmap_disk_count_leaves( frp = XFS_BTREE_REC_ADDR(xfs_bmbt, block, idx + b); *count += xfs_bmbt_disk_get_blockcount(frp); } - return 0; } diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index 6ff70cda451c..9f3e3a836d15 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -54,12 +54,23 @@ typedef struct xfs_bmap_free_item /* * Header for free extent list. + * + * xbf_low is used by the allocator to activate the lowspace algorithm - + * when free space is running low the extent allocator may choose to + * allocate an extent from an AG without leaving sufficient space for + * a btree split when inserting the new extent. In this case the allocator + * will enable the lowspace algorithm which is supposed to allow further + * allocations (such as btree splits and newroots) to allocate from + * sequential AGs. In order to avoid locking AGs out of order the lowspace + * algorithm will start searching for free space from AG 0. If the correct + * transaction reservations have been made then this algorithm will eventually + * find all the space it needs. */ typedef struct xfs_bmap_free { xfs_bmap_free_item_t *xbf_first; /* list of to-be-free extents */ int xbf_count; /* count of items on list */ - int xbf_low; /* kludge: alloc in low mode */ + int xbf_low; /* alloc in low mode */ } xfs_bmap_free_t; #define XFS_BMAP_MAX_NMAP 4 diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index 4f0e849d973e..23efad29a5cd 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -1493,12 +1493,27 @@ xfs_bmbt_split( left = XFS_BUF_TO_BMBT_BLOCK(lbp); args.fsbno = cur->bc_private.b.firstblock; args.firstblock = args.fsbno; + args.minleft = 0; if (args.fsbno == NULLFSBLOCK) { args.fsbno = lbno; args.type = XFS_ALLOCTYPE_START_BNO; - } else + /* + * Make sure there is sufficient room left in the AG to + * complete a full tree split for an extent insert. If + * we are converting the middle part of an extent then + * we may need space for two tree splits. + * + * We are relying on the caller to make the correct block + * reservation for this operation to succeed. If the + * reservation amount is insufficient then we may fail a + * block allocation here and corrupt the filesystem. + */ + args.minleft = xfs_trans_get_block_res(args.tp); + } else if (cur->bc_private.b.flist->xbf_low) + args.type = XFS_ALLOCTYPE_START_BNO; + else args.type = XFS_ALLOCTYPE_NEAR_BNO; - args.mod = args.minleft = args.alignment = args.total = args.isfl = + args.mod = args.alignment = args.total = args.isfl = args.userdata = args.minalignslop = 0; args.minlen = args.maxlen = args.prod = 1; args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL; @@ -1510,6 +1525,21 @@ xfs_bmbt_split( XFS_BMBT_TRACE_CURSOR(cur, ERROR); return error; } + if (args.fsbno == NULLFSBLOCK && args.minleft) { + /* + * Could not find an AG with enough free space to satisfy + * a full btree split. Try again without minleft and if + * successful activate the lowspace algorithm. + */ + args.fsbno = 0; + args.type = XFS_ALLOCTYPE_FIRST_AG; + args.minleft = 0; + if ((error = xfs_alloc_vextent(&args))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + cur->bc_private.b.flist->xbf_low = 1; + } if (args.fsbno == NULLFSBLOCK) { XFS_BMBT_TRACE_CURSOR(cur, EXIT); *stat = 0; @@ -2029,22 +2059,8 @@ xfs_bmbt_increment( * Insert the current record at the point referenced by cur. * * A multi-level split of the tree on insert will invalidate the original - * cursor. It appears, however, that some callers assume that the cursor is - * always valid. Hence if we do a multi-level split we need to revalidate the - * cursor. - * - * When a split occurs, we will see a new cursor returned. Use that as a - * trigger to determine if we need to revalidate the original cursor. If we get - * a split, then use the original irec to lookup up the path of the record we - * just inserted. - * - * Note that the fact that the btree root is in the inode means that we can - * have the level of the tree change without a "split" occurring at the root - * level. What happens is that the root is migrated to an allocated block and - * the inode root is pointed to it. This means a single split can change the - * level of the tree (level 2 -> level 3) and invalidate the old cursor. Hence - * the level change should be accounted as a split so as to correctly trigger a - * revalidation of the old cursor. + * cursor. All callers of this function should assume that the cursor is + * no longer valid and revalidate it. */ int /* error */ xfs_bmbt_insert( @@ -2057,14 +2073,11 @@ xfs_bmbt_insert( xfs_fsblock_t nbno; xfs_btree_cur_t *ncur; xfs_bmbt_rec_t nrec; - xfs_bmbt_irec_t oirec; /* original irec */ xfs_btree_cur_t *pcur; - int splits = 0; XFS_BMBT_TRACE_CURSOR(cur, ENTRY); level = 0; nbno = NULLFSBLOCK; - oirec = cur->bc_rec.b; xfs_bmbt_disk_set_all(&nrec, &cur->bc_rec.b); ncur = NULL; pcur = cur; @@ -2073,13 +2086,11 @@ xfs_bmbt_insert( &i))) { if (pcur != cur) xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR); - goto error0; + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; } XFS_WANT_CORRUPTED_GOTO(i == 1, error0); if (pcur != cur && (ncur || nbno == NULLFSBLOCK)) { - /* allocating a new root is effectively a split */ - if (cur->bc_nlevels != pcur->bc_nlevels) - splits++; cur->bc_nlevels = pcur->bc_nlevels; cur->bc_private.b.allocated += pcur->bc_private.b.allocated; @@ -2093,21 +2104,10 @@ xfs_bmbt_insert( xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR); } if (ncur) { - splits++; pcur = ncur; ncur = NULL; } } while (nbno != NULLFSBLOCK); - - if (splits > 1) { - /* revalidate the old cursor as we had a multi-level split */ - error = xfs_bmbt_lookup_eq(cur, oirec.br_startoff, - oirec.br_startblock, oirec.br_blockcount, &i); - if (error) - goto error0; - ASSERT(i == 1); - } - XFS_BMBT_TRACE_CURSOR(cur, EXIT); *stat = i; return 0; @@ -2254,7 +2254,9 @@ xfs_bmbt_newroot( #endif args.fsbno = be64_to_cpu(*pp); args.type = XFS_ALLOCTYPE_START_BNO; - } else + } else if (cur->bc_private.b.flist->xbf_low) + args.type = XFS_ALLOCTYPE_START_BNO; + else args.type = XFS_ALLOCTYPE_NEAR_BNO; if ((error = xfs_alloc_vextent(&args))) { XFS_BMBT_TRACE_CURSOR(cur, ERROR); diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index aeb87ca69fcc..cc593a84c345 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -46,38 +46,11 @@ kmem_zone_t *xfs_btree_cur_zone; /* * Btree magic numbers. */ -const __uint32_t xfs_magics[XFS_BTNUM_MAX] = -{ +const __uint32_t xfs_magics[XFS_BTNUM_MAX] = { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC }; /* - * Prototypes for internal routines. - */ - -/* - * Checking routine: return maxrecs for the block. - */ -STATIC int /* number of records fitting in block */ -xfs_btree_maxrecs( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_btree_block_t *block);/* generic btree block pointer */ - -/* - * Internal routines. - */ - -/* - * Retrieve the block pointer from the cursor at the given level. - * This may be a bmap btree root or from a buffer. - */ -STATIC xfs_btree_block_t * /* generic btree block pointer */ -xfs_btree_get_block( - xfs_btree_cur_t *cur, /* btree cursor */ - int level, /* level in btree */ - struct xfs_buf **bpp); /* buffer containing the block */ - -/* * Checking routine: return maxrecs for the block. */ STATIC int /* number of records fitting in block */ @@ -457,35 +430,6 @@ xfs_btree_dup_cursor( } /* - * Change the cursor to point to the first record at the given level. - * Other levels are unaffected. - */ -int /* success=1, failure=0 */ -xfs_btree_firstrec( - xfs_btree_cur_t *cur, /* btree cursor */ - int level) /* level to change */ -{ - xfs_btree_block_t *block; /* generic btree block pointer */ - xfs_buf_t *bp; /* buffer containing block */ - - /* - * Get the block pointer for this level. - */ - block = xfs_btree_get_block(cur, level, &bp); - xfs_btree_check_block(cur, block, level, bp); - /* - * It's empty, there is no such record. - */ - if (!block->bb_h.bb_numrecs) - return 0; - /* - * Set the ptr value to 1, that's the first record/key. - */ - cur->bc_ptrs[level] = 1; - return 1; -} - -/* * Retrieve the block pointer from the cursor at the given level. * This may be a bmap btree root or from a buffer. */ @@ -626,6 +570,13 @@ xfs_btree_init_cursor( cur->bc_private.a.agbp = agbp; cur->bc_private.a.agno = agno; break; + case XFS_BTNUM_INO: + /* + * Inode allocation btree fields. + */ + cur->bc_private.a.agbp = agbp; + cur->bc_private.a.agno = agno; + break; case XFS_BTNUM_BMAP: /* * Bmap btree fields. @@ -638,13 +589,6 @@ xfs_btree_init_cursor( cur->bc_private.b.flags = 0; cur->bc_private.b.whichfork = whichfork; break; - case XFS_BTNUM_INO: - /* - * Inode allocation btree fields. - */ - cur->bc_private.i.agbp = agbp; - cur->bc_private.i.agno = agno; - break; default: ASSERT(0); } @@ -671,6 +615,35 @@ xfs_btree_islastblock( } /* + * Change the cursor to point to the first record at the given level. + * Other levels are unaffected. + */ +int /* success=1, failure=0 */ +xfs_btree_firstrec( + xfs_btree_cur_t *cur, /* btree cursor */ + int level) /* level to change */ +{ + xfs_btree_block_t *block; /* generic btree block pointer */ + xfs_buf_t *bp; /* buffer containing block */ + + /* + * Get the block pointer for this level. + */ + block = xfs_btree_get_block(cur, level, &bp); + xfs_btree_check_block(cur, block, level, bp); + /* + * It's empty, there is no such record. + */ + if (!block->bb_h.bb_numrecs) + return 0; + /* + * Set the ptr value to 1, that's the first record/key. + */ + cur->bc_ptrs[level] = 1; + return 1; +} + +/* * Change the cursor to point to the last record in the current block * at the given level. Other levels are unaffected. */ @@ -890,12 +863,12 @@ xfs_btree_readahead_core( case XFS_BTNUM_INO: i = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[lev]); if ((lr & XFS_BTCUR_LEFTRA) && be32_to_cpu(i->bb_leftsib) != NULLAGBLOCK) { - xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.i.agno, + xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, be32_to_cpu(i->bb_leftsib), 1); rval++; } if ((lr & XFS_BTCUR_RIGHTRA) && be32_to_cpu(i->bb_rightsib) != NULLAGBLOCK) { - xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.i.agno, + xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, be32_to_cpu(i->bb_rightsib), 1); rval++; } diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h index 7440b78f9cec..1f528a2a3754 100644 --- a/fs/xfs/xfs_btree.h +++ b/fs/xfs/xfs_btree.h @@ -158,8 +158,8 @@ typedef struct xfs_btree_cur __uint8_t bc_blocklog; /* log2(blocksize) of btree blocks */ xfs_btnum_t bc_btnum; /* identifies which btree type */ union { - struct { /* needed for BNO, CNT */ - struct xfs_buf *agbp; /* agf buffer pointer */ + struct { /* needed for BNO, CNT, INO */ + struct xfs_buf *agbp; /* agf/agi buffer pointer */ xfs_agnumber_t agno; /* ag number */ } a; struct { /* needed for BMAP */ @@ -172,10 +172,6 @@ typedef struct xfs_btree_cur char flags; /* flags */ #define XFS_BTCUR_BPRV_WASDEL 1 /* was delayed */ } b; - struct { /* needed for INO */ - struct xfs_buf *agbp; /* agi buffer pointer */ - xfs_agnumber_t agno; /* ag number */ - } i; } bc_private; /* per-btree type data */ } xfs_btree_cur_t; diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 53a71c62025d..002fc2617c8e 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -732,12 +732,13 @@ xfs_buf_item_init( bip->bli_item.li_ops = &xfs_buf_item_ops; bip->bli_item.li_mountp = mp; bip->bli_buf = bp; + xfs_buf_hold(bp); bip->bli_format.blf_type = XFS_LI_BUF; bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp); bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp)); bip->bli_format.blf_map_size = map_size; #ifdef XFS_BLI_TRACE - bip->bli_trace = ktrace_alloc(XFS_BLI_TRACE_SIZE, KM_SLEEP); + bip->bli_trace = ktrace_alloc(XFS_BLI_TRACE_SIZE, KM_NOFS); #endif #ifdef XFS_TRANS_DEBUG @@ -867,6 +868,21 @@ xfs_buf_item_dirty( return (bip->bli_flags & XFS_BLI_DIRTY); } +STATIC void +xfs_buf_item_free( + xfs_buf_log_item_t *bip) +{ +#ifdef XFS_TRANS_DEBUG + kmem_free(bip->bli_orig); + kmem_free(bip->bli_logged); +#endif /* XFS_TRANS_DEBUG */ + +#ifdef XFS_BLI_TRACE + ktrace_free(bip->bli_trace); +#endif + kmem_zone_free(xfs_buf_item_zone, bip); +} + /* * This is called when the buf log item is no longer needed. It should * free the buf log item associated with the given buffer and clear @@ -887,18 +903,8 @@ xfs_buf_item_relse( (XFS_BUF_IODONE_FUNC(bp) != NULL)) { XFS_BUF_CLR_IODONE_FUNC(bp); } - -#ifdef XFS_TRANS_DEBUG - kmem_free(bip->bli_orig, XFS_BUF_COUNT(bp)); - bip->bli_orig = NULL; - kmem_free(bip->bli_logged, XFS_BUF_COUNT(bp) / NBBY); - bip->bli_logged = NULL; -#endif /* XFS_TRANS_DEBUG */ - -#ifdef XFS_BLI_TRACE - ktrace_free(bip->bli_trace); -#endif - kmem_zone_free(xfs_buf_item_zone, bip); + xfs_buf_rele(bp); + xfs_buf_item_free(bip); } @@ -1056,7 +1062,7 @@ xfs_buf_iodone_callbacks( anyway. */ XFS_BUF_SET_BRELSE_FUNC(bp,xfs_buf_error_relse); XFS_BUF_DONE(bp); - XFS_BUF_V_IODONESEMA(bp); + XFS_BUF_FINISH_IOWAIT(bp); } return; } @@ -1120,6 +1126,7 @@ xfs_buf_iodone( ASSERT(bip->bli_buf == bp); + xfs_buf_rele(bp); mp = bip->bli_item.li_mountp; /* @@ -1136,18 +1143,7 @@ xfs_buf_iodone( * xfs_trans_delete_ail() drops the AIL lock. */ xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip); - -#ifdef XFS_TRANS_DEBUG - kmem_free(bip->bli_orig, XFS_BUF_COUNT(bp)); - bip->bli_orig = NULL; - kmem_free(bip->bli_logged, XFS_BUF_COUNT(bp) / NBBY); - bip->bli_logged = NULL; -#endif /* XFS_TRANS_DEBUG */ - -#ifdef XFS_BLI_TRACE - ktrace_free(bip->bli_trace); -#endif - kmem_zone_free(xfs_buf_item_zone, bip); + xfs_buf_item_free(bip); } #if defined(XFS_BLI_TRACE) diff --git a/fs/xfs/xfs_clnt.h b/fs/xfs/xfs_clnt.h index d5d1e60ee224..d2ce5dd70d87 100644 --- a/fs/xfs/xfs_clnt.h +++ b/fs/xfs/xfs_clnt.h @@ -78,6 +78,7 @@ struct xfs_mount_args { #define XFSMNT_IOSIZE 0x00002000 /* optimize for I/O size */ #define XFSMNT_OSYNCISOSYNC 0x00004000 /* o_sync is REALLY o_sync */ /* (osyncisdsync is default) */ +#define XFSMNT_NOATTR2 0x00008000 /* turn off ATTR2 EA format */ #define XFSMNT_32BITINODES 0x00200000 /* restrict inodes to 32 * bits of address space */ #define XFSMNT_GQUOTA 0x00400000 /* group quota accounting */ diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 021a8f7e563f..9e561a9cefca 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -1431,7 +1431,7 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, } if (level < 0) { *result = XFS_ERROR(ENOENT); /* we're out of our tree */ - ASSERT(args->oknoent); + ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); return(0); } @@ -1530,6 +1530,28 @@ xfs_da_hashname(const uchar_t *name, int namelen) } } +enum xfs_dacmp +xfs_da_compname( + struct xfs_da_args *args, + const char *name, + int len) +{ + return (args->namelen == len && memcmp(args->name, name, len) == 0) ? + XFS_CMP_EXACT : XFS_CMP_DIFFERENT; +} + +static xfs_dahash_t +xfs_default_hashname( + struct xfs_name *name) +{ + return xfs_da_hashname(name->name, name->len); +} + +const struct xfs_nameops xfs_default_nameops = { + .hashname = xfs_default_hashname, + .compname = xfs_da_compname +}; + /* * Add a block to the btree ahead of the file. * Return the new block number to the caller. @@ -1598,7 +1620,7 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno) args->firstblock, args->total, &mapp[mapi], &nmap, args->flist, NULL))) { - kmem_free(mapp, sizeof(*mapp) * count); + kmem_free(mapp); return error; } if (nmap < 1) @@ -1620,11 +1642,11 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno) mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount != bno + count) { if (mapp != &map) - kmem_free(mapp, sizeof(*mapp) * count); + kmem_free(mapp); return XFS_ERROR(ENOSPC); } if (mapp != &map) - kmem_free(mapp, sizeof(*mapp) * count); + kmem_free(mapp); *new_blkno = (xfs_dablk_t)bno; return 0; } @@ -2090,10 +2112,10 @@ xfs_da_do_buf( } } if (bplist) { - kmem_free(bplist, sizeof(*bplist) * nmap); + kmem_free(bplist); } if (mapp != &map) { - kmem_free(mapp, sizeof(*mapp) * nfsb); + kmem_free(mapp); } if (bpp) *bpp = rbp; @@ -2102,11 +2124,11 @@ exit1: if (bplist) { for (i = 0; i < nbplist; i++) xfs_trans_brelse(trans, bplist[i]); - kmem_free(bplist, sizeof(*bplist) * nmap); + kmem_free(bplist); } exit0: if (mapp != &map) - kmem_free(mapp, sizeof(*mapp) * nfsb); + kmem_free(mapp); if (bpp) *bpp = NULL; return error; @@ -2218,7 +2240,7 @@ xfs_da_state_free(xfs_da_state_t *state) #ifdef XFS_DABUF_DEBUG xfs_dabuf_t *xfs_dabuf_global_list; -spinlock_t xfs_dabuf_global_lock; +static DEFINE_SPINLOCK(xfs_dabuf_global_lock); #endif /* @@ -2315,7 +2337,7 @@ xfs_da_buf_done(xfs_dabuf_t *dabuf) if (dabuf->dirty) xfs_da_buf_clean(dabuf); if (dabuf->nbuf > 1) - kmem_free(dabuf->data, BBTOB(dabuf->bbcount)); + kmem_free(dabuf->data); #ifdef XFS_DABUF_DEBUG { spin_lock(&xfs_dabuf_global_lock); @@ -2332,7 +2354,7 @@ xfs_da_buf_done(xfs_dabuf_t *dabuf) if (dabuf->nbuf == 1) kmem_zone_free(xfs_dabuf_zone, dabuf); else - kmem_free(dabuf, XFS_DA_BUF_SIZE(dabuf->nbuf)); + kmem_free(dabuf); } /* @@ -2403,7 +2425,7 @@ xfs_da_brelse(xfs_trans_t *tp, xfs_dabuf_t *dabuf) for (i = 0; i < nbuf; i++) xfs_trans_brelse(tp, bplist[i]); if (bplist != &bp) - kmem_free(bplist, nbuf * sizeof(*bplist)); + kmem_free(bplist); } /* @@ -2429,7 +2451,7 @@ xfs_da_binval(xfs_trans_t *tp, xfs_dabuf_t *dabuf) for (i = 0; i < nbuf; i++) xfs_trans_binval(tp, bplist[i]); if (bplist != &bp) - kmem_free(bplist, nbuf * sizeof(*bplist)); + kmem_free(bplist); } /* diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h index 7facf86f74f9..8be0b00ede9a 100644 --- a/fs/xfs/xfs_da_btree.h +++ b/fs/xfs/xfs_da_btree.h @@ -99,6 +99,15 @@ typedef struct xfs_da_node_entry xfs_da_node_entry_t; *========================================================================*/ /* + * Search comparison results + */ +enum xfs_dacmp { + XFS_CMP_DIFFERENT, /* names are completely different */ + XFS_CMP_EXACT, /* names are exactly the same */ + XFS_CMP_CASE /* names are same but differ in case */ +}; + +/* * Structure to ease passing around component names. */ typedef struct xfs_da_args { @@ -123,13 +132,20 @@ typedef struct xfs_da_args { int index2; /* index of 2nd attr in blk */ xfs_dablk_t rmtblkno2; /* remote attr value starting blkno */ int rmtblkcnt2; /* remote attr value block count */ - unsigned char justcheck; /* T/F: check for ok with no space */ - unsigned char rename; /* T/F: this is an atomic rename op */ - unsigned char addname; /* T/F: this is an add operation */ - unsigned char oknoent; /* T/F: ok to return ENOENT, else die */ + int op_flags; /* operation flags */ + enum xfs_dacmp cmpresult; /* name compare result for lookups */ } xfs_da_args_t; /* + * Operation flags: + */ +#define XFS_DA_OP_JUSTCHECK 0x0001 /* check for ok with no space */ +#define XFS_DA_OP_RENAME 0x0002 /* this is an atomic rename op */ +#define XFS_DA_OP_ADDNAME 0x0004 /* this is an add operation */ +#define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */ +#define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */ + +/* * Structure to describe buffer(s) for a block. * This is needed in the directory version 2 format case, when * multiple non-contiguous fsblocks might be needed to cover one @@ -201,6 +217,14 @@ typedef struct xfs_da_state { (uint)(XFS_DA_LOGOFF(BASE, ADDR)), \ (uint)(XFS_DA_LOGOFF(BASE, ADDR)+(SIZE)-1) +/* + * Name ops for directory and/or attr name operations + */ +struct xfs_nameops { + xfs_dahash_t (*hashname)(struct xfs_name *); + enum xfs_dacmp (*compname)(struct xfs_da_args *, const char *, int); +}; + #ifdef __KERNEL__ /*======================================================================== @@ -249,6 +273,10 @@ int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, xfs_dabuf_t *dead_buf); uint xfs_da_hashname(const uchar_t *name_string, int name_length); +enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args, + const char *name, int len); + + xfs_da_state_t *xfs_da_state_alloc(void); void xfs_da_state_free(xfs_da_state_t *state); diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index 5f3647cb9885..75b0cd4da0ea 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -116,7 +116,7 @@ xfs_swapext( out_put_file: fput(file); out_free_sxp: - kmem_free(sxp, sizeof(xfs_swapext_t)); + kmem_free(sxp); out: return error; } @@ -128,10 +128,8 @@ xfs_swap_extents( xfs_swapext_t *sxp) { xfs_mount_t *mp; - xfs_inode_t *ips[2]; xfs_trans_t *tp; xfs_bstat_t *sbp = &sxp->sx_stat; - bhv_vnode_t *vp, *tvp; xfs_ifork_t *tempifp, *ifp, *tifp; int ilf_fields, tilf_fields; static uint lock_flags = XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL; @@ -150,19 +148,15 @@ xfs_swap_extents( } sbp = &sxp->sx_stat; - vp = XFS_ITOV(ip); - tvp = XFS_ITOV(tip); - - /* Lock in i_ino order */ - if (ip->i_ino < tip->i_ino) { - ips[0] = ip; - ips[1] = tip; - } else { - ips[0] = tip; - ips[1] = ip; - } - xfs_lock_inodes(ips, 2, lock_flags); + /* + * we have to do two separate lock calls here to keep lockdep + * happy. If we try to get all the locks in one call, lock will + * report false positives when we drop the ILOCK and regain them + * below. + */ + xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL); + xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); locked = 1; /* Verify that both files have the same format */ @@ -184,7 +178,7 @@ xfs_swap_extents( goto error0; } - if (VN_CACHED(tvp) != 0) { + if (VN_CACHED(VFS_I(tip)) != 0) { xfs_inval_cached_trace(tip, 0, -1, 0, -1); error = xfs_flushinval_pages(tip, 0, -1, FI_REMAPF_LOCKED); @@ -193,7 +187,7 @@ xfs_swap_extents( } /* Verify O_DIRECT for ftmp */ - if (VN_CACHED(tvp) != 0) { + if (VN_CACHED(VFS_I(tip)) != 0) { error = XFS_ERROR(EINVAL); goto error0; } @@ -237,7 +231,7 @@ xfs_swap_extents( * vop_read (or write in the case of autogrow) they block on the iolock * until we have switched the extents. */ - if (VN_MAPPED(vp)) { + if (VN_MAPPED(VFS_I(ip))) { error = XFS_ERROR(EBUSY); goto error0; } @@ -265,7 +259,7 @@ xfs_swap_extents( locked = 0; goto error0; } - xfs_lock_inodes(ips, 2, XFS_ILOCK_EXCL); + xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); /* * Count the number of extended attribute blocks @@ -350,15 +344,11 @@ xfs_swap_extents( break; } - /* - * Increment vnode ref counts since xfs_trans_commit & - * xfs_trans_cancel will both unlock the inodes and - * decrement the associated ref counts. - */ - VN_HOLD(vp); - VN_HOLD(tvp); + IHOLD(ip); xfs_trans_ijoin(tp, ip, lock_flags); + + IHOLD(tip); xfs_trans_ijoin(tp, tip, lock_flags); xfs_trans_log_inode(tp, ip, ilf_fields); @@ -381,6 +371,6 @@ xfs_swap_extents( xfs_iunlock(tip, lock_flags); } if (tempifp != NULL) - kmem_free(tempifp, sizeof(xfs_ifork_t)); + kmem_free(tempifp); return error; } diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c index 7cb26529766b..80e0dc51361c 100644 --- a/fs/xfs/xfs_dir2.c +++ b/fs/xfs/xfs_dir2.c @@ -46,6 +46,54 @@ struct xfs_name xfs_name_dotdot = {"..", 2}; +extern const struct xfs_nameops xfs_default_nameops; + +/* + * ASCII case-insensitive (ie. A-Z) support for directories that was + * used in IRIX. + */ +STATIC xfs_dahash_t +xfs_ascii_ci_hashname( + struct xfs_name *name) +{ + xfs_dahash_t hash; + int i; + + for (i = 0, hash = 0; i < name->len; i++) + hash = tolower(name->name[i]) ^ rol32(hash, 7); + + return hash; +} + +STATIC enum xfs_dacmp +xfs_ascii_ci_compname( + struct xfs_da_args *args, + const char *name, + int len) +{ + enum xfs_dacmp result; + int i; + + if (args->namelen != len) + return XFS_CMP_DIFFERENT; + + result = XFS_CMP_EXACT; + for (i = 0; i < len; i++) { + if (args->name[i] == name[i]) + continue; + if (tolower(args->name[i]) != tolower(name[i])) + return XFS_CMP_DIFFERENT; + result = XFS_CMP_CASE; + } + + return result; +} + +static struct xfs_nameops xfs_ascii_ci_nameops = { + .hashname = xfs_ascii_ci_hashname, + .compname = xfs_ascii_ci_compname, +}; + void xfs_dir_mount( xfs_mount_t *mp) @@ -65,6 +113,10 @@ xfs_dir_mount( (mp->m_dirblksize - (uint)sizeof(xfs_da_node_hdr_t)) / (uint)sizeof(xfs_da_node_entry_t); mp->m_dir_magicpct = (mp->m_dirblksize * 37) / 100; + if (xfs_sb_version_hasasciici(&mp->m_sb)) + mp->m_dirnameops = &xfs_ascii_ci_nameops; + else + mp->m_dirnameops = &xfs_default_nameops; } /* @@ -162,9 +214,10 @@ xfs_dir_createname( return rval; XFS_STATS_INC(xs_dir_create); + memset(&args, 0, sizeof(xfs_da_args_t)); args.name = name->name; args.namelen = name->len; - args.hashval = xfs_da_hashname(name->name, name->len); + args.hashval = dp->i_mount->m_dirnameops->hashname(name); args.inumber = inum; args.dp = dp; args.firstblock = first; @@ -172,8 +225,7 @@ xfs_dir_createname( args.total = total; args.whichfork = XFS_DATA_FORK; args.trans = tp; - args.justcheck = 0; - args.addname = args.oknoent = 1; + args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) rval = xfs_dir2_sf_addname(&args); @@ -191,14 +243,43 @@ xfs_dir_createname( } /* + * If doing a CI lookup and case-insensitive match, dup actual name into + * args.value. Return EEXIST for success (ie. name found) or an error. + */ +int +xfs_dir_cilookup_result( + struct xfs_da_args *args, + const char *name, + int len) +{ + if (args->cmpresult == XFS_CMP_DIFFERENT) + return ENOENT; + if (args->cmpresult != XFS_CMP_CASE || + !(args->op_flags & XFS_DA_OP_CILOOKUP)) + return EEXIST; + + args->value = kmem_alloc(len, KM_MAYFAIL); + if (!args->value) + return ENOMEM; + + memcpy(args->value, name, len); + args->valuelen = len; + return EEXIST; +} + +/* * Lookup a name in a directory, give back the inode number. + * If ci_name is not NULL, returns the actual name in ci_name if it differs + * to name, or ci_name->name is set to NULL for an exact match. */ + int xfs_dir_lookup( xfs_trans_t *tp, xfs_inode_t *dp, struct xfs_name *name, - xfs_ino_t *inum) /* out: inode number */ + xfs_ino_t *inum, /* out: inode number */ + struct xfs_name *ci_name) /* out: actual name if CI match */ { xfs_da_args_t args; int rval; @@ -206,15 +287,17 @@ xfs_dir_lookup( ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); XFS_STATS_INC(xs_dir_lookup); - memset(&args, 0, sizeof(xfs_da_args_t)); + memset(&args, 0, sizeof(xfs_da_args_t)); args.name = name->name; args.namelen = name->len; - args.hashval = xfs_da_hashname(name->name, name->len); + args.hashval = dp->i_mount->m_dirnameops->hashname(name); args.dp = dp; args.whichfork = XFS_DATA_FORK; args.trans = tp; - args.oknoent = 1; + args.op_flags = XFS_DA_OP_OKNOENT; + if (ci_name) + args.op_flags |= XFS_DA_OP_CILOOKUP; if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) rval = xfs_dir2_sf_lookup(&args); @@ -230,8 +313,13 @@ xfs_dir_lookup( rval = xfs_dir2_node_lookup(&args); if (rval == EEXIST) rval = 0; - if (rval == 0) + if (!rval) { *inum = args.inumber; + if (ci_name) { + ci_name->name = args.value; + ci_name->len = args.valuelen; + } + } return rval; } @@ -255,9 +343,10 @@ xfs_dir_removename( ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); XFS_STATS_INC(xs_dir_remove); + memset(&args, 0, sizeof(xfs_da_args_t)); args.name = name->name; args.namelen = name->len; - args.hashval = xfs_da_hashname(name->name, name->len); + args.hashval = dp->i_mount->m_dirnameops->hashname(name); args.inumber = ino; args.dp = dp; args.firstblock = first; @@ -265,7 +354,6 @@ xfs_dir_removename( args.total = total; args.whichfork = XFS_DATA_FORK; args.trans = tp; - args.justcheck = args.addname = args.oknoent = 0; if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) rval = xfs_dir2_sf_removename(&args); @@ -338,9 +426,10 @@ xfs_dir_replace( if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) return rval; + memset(&args, 0, sizeof(xfs_da_args_t)); args.name = name->name; args.namelen = name->len; - args.hashval = xfs_da_hashname(name->name, name->len); + args.hashval = dp->i_mount->m_dirnameops->hashname(name); args.inumber = inum; args.dp = dp; args.firstblock = first; @@ -348,7 +437,6 @@ xfs_dir_replace( args.total = total; args.whichfork = XFS_DATA_FORK; args.trans = tp; - args.justcheck = args.addname = args.oknoent = 0; if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) rval = xfs_dir2_sf_replace(&args); @@ -384,15 +472,16 @@ xfs_dir_canenter( return 0; ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); - memset(&args, 0, sizeof(xfs_da_args_t)); + memset(&args, 0, sizeof(xfs_da_args_t)); args.name = name->name; args.namelen = name->len; - args.hashval = xfs_da_hashname(name->name, name->len); + args.hashval = dp->i_mount->m_dirnameops->hashname(name); args.dp = dp; args.whichfork = XFS_DATA_FORK; args.trans = tp; - args.justcheck = args.addname = args.oknoent = 1; + args.op_flags = XFS_DA_OP_JUSTCHECK | XFS_DA_OP_ADDNAME | + XFS_DA_OP_OKNOENT; if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) rval = xfs_dir2_sf_addname(&args); @@ -493,7 +582,7 @@ xfs_dir2_grow_inode( args->firstblock, args->total, &mapp[mapi], &nmap, args->flist, NULL))) { - kmem_free(mapp, sizeof(*mapp) * count); + kmem_free(mapp); return error; } if (nmap < 1) @@ -525,14 +614,14 @@ xfs_dir2_grow_inode( mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount != bno + count) { if (mapp != &map) - kmem_free(mapp, sizeof(*mapp) * count); + kmem_free(mapp); return XFS_ERROR(ENOSPC); } /* * Done with the temporary mapping table. */ if (mapp != &map) - kmem_free(mapp, sizeof(*mapp) * count); + kmem_free(mapp); *dbp = xfs_dir2_da_to_db(mp, (xfs_dablk_t)bno); /* * Update file's size if this is the data space and it grew. diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h index 6392f939029f..1d9ef96f33aa 100644 --- a/fs/xfs/xfs_dir2.h +++ b/fs/xfs/xfs_dir2.h @@ -74,7 +74,8 @@ extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp, xfs_fsblock_t *first, struct xfs_bmap_free *flist, xfs_extlen_t tot); extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp, - struct xfs_name *name, xfs_ino_t *inum); + struct xfs_name *name, xfs_ino_t *inum, + struct xfs_name *ci_name); extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_name *name, xfs_ino_t ino, xfs_fsblock_t *first, @@ -99,4 +100,7 @@ extern int xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp, extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db, struct xfs_dabuf *bp); +extern int xfs_dir_cilookup_result(struct xfs_da_args *args, const char *name, + int len); + #endif /* __XFS_DIR2_H__ */ diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index fb5a556725b3..e2fa0a1d8e96 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -215,7 +215,7 @@ xfs_dir2_block_addname( /* * If this isn't a real add, we're done with the buffer. */ - if (args->justcheck) + if (args->op_flags & XFS_DA_OP_JUSTCHECK) xfs_da_brelse(tp, bp); /* * If we don't have space for the new entry & leaf ... @@ -225,7 +225,7 @@ xfs_dir2_block_addname( * Not trying to actually do anything, or don't have * a space reservation: return no-space. */ - if (args->justcheck || args->total == 0) + if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0) return XFS_ERROR(ENOSPC); /* * Convert to the next larger format. @@ -240,7 +240,7 @@ xfs_dir2_block_addname( /* * Just checking, and it would work, so say so. */ - if (args->justcheck) + if (args->op_flags & XFS_DA_OP_JUSTCHECK) return 0; needlog = needscan = 0; /* @@ -610,14 +610,15 @@ xfs_dir2_block_lookup( /* * Get the offset from the leaf entry, to point to the data. */ - dep = (xfs_dir2_data_entry_t *) - ((char *)block + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address))); + dep = (xfs_dir2_data_entry_t *)((char *)block + + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address))); /* - * Fill in inode number, release the block. + * Fill in inode number, CI name if appropriate, release the block. */ args->inumber = be64_to_cpu(dep->inumber); + error = xfs_dir_cilookup_result(args, dep->name, dep->namelen); xfs_da_brelse(args->trans, bp); - return XFS_ERROR(EEXIST); + return XFS_ERROR(error); } /* @@ -643,6 +644,7 @@ xfs_dir2_block_lookup_int( int mid; /* binary search current idx */ xfs_mount_t *mp; /* filesystem mount point */ xfs_trans_t *tp; /* transaction pointer */ + enum xfs_dacmp cmp; /* comparison result */ dp = args->dp; tp = args->trans; @@ -673,7 +675,7 @@ xfs_dir2_block_lookup_int( else high = mid - 1; if (low > high) { - ASSERT(args->oknoent); + ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); xfs_da_brelse(tp, bp); return XFS_ERROR(ENOENT); } @@ -697,20 +699,31 @@ xfs_dir2_block_lookup_int( dep = (xfs_dir2_data_entry_t *) ((char *)block + xfs_dir2_dataptr_to_off(mp, addr)); /* - * Compare, if it's right give back buffer & entry number. + * Compare name and if it's an exact match, return the index + * and buffer. If it's the first case-insensitive match, store + * the index and buffer and continue looking for an exact match. */ - if (dep->namelen == args->namelen && - dep->name[0] == args->name[0] && - memcmp(dep->name, args->name, args->namelen) == 0) { + cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen); + if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) { + args->cmpresult = cmp; *bpp = bp; *entno = mid; - return 0; + if (cmp == XFS_CMP_EXACT) + return 0; } - } while (++mid < be32_to_cpu(btp->count) && be32_to_cpu(blp[mid].hashval) == hash); + } while (++mid < be32_to_cpu(btp->count) && + be32_to_cpu(blp[mid].hashval) == hash); + + ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); + /* + * Here, we can only be doing a lookup (not a rename or replace). + * If a case-insensitive match was found earlier, return success. + */ + if (args->cmpresult == XFS_CMP_CASE) + return 0; /* * No match, release the buffer and return ENOENT. */ - ASSERT(args->oknoent); xfs_da_brelse(tp, bp); return XFS_ERROR(ENOENT); } @@ -1033,6 +1046,7 @@ xfs_dir2_sf_to_block( xfs_dir2_sf_t *sfp; /* shortform structure */ __be16 *tagp; /* end of data entry */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_name name; xfs_dir2_trace_args("sf_to_block", args); dp = args->dp; @@ -1071,7 +1085,7 @@ xfs_dir2_sf_to_block( */ error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, &blkno); if (error) { - kmem_free(buf, buf_len); + kmem_free(buf); return error; } /* @@ -1079,7 +1093,7 @@ xfs_dir2_sf_to_block( */ error = xfs_dir2_data_init(args, blkno, &bp); if (error) { - kmem_free(buf, buf_len); + kmem_free(buf); return error; } block = bp->data; @@ -1187,8 +1201,10 @@ xfs_dir2_sf_to_block( tagp = xfs_dir2_data_entry_tag_p(dep); *tagp = cpu_to_be16((char *)dep - (char *)block); xfs_dir2_data_log_entry(tp, bp, dep); - blp[2 + i].hashval = cpu_to_be32(xfs_da_hashname( - (char *)sfep->name, sfep->namelen)); + name.name = sfep->name; + name.len = sfep->namelen; + blp[2 + i].hashval = cpu_to_be32(mp->m_dirnameops-> + hashname(&name)); blp[2 + i].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp, (char *)dep - (char *)block)); offset = (int)((char *)(tagp + 1) - (char *)block); @@ -1198,7 +1214,7 @@ xfs_dir2_sf_to_block( sfep = xfs_dir2_sf_nextentry(sfp, sfep); } /* Done with the temporary buffer */ - kmem_free(buf, buf_len); + kmem_free(buf); /* * Sort the leaf entries by hash value. */ diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c index fb8c9e08b23d..498f8d694330 100644 --- a/fs/xfs/xfs_dir2_data.c +++ b/fs/xfs/xfs_dir2_data.c @@ -65,6 +65,7 @@ xfs_dir2_data_check( xfs_mount_t *mp; /* filesystem mount point */ char *p; /* current data position */ int stale; /* count of stale leaves */ + struct xfs_name name; mp = dp->i_mount; d = bp->data; @@ -140,7 +141,9 @@ xfs_dir2_data_check( addr = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, (xfs_dir2_data_aoff_t) ((char *)dep - (char *)d)); - hash = xfs_da_hashname((char *)dep->name, dep->namelen); + name.name = dep->name; + name.len = dep->namelen; + hash = mp->m_dirnameops->hashname(&name); for (i = 0; i < be32_to_cpu(btp->count); i++) { if (be32_to_cpu(lep[i].address) == addr && be32_to_cpu(lep[i].hashval) == hash) diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index bc52b803d79b..93535992cb60 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -263,20 +263,21 @@ xfs_dir2_leaf_addname( * If we don't have enough free bytes but we can make enough * by compacting out stale entries, we'll do that. */ - if ((char *)bestsp - (char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] < needbytes && - be16_to_cpu(leaf->hdr.stale) > 1) { + if ((char *)bestsp - (char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] < + needbytes && be16_to_cpu(leaf->hdr.stale) > 1) { compact = 1; } /* * Otherwise if we don't have enough free bytes we need to * convert to node form. */ - else if ((char *)bestsp - (char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] < - needbytes) { + else if ((char *)bestsp - (char *)&leaf->ents[be16_to_cpu( + leaf->hdr.count)] < needbytes) { /* * Just checking or no space reservation, give up. */ - if (args->justcheck || args->total == 0) { + if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || + args->total == 0) { xfs_da_brelse(tp, lbp); return XFS_ERROR(ENOSPC); } @@ -301,7 +302,7 @@ xfs_dir2_leaf_addname( * If just checking, then it will fit unless we needed to allocate * a new data block. */ - if (args->justcheck) { + if (args->op_flags & XFS_DA_OP_JUSTCHECK) { xfs_da_brelse(tp, lbp); return use_block == -1 ? XFS_ERROR(ENOSPC) : 0; } @@ -1110,7 +1111,7 @@ xfs_dir2_leaf_getdents( *offset = XFS_DIR2_MAX_DATAPTR; else *offset = xfs_dir2_byte_to_dataptr(mp, curoff); - kmem_free(map, map_size * sizeof(*map)); + kmem_free(map); if (bp) xfs_da_brelse(NULL, bp); return error; @@ -1298,12 +1299,13 @@ xfs_dir2_leaf_lookup( ((char *)dbp->data + xfs_dir2_dataptr_to_off(dp->i_mount, be32_to_cpu(lep->address))); /* - * Return the found inode number. + * Return the found inode number & CI name if appropriate */ args->inumber = be64_to_cpu(dep->inumber); + error = xfs_dir_cilookup_result(args, dep->name, dep->namelen); xfs_da_brelse(tp, dbp); xfs_da_brelse(tp, lbp); - return XFS_ERROR(EEXIST); + return XFS_ERROR(error); } /* @@ -1319,8 +1321,8 @@ xfs_dir2_leaf_lookup_int( int *indexp, /* out: index in leaf block */ xfs_dabuf_t **dbpp) /* out: data buffer */ { - xfs_dir2_db_t curdb; /* current data block number */ - xfs_dabuf_t *dbp; /* data buffer */ + xfs_dir2_db_t curdb = -1; /* current data block number */ + xfs_dabuf_t *dbp = NULL; /* data buffer */ xfs_dir2_data_entry_t *dep; /* data entry */ xfs_inode_t *dp; /* incore directory inode */ int error; /* error return code */ @@ -1331,6 +1333,8 @@ xfs_dir2_leaf_lookup_int( xfs_mount_t *mp; /* filesystem mount point */ xfs_dir2_db_t newdb; /* new data block number */ xfs_trans_t *tp; /* transaction pointer */ + xfs_dir2_db_t cidb = -1; /* case match data block no. */ + enum xfs_dacmp cmp; /* name compare result */ dp = args->dp; tp = args->trans; @@ -1338,11 +1342,10 @@ xfs_dir2_leaf_lookup_int( /* * Read the leaf block into the buffer. */ - if ((error = - xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp, - XFS_DATA_FORK))) { + error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp, + XFS_DATA_FORK); + if (error) return error; - } *lbpp = lbp; leaf = lbp->data; xfs_dir2_leaf_check(dp, lbp); @@ -1354,9 +1357,9 @@ xfs_dir2_leaf_lookup_int( * Loop over all the entries with the right hash value * looking to match the name. */ - for (lep = &leaf->ents[index], dbp = NULL, curdb = -1; - index < be16_to_cpu(leaf->hdr.count) && be32_to_cpu(lep->hashval) == args->hashval; - lep++, index++) { + for (lep = &leaf->ents[index]; index < be16_to_cpu(leaf->hdr.count) && + be32_to_cpu(lep->hashval) == args->hashval; + lep++, index++) { /* * Skip over stale leaf entries. */ @@ -1373,10 +1376,10 @@ xfs_dir2_leaf_lookup_int( if (newdb != curdb) { if (dbp) xfs_da_brelse(tp, dbp); - if ((error = - xfs_da_read_buf(tp, dp, - xfs_dir2_db_to_da(mp, newdb), -1, &dbp, - XFS_DATA_FORK))) { + error = xfs_da_read_buf(tp, dp, + xfs_dir2_db_to_da(mp, newdb), + -1, &dbp, XFS_DATA_FORK); + if (error) { xfs_da_brelse(tp, lbp); return error; } @@ -1386,24 +1389,50 @@ xfs_dir2_leaf_lookup_int( /* * Point to the data entry. */ - dep = (xfs_dir2_data_entry_t *) - ((char *)dbp->data + - xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address))); + dep = (xfs_dir2_data_entry_t *)((char *)dbp->data + + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address))); /* - * If it matches then return it. + * Compare name and if it's an exact match, return the index + * and buffer. If it's the first case-insensitive match, store + * the index and buffer and continue looking for an exact match. */ - if (dep->namelen == args->namelen && - dep->name[0] == args->name[0] && - memcmp(dep->name, args->name, args->namelen) == 0) { - *dbpp = dbp; + cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen); + if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) { + args->cmpresult = cmp; *indexp = index; - return 0; + /* case exact match: return the current buffer. */ + if (cmp == XFS_CMP_EXACT) { + *dbpp = dbp; + return 0; + } + cidb = curdb; } } + ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); + /* + * Here, we can only be doing a lookup (not a rename or remove). + * If a case-insensitive match was found earlier, re-read the + * appropriate data block if required and return it. + */ + if (args->cmpresult == XFS_CMP_CASE) { + ASSERT(cidb != -1); + if (cidb != curdb) { + xfs_da_brelse(tp, dbp); + error = xfs_da_read_buf(tp, dp, + xfs_dir2_db_to_da(mp, cidb), + -1, &dbp, XFS_DATA_FORK); + if (error) { + xfs_da_brelse(tp, lbp); + return error; + } + } + *dbpp = dbp; + return 0; + } /* * No match found, return ENOENT. */ - ASSERT(args->oknoent); + ASSERT(cidb == -1); if (dbp) xfs_da_brelse(tp, dbp); xfs_da_brelse(tp, lbp); diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index 8dade711f099..fa6c3a5ddbc6 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -226,7 +226,7 @@ xfs_dir2_leafn_add( ASSERT(index == be16_to_cpu(leaf->hdr.count) || be32_to_cpu(leaf->ents[index].hashval) >= args->hashval); - if (args->justcheck) + if (args->op_flags & XFS_DA_OP_JUSTCHECK) return 0; /* @@ -387,28 +387,26 @@ xfs_dir2_leafn_lasthash( } /* - * Look up a leaf entry in a node-format leaf block. - * If this is an addname then the extrablk in state is a freespace block, - * otherwise it's a data block. + * Look up a leaf entry for space to add a name in a node-format leaf block. + * The extrablk in state is a freespace block. */ -int -xfs_dir2_leafn_lookup_int( +STATIC int +xfs_dir2_leafn_lookup_for_addname( xfs_dabuf_t *bp, /* leaf buffer */ xfs_da_args_t *args, /* operation arguments */ int *indexp, /* out: leaf entry index */ xfs_da_state_t *state) /* state to fill in */ { - xfs_dabuf_t *curbp; /* current data/free buffer */ - xfs_dir2_db_t curdb; /* current data block number */ - xfs_dir2_db_t curfdb; /* current free block number */ - xfs_dir2_data_entry_t *dep; /* data block entry */ + xfs_dabuf_t *curbp = NULL; /* current data/free buffer */ + xfs_dir2_db_t curdb = -1; /* current data block number */ + xfs_dir2_db_t curfdb = -1; /* current free block number */ xfs_inode_t *dp; /* incore directory inode */ int error; /* error return value */ int fi; /* free entry index */ - xfs_dir2_free_t *free=NULL; /* free block structure */ + xfs_dir2_free_t *free = NULL; /* free block structure */ int index; /* leaf entry index */ xfs_dir2_leaf_t *leaf; /* leaf structure */ - int length=0; /* length of new data entry */ + int length; /* length of new data entry */ xfs_dir2_leaf_entry_t *lep; /* leaf entry */ xfs_mount_t *mp; /* filesystem mount point */ xfs_dir2_db_t newdb; /* new data block number */ @@ -431,33 +429,20 @@ xfs_dir2_leafn_lookup_int( /* * Do we have a buffer coming in? */ - if (state->extravalid) + if (state->extravalid) { + /* If so, it's a free block buffer, get the block number. */ curbp = state->extrablk.bp; - else - curbp = NULL; - /* - * For addname, it's a free block buffer, get the block number. - */ - if (args->addname) { - curfdb = curbp ? state->extrablk.blkno : -1; - curdb = -1; - length = xfs_dir2_data_entsize(args->namelen); - if ((free = (curbp ? curbp->data : NULL))) - ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC); - } - /* - * For others, it's a data block buffer, get the block number. - */ - else { - curfdb = -1; - curdb = curbp ? state->extrablk.blkno : -1; + curfdb = state->extrablk.blkno; + free = curbp->data; + ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC); } + length = xfs_dir2_data_entsize(args->namelen); /* * Loop over leaf entries with the right hash value. */ - for (lep = &leaf->ents[index]; - index < be16_to_cpu(leaf->hdr.count) && be32_to_cpu(lep->hashval) == args->hashval; - lep++, index++) { + for (lep = &leaf->ents[index]; index < be16_to_cpu(leaf->hdr.count) && + be32_to_cpu(lep->hashval) == args->hashval; + lep++, index++) { /* * Skip stale leaf entries. */ @@ -471,161 +456,244 @@ xfs_dir2_leafn_lookup_int( * For addname, we're looking for a place to put the new entry. * We want to use a data block with an entry of equal * hash value to ours if there is one with room. + * + * If this block isn't the data block we already have + * in hand, take a look at it. */ - if (args->addname) { + if (newdb != curdb) { + curdb = newdb; /* - * If this block isn't the data block we already have - * in hand, take a look at it. + * Convert the data block to the free block + * holding its freespace information. */ - if (newdb != curdb) { - curdb = newdb; - /* - * Convert the data block to the free block - * holding its freespace information. - */ - newfdb = xfs_dir2_db_to_fdb(mp, newdb); - /* - * If it's not the one we have in hand, - * read it in. - */ - if (newfdb != curfdb) { - /* - * If we had one before, drop it. - */ - if (curbp) - xfs_da_brelse(tp, curbp); - /* - * Read the free block. - */ - if ((error = xfs_da_read_buf(tp, dp, - xfs_dir2_db_to_da(mp, - newfdb), - -1, &curbp, - XFS_DATA_FORK))) { - return error; - } - free = curbp->data; - ASSERT(be32_to_cpu(free->hdr.magic) == - XFS_DIR2_FREE_MAGIC); - ASSERT((be32_to_cpu(free->hdr.firstdb) % - XFS_DIR2_MAX_FREE_BESTS(mp)) == - 0); - ASSERT(be32_to_cpu(free->hdr.firstdb) <= curdb); - ASSERT(curdb < - be32_to_cpu(free->hdr.firstdb) + - be32_to_cpu(free->hdr.nvalid)); - } - /* - * Get the index for our entry. - */ - fi = xfs_dir2_db_to_fdindex(mp, curdb); - /* - * If it has room, return it. - */ - if (unlikely(be16_to_cpu(free->bests[fi]) == NULLDATAOFF)) { - XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int", - XFS_ERRLEVEL_LOW, mp); - if (curfdb != newfdb) - xfs_da_brelse(tp, curbp); - return XFS_ERROR(EFSCORRUPTED); - } - curfdb = newfdb; - if (be16_to_cpu(free->bests[fi]) >= length) { - *indexp = index; - state->extravalid = 1; - state->extrablk.bp = curbp; - state->extrablk.blkno = curfdb; - state->extrablk.index = fi; - state->extrablk.magic = - XFS_DIR2_FREE_MAGIC; - ASSERT(args->oknoent); - return XFS_ERROR(ENOENT); - } - } - } - /* - * Not adding a new entry, so we really want to find - * the name given to us. - */ - else { + newfdb = xfs_dir2_db_to_fdb(mp, newdb); /* - * If it's a different data block, go get it. + * If it's not the one we have in hand, read it in. */ - if (newdb != curdb) { + if (newfdb != curfdb) { /* - * If we had a block before, drop it. + * If we had one before, drop it. */ if (curbp) xfs_da_brelse(tp, curbp); /* - * Read the data block. + * Read the free block. */ - if ((error = - xfs_da_read_buf(tp, dp, - xfs_dir2_db_to_da(mp, newdb), -1, - &curbp, XFS_DATA_FORK))) { + error = xfs_da_read_buf(tp, dp, + xfs_dir2_db_to_da(mp, newfdb), + -1, &curbp, XFS_DATA_FORK); + if (error) return error; - } - xfs_dir2_data_check(dp, curbp); - curdb = newdb; + free = curbp->data; + ASSERT(be32_to_cpu(free->hdr.magic) == + XFS_DIR2_FREE_MAGIC); + ASSERT((be32_to_cpu(free->hdr.firstdb) % + XFS_DIR2_MAX_FREE_BESTS(mp)) == 0); + ASSERT(be32_to_cpu(free->hdr.firstdb) <= curdb); + ASSERT(curdb < be32_to_cpu(free->hdr.firstdb) + + be32_to_cpu(free->hdr.nvalid)); } /* - * Point to the data entry. + * Get the index for our entry. */ - dep = (xfs_dir2_data_entry_t *) - ((char *)curbp->data + - xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address))); + fi = xfs_dir2_db_to_fdindex(mp, curdb); /* - * Compare the entry, return it if it matches. + * If it has room, return it. */ - if (dep->namelen == args->namelen && - dep->name[0] == args->name[0] && - memcmp(dep->name, args->name, args->namelen) == 0) { - args->inumber = be64_to_cpu(dep->inumber); - *indexp = index; - state->extravalid = 1; - state->extrablk.bp = curbp; - state->extrablk.blkno = curdb; - state->extrablk.index = - (int)((char *)dep - - (char *)curbp->data); - state->extrablk.magic = XFS_DIR2_DATA_MAGIC; - return XFS_ERROR(EEXIST); + if (unlikely(be16_to_cpu(free->bests[fi]) == NULLDATAOFF)) { + XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int", + XFS_ERRLEVEL_LOW, mp); + if (curfdb != newfdb) + xfs_da_brelse(tp, curbp); + return XFS_ERROR(EFSCORRUPTED); } + curfdb = newfdb; + if (be16_to_cpu(free->bests[fi]) >= length) + goto out; } } + /* Didn't find any space */ + fi = -1; +out: + ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); + if (curbp) { + /* Giving back a free block. */ + state->extravalid = 1; + state->extrablk.bp = curbp; + state->extrablk.index = fi; + state->extrablk.blkno = curfdb; + state->extrablk.magic = XFS_DIR2_FREE_MAGIC; + } else { + state->extravalid = 0; + } /* - * Didn't find a match. - * If we are holding a buffer, give it back in case our caller - * finds it useful. + * Return the index, that will be the insertion point. */ - if ((state->extravalid = (curbp != NULL))) { - state->extrablk.bp = curbp; - state->extrablk.index = -1; + *indexp = index; + return XFS_ERROR(ENOENT); +} + +/* + * Look up a leaf entry in a node-format leaf block. + * The extrablk in state a data block. + */ +STATIC int +xfs_dir2_leafn_lookup_for_entry( + xfs_dabuf_t *bp, /* leaf buffer */ + xfs_da_args_t *args, /* operation arguments */ + int *indexp, /* out: leaf entry index */ + xfs_da_state_t *state) /* state to fill in */ +{ + xfs_dabuf_t *curbp = NULL; /* current data/free buffer */ + xfs_dir2_db_t curdb = -1; /* current data block number */ + xfs_dir2_data_entry_t *dep; /* data block entry */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return value */ + int index; /* leaf entry index */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry */ + xfs_mount_t *mp; /* filesystem mount point */ + xfs_dir2_db_t newdb; /* new data block number */ + xfs_trans_t *tp; /* transaction pointer */ + enum xfs_dacmp cmp; /* comparison result */ + + dp = args->dp; + tp = args->trans; + mp = dp->i_mount; + leaf = bp->data; + ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC); +#ifdef __KERNEL__ + ASSERT(be16_to_cpu(leaf->hdr.count) > 0); +#endif + xfs_dir2_leafn_check(dp, bp); + /* + * Look up the hash value in the leaf entries. + */ + index = xfs_dir2_leaf_search_hash(args, bp); + /* + * Do we have a buffer coming in? + */ + if (state->extravalid) { + curbp = state->extrablk.bp; + curdb = state->extrablk.blkno; + } + /* + * Loop over leaf entries with the right hash value. + */ + for (lep = &leaf->ents[index]; index < be16_to_cpu(leaf->hdr.count) && + be32_to_cpu(lep->hashval) == args->hashval; + lep++, index++) { /* - * For addname, giving back a free block. + * Skip stale leaf entries. */ - if (args->addname) { - state->extrablk.blkno = curfdb; - state->extrablk.magic = XFS_DIR2_FREE_MAGIC; + if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR) + continue; + /* + * Pull the data block number from the entry. + */ + newdb = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address)); + /* + * Not adding a new entry, so we really want to find + * the name given to us. + * + * If it's a different data block, go get it. + */ + if (newdb != curdb) { + /* + * If we had a block before that we aren't saving + * for a CI name, drop it + */ + if (curbp && (args->cmpresult == XFS_CMP_DIFFERENT || + curdb != state->extrablk.blkno)) + xfs_da_brelse(tp, curbp); + /* + * If needing the block that is saved with a CI match, + * use it otherwise read in the new data block. + */ + if (args->cmpresult != XFS_CMP_DIFFERENT && + newdb == state->extrablk.blkno) { + ASSERT(state->extravalid); + curbp = state->extrablk.bp; + } else { + error = xfs_da_read_buf(tp, dp, + xfs_dir2_db_to_da(mp, newdb), + -1, &curbp, XFS_DATA_FORK); + if (error) + return error; + } + xfs_dir2_data_check(dp, curbp); + curdb = newdb; } /* - * For other callers, giving back a data block. + * Point to the data entry. */ - else { + dep = (xfs_dir2_data_entry_t *)((char *)curbp->data + + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address))); + /* + * Compare the entry and if it's an exact match, return + * EEXIST immediately. If it's the first case-insensitive + * match, store the block & inode number and continue looking. + */ + cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen); + if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) { + /* If there is a CI match block, drop it */ + if (args->cmpresult != XFS_CMP_DIFFERENT && + curdb != state->extrablk.blkno) + xfs_da_brelse(tp, state->extrablk.bp); + args->cmpresult = cmp; + args->inumber = be64_to_cpu(dep->inumber); + *indexp = index; + state->extravalid = 1; + state->extrablk.bp = curbp; state->extrablk.blkno = curdb; + state->extrablk.index = (int)((char *)dep - + (char *)curbp->data); state->extrablk.magic = XFS_DIR2_DATA_MAGIC; + if (cmp == XFS_CMP_EXACT) + return XFS_ERROR(EEXIST); } } - /* - * Return the final index, that will be the insertion point. - */ + ASSERT(index == be16_to_cpu(leaf->hdr.count) || + (args->op_flags & XFS_DA_OP_OKNOENT)); + if (curbp) { + if (args->cmpresult == XFS_CMP_DIFFERENT) { + /* Giving back last used data block. */ + state->extravalid = 1; + state->extrablk.bp = curbp; + state->extrablk.index = -1; + state->extrablk.blkno = curdb; + state->extrablk.magic = XFS_DIR2_DATA_MAGIC; + } else { + /* If the curbp is not the CI match block, drop it */ + if (state->extrablk.bp != curbp) + xfs_da_brelse(tp, curbp); + } + } else { + state->extravalid = 0; + } *indexp = index; - ASSERT(index == be16_to_cpu(leaf->hdr.count) || args->oknoent); return XFS_ERROR(ENOENT); } /* + * Look up a leaf entry in a node-format leaf block. + * If this is an addname then the extrablk in state is a freespace block, + * otherwise it's a data block. + */ +int +xfs_dir2_leafn_lookup_int( + xfs_dabuf_t *bp, /* leaf buffer */ + xfs_da_args_t *args, /* operation arguments */ + int *indexp, /* out: leaf entry index */ + xfs_da_state_t *state) /* state to fill in */ +{ + if (args->op_flags & XFS_DA_OP_ADDNAME) + return xfs_dir2_leafn_lookup_for_addname(bp, args, indexp, + state); + return xfs_dir2_leafn_lookup_for_entry(bp, args, indexp, state); +} + +/* * Move count leaf entries from source to destination leaf. * Log entries and headers. Stale entries are preserved. */ @@ -823,9 +891,10 @@ xfs_dir2_leafn_rebalance( */ if (!state->inleaf) blk2->index = blk1->index - be16_to_cpu(leaf1->hdr.count); - - /* - * Finally sanity check just to make sure we are not returning a negative index + + /* + * Finally sanity check just to make sure we are not returning a + * negative index */ if(blk2->index < 0) { state->inleaf = 1; @@ -1332,7 +1401,7 @@ xfs_dir2_node_addname( /* * It worked, fix the hash values up the btree. */ - if (!args->justcheck) + if (!(args->op_flags & XFS_DA_OP_JUSTCHECK)) xfs_da_fixhashpath(state, &state->path); } else { /* @@ -1515,7 +1584,8 @@ xfs_dir2_node_addname_int( /* * Not allowed to allocate, return failure. */ - if (args->justcheck || args->total == 0) { + if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || + args->total == 0) { /* * Drop the freespace buffer unless it came from our * caller. @@ -1661,7 +1731,7 @@ xfs_dir2_node_addname_int( /* * If just checking, we succeeded. */ - if (args->justcheck) { + if (args->op_flags & XFS_DA_OP_JUSTCHECK) { if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL) xfs_da_buf_done(fbp); return 0; @@ -1767,6 +1837,14 @@ xfs_dir2_node_lookup( error = xfs_da_node_lookup_int(state, &rval); if (error) rval = error; + else if (rval == ENOENT && args->cmpresult == XFS_CMP_CASE) { + /* If a CI match, dup the actual name and return EEXIST */ + xfs_dir2_data_entry_t *dep; + + dep = (xfs_dir2_data_entry_t *)((char *)state->extrablk.bp-> + data + state->extrablk.index); + rval = xfs_dir_cilookup_result(args, dep->name, dep->namelen); + } /* * Release the btree blocks and leaf block. */ @@ -1810,9 +1888,8 @@ xfs_dir2_node_removename( * Look up the entry we're deleting, set up the cursor. */ error = xfs_da_node_lookup_int(state, &rval); - if (error) { + if (error) rval = error; - } /* * Didn't find it, upper layer screwed up. */ @@ -1829,9 +1906,8 @@ xfs_dir2_node_removename( */ error = xfs_dir2_leafn_remove(args, blk->bp, blk->index, &state->extrablk, &rval); - if (error) { + if (error) return error; - } /* * Fix the hash values up the btree. */ diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c index 919d275a1cef..b46af0013ec9 100644 --- a/fs/xfs/xfs_dir2_sf.c +++ b/fs/xfs/xfs_dir2_sf.c @@ -255,7 +255,7 @@ xfs_dir2_block_to_sf( xfs_dir2_sf_check(args); out: xfs_trans_log_inode(args->trans, dp, logflags); - kmem_free(block, mp->m_dirblksize); + kmem_free(block); return error; } @@ -332,7 +332,7 @@ xfs_dir2_sf_addname( /* * Just checking or no space reservation, it doesn't fit. */ - if (args->justcheck || args->total == 0) + if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0) return XFS_ERROR(ENOSPC); /* * Convert to block form then add the name. @@ -345,7 +345,7 @@ xfs_dir2_sf_addname( /* * Just checking, it fits. */ - if (args->justcheck) + if (args->op_flags & XFS_DA_OP_JUSTCHECK) return 0; /* * Do it the easy way - just add it at the end. @@ -512,7 +512,7 @@ xfs_dir2_sf_addname_hard( sfep = xfs_dir2_sf_nextentry(sfp, sfep); memcpy(sfep, oldsfep, old_isize - nbytes); } - kmem_free(buf, old_isize); + kmem_free(buf); dp->i_d.di_size = new_isize; xfs_dir2_sf_check(args); } @@ -812,8 +812,11 @@ xfs_dir2_sf_lookup( { xfs_inode_t *dp; /* incore directory inode */ int i; /* entry index */ + int error; xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ xfs_dir2_sf_t *sfp; /* shortform structure */ + enum xfs_dacmp cmp; /* comparison result */ + xfs_dir2_sf_entry_t *ci_sfep; /* case-insens. entry */ xfs_dir2_trace_args("sf_lookup", args); xfs_dir2_sf_check(args); @@ -836,6 +839,7 @@ xfs_dir2_sf_lookup( */ if (args->namelen == 1 && args->name[0] == '.') { args->inumber = dp->i_ino; + args->cmpresult = XFS_CMP_EXACT; return XFS_ERROR(EEXIST); } /* @@ -844,28 +848,41 @@ xfs_dir2_sf_lookup( if (args->namelen == 2 && args->name[0] == '.' && args->name[1] == '.') { args->inumber = xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent); + args->cmpresult = XFS_CMP_EXACT; return XFS_ERROR(EEXIST); } /* * Loop over all the entries trying to match ours. */ - for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); - i < sfp->hdr.count; - i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) { - if (sfep->namelen == args->namelen && - sfep->name[0] == args->name[0] && - memcmp(args->name, sfep->name, args->namelen) == 0) { - args->inumber = - xfs_dir2_sf_get_inumber(sfp, - xfs_dir2_sf_inumberp(sfep)); - return XFS_ERROR(EEXIST); + ci_sfep = NULL; + for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->hdr.count; + i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) { + /* + * Compare name and if it's an exact match, return the inode + * number. If it's the first case-insensitive match, store the + * inode number and continue looking for an exact match. + */ + cmp = dp->i_mount->m_dirnameops->compname(args, sfep->name, + sfep->namelen); + if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) { + args->cmpresult = cmp; + args->inumber = xfs_dir2_sf_get_inumber(sfp, + xfs_dir2_sf_inumberp(sfep)); + if (cmp == XFS_CMP_EXACT) + return XFS_ERROR(EEXIST); + ci_sfep = sfep; } } + ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); /* - * Didn't find it. + * Here, we can only be doing a lookup (not a rename or replace). + * If a case-insensitive match was not found, return ENOENT. */ - ASSERT(args->oknoent); - return XFS_ERROR(ENOENT); + if (!ci_sfep) + return XFS_ERROR(ENOENT); + /* otherwise process the CI match as required by the caller */ + error = xfs_dir_cilookup_result(args, ci_sfep->name, ci_sfep->namelen); + return XFS_ERROR(error); } /* @@ -904,24 +921,21 @@ xfs_dir2_sf_removename( * Loop over the old directory entries. * Find the one we're deleting. */ - for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); - i < sfp->hdr.count; - i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) { - if (sfep->namelen == args->namelen && - sfep->name[0] == args->name[0] && - memcmp(sfep->name, args->name, args->namelen) == 0) { + for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->hdr.count; + i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) { + if (xfs_da_compname(args, sfep->name, sfep->namelen) == + XFS_CMP_EXACT) { ASSERT(xfs_dir2_sf_get_inumber(sfp, - xfs_dir2_sf_inumberp(sfep)) == - args->inumber); + xfs_dir2_sf_inumberp(sfep)) == + args->inumber); break; } } /* * Didn't find it. */ - if (i == sfp->hdr.count) { + if (i == sfp->hdr.count) return XFS_ERROR(ENOENT); - } /* * Calculate sizes. */ @@ -1042,11 +1056,10 @@ xfs_dir2_sf_replace( */ else { for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); - i < sfp->hdr.count; - i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) { - if (sfep->namelen == args->namelen && - sfep->name[0] == args->name[0] && - memcmp(args->name, sfep->name, args->namelen) == 0) { + i < sfp->hdr.count; + i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) { + if (xfs_da_compname(args, sfep->name, sfep->namelen) == + XFS_CMP_EXACT) { #if XFS_BIG_INUMS || defined(DEBUG) ino = xfs_dir2_sf_get_inumber(sfp, xfs_dir2_sf_inumberp(sfep)); @@ -1061,7 +1074,7 @@ xfs_dir2_sf_replace( * Didn't find it. */ if (i == sfp->hdr.count) { - ASSERT(args->oknoent); + ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); #if XFS_BIG_INUMS if (i8elevated) xfs_dir2_sf_toino4(args); @@ -1174,7 +1187,7 @@ xfs_dir2_sf_toino4( /* * Clean up the inode. */ - kmem_free(buf, oldsize); + kmem_free(buf); dp->i_d.di_size = newsize; xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); } @@ -1251,7 +1264,7 @@ xfs_dir2_sf_toino8( /* * Clean up the inode. */ - kmem_free(buf, oldsize); + kmem_free(buf); dp->i_d.di_size = newsize; xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); } diff --git a/fs/xfs/xfs_dir2_sf.h b/fs/xfs/xfs_dir2_sf.h index 005629d702d2..deecc9d238f8 100644 --- a/fs/xfs/xfs_dir2_sf.h +++ b/fs/xfs/xfs_dir2_sf.h @@ -62,7 +62,7 @@ typedef union { * Normalized offset (in a data block) of the entry, really xfs_dir2_data_off_t. * Only need 16 bits, this is the byte offset into the single block form. */ -typedef struct { __uint8_t i[2]; } xfs_dir2_sf_off_t; +typedef struct { __uint8_t i[2]; } __arch_pack xfs_dir2_sf_off_t; /* * The parent directory has a dedicated field, and the self-pointer must @@ -76,14 +76,14 @@ typedef struct xfs_dir2_sf_hdr { __uint8_t count; /* count of entries */ __uint8_t i8count; /* count of 8-byte inode #s */ xfs_dir2_inou_t parent; /* parent dir inode number */ -} xfs_dir2_sf_hdr_t; +} __arch_pack xfs_dir2_sf_hdr_t; typedef struct xfs_dir2_sf_entry { __uint8_t namelen; /* actual name length */ xfs_dir2_sf_off_t offset; /* saved offset */ __uint8_t name[1]; /* name, variable size */ xfs_dir2_inou_t inumber; /* inode number, var. offset */ -} xfs_dir2_sf_entry_t; +} __arch_pack xfs_dir2_sf_entry_t; typedef struct xfs_dir2_sf { xfs_dir2_sf_hdr_t hdr; /* shortform header */ diff --git a/fs/xfs/xfs_dir2_trace.c b/fs/xfs/xfs_dir2_trace.c index f3fb2ffd6f5c..6cc7c0c681ac 100644 --- a/fs/xfs/xfs_dir2_trace.c +++ b/fs/xfs/xfs_dir2_trace.c @@ -85,7 +85,8 @@ xfs_dir2_trace_args( (void *)((unsigned long)(args->inumber >> 32)), (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)), (void *)args->dp, (void *)args->trans, - (void *)(unsigned long)args->justcheck, NULL, NULL); + (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK), + NULL, NULL); } void @@ -100,7 +101,7 @@ xfs_dir2_trace_args_b( (void *)((unsigned long)(args->inumber >> 32)), (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)), (void *)args->dp, (void *)args->trans, - (void *)(unsigned long)args->justcheck, + (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK), (void *)(bp ? bp->bps[0] : NULL), NULL); } @@ -117,7 +118,7 @@ xfs_dir2_trace_args_bb( (void *)((unsigned long)(args->inumber >> 32)), (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)), (void *)args->dp, (void *)args->trans, - (void *)(unsigned long)args->justcheck, + (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK), (void *)(lbp ? lbp->bps[0] : NULL), (void *)(dbp ? dbp->bps[0] : NULL)); } @@ -157,8 +158,8 @@ xfs_dir2_trace_args_db( (void *)((unsigned long)(args->inumber >> 32)), (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)), (void *)args->dp, (void *)args->trans, - (void *)(unsigned long)args->justcheck, (void *)(long)db, - (void *)dbp); + (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK), + (void *)(long)db, (void *)dbp); } void @@ -173,7 +174,7 @@ xfs_dir2_trace_args_i( (void *)((unsigned long)(args->inumber >> 32)), (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)), (void *)args->dp, (void *)args->trans, - (void *)(unsigned long)args->justcheck, + (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK), (void *)((unsigned long)(i >> 32)), (void *)((unsigned long)(i & 0xFFFFFFFF))); } @@ -190,7 +191,8 @@ xfs_dir2_trace_args_s( (void *)((unsigned long)(args->inumber >> 32)), (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)), (void *)args->dp, (void *)args->trans, - (void *)(unsigned long)args->justcheck, (void *)(long)s, NULL); + (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK), + (void *)(long)s, NULL); } void @@ -208,7 +210,7 @@ xfs_dir2_trace_args_sb( (void *)((unsigned long)(args->inumber >> 32)), (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)), (void *)args->dp, (void *)args->trans, - (void *)(unsigned long)args->justcheck, (void *)(long)s, - (void *)dbp); + (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK), + (void *)(long)s, (void *)dbp); } #endif /* XFS_DIR2_TRACE */ diff --git a/fs/xfs/xfs_dmapi.h b/fs/xfs/xfs_dmapi.h index f71784ab6a60..2813cdd72375 100644 --- a/fs/xfs/xfs_dmapi.h +++ b/fs/xfs/xfs_dmapi.h @@ -18,7 +18,6 @@ #ifndef __XFS_DMAPI_H__ #define __XFS_DMAPI_H__ -#include <linux/version.h> /* Values used to define the on-disk version of dm_attrname_t. All * on-disk attribute names start with the 8-byte string "SGI_DMI_". * @@ -166,6 +165,6 @@ typedef enum { #define FILP_DELAY_FLAG(filp) ((filp->f_flags&(O_NDELAY|O_NONBLOCK)) ? \ DM_FLAGS_NDELAY : 0) -#define AT_DELAY_FLAG(f) ((f&ATTR_NONBLOCK) ? DM_FLAGS_NDELAY : 0) +#define AT_DELAY_FLAG(f) ((f & XFS_ATTR_NONBLOCK) ? DM_FLAGS_NDELAY : 0) #endif /* __XFS_DMAPI_H__ */ diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 05e5365d3c31..f227ecd1a294 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -58,22 +58,11 @@ xfs_error_trap(int e) } return e; } -#endif - -#if (defined(DEBUG) || defined(INDUCE_IO_ERROR)) int xfs_etest[XFS_NUM_INJECT_ERROR]; int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR]; char * xfs_etest_fsname[XFS_NUM_INJECT_ERROR]; -void -xfs_error_test_init(void) -{ - memset(xfs_etest, 0, sizeof(xfs_etest)); - memset(xfs_etest_fsid, 0, sizeof(xfs_etest_fsid)); - memset(xfs_etest_fsname, 0, sizeof(xfs_etest_fsname)); -} - int xfs_error_test(int error_tag, int *fsidp, char *expression, int line, char *file, unsigned long randfactor) @@ -150,8 +139,7 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud) xfs_etest[i]); xfs_etest[i] = 0; xfs_etest_fsid[i] = 0LL; - kmem_free(xfs_etest_fsname[i], - strlen(xfs_etest_fsname[i]) + 1); + kmem_free(xfs_etest_fsname[i]); xfs_etest_fsname[i] = NULL; } } @@ -163,7 +151,7 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud) return 0; } -#endif /* DEBUG || INDUCE_IO_ERROR */ +#endif /* DEBUG */ static void xfs_fs_vcmn_err(int level, xfs_mount_t *mp, char *fmt, va_list ap) @@ -175,7 +163,7 @@ xfs_fs_vcmn_err(int level, xfs_mount_t *mp, char *fmt, va_list ap) newfmt = kmem_alloc(len, KM_SLEEP); sprintf(newfmt, "Filesystem \"%s\": %s", mp->m_fsname, fmt); icmn_err(level, newfmt, ap); - kmem_free(newfmt, len); + kmem_free(newfmt); } else { icmn_err(level, fmt, ap); } diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 6490d2a9f8e1..11543f10b0c6 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -125,23 +125,14 @@ extern void xfs_corruption_error(char *tag, int level, struct xfs_mount *mp, #define XFS_RANDOM_DIOWRITE_IOERR (XFS_RANDOM_DEFAULT/10) #define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT -#if (defined(DEBUG) || defined(INDUCE_IO_ERROR)) +#ifdef DEBUG extern int xfs_error_test(int, int *, char *, int, char *, unsigned long); -extern void xfs_error_test_init(void); #define XFS_NUM_INJECT_ERROR 10 - -#ifdef __ANSI_CPP__ -#define XFS_TEST_ERROR(expr, mp, tag, rf) \ - ((expr) || \ - xfs_error_test((tag), (mp)->m_fixedfsid, #expr, __LINE__, __FILE__, \ - (rf))) -#else #define XFS_TEST_ERROR(expr, mp, tag, rf) \ ((expr) || \ xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \ (rf))) -#endif /* __ANSI_CPP__ */ extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp); extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud); @@ -149,7 +140,7 @@ extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud); #define XFS_TEST_ERROR(expr, mp, tag, rf) (expr) #define xfs_errortag_add(tag, mp) (ENOSYS) #define xfs_errortag_clearall(mp, loud) (ENOSYS) -#endif /* (DEBUG || INDUCE_IO_ERROR) */ +#endif /* DEBUG */ /* * XFS panic tags -- allow a call to xfs_cmn_err() be turned into diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 132bd07b9bb8..8aa28f751b2a 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -41,8 +41,7 @@ xfs_efi_item_free(xfs_efi_log_item_t *efip) int nexts = efip->efi_format.efi_nextents; if (nexts > XFS_EFI_MAX_FAST_EXTENTS) { - kmem_free(efip, sizeof(xfs_efi_log_item_t) + - (nexts - 1) * sizeof(xfs_extent_t)); + kmem_free(efip); } else { kmem_zone_free(xfs_efi_zone, efip); } @@ -374,8 +373,7 @@ xfs_efd_item_free(xfs_efd_log_item_t *efdp) int nexts = efdp->efd_format.efd_nextents; if (nexts > XFS_EFD_MAX_FAST_EXTENTS) { - kmem_free(efdp, sizeof(xfs_efd_log_item_t) + - (nexts - 1) * sizeof(xfs_extent_t)); + kmem_free(efdp); } else { kmem_zone_free(xfs_efd_zone, efdp); } diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 3f3785b10804..f3bb75da384e 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -397,10 +397,12 @@ int xfs_filestream_init(void) { item_zone = kmem_zone_init(sizeof(fstrm_item_t), "fstrm_item"); + if (!item_zone) + return -ENOMEM; #ifdef XFS_FILESTREAMS_TRACE - xfs_filestreams_trace_buf = ktrace_alloc(XFS_FSTRM_KTRACE_SIZE, KM_SLEEP); + xfs_filestreams_trace_buf = ktrace_alloc(XFS_FSTRM_KTRACE_SIZE, KM_NOFS); #endif - return item_zone ? 0 : -ENOMEM; + return 0; } /* diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index 3bed6433d050..01c0cc88d3f3 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -239,6 +239,7 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_LOGV2 0x0100 /* log format version 2 */ #define XFS_FSOP_GEOM_FLAGS_SECTOR 0x0200 /* sector sizes >1BB */ #define XFS_FSOP_GEOM_FLAGS_ATTR2 0x0400 /* inline attributes rework */ +#define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */ #define XFS_FSOP_GEOM_FLAGS_LAZYSB 0x4000 /* lazy superblock counters */ @@ -371,6 +372,9 @@ typedef struct xfs_fsop_attrlist_handlereq { typedef struct xfs_attr_multiop { __u32 am_opcode; +#define ATTR_OP_GET 1 /* return the indicated attr's value */ +#define ATTR_OP_SET 2 /* set/create the indicated attr/value pair */ +#define ATTR_OP_REMOVE 3 /* remove the indicated attr */ __s32 am_error; void __user *am_attrname; void __user *am_attrvalue; diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 381ebda4f7bc..84583cf73db3 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -95,6 +95,8 @@ xfs_fs_geometry( XFS_FSOP_GEOM_FLAGS_DIRV2 : 0) | (xfs_sb_version_hassector(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_SECTOR : 0) | + (xfs_sb_version_hasasciici(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_DIRV2CI : 0) | (xfs_sb_version_haslazysbcount(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_LAZYSB : 0) | (xfs_sb_version_hasattr2(&mp->m_sb) ? @@ -625,7 +627,7 @@ xfs_fs_goingdown( xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT); thaw_bdev(sb->s_bdev, sb); } - + break; } case XFS_FSOP_GOING_FLAGS_LOGFLUSH: diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c index e5310c90e50f..83502f3edef0 100644 --- a/fs/xfs/xfs_ialloc_btree.c +++ b/fs/xfs/xfs_ialloc_btree.c @@ -181,7 +181,7 @@ xfs_inobt_delrec( * then we can get rid of this level. */ if (numrecs == 1 && level > 0) { - agbp = cur->bc_private.i.agbp; + agbp = cur->bc_private.a.agbp; agi = XFS_BUF_TO_AGI(agbp); /* * pp is still set to the first pointer in the block. @@ -194,7 +194,7 @@ xfs_inobt_delrec( * Free the block. */ if ((error = xfs_free_extent(cur->bc_tp, - XFS_AGB_TO_FSB(mp, cur->bc_private.i.agno, bno), 1))) + XFS_AGB_TO_FSB(mp, cur->bc_private.a.agno, bno), 1))) return error; xfs_trans_binval(cur->bc_tp, bp); xfs_ialloc_log_agi(cur->bc_tp, agbp, @@ -379,7 +379,7 @@ xfs_inobt_delrec( rrecs = be16_to_cpu(right->bb_numrecs); rbp = bp; if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, - cur->bc_private.i.agno, lbno, 0, &lbp, + cur->bc_private.a.agno, lbno, 0, &lbp, XFS_INO_BTREE_REF))) return error; left = XFS_BUF_TO_INOBT_BLOCK(lbp); @@ -401,7 +401,7 @@ xfs_inobt_delrec( lrecs = be16_to_cpu(left->bb_numrecs); lbp = bp; if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, - cur->bc_private.i.agno, rbno, 0, &rbp, + cur->bc_private.a.agno, rbno, 0, &rbp, XFS_INO_BTREE_REF))) return error; right = XFS_BUF_TO_INOBT_BLOCK(rbp); @@ -484,7 +484,7 @@ xfs_inobt_delrec( xfs_buf_t *rrbp; if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, - cur->bc_private.i.agno, be32_to_cpu(left->bb_rightsib), 0, + cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib), 0, &rrbp, XFS_INO_BTREE_REF))) return error; rrblock = XFS_BUF_TO_INOBT_BLOCK(rrbp); @@ -497,7 +497,7 @@ xfs_inobt_delrec( * Free the deleting block. */ if ((error = xfs_free_extent(cur->bc_tp, XFS_AGB_TO_FSB(mp, - cur->bc_private.i.agno, rbno), 1))) + cur->bc_private.a.agno, rbno), 1))) return error; xfs_trans_binval(cur->bc_tp, rbp); /* @@ -854,7 +854,7 @@ xfs_inobt_lookup( { xfs_agi_t *agi; /* a.g. inode header */ - agi = XFS_BUF_TO_AGI(cur->bc_private.i.agbp); + agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp); agno = be32_to_cpu(agi->agi_seqno); agbno = be32_to_cpu(agi->agi_root); } @@ -1089,7 +1089,7 @@ xfs_inobt_lshift( * Set up the left neighbor as "left". */ if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, - cur->bc_private.i.agno, be32_to_cpu(right->bb_leftsib), + cur->bc_private.a.agno, be32_to_cpu(right->bb_leftsib), 0, &lbp, XFS_INO_BTREE_REF))) return error; left = XFS_BUF_TO_INOBT_BLOCK(lbp); @@ -1207,10 +1207,10 @@ xfs_inobt_newroot( /* * Get a block & a buffer. */ - agi = XFS_BUF_TO_AGI(cur->bc_private.i.agbp); + agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp); args.tp = cur->bc_tp; args.mp = cur->bc_mp; - args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.i.agno, + args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, be32_to_cpu(agi->agi_root)); args.mod = args.minleft = args.alignment = args.total = args.wasdel = args.isfl = args.userdata = args.minalignslop = 0; @@ -1233,7 +1233,7 @@ xfs_inobt_newroot( */ agi->agi_root = cpu_to_be32(args.agbno); be32_add_cpu(&agi->agi_level, 1); - xfs_ialloc_log_agi(args.tp, cur->bc_private.i.agbp, + xfs_ialloc_log_agi(args.tp, cur->bc_private.a.agbp, XFS_AGI_ROOT | XFS_AGI_LEVEL); /* * At the previous root level there are now two blocks: the old @@ -1376,7 +1376,7 @@ xfs_inobt_rshift( * Set up the right neighbor as "right". */ if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, - cur->bc_private.i.agno, be32_to_cpu(left->bb_rightsib), + cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib), 0, &rbp, XFS_INO_BTREE_REF))) return error; right = XFS_BUF_TO_INOBT_BLOCK(rbp); @@ -1492,7 +1492,7 @@ xfs_inobt_split( * Allocate the new block. * If we can't do it, we're toast. Give up. */ - args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.i.agno, lbno); + args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, lbno); args.mod = args.minleft = args.alignment = args.total = args.wasdel = args.isfl = args.userdata = args.minalignslop = 0; args.minlen = args.maxlen = args.prod = 1; @@ -1725,7 +1725,7 @@ xfs_inobt_decrement( agbno = be32_to_cpu(*XFS_INOBT_PTR_ADDR(block, cur->bc_ptrs[lev], cur)); if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, - cur->bc_private.i.agno, agbno, 0, &bp, + cur->bc_private.a.agno, agbno, 0, &bp, XFS_INO_BTREE_REF))) return error; lev--; @@ -1897,7 +1897,7 @@ xfs_inobt_increment( agbno = be32_to_cpu(*XFS_INOBT_PTR_ADDR(block, cur->bc_ptrs[lev], cur)); if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, - cur->bc_private.i.agno, agbno, 0, &bp, + cur->bc_private.a.agno, agbno, 0, &bp, XFS_INO_BTREE_REF))) return error; lev--; diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index b07604b94d9f..e229e9e001c2 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -216,7 +216,14 @@ finish_inode: mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); init_waitqueue_head(&ip->i_ipin_wait); atomic_set(&ip->i_pincount, 0); - initnsema(&ip->i_flock, 1, "xfsfino"); + + /* + * Because we want to use a counting completion, complete + * the flush completion once to allow a single access to + * the flush completion without blocking. + */ + init_completion(&ip->i_flush); + complete(&ip->i_flush); if (lock_flags) xfs_ilock(ip, lock_flags); @@ -288,10 +295,17 @@ finish_inode: *ipp = ip; /* + * Set up the Linux with the Linux inode. + */ + ip->i_vnode = inode; + inode->i_private = ip; + + /* * If we have a real type for an on-disk inode, we can set ops(&unlock) * now. If it's a new inode being created, xfs_ialloc will handle it. */ - xfs_initialize_vnode(mp, inode, ip); + if (ip->i_d.di_mode != 0) + xfs_setup_inode(ip); return 0; } @@ -411,10 +425,11 @@ xfs_iput(xfs_inode_t *ip, * Special iput for brand-new inodes that are still locked */ void -xfs_iput_new(xfs_inode_t *ip, - uint lock_flags) +xfs_iput_new( + xfs_inode_t *ip, + uint lock_flags) { - struct inode *inode = ip->i_vnode; + struct inode *inode = VFS_I(ip); xfs_itrace_entry(ip); @@ -775,26 +790,3 @@ xfs_isilocked( } #endif -/* - * The following three routines simply manage the i_flock - * semaphore embedded in the inode. This semaphore synchronizes - * processes attempting to flush the in-core inode back to disk. - */ -void -xfs_iflock(xfs_inode_t *ip) -{ - psema(&(ip->i_flock), PINOD|PLTWAIT); -} - -int -xfs_iflock_nowait(xfs_inode_t *ip) -{ - return (cpsema(&(ip->i_flock))); -} - -void -xfs_ifunlock(xfs_inode_t *ip) -{ - ASSERT(issemalocked(&(ip->i_flock))); - vsema(&(ip->i_flock)); -} diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index e569bf5d6cf0..dbd9cef852ec 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -580,8 +580,8 @@ xfs_iformat_extents( xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip)); for (i = 0; i < nex; i++, dp++) { xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); - ep->l0 = be64_to_cpu(get_unaligned(&dp->l0)); - ep->l1 = be64_to_cpu(get_unaligned(&dp->l1)); + ep->l0 = get_unaligned_be64(&dp->l0); + ep->l1 = get_unaligned_be64(&dp->l1); } XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork); if (whichfork != XFS_DATA_FORK || @@ -835,22 +835,22 @@ xfs_iread( * Do this before xfs_iformat in case it adds entries. */ #ifdef XFS_INODE_TRACE - ip->i_trace = ktrace_alloc(INODE_TRACE_SIZE, KM_SLEEP); + ip->i_trace = ktrace_alloc(INODE_TRACE_SIZE, KM_NOFS); #endif #ifdef XFS_BMAP_TRACE - ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_SLEEP); + ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_NOFS); #endif #ifdef XFS_BMBT_TRACE - ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_SLEEP); + ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_NOFS); #endif #ifdef XFS_RW_TRACE - ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_SLEEP); + ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_NOFS); #endif #ifdef XFS_ILOCK_TRACE - ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_SLEEP); + ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_NOFS); #endif #ifdef XFS_DIR2_TRACE - ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_SLEEP); + ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS); #endif /* @@ -1046,9 +1046,9 @@ xfs_ialloc( { xfs_ino_t ino; xfs_inode_t *ip; - bhv_vnode_t *vp; uint flags; int error; + timespec_t tv; /* * Call the space management code to pick @@ -1077,13 +1077,12 @@ xfs_ialloc( } ASSERT(ip != NULL); - vp = XFS_ITOV(ip); ip->i_d.di_mode = (__uint16_t)mode; ip->i_d.di_onlink = 0; ip->i_d.di_nlink = nlink; ASSERT(ip->i_d.di_nlink == nlink); - ip->i_d.di_uid = current_fsuid(cr); - ip->i_d.di_gid = current_fsgid(cr); + ip->i_d.di_uid = current_fsuid(); + ip->i_d.di_gid = current_fsgid(); ip->i_d.di_projid = prid; memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); @@ -1130,7 +1129,13 @@ xfs_ialloc( ip->i_size = 0; ip->i_d.di_nextents = 0; ASSERT(ip->i_d.di_nblocks == 0); - xfs_ichgtime(ip, XFS_ICHGTIME_CHG|XFS_ICHGTIME_ACC|XFS_ICHGTIME_MOD); + + nanotime(&tv); + ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec; + ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec; + ip->i_d.di_atime = ip->i_d.di_mtime; + ip->i_d.di_ctime = ip->i_d.di_mtime; + /* * di_gen will have been taken care of in xfs_iread. */ @@ -1220,7 +1225,7 @@ xfs_ialloc( xfs_trans_log_inode(tp, ip, flags); /* now that we have an i_mode we can setup inode ops and unlock */ - xfs_initialize_vnode(tp->t_mountp, vp, ip); + xfs_setup_inode(ip); *ipp = ip; return 0; @@ -1399,7 +1404,6 @@ xfs_itruncate_start( xfs_fsize_t last_byte; xfs_off_t toss_start; xfs_mount_t *mp; - bhv_vnode_t *vp; int error = 0; ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); @@ -1408,7 +1412,6 @@ xfs_itruncate_start( (flags == XFS_ITRUNC_MAYBE)); mp = ip->i_mount; - vp = XFS_ITOV(ip); /* wait for the completion of any pending DIOs */ if (new_size < ip->i_size) @@ -1457,7 +1460,7 @@ xfs_itruncate_start( #ifdef DEBUG if (new_size == 0) { - ASSERT(VN_CACHED(vp) == 0); + ASSERT(VN_CACHED(VFS_I(ip)) == 0); } #endif return error; @@ -1763,67 +1766,6 @@ xfs_itruncate_finish( return 0; } - -/* - * xfs_igrow_start - * - * Do the first part of growing a file: zero any data in the last - * block that is beyond the old EOF. We need to do this before - * the inode is joined to the transaction to modify the i_size. - * That way we can drop the inode lock and call into the buffer - * cache to get the buffer mapping the EOF. - */ -int -xfs_igrow_start( - xfs_inode_t *ip, - xfs_fsize_t new_size, - cred_t *credp) -{ - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); - ASSERT(new_size > ip->i_size); - - /* - * Zero any pages that may have been created by - * xfs_write_file() beyond the end of the file - * and any blocks between the old and new file sizes. - */ - return xfs_zero_eof(ip, new_size, ip->i_size); -} - -/* - * xfs_igrow_finish - * - * This routine is called to extend the size of a file. - * The inode must have both the iolock and the ilock locked - * for update and it must be a part of the current transaction. - * The xfs_igrow_start() function must have been called previously. - * If the change_flag is not zero, the inode change timestamp will - * be updated. - */ -void -xfs_igrow_finish( - xfs_trans_t *tp, - xfs_inode_t *ip, - xfs_fsize_t new_size, - int change_flag) -{ - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); - ASSERT(ip->i_transp == tp); - ASSERT(new_size > ip->i_size); - - /* - * Update the file size. Update the inode change timestamp - * if change_flag set. - */ - ip->i_d.di_size = new_size; - ip->i_size = new_size; - if (change_flag) - xfs_ichgtime(ip, XFS_ICHGTIME_CHG); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - -} - - /* * This is called when the inode's link count goes to 0. * We place the on-disk inode on a list in the AGI. It @@ -2258,7 +2200,7 @@ xfs_ifree_cluster( xfs_trans_binval(tp, bp); } - kmem_free(ip_found, ninodes * sizeof(xfs_inode_t *)); + kmem_free(ip_found); xfs_put_perag(mp, pag); } @@ -2470,7 +2412,7 @@ xfs_iroot_realloc( (int)new_size); memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t)); } - kmem_free(ifp->if_broot, ifp->if_broot_bytes); + kmem_free(ifp->if_broot); ifp->if_broot = new_broot; ifp->if_broot_bytes = (int)new_size; ASSERT(ifp->if_broot_bytes <= @@ -2514,7 +2456,7 @@ xfs_idata_realloc( if (new_size == 0) { if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { - kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes); + kmem_free(ifp->if_u1.if_data); } ifp->if_u1.if_data = NULL; real_size = 0; @@ -2529,7 +2471,7 @@ xfs_idata_realloc( ASSERT(ifp->if_real_bytes != 0); memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data, new_size); - kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes); + kmem_free(ifp->if_u1.if_data); ifp->if_u1.if_data = ifp->if_u2.if_inline_data; } real_size = 0; @@ -2636,7 +2578,7 @@ xfs_idestroy_fork( ifp = XFS_IFORK_PTR(ip, whichfork); if (ifp->if_broot != NULL) { - kmem_free(ifp->if_broot, ifp->if_broot_bytes); + kmem_free(ifp->if_broot); ifp->if_broot = NULL; } @@ -2650,7 +2592,7 @@ xfs_idestroy_fork( if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) && (ifp->if_u1.if_data != NULL)) { ASSERT(ifp->if_real_bytes != 0); - kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes); + kmem_free(ifp->if_u1.if_data); ifp->if_u1.if_data = NULL; ifp->if_real_bytes = 0; } @@ -2691,7 +2633,6 @@ xfs_idestroy( xfs_idestroy_fork(ip, XFS_ATTR_FORK); mrfree(&ip->i_lock); mrfree(&ip->i_iolock); - freesema(&ip->i_flock); #ifdef XFS_INODE_TRACE ktrace_free(ip->i_trace); @@ -3058,7 +2999,7 @@ xfs_iflush_cluster( out_free: read_unlock(&pag->pag_ici_lock); - kmem_free(ilist, ilist_size); + kmem_free(ilist); return 0; @@ -3102,17 +3043,17 @@ cluster_corrupt_out: * Unlocks the flush lock */ xfs_iflush_abort(iq); - kmem_free(ilist, ilist_size); + kmem_free(ilist); return XFS_ERROR(EFSCORRUPTED); } /* * xfs_iflush() will write a modified inode's changes out to the * inode's on disk home. The caller must have the inode lock held - * in at least shared mode and the inode flush semaphore must be - * held as well. The inode lock will still be held upon return from + * in at least shared mode and the inode flush completion must be + * active as well. The inode lock will still be held upon return from * the call and the caller is free to unlock it. - * The inode flush lock will be unlocked when the inode reaches the disk. + * The inode flush will be completed when the inode reaches the disk. * The flags indicate how the inode's buffer should be written out. */ int @@ -3131,7 +3072,7 @@ xfs_iflush( XFS_STATS_INC(xs_iflush_count); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); - ASSERT(issemalocked(&(ip->i_flock))); + ASSERT(!completion_done(&ip->i_flush)); ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || ip->i_d.di_nextents > ip->i_df.if_ext_max); @@ -3143,8 +3084,6 @@ xfs_iflush( * flush lock and do nothing. */ if (xfs_inode_clean(ip)) { - ASSERT((iip != NULL) ? - !(iip->ili_item.li_flags & XFS_LI_IN_AIL) : 1); xfs_ifunlock(ip); return 0; } @@ -3296,7 +3235,7 @@ xfs_iflush_int( #endif ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); - ASSERT(issemalocked(&(ip->i_flock))); + ASSERT(!completion_done(&ip->i_flush)); ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || ip->i_d.di_nextents > ip->i_df.if_ext_max); @@ -3528,7 +3467,6 @@ xfs_iflush_all( xfs_mount_t *mp) { xfs_inode_t *ip; - bhv_vnode_t *vp; again: XFS_MOUNT_ILOCK(mp); @@ -3543,14 +3481,13 @@ xfs_iflush_all( continue; } - vp = XFS_ITOV_NULL(ip); - if (!vp) { + if (!VFS_I(ip)) { XFS_MOUNT_IUNLOCK(mp); xfs_finish_reclaim(ip, 0, XFS_IFLUSH_ASYNC); goto again; } - ASSERT(vn_count(vp) == 0); + ASSERT(vn_count(VFS_I(ip)) == 0); ip = ip->i_mnext; } while (ip != mp->m_inodes); @@ -3770,7 +3707,7 @@ xfs_iext_add_indirect_multi( * (all extents past */ if (nex2) { byte_diff = nex2 * sizeof(xfs_bmbt_rec_t); - nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_SLEEP); + nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS); memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff); erp->er_extcount -= nex2; xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2); @@ -3836,7 +3773,7 @@ xfs_iext_add_indirect_multi( erp = xfs_iext_irec_new(ifp, erp_idx); } memmove(&erp->er_extbuf[i], nex2_ep, byte_diff); - kmem_free(nex2_ep, byte_diff); + kmem_free(nex2_ep); erp->er_extcount += nex2; xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2); } @@ -4070,8 +4007,7 @@ xfs_iext_realloc_direct( ifp->if_u1.if_extents = kmem_realloc(ifp->if_u1.if_extents, rnew_size, - ifp->if_real_bytes, - KM_SLEEP); + ifp->if_real_bytes, KM_NOFS); } if (rnew_size > ifp->if_real_bytes) { memset(&ifp->if_u1.if_extents[ifp->if_bytes / @@ -4112,7 +4048,7 @@ xfs_iext_direct_to_inline( */ memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents, nextents * sizeof(xfs_bmbt_rec_t)); - kmem_free(ifp->if_u1.if_extents, ifp->if_real_bytes); + kmem_free(ifp->if_u1.if_extents); ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; ifp->if_real_bytes = 0; } @@ -4130,7 +4066,7 @@ xfs_iext_inline_to_direct( xfs_ifork_t *ifp, /* inode fork pointer */ int new_size) /* number of extents in file */ { - ifp->if_u1.if_extents = kmem_alloc(new_size, KM_SLEEP); + ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS); memset(ifp->if_u1.if_extents, 0, new_size); if (ifp->if_bytes) { memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext, @@ -4162,7 +4098,7 @@ xfs_iext_realloc_indirect( } else { ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *) kmem_realloc(ifp->if_u1.if_ext_irec, - new_size, size, KM_SLEEP); + new_size, size, KM_NOFS); } } @@ -4182,11 +4118,11 @@ xfs_iext_indirect_to_direct( ASSERT(nextents <= XFS_LINEAR_EXTS); size = nextents * sizeof(xfs_bmbt_rec_t); - xfs_iext_irec_compact_full(ifp); + xfs_iext_irec_compact_pages(ifp); ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ); ep = ifp->if_u1.if_ext_irec->er_extbuf; - kmem_free(ifp->if_u1.if_ext_irec, sizeof(xfs_ext_irec_t)); + kmem_free(ifp->if_u1.if_ext_irec); ifp->if_flags &= ~XFS_IFEXTIREC; ifp->if_u1.if_extents = ep; ifp->if_bytes = size; @@ -4212,7 +4148,7 @@ xfs_iext_destroy( } ifp->if_flags &= ~XFS_IFEXTIREC; } else if (ifp->if_real_bytes) { - kmem_free(ifp->if_u1.if_extents, ifp->if_real_bytes); + kmem_free(ifp->if_u1.if_extents); } else if (ifp->if_bytes) { memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)); @@ -4404,11 +4340,10 @@ xfs_iext_irec_init( nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); ASSERT(nextents <= XFS_LINEAR_EXTS); - erp = (xfs_ext_irec_t *) - kmem_alloc(sizeof(xfs_ext_irec_t), KM_SLEEP); + erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS); if (nextents == 0) { - ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_SLEEP); + ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS); } else if (!ifp->if_real_bytes) { xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ); } else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) { @@ -4456,7 +4391,7 @@ xfs_iext_irec_new( /* Initialize new extent record */ erp = ifp->if_u1.if_ext_irec; - erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_SLEEP); + erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS); ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ; memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ); erp[erp_idx].er_extcount = 0; @@ -4483,7 +4418,7 @@ xfs_iext_irec_remove( if (erp->er_extbuf) { xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -erp->er_extcount); - kmem_free(erp->er_extbuf, XFS_IEXT_BUFSZ); + kmem_free(erp->er_extbuf); } /* Compact extent records */ erp = ifp->if_u1.if_ext_irec; @@ -4501,8 +4436,7 @@ xfs_iext_irec_remove( xfs_iext_realloc_indirect(ifp, nlists * sizeof(xfs_ext_irec_t)); } else { - kmem_free(ifp->if_u1.if_ext_irec, - sizeof(xfs_ext_irec_t)); + kmem_free(ifp->if_u1.if_ext_irec); } ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ; } @@ -4515,8 +4449,7 @@ xfs_iext_irec_remove( * compaction policy is as follows: * * Full Compaction: Extents fit into a single page (or inline buffer) - * Full Compaction: Extents occupy less than 10% of allocated space - * Partial Compaction: Extents occupy > 10% and < 50% of allocated space + * Partial Compaction: Extents occupy less than 50% of allocated space * No Compaction: Extents occupy at least 50% of allocated space */ void @@ -4537,8 +4470,6 @@ xfs_iext_irec_compact( xfs_iext_direct_to_inline(ifp, nextents); } else if (nextents <= XFS_LINEAR_EXTS) { xfs_iext_indirect_to_direct(ifp); - } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 3) { - xfs_iext_irec_compact_full(ifp); } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) { xfs_iext_irec_compact_pages(ifp); } @@ -4562,7 +4493,7 @@ xfs_iext_irec_compact_pages( erp_next = erp + 1; if (erp_next->er_extcount <= (XFS_LINEAR_EXTS - erp->er_extcount)) { - memmove(&erp->er_extbuf[erp->er_extcount], + memcpy(&erp->er_extbuf[erp->er_extcount], erp_next->er_extbuf, erp_next->er_extcount * sizeof(xfs_bmbt_rec_t)); erp->er_extcount += erp_next->er_extcount; @@ -4571,75 +4502,13 @@ xfs_iext_irec_compact_pages( * so er_extoffs don't get modified in * xfs_iext_irec_remove. */ - kmem_free(erp_next->er_extbuf, XFS_IEXT_BUFSZ); - erp_next->er_extbuf = NULL; - xfs_iext_irec_remove(ifp, erp_idx + 1); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - } else { - erp_idx++; - } - } -} - -/* - * Fully compact the extent records managed by the indirection array. - */ -void -xfs_iext_irec_compact_full( - xfs_ifork_t *ifp) /* inode fork pointer */ -{ - xfs_bmbt_rec_host_t *ep, *ep_next; /* extent record pointers */ - xfs_ext_irec_t *erp, *erp_next; /* extent irec pointers */ - int erp_idx = 0; /* extent irec index */ - int ext_avail; /* empty entries in ex list */ - int ext_diff; /* number of exts to add */ - int nlists; /* number of irec's (ex lists) */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - erp = ifp->if_u1.if_ext_irec; - ep = &erp->er_extbuf[erp->er_extcount]; - erp_next = erp + 1; - ep_next = erp_next->er_extbuf; - while (erp_idx < nlists - 1) { - ext_avail = XFS_LINEAR_EXTS - erp->er_extcount; - ext_diff = MIN(ext_avail, erp_next->er_extcount); - memcpy(ep, ep_next, ext_diff * sizeof(xfs_bmbt_rec_t)); - erp->er_extcount += ext_diff; - erp_next->er_extcount -= ext_diff; - /* Remove next page */ - if (erp_next->er_extcount == 0) { - /* - * Free page before removing extent record - * so er_extoffs don't get modified in - * xfs_iext_irec_remove. - */ - kmem_free(erp_next->er_extbuf, - erp_next->er_extcount * sizeof(xfs_bmbt_rec_t)); + kmem_free(erp_next->er_extbuf); erp_next->er_extbuf = NULL; xfs_iext_irec_remove(ifp, erp_idx + 1); - erp = &ifp->if_u1.if_ext_irec[erp_idx]; nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - /* Update next page */ } else { - /* Move rest of page up to become next new page */ - memmove(erp_next->er_extbuf, ep_next, - erp_next->er_extcount * sizeof(xfs_bmbt_rec_t)); - ep_next = erp_next->er_extbuf; - memset(&ep_next[erp_next->er_extcount], 0, - (XFS_LINEAR_EXTS - erp_next->er_extcount) * - sizeof(xfs_bmbt_rec_t)); - } - if (erp->er_extcount == XFS_LINEAR_EXTS) { erp_idx++; - if (erp_idx < nlists) - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - else - break; } - ep = &erp->er_extbuf[erp->er_extcount]; - erp_next = erp + 1; - ep_next = erp_next->er_extbuf; } } diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 0a999fee4f03..1420c49674d7 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -87,8 +87,7 @@ typedef struct xfs_ifork { * Flags for xfs_ichgtime(). */ #define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */ -#define XFS_ICHGTIME_ACC 0x2 /* data fork access timestamp */ -#define XFS_ICHGTIME_CHG 0x4 /* inode field change timestamp */ +#define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */ /* * Per-fork incore inode flags. @@ -204,7 +203,7 @@ typedef struct xfs_inode { struct xfs_inode *i_mprev; /* ptr to prev inode */ struct xfs_mount *i_mount; /* fs mount struct ptr */ struct list_head i_reclaim; /* reclaim list */ - bhv_vnode_t *i_vnode; /* vnode backpointer */ + struct inode *i_vnode; /* vnode backpointer */ struct xfs_dquot *i_udquot; /* user dquot */ struct xfs_dquot *i_gdquot; /* group dquot */ @@ -223,7 +222,7 @@ typedef struct xfs_inode { struct xfs_inode_log_item *i_itemp; /* logging information */ mrlock_t i_lock; /* inode lock */ mrlock_t i_iolock; /* inode IO lock */ - sema_t i_flock; /* inode flush lock */ + struct completion i_flush; /* inode flush completion q */ atomic_t i_pincount; /* inode pin count */ wait_queue_head_t i_ipin_wait; /* inode pinning wait queue */ spinlock_t i_flags_lock; /* inode i_flags lock */ @@ -263,6 +262,18 @@ typedef struct xfs_inode { #define XFS_ISIZE(ip) (((ip)->i_d.di_mode & S_IFMT) == S_IFREG) ? \ (ip)->i_size : (ip)->i_d.di_size; +/* Convert from vfs inode to xfs inode */ +static inline struct xfs_inode *XFS_I(struct inode *inode) +{ + return (struct xfs_inode *)inode->i_private; +} + +/* convert from xfs inode to vfs inode */ +static inline struct inode *VFS_I(struct xfs_inode *ip) +{ + return (struct inode *)ip->i_vnode; +} + /* * i_flags helper functions */ @@ -439,9 +450,6 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags) #define XFS_ITRUNC_DEFINITE 0x1 #define XFS_ITRUNC_MAYBE 0x2 -#define XFS_ITOV(ip) ((ip)->i_vnode) -#define XFS_ITOV_NULL(ip) ((ip)->i_vnode) - /* * For multiple groups support: if S_ISGID bit is set in the parent * directory, group of new file is set to that of the parent, and @@ -473,11 +481,8 @@ int xfs_ilock_nowait(xfs_inode_t *, uint); void xfs_iunlock(xfs_inode_t *, uint); void xfs_ilock_demote(xfs_inode_t *, uint); int xfs_isilocked(xfs_inode_t *, uint); -void xfs_iflock(xfs_inode_t *); -int xfs_iflock_nowait(xfs_inode_t *); uint xfs_ilock_map_shared(xfs_inode_t *); void xfs_iunlock_map_shared(xfs_inode_t *, uint); -void xfs_ifunlock(xfs_inode_t *); void xfs_ireclaim(xfs_inode_t *); int xfs_finish_reclaim(xfs_inode_t *, int, int); int xfs_finish_reclaim_all(struct xfs_mount *, int); @@ -507,9 +512,6 @@ int xfs_itruncate_start(xfs_inode_t *, uint, xfs_fsize_t); int xfs_itruncate_finish(struct xfs_trans **, xfs_inode_t *, xfs_fsize_t, int, int); int xfs_iunlink(struct xfs_trans *, xfs_inode_t *); -int xfs_igrow_start(xfs_inode_t *, xfs_fsize_t, struct cred *); -void xfs_igrow_finish(struct xfs_trans *, xfs_inode_t *, - xfs_fsize_t, int); void xfs_idestroy_fork(xfs_inode_t *, int); void xfs_idestroy(xfs_inode_t *); @@ -525,6 +527,7 @@ void xfs_iflush_all(struct xfs_mount *); void xfs_ichgtime(xfs_inode_t *, int); xfs_fsize_t xfs_file_last_byte(xfs_inode_t *); void xfs_lock_inodes(xfs_inode_t **, int, uint); +void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); void xfs_synchronize_atime(xfs_inode_t *); void xfs_mark_inode_dirty_sync(xfs_inode_t *); @@ -573,6 +576,26 @@ extern struct kmem_zone *xfs_ifork_zone; extern struct kmem_zone *xfs_inode_zone; extern struct kmem_zone *xfs_ili_zone; +/* + * Manage the i_flush queue embedded in the inode. This completion + * queue synchronizes processes attempting to flush the in-core + * inode back to disk. + */ +static inline void xfs_iflock(xfs_inode_t *ip) +{ + wait_for_completion(&ip->i_flush); +} + +static inline int xfs_iflock_nowait(xfs_inode_t *ip) +{ + return try_wait_for_completion(&ip->i_flush); +} + +static inline void xfs_ifunlock(xfs_inode_t *ip) +{ + complete(&ip->i_flush); +} + #endif /* __KERNEL__ */ #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 167b33f15772..97c7452e2620 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -686,7 +686,7 @@ xfs_inode_item_unlock( ASSERT(ip->i_d.di_nextents > 0); ASSERT(iip->ili_format.ilf_fields & XFS_ILOG_DEXT); ASSERT(ip->i_df.if_bytes > 0); - kmem_free(iip->ili_extents_buf, ip->i_df.if_bytes); + kmem_free(iip->ili_extents_buf); iip->ili_extents_buf = NULL; } if (iip->ili_aextents_buf != NULL) { @@ -694,7 +694,7 @@ xfs_inode_item_unlock( ASSERT(ip->i_d.di_anextents > 0); ASSERT(iip->ili_format.ilf_fields & XFS_ILOG_AEXT); ASSERT(ip->i_afp->if_bytes > 0); - kmem_free(iip->ili_aextents_buf, ip->i_afp->if_bytes); + kmem_free(iip->ili_aextents_buf); iip->ili_aextents_buf = NULL; } @@ -779,11 +779,10 @@ xfs_inode_item_pushbuf( ASSERT(iip->ili_push_owner == current_pid()); /* - * If flushlock isn't locked anymore, chances are that the - * inode flush completed and the inode was taken off the AIL. - * So, just get out. + * If a flush is not in progress anymore, chances are that the + * inode was taken off the AIL. So, just get out. */ - if (!issemalocked(&(ip->i_flock)) || + if (completion_done(&ip->i_flush) || ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) { iip->ili_pushbuf_flag = 0; xfs_iunlock(ip, XFS_ILOCK_SHARED); @@ -805,7 +804,7 @@ xfs_inode_item_pushbuf( * If not, we can flush it async. */ dopush = ((iip->ili_item.li_flags & XFS_LI_IN_AIL) && - issemalocked(&(ip->i_flock))); + !completion_done(&ip->i_flush)); iip->ili_pushbuf_flag = 0; xfs_iunlock(ip, XFS_ILOCK_SHARED); xfs_buftrace("INODE ITEM PUSH", bp); @@ -858,7 +857,7 @@ xfs_inode_item_push( ip = iip->ili_inode; ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); - ASSERT(issemalocked(&(ip->i_flock))); + ASSERT(!completion_done(&ip->i_flush)); /* * Since we were able to lock the inode's flush lock and * we found it on the AIL, the inode must be dirty. This @@ -957,8 +956,7 @@ xfs_inode_item_destroy( { #ifdef XFS_TRANS_DEBUG if (ip->i_itemp->ili_root_size != 0) { - kmem_free(ip->i_itemp->ili_orig_root, - ip->i_itemp->ili_root_size); + kmem_free(ip->i_itemp->ili_orig_root); } #endif kmem_zone_free(xfs_ili_zone, ip->i_itemp); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 7edcde691d1a..67f22b2b44b3 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -889,6 +889,16 @@ xfs_iomap_write_unwritten( count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); count_fsb = (xfs_filblks_t)(count_fsb - offset_fsb); + /* + * Reserve enough blocks in this transaction for two complete extent + * btree splits. We may be converting the middle part of an unwritten + * extent and in this case we will insert two new extents in the btree + * each of which could cause a full split. + * + * This reservation amount will be used in the first call to + * xfs_bmbt_split() to select an AG with enough space to satisfy the + * rest of the operation. + */ resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1; do { diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 419de15aeb43..cf6754a3c5b3 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -59,7 +59,6 @@ xfs_bulkstat_one_iget( { xfs_icdinode_t *dic; /* dinode core info pointer */ xfs_inode_t *ip; /* incore inode pointer */ - bhv_vnode_t *vp; int error; error = xfs_iget(mp, NULL, ino, @@ -72,7 +71,6 @@ xfs_bulkstat_one_iget( ASSERT(ip != NULL); ASSERT(ip->i_blkno != (xfs_daddr_t)0); - vp = XFS_ITOV(ip); dic = &ip->i_d; /* xfs_iget returns the following without needing @@ -85,7 +83,7 @@ xfs_bulkstat_one_iget( buf->bs_uid = dic->di_uid; buf->bs_gid = dic->di_gid; buf->bs_size = dic->di_size; - vn_atime_to_bstime(vp, &buf->bs_atime); + vn_atime_to_bstime(VFS_I(ip), &buf->bs_atime); buf->bs_mtime.tv_sec = dic->di_mtime.t_sec; buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec; buf->bs_ctime.tv_sec = dic->di_ctime.t_sec; @@ -257,7 +255,7 @@ xfs_bulkstat_one( *ubused = error; out_free: - kmem_free(buf, sizeof(*buf)); + kmem_free(buf); return error; } @@ -708,7 +706,7 @@ xfs_bulkstat( /* * Done, we're either out of filesystem or space to put the data. */ - kmem_free(irbuf, irbsize); + kmem_free(irbuf); *ubcountp = ubelem; /* * Found some inodes, return them now and return the error next time. @@ -914,7 +912,7 @@ xfs_inumbers( } *lastino = XFS_AGINO_TO_INO(mp, agno, agino); } - kmem_free(buffer, bcount * sizeof(*buffer)); + kmem_free(buffer); if (cur) xfs_btree_del_cursor(cur, (error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR)); diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index afaee301b0ee..0b02c6443551 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -124,16 +124,27 @@ STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog, STATIC int xlog_iclogs_empty(xlog_t *log); #if defined(XFS_LOG_TRACE) + +#define XLOG_TRACE_LOGGRANT_SIZE 2048 +#define XLOG_TRACE_ICLOG_SIZE 256 + +void +xlog_trace_loggrant_alloc(xlog_t *log) +{ + log->l_grant_trace = ktrace_alloc(XLOG_TRACE_LOGGRANT_SIZE, KM_NOFS); +} + +void +xlog_trace_loggrant_dealloc(xlog_t *log) +{ + ktrace_free(log->l_grant_trace); +} + void xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string) { unsigned long cnts; - if (!log->l_grant_trace) { - log->l_grant_trace = ktrace_alloc(2048, KM_NOSLEEP); - if (!log->l_grant_trace) - return; - } /* ticket counts are 1 byte each */ cnts = ((unsigned long)tic->t_ocnt) | ((unsigned long)tic->t_cnt) << 8; @@ -157,10 +168,20 @@ xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string) } void +xlog_trace_iclog_alloc(xlog_in_core_t *iclog) +{ + iclog->ic_trace = ktrace_alloc(XLOG_TRACE_ICLOG_SIZE, KM_NOFS); +} + +void +xlog_trace_iclog_dealloc(xlog_in_core_t *iclog) +{ + ktrace_free(iclog->ic_trace); +} + +void xlog_trace_iclog(xlog_in_core_t *iclog, uint state) { - if (!iclog->ic_trace) - iclog->ic_trace = ktrace_alloc(256, KM_SLEEP); ktrace_enter(iclog->ic_trace, (void *)((unsigned long)state), (void *)((unsigned long)current_pid()), @@ -170,8 +191,15 @@ xlog_trace_iclog(xlog_in_core_t *iclog, uint state) (void *)NULL, (void *)NULL); } #else + +#define xlog_trace_loggrant_alloc(log) +#define xlog_trace_loggrant_dealloc(log) #define xlog_trace_loggrant(log,tic,string) + +#define xlog_trace_iclog_alloc(iclog) +#define xlog_trace_iclog_dealloc(iclog) #define xlog_trace_iclog(iclog,state) + #endif /* XFS_LOG_TRACE */ @@ -226,20 +254,24 @@ xlog_grant_sub_space(struct log *log, int bytes) static void xlog_grant_add_space_write(struct log *log, int bytes) { - log->l_grant_write_bytes += bytes; - if (log->l_grant_write_bytes > log->l_logsize) { - log->l_grant_write_bytes -= log->l_logsize; + int tmp = log->l_logsize - log->l_grant_write_bytes; + if (tmp > bytes) + log->l_grant_write_bytes += bytes; + else { log->l_grant_write_cycle++; + log->l_grant_write_bytes = bytes - tmp; } } static void xlog_grant_add_space_reserve(struct log *log, int bytes) { - log->l_grant_reserve_bytes += bytes; - if (log->l_grant_reserve_bytes > log->l_logsize) { - log->l_grant_reserve_bytes -= log->l_logsize; + int tmp = log->l_logsize - log->l_grant_reserve_bytes; + if (tmp > bytes) + log->l_grant_reserve_bytes += bytes; + else { log->l_grant_reserve_cycle++; + log->l_grant_reserve_bytes = bytes - tmp; } } @@ -332,15 +364,12 @@ xfs_log_done(xfs_mount_t *mp, } else { xlog_trace_loggrant(log, ticket, "xfs_log_done: (permanent)"); xlog_regrant_reserve_log_space(log, ticket); - } - - /* If this ticket was a permanent reservation and we aren't - * trying to release it, reset the inited flags; so next time - * we write, a start record will be written out. - */ - if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) && - (flags & XFS_LOG_REL_PERM_RESERV) == 0) + /* If this ticket was a permanent reservation and we aren't + * trying to release it, reset the inited flags; so next time + * we write, a start record will be written out. + */ ticket->t_flags |= XLOG_TIC_INITED; + } return lsn; } /* xfs_log_done */ @@ -353,11 +382,11 @@ xfs_log_done(xfs_mount_t *mp, * Asynchronous forces are implemented by setting the WANT_SYNC * bit in the appropriate in-core log and then returning. * - * Synchronous forces are implemented with a semaphore. All callers - * to force a given lsn to disk will wait on a semaphore attached to the + * Synchronous forces are implemented with a signal variable. All callers + * to force a given lsn to disk will wait on a the sv attached to the * specific in-core log. When given in-core log finally completes its * write to disk, that thread will wake up all threads waiting on the - * semaphore. + * sv. */ int _xfs_log_force( @@ -584,12 +613,12 @@ error: * mp - ubiquitous xfs mount point structure */ int -xfs_log_mount_finish(xfs_mount_t *mp, int mfsi_flags) +xfs_log_mount_finish(xfs_mount_t *mp) { int error; if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) - error = xlog_recover_finish(mp->m_log, mfsi_flags); + error = xlog_recover_finish(mp->m_log); else { error = 0; ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); @@ -703,7 +732,7 @@ xfs_log_unmount_write(xfs_mount_t *mp) if (!(iclog->ic_state == XLOG_STATE_ACTIVE || iclog->ic_state == XLOG_STATE_DIRTY)) { if (!XLOG_FORCED_SHUTDOWN(log)) { - sv_wait(&iclog->ic_forcesema, PMEM, + sv_wait(&iclog->ic_force_wait, PMEM, &log->l_icloglock, s); } else { spin_unlock(&log->l_icloglock); @@ -744,7 +773,7 @@ xfs_log_unmount_write(xfs_mount_t *mp) || iclog->ic_state == XLOG_STATE_DIRTY || iclog->ic_state == XLOG_STATE_IOERROR) ) { - sv_wait(&iclog->ic_forcesema, PMEM, + sv_wait(&iclog->ic_force_wait, PMEM, &log->l_icloglock, s); } else { spin_unlock(&log->l_icloglock); @@ -834,7 +863,7 @@ xfs_log_move_tail(xfs_mount_t *mp, break; tail_lsn = 0; free_bytes -= tic->t_unit_res; - sv_signal(&tic->t_sema); + sv_signal(&tic->t_wait); tic = tic->t_next; } while (tic != log->l_write_headq); } @@ -855,7 +884,7 @@ xfs_log_move_tail(xfs_mount_t *mp, break; tail_lsn = 0; free_bytes -= need_bytes; - sv_signal(&tic->t_sema); + sv_signal(&tic->t_wait); tic = tic->t_next; } while (tic != log->l_reserve_headq); } @@ -1004,11 +1033,12 @@ xlog_iodone(xfs_buf_t *bp) l = iclog->ic_log; /* - * If the ordered flag has been removed by a lower - * layer, it means the underlyin device no longer supports + * If the _XFS_BARRIER_FAILED flag was set by a lower + * layer, it means the underlying device no longer supports * barrier I/O. Warn loudly and turn off barriers. */ - if ((l->l_mp->m_flags & XFS_MOUNT_BARRIER) && !XFS_BUF_ORDERED(bp)) { + if (bp->b_flags & _XFS_BARRIER_FAILED) { + bp->b_flags &= ~_XFS_BARRIER_FAILED; l->l_mp->m_flags &= ~XFS_MOUNT_BARRIER; xfs_fs_cmn_err(CE_WARN, l->l_mp, "xlog_iodone: Barriers are no longer supported" @@ -1228,8 +1258,9 @@ xlog_alloc_log(xfs_mount_t *mp, spin_lock_init(&log->l_icloglock); spin_lock_init(&log->l_grant_lock); - initnsema(&log->l_flushsema, 0, "ic-flush"); + sv_init(&log->l_flush_wait, 0, "flush_wait"); + xlog_trace_loggrant_alloc(log); /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */ ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0); @@ -1281,8 +1312,10 @@ xlog_alloc_log(xfs_mount_t *mp, ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp)); ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0); - sv_init(&iclog->ic_forcesema, SV_DEFAULT, "iclog-force"); - sv_init(&iclog->ic_writesema, SV_DEFAULT, "iclog-write"); + sv_init(&iclog->ic_force_wait, SV_DEFAULT, "iclog-force"); + sv_init(&iclog->ic_write_wait, SV_DEFAULT, "iclog-write"); + + xlog_trace_iclog_alloc(iclog); iclogp = &iclog->ic_next; } @@ -1561,33 +1594,21 @@ xlog_dealloc_log(xlog_t *log) iclog = log->l_iclog; for (i=0; i<log->l_iclog_bufs; i++) { - sv_destroy(&iclog->ic_forcesema); - sv_destroy(&iclog->ic_writesema); + sv_destroy(&iclog->ic_force_wait); + sv_destroy(&iclog->ic_write_wait); xfs_buf_free(iclog->ic_bp); -#ifdef XFS_LOG_TRACE - if (iclog->ic_trace != NULL) { - ktrace_free(iclog->ic_trace); - } -#endif + xlog_trace_iclog_dealloc(iclog); next_iclog = iclog->ic_next; - kmem_free(iclog, sizeof(xlog_in_core_t)); + kmem_free(iclog); iclog = next_iclog; } - freesema(&log->l_flushsema); spinlock_destroy(&log->l_icloglock); spinlock_destroy(&log->l_grant_lock); xfs_buf_free(log->l_xbuf); -#ifdef XFS_LOG_TRACE - if (log->l_trace != NULL) { - ktrace_free(log->l_trace); - } - if (log->l_grant_trace != NULL) { - ktrace_free(log->l_grant_trace); - } -#endif + xlog_trace_loggrant_dealloc(log); log->l_mp->m_log = NULL; - kmem_free(log, sizeof(xlog_t)); + kmem_free(log); } /* xlog_dealloc_log */ /* @@ -1973,7 +1994,7 @@ xlog_write(xfs_mount_t * mp, /* Clean iclogs starting from the head. This ordering must be * maintained, so an iclog doesn't become ACTIVE beyond one that * is SYNCING. This is also required to maintain the notion that we use - * a counting semaphore to hold off would be writers to the log when every + * a ordered wait queue to hold off would be writers to the log when every * iclog is trying to sync to disk. * * State Change: DIRTY -> ACTIVE @@ -2097,6 +2118,7 @@ xlog_state_do_callback( int funcdidcallbacks; /* flag: function did callbacks */ int repeats; /* for issuing console warnings if * looping too many times */ + int wake = 0; spin_lock(&log->l_icloglock); first_iclog = iclog = log->l_iclog; @@ -2236,7 +2258,7 @@ xlog_state_do_callback( xlog_state_clean_log(log); /* wake up threads waiting in xfs_log_force() */ - sv_broadcast(&iclog->ic_forcesema); + sv_broadcast(&iclog->ic_force_wait); iclog = iclog->ic_next; } while (first_iclog != iclog); @@ -2278,15 +2300,13 @@ xlog_state_do_callback( } #endif - flushcnt = 0; - if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR)) { - flushcnt = log->l_flushcnt; - log->l_flushcnt = 0; - } + if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR)) + wake = 1; spin_unlock(&log->l_icloglock); - while (flushcnt--) - vsema(&log->l_flushsema); -} /* xlog_state_do_callback */ + + if (wake) + sv_broadcast(&log->l_flush_wait); +} /* @@ -2300,8 +2320,7 @@ xlog_state_do_callback( * the second completion goes through. * * Callbacks could take time, so they are done outside the scope of the - * global state machine log lock. Assume that the calls to cvsema won't - * take a long time. At least we know it won't sleep. + * global state machine log lock. */ STATIC void xlog_state_done_syncing( @@ -2337,7 +2356,7 @@ xlog_state_done_syncing( * iclog buffer, we wake them all, one will get to do the * I/O, the others get to wait for the result. */ - sv_broadcast(&iclog->ic_writesema); + sv_broadcast(&iclog->ic_write_wait); spin_unlock(&log->l_icloglock); xlog_state_do_callback(log, aborted, iclog); /* also cleans log */ } /* xlog_state_done_syncing */ @@ -2345,11 +2364,9 @@ xlog_state_done_syncing( /* * If the head of the in-core log ring is not (ACTIVE or DIRTY), then we must - * sleep. The flush semaphore is set to the number of in-core buffers and - * decremented around disk syncing. Therefore, if all buffers are syncing, - * this semaphore will cause new writes to sleep until a sync completes. - * Otherwise, this code just does p() followed by v(). This approximates - * a sleep/wakeup except we can't race. + * sleep. We wait on the flush queue on the head iclog as that should be + * the first iclog to complete flushing. Hence if all iclogs are syncing, + * we will wait here and all new writes will sleep until a sync completes. * * The in-core logs are used in a circular fashion. They are not used * out-of-order even when an iclog past the head is free. @@ -2384,16 +2401,15 @@ restart: } iclog = log->l_iclog; - if (! (iclog->ic_state == XLOG_STATE_ACTIVE)) { - log->l_flushcnt++; - spin_unlock(&log->l_icloglock); + if (iclog->ic_state != XLOG_STATE_ACTIVE) { xlog_trace_iclog(iclog, XLOG_TRACE_SLEEP_FLUSH); XFS_STATS_INC(xs_log_noiclogs); - /* Ensure that log writes happen */ - psema(&log->l_flushsema, PINOD); + + /* Wait for log writes to have flushed */ + sv_wait(&log->l_flush_wait, 0, &log->l_icloglock, 0); goto restart; } - ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); + head = &iclog->ic_header; atomic_inc(&iclog->ic_refcnt); /* prevents sync */ @@ -2427,13 +2443,20 @@ restart: if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) { xlog_state_switch_iclogs(log, iclog, iclog->ic_size); - /* If I'm the only one writing to this iclog, sync it to disk */ - if (atomic_read(&iclog->ic_refcnt) == 1) { + /* + * If I'm the only one writing to this iclog, sync it to disk. + * We need to do an atomic compare and decrement here to avoid + * racing with concurrent atomic_dec_and_lock() calls in + * xlog_state_release_iclog() when there is more than one + * reference to the iclog. + */ + if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) { + /* we are the only one */ spin_unlock(&log->l_icloglock); - if ((error = xlog_state_release_iclog(log, iclog))) + error = xlog_state_release_iclog(log, iclog); + if (error) return error; } else { - atomic_dec(&iclog->ic_refcnt); spin_unlock(&log->l_icloglock); } goto restart; @@ -2500,7 +2523,7 @@ xlog_grant_log_space(xlog_t *log, goto error_return; XFS_STATS_INC(xs_sleep_logspace); - sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s); + sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s); /* * If we got an error, and the filesystem is shutting down, * we'll catch it down below. So just continue... @@ -2526,7 +2549,7 @@ redo: xlog_trace_loggrant(log, tic, "xlog_grant_log_space: sleep 2"); XFS_STATS_INC(xs_sleep_logspace); - sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s); + sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s); if (XLOG_FORCED_SHUTDOWN(log)) { spin_lock(&log->l_grant_lock); @@ -2625,7 +2648,7 @@ xlog_regrant_write_log_space(xlog_t *log, if (free_bytes < ntic->t_unit_res) break; free_bytes -= ntic->t_unit_res; - sv_signal(&ntic->t_sema); + sv_signal(&ntic->t_wait); ntic = ntic->t_next; } while (ntic != log->l_write_headq); @@ -2636,7 +2659,7 @@ xlog_regrant_write_log_space(xlog_t *log, xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: sleep 1"); XFS_STATS_INC(xs_sleep_logspace); - sv_wait(&tic->t_sema, PINOD|PLTWAIT, + sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s); /* If we're shutting down, this tic is already @@ -2665,7 +2688,7 @@ redo: if ((tic->t_flags & XLOG_TIC_IN_Q) == 0) xlog_ins_ticketq(&log->l_write_headq, tic); XFS_STATS_INC(xs_sleep_logspace); - sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s); + sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s); /* If we're shutting down, this tic is already off the queue */ if (XLOG_FORCED_SHUTDOWN(log)) { @@ -2908,7 +2931,7 @@ xlog_state_switch_iclogs(xlog_t *log, * 2. the current iclog is drity, and the previous iclog is in the * active or dirty state. * - * We may sleep (call psema) if: + * We may sleep if: * * 1. the current iclog is not in the active nor dirty state. * 2. the current iclog dirty, and the previous iclog is not in the @@ -3005,7 +3028,7 @@ maybe_sleep: return XFS_ERROR(EIO); } XFS_STATS_INC(xs_log_force_sleep); - sv_wait(&iclog->ic_forcesema, PINOD, &log->l_icloglock, s); + sv_wait(&iclog->ic_force_wait, PINOD, &log->l_icloglock, s); /* * No need to grab the log lock here since we're * only deciding whether or not to return EIO @@ -3088,7 +3111,7 @@ try_again: XLOG_STATE_SYNCING))) { ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR)); XFS_STATS_INC(xs_log_force_sleep); - sv_wait(&iclog->ic_prev->ic_writesema, PSWP, + sv_wait(&iclog->ic_prev->ic_write_wait, PSWP, &log->l_icloglock, s); *log_flushed = 1; already_slept = 1; @@ -3108,7 +3131,7 @@ try_again: !(iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) { /* - * Don't wait on the forcesema if we know that we've + * Don't wait on completion if we know that we've * gotten a log write error. */ if (iclog->ic_state & XLOG_STATE_IOERROR) { @@ -3116,7 +3139,7 @@ try_again: return XFS_ERROR(EIO); } XFS_STATS_INC(xs_log_force_sleep); - sv_wait(&iclog->ic_forcesema, PSWP, &log->l_icloglock, s); + sv_wait(&iclog->ic_force_wait, PSWP, &log->l_icloglock, s); /* * No need to grab the log lock here since we're * only deciding whether or not to return EIO @@ -3172,7 +3195,7 @@ STATIC void xlog_ticket_put(xlog_t *log, xlog_ticket_t *ticket) { - sv_destroy(&ticket->t_sema); + sv_destroy(&ticket->t_wait); kmem_zone_free(xfs_log_ticket_zone, ticket); } /* xlog_ticket_put */ @@ -3262,7 +3285,7 @@ xlog_ticket_get(xlog_t *log, tic->t_trans_type = 0; if (xflags & XFS_LOG_PERM_RESERV) tic->t_flags |= XLOG_TIC_PERM_RESERV; - sv_init(&(tic->t_sema), SV_DEFAULT, "logtick"); + sv_init(&(tic->t_wait), SV_DEFAULT, "logtick"); xlog_tic_reset_res(tic); @@ -3549,14 +3572,14 @@ xfs_log_force_umount( */ if ((tic = log->l_reserve_headq)) { do { - sv_signal(&tic->t_sema); + sv_signal(&tic->t_wait); tic = tic->t_next; } while (tic != log->l_reserve_headq); } if ((tic = log->l_write_headq)) { do { - sv_signal(&tic->t_sema); + sv_signal(&tic->t_wait); tic = tic->t_next; } while (tic != log->l_write_headq); } diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index d1d678ecb63e..d47b91f10822 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -149,7 +149,7 @@ int xfs_log_mount(struct xfs_mount *mp, struct xfs_buftarg *log_target, xfs_daddr_t start_block, int num_bblocks); -int xfs_log_mount_finish(struct xfs_mount *mp, int); +int xfs_log_mount_finish(struct xfs_mount *mp); void xfs_log_move_tail(struct xfs_mount *mp, xfs_lsn_t tail_lsn); int xfs_log_notify(struct xfs_mount *mp, diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 8952a392b5f3..e7d8f84443fa 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -241,7 +241,7 @@ typedef struct xlog_res { } xlog_res_t; typedef struct xlog_ticket { - sv_t t_sema; /* sleep on this semaphore : 20 */ + sv_t t_wait; /* ticket wait queue : 20 */ struct xlog_ticket *t_next; /* :4|8 */ struct xlog_ticket *t_prev; /* :4|8 */ xlog_tid_t t_tid; /* transaction identifier : 4 */ @@ -314,7 +314,7 @@ typedef struct xlog_rec_ext_header { * xlog_rec_header_t into the reserved space. * - ic_data follows, so a write to disk can start at the beginning of * the iclog. - * - ic_forcesema is used to implement synchronous forcing of the iclog to disk. + * - ic_forcewait is used to implement synchronous forcing of the iclog to disk. * - ic_next is the pointer to the next iclog in the ring. * - ic_bp is a pointer to the buffer used to write this incore log to disk. * - ic_log is a pointer back to the global log structure. @@ -339,8 +339,8 @@ typedef struct xlog_rec_ext_header { * and move everything else out to subsequent cachelines. */ typedef struct xlog_iclog_fields { - sv_t ic_forcesema; - sv_t ic_writesema; + sv_t ic_force_wait; + sv_t ic_write_wait; struct xlog_in_core *ic_next; struct xlog_in_core *ic_prev; struct xfs_buf *ic_bp; @@ -377,8 +377,8 @@ typedef struct xlog_in_core { /* * Defines to save our code from this glop. */ -#define ic_forcesema hic_fields.ic_forcesema -#define ic_writesema hic_fields.ic_writesema +#define ic_force_wait hic_fields.ic_force_wait +#define ic_write_wait hic_fields.ic_write_wait #define ic_next hic_fields.ic_next #define ic_prev hic_fields.ic_prev #define ic_bp hic_fields.ic_bp @@ -423,10 +423,8 @@ typedef struct log { int l_logBBsize; /* size of log in BB chunks */ /* The following block of fields are changed while holding icloglock */ - sema_t l_flushsema ____cacheline_aligned_in_smp; - /* iclog flushing semaphore */ - int l_flushcnt; /* # of procs waiting on this - * sema */ + sv_t l_flush_wait ____cacheline_aligned_in_smp; + /* waiting for iclog flush */ int l_covered_state;/* state of "covering disk * log entries" */ xlog_in_core_t *l_iclog; /* head log queue */ @@ -450,7 +448,6 @@ typedef struct log { int l_grant_write_bytes; #ifdef XFS_LOG_TRACE - struct ktrace *l_trace; struct ktrace *l_grant_trace; #endif @@ -470,7 +467,7 @@ extern int xlog_find_tail(xlog_t *log, xfs_daddr_t *head_blk, xfs_daddr_t *tail_blk); extern int xlog_recover(xlog_t *log); -extern int xlog_recover_finish(xlog_t *log, int mfsi_flags); +extern int xlog_recover_finish(xlog_t *log); extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int); extern void xlog_recover_process_iunlinks(xlog_t *log); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index e65ab4af0955..82d46ce69d5f 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1715,8 +1715,7 @@ xlog_check_buffer_cancelled( } else { prevp->bc_next = bcp->bc_next; } - kmem_free(bcp, - sizeof(xfs_buf_cancel_t)); + kmem_free(bcp); } } return 1; @@ -2519,7 +2518,7 @@ write_inode_buffer: error: if (need_free) - kmem_free(in_f, sizeof(*in_f)); + kmem_free(in_f); return XFS_ERROR(error); } @@ -2830,16 +2829,14 @@ xlog_recover_free_trans( item = item->ri_next; /* Free the regions in the item. */ for (i = 0; i < free_item->ri_cnt; i++) { - kmem_free(free_item->ri_buf[i].i_addr, - free_item->ri_buf[i].i_len); + kmem_free(free_item->ri_buf[i].i_addr); } /* Free the item itself */ - kmem_free(free_item->ri_buf, - (free_item->ri_total * sizeof(xfs_log_iovec_t))); - kmem_free(free_item, sizeof(xlog_recover_item_t)); + kmem_free(free_item->ri_buf); + kmem_free(free_item); } while (first_item != item); /* Free the transaction recover structure */ - kmem_free(trans, sizeof(xlog_recover_t)); + kmem_free(trans); } STATIC int @@ -3786,8 +3783,7 @@ xlog_do_log_recovery( error = xlog_do_recovery_pass(log, head_blk, tail_blk, XLOG_RECOVER_PASS1); if (error != 0) { - kmem_free(log->l_buf_cancel_table, - XLOG_BC_TABLE_SIZE * sizeof(xfs_buf_cancel_t*)); + kmem_free(log->l_buf_cancel_table); log->l_buf_cancel_table = NULL; return error; } @@ -3806,8 +3802,7 @@ xlog_do_log_recovery( } #endif /* DEBUG */ - kmem_free(log->l_buf_cancel_table, - XLOG_BC_TABLE_SIZE * sizeof(xfs_buf_cancel_t*)); + kmem_free(log->l_buf_cancel_table); log->l_buf_cancel_table = NULL; return error; @@ -3945,8 +3940,7 @@ xlog_recover( */ int xlog_recover_finish( - xlog_t *log, - int mfsi_flags) + xlog_t *log) { /* * Now we're ready to do the transactions needed for the @@ -3974,9 +3968,7 @@ xlog_recover_finish( xfs_log_force(log->l_mp, (xfs_lsn_t)0, (XFS_LOG_FORCE | XFS_LOG_SYNC)); - if ( (mfsi_flags & XFS_MFSI_NOUNLINK) == 0 ) { - xlog_recover_process_iunlinks(log); - } + xlog_recover_process_iunlinks(log); xlog_recover_check_summary(log); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index da3988453b71..a4503f5e9497 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -47,12 +47,10 @@ STATIC int xfs_mount_log_sb(xfs_mount_t *, __int64_t); STATIC int xfs_uuid_mount(xfs_mount_t *); -STATIC void xfs_uuid_unmount(xfs_mount_t *mp); STATIC void xfs_unmountfs_wait(xfs_mount_t *); #ifdef HAVE_PERCPU_SB -STATIC void xfs_icsb_destroy_counters(xfs_mount_t *); STATIC void xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t, int); STATIC void xfs_icsb_balance_counter_locked(xfs_mount_t *, xfs_sb_field_t, @@ -63,7 +61,6 @@ STATIC void xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t); #else -#define xfs_icsb_destroy_counters(mp) do { } while (0) #define xfs_icsb_balance_counter(mp, a, b) do { } while (0) #define xfs_icsb_balance_counter_locked(mp, a, b) do { } while (0) #define xfs_icsb_modify_counters(mp, a, b, c) do { } while (0) @@ -126,34 +123,12 @@ static const struct { }; /* - * Return a pointer to an initialized xfs_mount structure. - */ -xfs_mount_t * -xfs_mount_init(void) -{ - xfs_mount_t *mp; - - mp = kmem_zalloc(sizeof(xfs_mount_t), KM_SLEEP); - - if (xfs_icsb_init_counters(mp)) { - mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB; - } - - spin_lock_init(&mp->m_sb_lock); - mutex_init(&mp->m_ilock); - mutex_init(&mp->m_growlock); - atomic_set(&mp->m_active_trans, 0); - - return mp; -} - -/* * Free up the resources associated with a mount structure. Assume that * the structure was initially zeroed, so we can tell which fields got * initialized. */ -void -xfs_mount_free( +STATIC void +xfs_free_perag( xfs_mount_t *mp) { if (mp->m_perag) { @@ -161,28 +136,9 @@ xfs_mount_free( for (agno = 0; agno < mp->m_maxagi; agno++) if (mp->m_perag[agno].pagb_list) - kmem_free(mp->m_perag[agno].pagb_list, - sizeof(xfs_perag_busy_t) * - XFS_PAGB_NUM_SLOTS); - kmem_free(mp->m_perag, - sizeof(xfs_perag_t) * mp->m_sb.sb_agcount); + kmem_free(mp->m_perag[agno].pagb_list); + kmem_free(mp->m_perag); } - - spinlock_destroy(&mp->m_ail_lock); - spinlock_destroy(&mp->m_sb_lock); - mutex_destroy(&mp->m_ilock); - mutex_destroy(&mp->m_growlock); - if (mp->m_quotainfo) - XFS_QM_DONE(mp); - - if (mp->m_fsname != NULL) - kmem_free(mp->m_fsname, mp->m_fsname_len); - if (mp->m_rtname != NULL) - kmem_free(mp->m_rtname, strlen(mp->m_rtname) + 1); - if (mp->m_logname != NULL) - kmem_free(mp->m_logname, strlen(mp->m_logname) + 1); - - xfs_icsb_destroy_counters(mp); } /* @@ -288,6 +244,19 @@ xfs_mount_validate_sb( return XFS_ERROR(EFSCORRUPTED); } + /* + * Until this is fixed only page-sized or smaller data blocks work. + */ + if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) { + xfs_fs_mount_cmn_err(flags, + "file system with blocksize %d bytes", + sbp->sb_blocksize); + xfs_fs_mount_cmn_err(flags, + "only pagesize (%ld) or less will currently work.", + PAGE_SIZE); + return XFS_ERROR(ENOSYS); + } + if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) || xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) { xfs_fs_mount_cmn_err(flags, @@ -309,19 +278,6 @@ xfs_mount_validate_sb( return XFS_ERROR(ENOSYS); } - /* - * Until this is fixed only page-sized or smaller data blocks work. - */ - if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) { - xfs_fs_mount_cmn_err(flags, - "file system with blocksize %d bytes", - sbp->sb_blocksize); - xfs_fs_mount_cmn_err(flags, - "only pagesize (%ld) or less will currently work.", - PAGE_SIZE); - return XFS_ERROR(ENOSYS); - } - return 0; } @@ -734,11 +690,11 @@ xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount) * Update alignment values based on mount options and sb values */ STATIC int -xfs_update_alignment(xfs_mount_t *mp, int mfsi_flags, __uint64_t *update_flags) +xfs_update_alignment(xfs_mount_t *mp, __uint64_t *update_flags) { xfs_sb_t *sbp = &(mp->m_sb); - if (mp->m_dalign && !(mfsi_flags & XFS_MFSI_SECOND)) { + if (mp->m_dalign) { /* * If stripe unit and stripe width are not multiples * of the fs blocksize turn off alignment. @@ -894,7 +850,7 @@ xfs_set_inoalignment(xfs_mount_t *mp) * Check that the data (and log if separate) are an ok size. */ STATIC int -xfs_check_sizes(xfs_mount_t *mp, int mfsi_flags) +xfs_check_sizes(xfs_mount_t *mp) { xfs_buf_t *bp; xfs_daddr_t d; @@ -917,8 +873,7 @@ xfs_check_sizes(xfs_mount_t *mp, int mfsi_flags) return error; } - if (((mfsi_flags & XFS_MFSI_CLIENT) == 0) && - mp->m_logdev_targp != mp->m_ddev_targp) { + if (mp->m_logdev_targp != mp->m_ddev_targp) { d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks); if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) { cmn_err(CE_WARN, "XFS: size check 3 failed"); @@ -953,15 +908,13 @@ xfs_check_sizes(xfs_mount_t *mp, int mfsi_flags) */ int xfs_mountfs( - xfs_mount_t *mp, - int mfsi_flags) + xfs_mount_t *mp) { xfs_sb_t *sbp = &(mp->m_sb); xfs_inode_t *rip; __uint64_t resblks; __int64_t update_flags = 0LL; uint quotamount, quotaflags; - int agno; int uuid_mounted = 0; int error = 0; @@ -994,9 +947,19 @@ xfs_mountfs( * Re-check for ATTR2 in case it was found in bad_features2 * slot. */ - if (xfs_sb_version_hasattr2(&mp->m_sb)) + if (xfs_sb_version_hasattr2(&mp->m_sb) && + !(mp->m_flags & XFS_MOUNT_NOATTR2)) mp->m_flags |= XFS_MOUNT_ATTR2; + } + + if (xfs_sb_version_hasattr2(&mp->m_sb) && + (mp->m_flags & XFS_MOUNT_NOATTR2)) { + xfs_sb_version_removeattr2(&mp->m_sb); + update_flags |= XFS_SB_FEATURES2; + /* update sb_versionnum for the clearing of the morebits */ + if (!sbp->sb_features2) + update_flags |= XFS_SB_VERSIONNUM; } /* @@ -1005,7 +968,7 @@ xfs_mountfs( * allocator alignment is within an ag, therefore ag has * to be aligned at stripe boundary. */ - error = xfs_update_alignment(mp, mfsi_flags, &update_flags); + error = xfs_update_alignment(mp, &update_flags); if (error) goto error1; @@ -1024,8 +987,7 @@ xfs_mountfs( * since a single partition filesystem is identical to a single * partition volume/filesystem. */ - if ((mfsi_flags & XFS_MFSI_SECOND) == 0 && - (mp->m_flags & XFS_MOUNT_NOUUID) == 0) { + if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0) { if (xfs_uuid_mount(mp)) { error = XFS_ERROR(EINVAL); goto error1; @@ -1053,7 +1015,7 @@ xfs_mountfs( /* * Check that the data (and log if separate) are an ok size. */ - error = xfs_check_sizes(mp, mfsi_flags); + error = xfs_check_sizes(mp); if (error) goto error1; @@ -1067,13 +1029,6 @@ xfs_mountfs( } /* - * For client case we are done now - */ - if (mfsi_flags & XFS_MFSI_CLIENT) { - return 0; - } - - /* * Copies the low order bits of the timestamp and the randomly * set "sequence" number out of a UUID. */ @@ -1097,8 +1052,10 @@ xfs_mountfs( * Allocate and initialize the per-ag data. */ init_rwsem(&mp->m_peraglock); - mp->m_perag = - kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t), KM_SLEEP); + mp->m_perag = kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t), + KM_MAYFAIL); + if (!mp->m_perag) + goto error1; mp->m_maxagi = xfs_initialize_perag(mp, sbp->sb_agcount); @@ -1210,7 +1167,7 @@ xfs_mountfs( * delayed until after the root and real-time bitmap inodes * were consistently read in. */ - error = xfs_log_mount_finish(mp, mfsi_flags); + error = xfs_log_mount_finish(mp); if (error) { cmn_err(CE_WARN, "XFS: log mount finish failed"); goto error4; @@ -1219,7 +1176,7 @@ xfs_mountfs( /* * Complete the quota initialisation, post-log-replay component. */ - error = XFS_QM_MOUNT(mp, quotamount, quotaflags, mfsi_flags); + error = XFS_QM_MOUNT(mp, quotamount, quotaflags); if (error) goto error4; @@ -1253,31 +1210,25 @@ xfs_mountfs( error3: xfs_log_unmount_dealloc(mp); error2: - for (agno = 0; agno < sbp->sb_agcount; agno++) - if (mp->m_perag[agno].pagb_list) - kmem_free(mp->m_perag[agno].pagb_list, - sizeof(xfs_perag_busy_t) * XFS_PAGB_NUM_SLOTS); - kmem_free(mp->m_perag, sbp->sb_agcount * sizeof(xfs_perag_t)); - mp->m_perag = NULL; - /* FALLTHROUGH */ + xfs_free_perag(mp); error1: if (uuid_mounted) - xfs_uuid_unmount(mp); - xfs_freesb(mp); + uuid_table_remove(&mp->m_sb.sb_uuid); return error; } /* - * xfs_unmountfs - * * This flushes out the inodes,dquots and the superblock, unmounts the * log and makes sure that incore structures are freed. */ -int -xfs_unmountfs(xfs_mount_t *mp, struct cred *cr) +void +xfs_unmountfs( + struct xfs_mount *mp) { - __uint64_t resblks; - int error = 0; + __uint64_t resblks; + int error; + + IRELE(mp->m_rootip); /* * We can potentially deadlock here if we have an inode cluster @@ -1334,32 +1285,20 @@ xfs_unmountfs(xfs_mount_t *mp, struct cred *cr) xfs_unmountfs_wait(mp); /* wait for async bufs */ xfs_log_unmount(mp); /* Done! No more fs ops. */ - xfs_freesb(mp); - /* * All inodes from this mount point should be freed. */ ASSERT(mp->m_inodes == NULL); - xfs_unmountfs_close(mp, cr); if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0) - xfs_uuid_unmount(mp); + uuid_table_remove(&mp->m_sb.sb_uuid); -#if defined(DEBUG) || defined(INDUCE_IO_ERROR) +#if defined(DEBUG) xfs_errortag_clearall(mp, 0); #endif - xfs_mount_free(mp); - return 0; -} - -void -xfs_unmountfs_close(xfs_mount_t *mp, struct cred *cr) -{ - if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) - xfs_free_buftarg(mp->m_logdev_targp, 1); - if (mp->m_rtdev_targp) - xfs_free_buftarg(mp->m_rtdev_targp, 1); - xfs_free_buftarg(mp->m_ddev_targp, 0); + xfs_free_perag(mp); + if (mp->m_quotainfo) + XFS_QM_DONE(mp); } STATIC void @@ -1905,16 +1844,6 @@ xfs_uuid_mount( } /* - * Remove filesystem from the UUID table. - */ -STATIC void -xfs_uuid_unmount( - xfs_mount_t *mp) -{ - uuid_table_remove(&mp->m_sb.sb_uuid); -} - -/* * Used to log changes to the superblock unit and width fields which could * be altered by the mount options, as well as any potential sb_features2 * fixup. Only the first superblock is updated. @@ -1928,7 +1857,8 @@ xfs_mount_log_sb( int error; ASSERT(fields & (XFS_SB_UNIT | XFS_SB_WIDTH | XFS_SB_UUID | - XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2)); + XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2 | + XFS_SB_VERSIONNUM)); tp = xfs_trans_alloc(mp, XFS_TRANS_SB_UNIT); error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0, @@ -2109,7 +2039,7 @@ xfs_icsb_reinit_counters( xfs_icsb_unlock(mp); } -STATIC void +void xfs_icsb_destroy_counters( xfs_mount_t *mp) { diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 63e0693a358a..f3c1024b1241 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -61,6 +61,7 @@ struct xfs_bmap_free; struct xfs_extdelta; struct xfs_swapext; struct xfs_mru_cache; +struct xfs_nameops; /* * Prototypes and functions for the Data Migration subsystem. @@ -113,7 +114,7 @@ struct xfs_dqtrxops; struct xfs_quotainfo; typedef int (*xfs_qminit_t)(struct xfs_mount *, uint *, uint *); -typedef int (*xfs_qmmount_t)(struct xfs_mount *, uint, uint, int); +typedef int (*xfs_qmmount_t)(struct xfs_mount *, uint, uint); typedef int (*xfs_qmunmount_t)(struct xfs_mount *); typedef void (*xfs_qmdone_t)(struct xfs_mount *); typedef void (*xfs_dqrele_t)(struct xfs_dquot *); @@ -157,8 +158,8 @@ typedef struct xfs_qmops { #define XFS_QM_INIT(mp, mnt, fl) \ (*(mp)->m_qm_ops->xfs_qminit)(mp, mnt, fl) -#define XFS_QM_MOUNT(mp, mnt, fl, mfsi_flags) \ - (*(mp)->m_qm_ops->xfs_qmmount)(mp, mnt, fl, mfsi_flags) +#define XFS_QM_MOUNT(mp, mnt, fl) \ + (*(mp)->m_qm_ops->xfs_qmmount)(mp, mnt, fl) #define XFS_QM_UNMOUNT(mp) \ (*(mp)->m_qm_ops->xfs_qmunmount)(mp) #define XFS_QM_DONE(mp) \ @@ -210,12 +211,14 @@ typedef struct xfs_icsb_cnts { extern int xfs_icsb_init_counters(struct xfs_mount *); extern void xfs_icsb_reinit_counters(struct xfs_mount *); +extern void xfs_icsb_destroy_counters(struct xfs_mount *); extern void xfs_icsb_sync_counters(struct xfs_mount *, int); extern void xfs_icsb_sync_counters_locked(struct xfs_mount *, int); #else -#define xfs_icsb_init_counters(mp) (0) -#define xfs_icsb_reinit_counters(mp) do { } while (0) +#define xfs_icsb_init_counters(mp) (0) +#define xfs_icsb_destroy_counters(mp) do { } while (0) +#define xfs_icsb_reinit_counters(mp) do { } while (0) #define xfs_icsb_sync_counters(mp, flags) do { } while (0) #define xfs_icsb_sync_counters_locked(mp, flags) do { } while (0) #endif @@ -313,6 +316,7 @@ typedef struct xfs_mount { __uint8_t m_inode_quiesce;/* call quiesce on new inodes. field governed by m_ilock */ __uint8_t m_sectbb_log; /* sectlog - BBSHIFT */ + const struct xfs_nameops *m_dirnameops; /* vector of dir name ops */ int m_dirblksize; /* directory block sz--bytes */ int m_dirblkfsbs; /* directory block sz--fsbs */ xfs_dablk_t m_dirdatablk; /* blockno of dir data v2 */ @@ -378,6 +382,7 @@ typedef struct xfs_mount { counters */ #define XFS_MOUNT_FILESTREAMS (1ULL << 24) /* enable the filestreams allocator */ +#define XFS_MOUNT_NOATTR2 (1ULL << 25) /* disable use of attr2 format */ /* @@ -437,13 +442,6 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname, /* * Flags for xfs_mountfs */ -#define XFS_MFSI_SECOND 0x01 /* Secondary mount -- skip stuff */ -#define XFS_MFSI_CLIENT 0x02 /* Is a client -- skip lots of stuff */ -/* XFS_MFSI_RRINODES */ -#define XFS_MFSI_NOUNLINK 0x08 /* Skip unlinked inode processing in */ - /* log recovery */ -#define XFS_MFSI_NO_QUOTACHECK 0x10 /* Skip quotacheck processing */ -/* XFS_MFSI_CONVERT_SUNIT */ #define XFS_MFSI_QUIET 0x40 /* Be silent if mount errors found */ #define XFS_DADDR_TO_AGNO(mp,d) xfs_daddr_to_agno(mp,d) @@ -510,15 +508,12 @@ typedef struct xfs_mod_sb { #define XFS_MOUNT_ILOCK(mp) mutex_lock(&((mp)->m_ilock)) #define XFS_MOUNT_IUNLOCK(mp) mutex_unlock(&((mp)->m_ilock)) -extern xfs_mount_t *xfs_mount_init(void); extern void xfs_mod_sb(xfs_trans_t *, __int64_t); extern int xfs_log_sbcount(xfs_mount_t *, uint); -extern void xfs_mount_free(xfs_mount_t *mp); -extern int xfs_mountfs(xfs_mount_t *mp, int); +extern int xfs_mountfs(xfs_mount_t *mp); extern void xfs_mountfs_check_barriers(xfs_mount_t *mp); -extern int xfs_unmountfs(xfs_mount_t *, struct cred *); -extern void xfs_unmountfs_close(xfs_mount_t *, struct cred *); +extern void xfs_unmountfs(xfs_mount_t *); extern int xfs_unmountfs_writesb(xfs_mount_t *); extern int xfs_unmount_flush(xfs_mount_t *, int); extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int); @@ -544,9 +539,6 @@ extern void xfs_qmops_put(struct xfs_mount *); extern struct xfs_dmops xfs_dmcore_xfs; -extern int xfs_init(void); -extern void xfs_cleanup(void); - #endif /* __KERNEL__ */ #endif /* __XFS_MOUNT_H__ */ diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c index a0b2c0a2589a..afee7eb24323 100644 --- a/fs/xfs/xfs_mru_cache.c +++ b/fs/xfs/xfs_mru_cache.c @@ -307,15 +307,18 @@ xfs_mru_cache_init(void) xfs_mru_elem_zone = kmem_zone_init(sizeof(xfs_mru_cache_elem_t), "xfs_mru_cache_elem"); if (!xfs_mru_elem_zone) - return ENOMEM; + goto out; xfs_mru_reap_wq = create_singlethread_workqueue("xfs_mru_cache"); - if (!xfs_mru_reap_wq) { - kmem_zone_destroy(xfs_mru_elem_zone); - return ENOMEM; - } + if (!xfs_mru_reap_wq) + goto out_destroy_mru_elem_zone; return 0; + + out_destroy_mru_elem_zone: + kmem_zone_destroy(xfs_mru_elem_zone); + out: + return -ENOMEM; } void @@ -382,9 +385,9 @@ xfs_mru_cache_create( exit: if (err && mru && mru->lists) - kmem_free(mru->lists, mru->grp_count * sizeof(*mru->lists)); + kmem_free(mru->lists); if (err && mru) - kmem_free(mru, sizeof(*mru)); + kmem_free(mru); return err; } @@ -424,8 +427,8 @@ xfs_mru_cache_destroy( xfs_mru_cache_flush(mru); - kmem_free(mru->lists, mru->grp_count * sizeof(*mru->lists)); - kmem_free(mru, sizeof(*mru)); + kmem_free(mru->lists); + kmem_free(mru); } /* diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c index d8063e1ad298..d700dacdb10e 100644 --- a/fs/xfs/xfs_rename.c +++ b/fs/xfs/xfs_rename.c @@ -336,22 +336,18 @@ xfs_rename( ASSERT(error != EEXIST); if (error) goto abort_return; - xfs_ichgtime(src_ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - - } else { - /* - * We always want to hit the ctime on the source inode. - * We do it in the if clause above for the 'new_parent && - * src_is_directory' case, and here we get all the other - * cases. This isn't strictly required by the standards - * since the source inode isn't really being changed, - * but old unix file systems did it and some incremental - * backup programs won't work without it. - */ - xfs_ichgtime(src_ip, XFS_ICHGTIME_CHG); } /* + * We always want to hit the ctime on the source inode. + * + * This isn't strictly required by the standards since the source + * inode isn't really being changed, but old unix file systems did + * it and some incremental backup programs won't work without it. + */ + xfs_ichgtime(src_ip, XFS_ICHGTIME_CHG); + + /* * Adjust the link count on src_dp. This is necessary when * renaming a directory, either within one parent when * the target existed, or across two parent directories. diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index a0dc6e5bc5b9..e2f68de16159 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -74,18 +74,6 @@ STATIC int xfs_rtmodify_summary(xfs_mount_t *, xfs_trans_t *, int, */ /* - * xfs_lowbit32: get low bit set out of 32-bit argument, -1 if none set. - */ -STATIC int -xfs_lowbit32( - __uint32_t v) -{ - if (v) - return ffs(v) - 1; - return -1; -} - -/* * Allocate space to the bitmap or summary file, and zero it, for growfs. */ STATIC int /* error */ @@ -450,6 +438,7 @@ xfs_rtallocate_extent_near( } bbno = XFS_BITTOBLOCK(mp, bno); i = 0; + ASSERT(minlen != 0); log2len = xfs_highbit32(minlen); /* * Loop over all bitmap blocks (bbno + i is current block). @@ -618,6 +607,8 @@ xfs_rtallocate_extent_size( xfs_suminfo_t sum; /* summary information for extents */ ASSERT(minlen % prod == 0 && maxlen % prod == 0); + ASSERT(maxlen != 0); + /* * Loop over all the levels starting with maxlen. * At each level, look at all the bitmap blocks, to see if there @@ -675,6 +666,9 @@ xfs_rtallocate_extent_size( *rtblock = NULLRTBLOCK; return 0; } + ASSERT(minlen != 0); + ASSERT(maxlen != 0); + /* * Loop over sizes, from maxlen down to minlen. * This time, when we do the allocations, allow smaller ones @@ -1961,6 +1955,7 @@ xfs_growfs_rt( nsbp->sb_blocksize * nsbp->sb_rextsize); nsbp->sb_rextents = nsbp->sb_rblocks; do_div(nsbp->sb_rextents, nsbp->sb_rextsize); + ASSERT(nsbp->sb_rextents != 0); nsbp->sb_rextslog = xfs_highbit32(nsbp->sb_rextents); nrsumlevels = nmp->m_rsumlevels = nsbp->sb_rextslog + 1; nrsumsize = @@ -2062,7 +2057,7 @@ xfs_growfs_rt( /* * Free the fake mp structure. */ - kmem_free(nmp, sizeof(*nmp)); + kmem_free(nmp); return error; } diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c index b0f31c09a76d..3a82576dde9a 100644 --- a/fs/xfs/xfs_rw.c +++ b/fs/xfs/xfs_rw.c @@ -314,7 +314,7 @@ xfs_bioerror_relse( * ASYNC buffers. */ XFS_BUF_ERROR(bp, EIO); - XFS_BUF_V_IODONESEMA(bp); + XFS_BUF_FINISH_IOWAIT(bp); } else { xfs_buf_relse(bp); } diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h index d904efe7f871..3f8cf1587f4c 100644 --- a/fs/xfs/xfs_sb.h +++ b/fs/xfs/xfs_sb.h @@ -46,10 +46,12 @@ struct xfs_mount; #define XFS_SB_VERSION_SECTORBIT 0x0800 #define XFS_SB_VERSION_EXTFLGBIT 0x1000 #define XFS_SB_VERSION_DIRV2BIT 0x2000 +#define XFS_SB_VERSION_BORGBIT 0x4000 /* ASCII only case-insens. */ #define XFS_SB_VERSION_MOREBITSBIT 0x8000 #define XFS_SB_VERSION_OKSASHFBITS \ (XFS_SB_VERSION_EXTFLGBIT | \ - XFS_SB_VERSION_DIRV2BIT) + XFS_SB_VERSION_DIRV2BIT | \ + XFS_SB_VERSION_BORGBIT) #define XFS_SB_VERSION_OKREALFBITS \ (XFS_SB_VERSION_ATTRBIT | \ XFS_SB_VERSION_NLINKBIT | \ @@ -437,6 +439,12 @@ static inline int xfs_sb_version_hassector(xfs_sb_t *sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_SECTORBIT); } +static inline int xfs_sb_version_hasasciici(xfs_sb_t *sbp) +{ + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ + (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT); +} + static inline int xfs_sb_version_hasmorebits(xfs_sb_t *sbp) { return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ @@ -473,6 +481,13 @@ static inline void xfs_sb_version_addattr2(xfs_sb_t *sbp) ((sbp)->sb_features2 | XFS_SB_VERSION2_ATTR2BIT))); } +static inline void xfs_sb_version_removeattr2(xfs_sb_t *sbp) +{ + sbp->sb_features2 &= ~XFS_SB_VERSION2_ATTR2BIT; + if (!sbp->sb_features2) + sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT; +} + /* * end of superblock version macros */ diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 140386434aa3..4e1c22a23be5 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -43,6 +43,7 @@ #include "xfs_quota.h" #include "xfs_trans_priv.h" #include "xfs_trans_space.h" +#include "xfs_inode_item.h" STATIC void xfs_trans_apply_sb_deltas(xfs_trans_t *); @@ -253,7 +254,7 @@ _xfs_trans_alloc( tp->t_mountp = mp; tp->t_items_free = XFS_LIC_NUM_SLOTS; tp->t_busy_free = XFS_LBC_NUM_SLOTS; - XFS_LIC_INIT(&(tp->t_items)); + xfs_lic_init(&(tp->t_items)); XFS_LBC_INIT(&(tp->t_busy)); return tp; } @@ -282,7 +283,7 @@ xfs_trans_dup( ntp->t_mountp = tp->t_mountp; ntp->t_items_free = XFS_LIC_NUM_SLOTS; ntp->t_busy_free = XFS_LBC_NUM_SLOTS; - XFS_LIC_INIT(&(ntp->t_items)); + xfs_lic_init(&(ntp->t_items)); XFS_LBC_INIT(&(ntp->t_busy)); ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); @@ -889,7 +890,7 @@ shut_us_down: tp->t_commit_lsn = commit_lsn; if (nvec > XFS_TRANS_LOGVEC_COUNT) { - kmem_free(log_vector, nvec * sizeof(xfs_log_iovec_t)); + kmem_free(log_vector); } /* @@ -1169,7 +1170,7 @@ xfs_trans_cancel( while (licp != NULL) { lidp = licp->lic_descs; for (i = 0; i < licp->lic_unused; i++, lidp++) { - if (XFS_LIC_ISFREE(licp, i)) { + if (xfs_lic_isfree(licp, i)) { continue; } @@ -1216,6 +1217,68 @@ xfs_trans_free( kmem_zone_free(xfs_trans_zone, tp); } +/* + * Roll from one trans in the sequence of PERMANENT transactions to + * the next: permanent transactions are only flushed out when + * committed with XFS_TRANS_RELEASE_LOG_RES, but we still want as soon + * as possible to let chunks of it go to the log. So we commit the + * chunk we've been working on and get a new transaction to continue. + */ +int +xfs_trans_roll( + struct xfs_trans **tpp, + struct xfs_inode *dp) +{ + struct xfs_trans *trans; + unsigned int logres, count; + int error; + + /* + * Ensure that the inode is always logged. + */ + trans = *tpp; + xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE); + + /* + * Copy the critical parameters from one trans to the next. + */ + logres = trans->t_log_res; + count = trans->t_log_count; + *tpp = xfs_trans_dup(trans); + + /* + * Commit the current transaction. + * If this commit failed, then it'd just unlock those items that + * are not marked ihold. That also means that a filesystem shutdown + * is in progress. The caller takes the responsibility to cancel + * the duplicate transaction that gets returned. + */ + error = xfs_trans_commit(trans, 0); + if (error) + return (error); + + trans = *tpp; + + /* + * Reserve space in the log for th next transaction. + * This also pushes items in the "AIL", the list of logged items, + * out to disk if they are taking up space at the tail of the log + * that we want to use. This requires that either nothing be locked + * across this call, or that anything that is locked be logged in + * the prior and the next transactions. + */ + error = xfs_trans_reserve(trans, 0, logres, 0, + XFS_TRANS_PERM_LOG_RES, count); + /* + * Ensure that the inode is in the new transaction and locked. + */ + if (error) + return error; + + xfs_trans_ijoin(trans, dp, XFS_ILOCK_EXCL); + xfs_trans_ihold(trans, dp); + return 0; +} /* * THIS SHOULD BE REWRITTEN TO USE xfs_trans_next_item(). @@ -1253,7 +1316,7 @@ xfs_trans_committed( * Special case the chunk embedded in the transaction. */ licp = &(tp->t_items); - if (!(XFS_LIC_ARE_ALL_FREE(licp))) { + if (!(xfs_lic_are_all_free(licp))) { xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag); } @@ -1262,10 +1325,10 @@ xfs_trans_committed( */ licp = licp->lic_next; while (licp != NULL) { - ASSERT(!XFS_LIC_ARE_ALL_FREE(licp)); + ASSERT(!xfs_lic_are_all_free(licp)); xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag); next_licp = licp->lic_next; - kmem_free(licp, sizeof(xfs_log_item_chunk_t)); + kmem_free(licp); licp = next_licp; } @@ -1325,7 +1388,7 @@ xfs_trans_chunk_committed( lidp = licp->lic_descs; for (i = 0; i < licp->lic_unused; i++, lidp++) { - if (XFS_LIC_ISFREE(licp, i)) { + if (xfs_lic_isfree(licp, i)) { continue; } diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 0804207c7391..74c80bd2b0ec 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -210,62 +210,52 @@ typedef struct xfs_log_item_chunk { * lic_unused to the right value (0 matches all free). The * lic_descs.lid_index values are set up as each desc is allocated. */ -#define XFS_LIC_INIT(cp) xfs_lic_init(cp) static inline void xfs_lic_init(xfs_log_item_chunk_t *cp) { cp->lic_free = XFS_LIC_FREEMASK; } -#define XFS_LIC_INIT_SLOT(cp,slot) xfs_lic_init_slot(cp, slot) static inline void xfs_lic_init_slot(xfs_log_item_chunk_t *cp, int slot) { cp->lic_descs[slot].lid_index = (unsigned char)(slot); } -#define XFS_LIC_VACANCY(cp) xfs_lic_vacancy(cp) static inline int xfs_lic_vacancy(xfs_log_item_chunk_t *cp) { return cp->lic_free & XFS_LIC_FREEMASK; } -#define XFS_LIC_ALL_FREE(cp) xfs_lic_all_free(cp) static inline void xfs_lic_all_free(xfs_log_item_chunk_t *cp) { cp->lic_free = XFS_LIC_FREEMASK; } -#define XFS_LIC_ARE_ALL_FREE(cp) xfs_lic_are_all_free(cp) static inline int xfs_lic_are_all_free(xfs_log_item_chunk_t *cp) { return ((cp->lic_free & XFS_LIC_FREEMASK) == XFS_LIC_FREEMASK); } -#define XFS_LIC_ISFREE(cp,slot) xfs_lic_isfree(cp,slot) static inline int xfs_lic_isfree(xfs_log_item_chunk_t *cp, int slot) { return (cp->lic_free & (1 << slot)); } -#define XFS_LIC_CLAIM(cp,slot) xfs_lic_claim(cp,slot) static inline void xfs_lic_claim(xfs_log_item_chunk_t *cp, int slot) { cp->lic_free &= ~(1 << slot); } -#define XFS_LIC_RELSE(cp,slot) xfs_lic_relse(cp,slot) static inline void xfs_lic_relse(xfs_log_item_chunk_t *cp, int slot) { cp->lic_free |= 1 << slot; } -#define XFS_LIC_SLOT(cp,slot) xfs_lic_slot(cp,slot) static inline xfs_log_item_desc_t * xfs_lic_slot(xfs_log_item_chunk_t *cp, int slot) { return &(cp->lic_descs[slot]); } -#define XFS_LIC_DESC_TO_SLOT(dp) xfs_lic_desc_to_slot(dp) static inline int xfs_lic_desc_to_slot(xfs_log_item_desc_t *dp) { return (uint)dp->lid_index; @@ -278,7 +268,6 @@ static inline int xfs_lic_desc_to_slot(xfs_log_item_desc_t *dp) * All of this yields the address of the chunk, which is * cast to a chunk pointer. */ -#define XFS_LIC_DESC_TO_CHUNK(dp) xfs_lic_desc_to_chunk(dp) static inline xfs_log_item_chunk_t * xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp) { @@ -986,6 +975,7 @@ int _xfs_trans_commit(xfs_trans_t *, int *); #define xfs_trans_commit(tp, flags) _xfs_trans_commit(tp, flags, NULL) void xfs_trans_cancel(xfs_trans_t *, int); +int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *); int xfs_trans_ail_init(struct xfs_mount *); void xfs_trans_ail_destroy(struct xfs_mount *); void xfs_trans_push_ail(struct xfs_mount *, xfs_lsn_t); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index cb0c5839154b..4e855b5ced66 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -1021,16 +1021,16 @@ xfs_trans_buf_item_match( bp = NULL; len = BBTOB(len); licp = &tp->t_items; - if (!XFS_LIC_ARE_ALL_FREE(licp)) { + if (!xfs_lic_are_all_free(licp)) { for (i = 0; i < licp->lic_unused; i++) { /* * Skip unoccupied slots. */ - if (XFS_LIC_ISFREE(licp, i)) { + if (xfs_lic_isfree(licp, i)) { continue; } - lidp = XFS_LIC_SLOT(licp, i); + lidp = xfs_lic_slot(licp, i); blip = (xfs_buf_log_item_t *)lidp->lid_item; if (blip->bli_item.li_type != XFS_LI_BUF) { continue; @@ -1074,7 +1074,7 @@ xfs_trans_buf_item_match_all( bp = NULL; len = BBTOB(len); for (licp = &tp->t_items; licp != NULL; licp = licp->lic_next) { - if (XFS_LIC_ARE_ALL_FREE(licp)) { + if (xfs_lic_are_all_free(licp)) { ASSERT(licp == &tp->t_items); ASSERT(licp->lic_next == NULL); return NULL; @@ -1083,11 +1083,11 @@ xfs_trans_buf_item_match_all( /* * Skip unoccupied slots. */ - if (XFS_LIC_ISFREE(licp, i)) { + if (xfs_lic_isfree(licp, i)) { continue; } - lidp = XFS_LIC_SLOT(licp, i); + lidp = xfs_lic_slot(licp, i); blip = (xfs_buf_log_item_t *)lidp->lid_item; if (blip->bli_item.li_type != XFS_LI_BUF) { continue; diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index 4c70bf5e9985..2a1c0f071f91 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -291,7 +291,7 @@ xfs_trans_inode_broot_debug( iip = ip->i_itemp; if (iip->ili_root_size != 0) { ASSERT(iip->ili_orig_root != NULL); - kmem_free(iip->ili_orig_root, iip->ili_root_size); + kmem_free(iip->ili_orig_root); iip->ili_root_size = 0; iip->ili_orig_root = NULL; } diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c index 66a09f0d894b..3c666e8317f8 100644 --- a/fs/xfs/xfs_trans_item.c +++ b/fs/xfs/xfs_trans_item.c @@ -53,11 +53,11 @@ xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip) * Initialize the chunk, and then * claim the first slot in the newly allocated chunk. */ - XFS_LIC_INIT(licp); - XFS_LIC_CLAIM(licp, 0); + xfs_lic_init(licp); + xfs_lic_claim(licp, 0); licp->lic_unused = 1; - XFS_LIC_INIT_SLOT(licp, 0); - lidp = XFS_LIC_SLOT(licp, 0); + xfs_lic_init_slot(licp, 0); + lidp = xfs_lic_slot(licp, 0); /* * Link in the new chunk and update the free count. @@ -88,14 +88,14 @@ xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip) */ licp = &tp->t_items; while (licp != NULL) { - if (XFS_LIC_VACANCY(licp)) { + if (xfs_lic_vacancy(licp)) { if (licp->lic_unused <= XFS_LIC_MAX_SLOT) { i = licp->lic_unused; - ASSERT(XFS_LIC_ISFREE(licp, i)); + ASSERT(xfs_lic_isfree(licp, i)); break; } for (i = 0; i <= XFS_LIC_MAX_SLOT; i++) { - if (XFS_LIC_ISFREE(licp, i)) + if (xfs_lic_isfree(licp, i)) break; } ASSERT(i <= XFS_LIC_MAX_SLOT); @@ -108,12 +108,12 @@ xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip) * If we find a free descriptor, claim it, * initialize it, and return it. */ - XFS_LIC_CLAIM(licp, i); + xfs_lic_claim(licp, i); if (licp->lic_unused <= i) { licp->lic_unused = i + 1; - XFS_LIC_INIT_SLOT(licp, i); + xfs_lic_init_slot(licp, i); } - lidp = XFS_LIC_SLOT(licp, i); + lidp = xfs_lic_slot(licp, i); tp->t_items_free--; lidp->lid_item = lip; lidp->lid_flags = 0; @@ -136,9 +136,9 @@ xfs_trans_free_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp) xfs_log_item_chunk_t *licp; xfs_log_item_chunk_t **licpp; - slot = XFS_LIC_DESC_TO_SLOT(lidp); - licp = XFS_LIC_DESC_TO_CHUNK(lidp); - XFS_LIC_RELSE(licp, slot); + slot = xfs_lic_desc_to_slot(lidp); + licp = xfs_lic_desc_to_chunk(lidp); + xfs_lic_relse(licp, slot); lidp->lid_item->li_desc = NULL; tp->t_items_free++; @@ -154,14 +154,14 @@ xfs_trans_free_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp) * Also decrement the transaction structure's count of free items * by the number in a chunk since we are freeing an empty chunk. */ - if (XFS_LIC_ARE_ALL_FREE(licp) && (licp != &(tp->t_items))) { + if (xfs_lic_are_all_free(licp) && (licp != &(tp->t_items))) { licpp = &(tp->t_items.lic_next); while (*licpp != licp) { ASSERT(*licpp != NULL); licpp = &((*licpp)->lic_next); } *licpp = licp->lic_next; - kmem_free(licp, sizeof(xfs_log_item_chunk_t)); + kmem_free(licp); tp->t_items_free -= XFS_LIC_NUM_SLOTS; } } @@ -207,20 +207,20 @@ xfs_trans_first_item(xfs_trans_t *tp) /* * If it's not in the first chunk, skip to the second. */ - if (XFS_LIC_ARE_ALL_FREE(licp)) { + if (xfs_lic_are_all_free(licp)) { licp = licp->lic_next; } /* * Return the first non-free descriptor in the chunk. */ - ASSERT(!XFS_LIC_ARE_ALL_FREE(licp)); + ASSERT(!xfs_lic_are_all_free(licp)); for (i = 0; i < licp->lic_unused; i++) { - if (XFS_LIC_ISFREE(licp, i)) { + if (xfs_lic_isfree(licp, i)) { continue; } - return XFS_LIC_SLOT(licp, i); + return xfs_lic_slot(licp, i); } cmn_err(CE_WARN, "xfs_trans_first_item() -- no first item"); return NULL; @@ -242,18 +242,18 @@ xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp) xfs_log_item_chunk_t *licp; int i; - licp = XFS_LIC_DESC_TO_CHUNK(lidp); + licp = xfs_lic_desc_to_chunk(lidp); /* * First search the rest of the chunk. The for loop keeps us * from referencing things beyond the end of the chunk. */ - for (i = (int)XFS_LIC_DESC_TO_SLOT(lidp) + 1; i < licp->lic_unused; i++) { - if (XFS_LIC_ISFREE(licp, i)) { + for (i = (int)xfs_lic_desc_to_slot(lidp) + 1; i < licp->lic_unused; i++) { + if (xfs_lic_isfree(licp, i)) { continue; } - return XFS_LIC_SLOT(licp, i); + return xfs_lic_slot(licp, i); } /* @@ -266,13 +266,13 @@ xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp) } licp = licp->lic_next; - ASSERT(!XFS_LIC_ARE_ALL_FREE(licp)); + ASSERT(!xfs_lic_are_all_free(licp)); for (i = 0; i < licp->lic_unused; i++) { - if (XFS_LIC_ISFREE(licp, i)) { + if (xfs_lic_isfree(licp, i)) { continue; } - return XFS_LIC_SLOT(licp, i); + return xfs_lic_slot(licp, i); } ASSERT(0); /* NOTREACHED */ @@ -300,9 +300,9 @@ xfs_trans_free_items( /* * Special case the embedded chunk so we don't free it below. */ - if (!XFS_LIC_ARE_ALL_FREE(licp)) { + if (!xfs_lic_are_all_free(licp)) { (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN); - XFS_LIC_ALL_FREE(licp); + xfs_lic_all_free(licp); licp->lic_unused = 0; } licp = licp->lic_next; @@ -311,10 +311,10 @@ xfs_trans_free_items( * Unlock each item in each chunk and free the chunks. */ while (licp != NULL) { - ASSERT(!XFS_LIC_ARE_ALL_FREE(licp)); + ASSERT(!xfs_lic_are_all_free(licp)); (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN); next_licp = licp->lic_next; - kmem_free(licp, sizeof(xfs_log_item_chunk_t)); + kmem_free(licp); licp = next_licp; } @@ -347,7 +347,7 @@ xfs_trans_unlock_items(xfs_trans_t *tp, xfs_lsn_t commit_lsn) /* * Special case the embedded chunk so we don't free. */ - if (!XFS_LIC_ARE_ALL_FREE(licp)) { + if (!xfs_lic_are_all_free(licp)) { freed = xfs_trans_unlock_chunk(licp, 0, 0, commit_lsn); } licpp = &(tp->t_items.lic_next); @@ -358,12 +358,12 @@ xfs_trans_unlock_items(xfs_trans_t *tp, xfs_lsn_t commit_lsn) * and free empty chunks. */ while (licp != NULL) { - ASSERT(!XFS_LIC_ARE_ALL_FREE(licp)); + ASSERT(!xfs_lic_are_all_free(licp)); freed += xfs_trans_unlock_chunk(licp, 0, 0, commit_lsn); next_licp = licp->lic_next; - if (XFS_LIC_ARE_ALL_FREE(licp)) { + if (xfs_lic_are_all_free(licp)) { *licpp = next_licp; - kmem_free(licp, sizeof(xfs_log_item_chunk_t)); + kmem_free(licp); freed -= XFS_LIC_NUM_SLOTS; } else { licpp = &(licp->lic_next); @@ -402,7 +402,7 @@ xfs_trans_unlock_chunk( freed = 0; lidp = licp->lic_descs; for (i = 0; i < licp->lic_unused; i++, lidp++) { - if (XFS_LIC_ISFREE(licp, i)) { + if (xfs_lic_isfree(licp, i)) { continue; } lip = lidp->lid_item; @@ -421,7 +421,7 @@ xfs_trans_unlock_chunk( */ if (!(freeing_chunk) && (!(lidp->lid_flags & XFS_LID_DIRTY) || abort)) { - XFS_LIC_RELSE(licp, i); + xfs_lic_relse(licp, i); freed++; } } @@ -530,7 +530,7 @@ xfs_trans_free_busy(xfs_trans_t *tp) lbcp = tp->t_busy.lbc_next; while (lbcp != NULL) { lbcq = lbcp->lbc_next; - kmem_free(lbcp, sizeof(xfs_log_busy_chunk_t)); + kmem_free(lbcp); lbcp = lbcq; } diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c index 98e5f110ba5f..35d4d414bcc2 100644 --- a/fs/xfs/xfs_utils.c +++ b/fs/xfs/xfs_utils.c @@ -237,7 +237,7 @@ xfs_droplink( ASSERT (ip->i_d.di_nlink > 0); ip->i_d.di_nlink--; - drop_nlink(ip->i_vnode); + drop_nlink(VFS_I(ip)); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); error = 0; @@ -301,7 +301,7 @@ xfs_bumplink( ASSERT(ip->i_d.di_nlink > 0); ip->i_d.di_nlink++; - inc_nlink(ip->i_vnode); + inc_nlink(VFS_I(ip)); if ((ip->i_d.di_version == XFS_DINODE_VERSION_1) && (ip->i_d.di_nlink > XFS_MAXLINK_1)) { /* diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h index f316cb85d8e2..ef321225d269 100644 --- a/fs/xfs/xfs_utils.h +++ b/fs/xfs/xfs_utils.h @@ -18,9 +18,6 @@ #ifndef __XFS_UTILS_H__ #define __XFS_UTILS_H__ -#define IRELE(ip) VN_RELE(XFS_ITOV(ip)) -#define IHOLD(ip) VN_HOLD(XFS_ITOV(ip)) - extern int xfs_truncate_file(xfs_mount_t *, xfs_inode_t *); extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t, xfs_dev_t, cred_t *, prid_t, int, diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c index 30bacd8bb0e5..439dd3939dda 100644 --- a/fs/xfs/xfs_vfsops.c +++ b/fs/xfs/xfs_vfsops.c @@ -58,586 +58,6 @@ #include "xfs_utils.h" -int __init -xfs_init(void) -{ -#ifdef XFS_DABUF_DEBUG - extern spinlock_t xfs_dabuf_global_lock; - spin_lock_init(&xfs_dabuf_global_lock); -#endif - - /* - * Initialize all of the zone allocators we use. - */ - xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t), - "xfs_log_ticket"); - xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t), - "xfs_bmap_free_item"); - xfs_btree_cur_zone = kmem_zone_init(sizeof(xfs_btree_cur_t), - "xfs_btree_cur"); - xfs_da_state_zone = kmem_zone_init(sizeof(xfs_da_state_t), - "xfs_da_state"); - xfs_dabuf_zone = kmem_zone_init(sizeof(xfs_dabuf_t), "xfs_dabuf"); - xfs_ifork_zone = kmem_zone_init(sizeof(xfs_ifork_t), "xfs_ifork"); - xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans"); - xfs_acl_zone_init(xfs_acl_zone, "xfs_acl"); - xfs_mru_cache_init(); - xfs_filestream_init(); - - /* - * The size of the zone allocated buf log item is the maximum - * size possible under XFS. This wastes a little bit of memory, - * but it is much faster. - */ - xfs_buf_item_zone = - kmem_zone_init((sizeof(xfs_buf_log_item_t) + - (((XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK) / - NBWORD) * sizeof(int))), - "xfs_buf_item"); - xfs_efd_zone = - kmem_zone_init((sizeof(xfs_efd_log_item_t) + - ((XFS_EFD_MAX_FAST_EXTENTS - 1) * - sizeof(xfs_extent_t))), - "xfs_efd_item"); - xfs_efi_zone = - kmem_zone_init((sizeof(xfs_efi_log_item_t) + - ((XFS_EFI_MAX_FAST_EXTENTS - 1) * - sizeof(xfs_extent_t))), - "xfs_efi_item"); - - /* - * These zones warrant special memory allocator hints - */ - xfs_inode_zone = - kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode", - KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | - KM_ZONE_SPREAD, NULL); - xfs_ili_zone = - kmem_zone_init_flags(sizeof(xfs_inode_log_item_t), "xfs_ili", - KM_ZONE_SPREAD, NULL); - - /* - * Allocate global trace buffers. - */ -#ifdef XFS_ALLOC_TRACE - xfs_alloc_trace_buf = ktrace_alloc(XFS_ALLOC_TRACE_SIZE, KM_SLEEP); -#endif -#ifdef XFS_BMAP_TRACE - xfs_bmap_trace_buf = ktrace_alloc(XFS_BMAP_TRACE_SIZE, KM_SLEEP); -#endif -#ifdef XFS_BMBT_TRACE - xfs_bmbt_trace_buf = ktrace_alloc(XFS_BMBT_TRACE_SIZE, KM_SLEEP); -#endif -#ifdef XFS_ATTR_TRACE - xfs_attr_trace_buf = ktrace_alloc(XFS_ATTR_TRACE_SIZE, KM_SLEEP); -#endif -#ifdef XFS_DIR2_TRACE - xfs_dir2_trace_buf = ktrace_alloc(XFS_DIR2_GTRACE_SIZE, KM_SLEEP); -#endif - - xfs_dir_startup(); - -#if (defined(DEBUG) || defined(INDUCE_IO_ERROR)) - xfs_error_test_init(); -#endif /* DEBUG || INDUCE_IO_ERROR */ - - xfs_init_procfs(); - xfs_sysctl_register(); - return 0; -} - -void __exit -xfs_cleanup(void) -{ - extern kmem_zone_t *xfs_inode_zone; - extern kmem_zone_t *xfs_efd_zone; - extern kmem_zone_t *xfs_efi_zone; - - xfs_cleanup_procfs(); - xfs_sysctl_unregister(); - xfs_filestream_uninit(); - xfs_mru_cache_uninit(); - xfs_acl_zone_destroy(xfs_acl_zone); - -#ifdef XFS_DIR2_TRACE - ktrace_free(xfs_dir2_trace_buf); -#endif -#ifdef XFS_ATTR_TRACE - ktrace_free(xfs_attr_trace_buf); -#endif -#ifdef XFS_BMBT_TRACE - ktrace_free(xfs_bmbt_trace_buf); -#endif -#ifdef XFS_BMAP_TRACE - ktrace_free(xfs_bmap_trace_buf); -#endif -#ifdef XFS_ALLOC_TRACE - ktrace_free(xfs_alloc_trace_buf); -#endif - - kmem_zone_destroy(xfs_bmap_free_item_zone); - kmem_zone_destroy(xfs_btree_cur_zone); - kmem_zone_destroy(xfs_inode_zone); - kmem_zone_destroy(xfs_trans_zone); - kmem_zone_destroy(xfs_da_state_zone); - kmem_zone_destroy(xfs_dabuf_zone); - kmem_zone_destroy(xfs_buf_item_zone); - kmem_zone_destroy(xfs_efd_zone); - kmem_zone_destroy(xfs_efi_zone); - kmem_zone_destroy(xfs_ifork_zone); - kmem_zone_destroy(xfs_ili_zone); - kmem_zone_destroy(xfs_log_ticket_zone); -} - -/* - * xfs_start_flags - * - * This function fills in xfs_mount_t fields based on mount args. - * Note: the superblock has _not_ yet been read in. - */ -STATIC int -xfs_start_flags( - struct xfs_mount_args *ap, - struct xfs_mount *mp) -{ - /* Values are in BBs */ - if ((ap->flags & XFSMNT_NOALIGN) != XFSMNT_NOALIGN) { - /* - * At this point the superblock has not been read - * in, therefore we do not know the block size. - * Before the mount call ends we will convert - * these to FSBs. - */ - mp->m_dalign = ap->sunit; - mp->m_swidth = ap->swidth; - } - - if (ap->logbufs != -1 && - ap->logbufs != 0 && - (ap->logbufs < XLOG_MIN_ICLOGS || - ap->logbufs > XLOG_MAX_ICLOGS)) { - cmn_err(CE_WARN, - "XFS: invalid logbufs value: %d [not %d-%d]", - ap->logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS); - return XFS_ERROR(EINVAL); - } - mp->m_logbufs = ap->logbufs; - if (ap->logbufsize != -1 && - ap->logbufsize != 0 && - (ap->logbufsize < XLOG_MIN_RECORD_BSIZE || - ap->logbufsize > XLOG_MAX_RECORD_BSIZE || - !is_power_of_2(ap->logbufsize))) { - cmn_err(CE_WARN, - "XFS: invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]", - ap->logbufsize); - return XFS_ERROR(EINVAL); - } - mp->m_logbsize = ap->logbufsize; - mp->m_fsname_len = strlen(ap->fsname) + 1; - mp->m_fsname = kmem_alloc(mp->m_fsname_len, KM_SLEEP); - strcpy(mp->m_fsname, ap->fsname); - if (ap->rtname[0]) { - mp->m_rtname = kmem_alloc(strlen(ap->rtname) + 1, KM_SLEEP); - strcpy(mp->m_rtname, ap->rtname); - } - if (ap->logname[0]) { - mp->m_logname = kmem_alloc(strlen(ap->logname) + 1, KM_SLEEP); - strcpy(mp->m_logname, ap->logname); - } - - if (ap->flags & XFSMNT_WSYNC) - mp->m_flags |= XFS_MOUNT_WSYNC; -#if XFS_BIG_INUMS - if (ap->flags & XFSMNT_INO64) { - mp->m_flags |= XFS_MOUNT_INO64; - mp->m_inoadd = XFS_INO64_OFFSET; - } -#endif - if (ap->flags & XFSMNT_RETERR) - mp->m_flags |= XFS_MOUNT_RETERR; - if (ap->flags & XFSMNT_NOALIGN) - mp->m_flags |= XFS_MOUNT_NOALIGN; - if (ap->flags & XFSMNT_SWALLOC) - mp->m_flags |= XFS_MOUNT_SWALLOC; - if (ap->flags & XFSMNT_OSYNCISOSYNC) - mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC; - if (ap->flags & XFSMNT_32BITINODES) - mp->m_flags |= XFS_MOUNT_32BITINODES; - - if (ap->flags & XFSMNT_IOSIZE) { - if (ap->iosizelog > XFS_MAX_IO_LOG || - ap->iosizelog < XFS_MIN_IO_LOG) { - cmn_err(CE_WARN, - "XFS: invalid log iosize: %d [not %d-%d]", - ap->iosizelog, XFS_MIN_IO_LOG, - XFS_MAX_IO_LOG); - return XFS_ERROR(EINVAL); - } - - mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE; - mp->m_readio_log = mp->m_writeio_log = ap->iosizelog; - } - - if (ap->flags & XFSMNT_IKEEP) - mp->m_flags |= XFS_MOUNT_IKEEP; - if (ap->flags & XFSMNT_DIRSYNC) - mp->m_flags |= XFS_MOUNT_DIRSYNC; - if (ap->flags & XFSMNT_ATTR2) - mp->m_flags |= XFS_MOUNT_ATTR2; - - if (ap->flags2 & XFSMNT2_COMPAT_IOSIZE) - mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE; - - /* - * no recovery flag requires a read-only mount - */ - if (ap->flags & XFSMNT_NORECOVERY) { - if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { - cmn_err(CE_WARN, - "XFS: tried to mount a FS read-write without recovery!"); - return XFS_ERROR(EINVAL); - } - mp->m_flags |= XFS_MOUNT_NORECOVERY; - } - - if (ap->flags & XFSMNT_NOUUID) - mp->m_flags |= XFS_MOUNT_NOUUID; - if (ap->flags & XFSMNT_BARRIER) - mp->m_flags |= XFS_MOUNT_BARRIER; - else - mp->m_flags &= ~XFS_MOUNT_BARRIER; - - if (ap->flags2 & XFSMNT2_FILESTREAMS) - mp->m_flags |= XFS_MOUNT_FILESTREAMS; - - if (ap->flags & XFSMNT_DMAPI) - mp->m_flags |= XFS_MOUNT_DMAPI; - return 0; -} - -/* - * This function fills in xfs_mount_t fields based on mount args. - * Note: the superblock _has_ now been read in. - */ -STATIC int -xfs_finish_flags( - struct xfs_mount_args *ap, - struct xfs_mount *mp) -{ - int ronly = (mp->m_flags & XFS_MOUNT_RDONLY); - - /* Fail a mount where the logbuf is smaller then the log stripe */ - if (xfs_sb_version_haslogv2(&mp->m_sb)) { - if ((ap->logbufsize <= 0) && - (mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE)) { - mp->m_logbsize = mp->m_sb.sb_logsunit; - } else if (ap->logbufsize > 0 && - ap->logbufsize < mp->m_sb.sb_logsunit) { - cmn_err(CE_WARN, - "XFS: logbuf size must be greater than or equal to log stripe size"); - return XFS_ERROR(EINVAL); - } - } else { - /* Fail a mount if the logbuf is larger than 32K */ - if (ap->logbufsize > XLOG_BIG_RECORD_BSIZE) { - cmn_err(CE_WARN, - "XFS: logbuf size for version 1 logs must be 16K or 32K"); - return XFS_ERROR(EINVAL); - } - } - - if (xfs_sb_version_hasattr2(&mp->m_sb)) - mp->m_flags |= XFS_MOUNT_ATTR2; - - /* - * prohibit r/w mounts of read-only filesystems - */ - if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) { - cmn_err(CE_WARN, - "XFS: cannot mount a read-only filesystem as read-write"); - return XFS_ERROR(EROFS); - } - - /* - * check for shared mount. - */ - if (ap->flags & XFSMNT_SHARED) { - if (!xfs_sb_version_hasshared(&mp->m_sb)) - return XFS_ERROR(EINVAL); - - /* - * For IRIX 6.5, shared mounts must have the shared - * version bit set, have the persistent readonly - * field set, must be version 0 and can only be mounted - * read-only. - */ - if (!ronly || !(mp->m_sb.sb_flags & XFS_SBF_READONLY) || - (mp->m_sb.sb_shared_vn != 0)) - return XFS_ERROR(EINVAL); - - mp->m_flags |= XFS_MOUNT_SHARED; - - /* - * Shared XFS V0 can't deal with DMI. Return EINVAL. - */ - if (mp->m_sb.sb_shared_vn == 0 && (ap->flags & XFSMNT_DMAPI)) - return XFS_ERROR(EINVAL); - } - - if (ap->flags & XFSMNT_UQUOTA) { - mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE); - if (ap->flags & XFSMNT_UQUOTAENF) - mp->m_qflags |= XFS_UQUOTA_ENFD; - } - - if (ap->flags & XFSMNT_GQUOTA) { - mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE); - if (ap->flags & XFSMNT_GQUOTAENF) - mp->m_qflags |= XFS_OQUOTA_ENFD; - } else if (ap->flags & XFSMNT_PQUOTA) { - mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE); - if (ap->flags & XFSMNT_PQUOTAENF) - mp->m_qflags |= XFS_OQUOTA_ENFD; - } - - return 0; -} - -/* - * xfs_mount - * - * The file system configurations are: - * (1) device (partition) with data and internal log - * (2) logical volume with data and log subvolumes. - * (3) logical volume with data, log, and realtime subvolumes. - * - * We only have to handle opening the log and realtime volumes here if - * they are present. The data subvolume has already been opened by - * get_sb_bdev() and is stored in vfsp->vfs_super->s_bdev. - */ -int -xfs_mount( - struct xfs_mount *mp, - struct xfs_mount_args *args, - cred_t *credp) -{ - struct block_device *ddev, *logdev, *rtdev; - int flags = 0, error; - - ddev = mp->m_super->s_bdev; - logdev = rtdev = NULL; - - error = xfs_dmops_get(mp, args); - if (error) - return error; - error = xfs_qmops_get(mp, args); - if (error) - return error; - - if (args->flags & XFSMNT_QUIET) - flags |= XFS_MFSI_QUIET; - - /* - * Open real time and log devices - order is important. - */ - if (args->logname[0]) { - error = xfs_blkdev_get(mp, args->logname, &logdev); - if (error) - return error; - } - if (args->rtname[0]) { - error = xfs_blkdev_get(mp, args->rtname, &rtdev); - if (error) { - xfs_blkdev_put(logdev); - return error; - } - - if (rtdev == ddev || rtdev == logdev) { - cmn_err(CE_WARN, - "XFS: Cannot mount filesystem with identical rtdev and ddev/logdev."); - xfs_blkdev_put(logdev); - xfs_blkdev_put(rtdev); - return EINVAL; - } - } - - /* - * Setup xfs_mount buffer target pointers - */ - error = ENOMEM; - mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0); - if (!mp->m_ddev_targp) { - xfs_blkdev_put(logdev); - xfs_blkdev_put(rtdev); - return error; - } - if (rtdev) { - mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1); - if (!mp->m_rtdev_targp) { - xfs_blkdev_put(logdev); - xfs_blkdev_put(rtdev); - goto error0; - } - } - mp->m_logdev_targp = (logdev && logdev != ddev) ? - xfs_alloc_buftarg(logdev, 1) : mp->m_ddev_targp; - if (!mp->m_logdev_targp) { - xfs_blkdev_put(logdev); - xfs_blkdev_put(rtdev); - goto error0; - } - - /* - * Setup flags based on mount(2) options and then the superblock - */ - error = xfs_start_flags(args, mp); - if (error) - goto error1; - error = xfs_readsb(mp, flags); - if (error) - goto error1; - error = xfs_finish_flags(args, mp); - if (error) - goto error2; - - /* - * Setup xfs_mount buffer target pointers based on superblock - */ - error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_blocksize, - mp->m_sb.sb_sectsize); - if (!error && logdev && logdev != ddev) { - unsigned int log_sector_size = BBSIZE; - - if (xfs_sb_version_hassector(&mp->m_sb)) - log_sector_size = mp->m_sb.sb_logsectsize; - error = xfs_setsize_buftarg(mp->m_logdev_targp, - mp->m_sb.sb_blocksize, - log_sector_size); - } - if (!error && rtdev) - error = xfs_setsize_buftarg(mp->m_rtdev_targp, - mp->m_sb.sb_blocksize, - mp->m_sb.sb_sectsize); - if (error) - goto error2; - - if (mp->m_flags & XFS_MOUNT_BARRIER) - xfs_mountfs_check_barriers(mp); - - if ((error = xfs_filestream_mount(mp))) - goto error2; - - error = xfs_mountfs(mp, flags); - if (error) - goto error2; - - XFS_SEND_MOUNT(mp, DM_RIGHT_NULL, args->mtpt, args->fsname); - - return 0; - -error2: - if (mp->m_sb_bp) - xfs_freesb(mp); -error1: - xfs_binval(mp->m_ddev_targp); - if (logdev && logdev != ddev) - xfs_binval(mp->m_logdev_targp); - if (rtdev) - xfs_binval(mp->m_rtdev_targp); -error0: - xfs_unmountfs_close(mp, credp); - xfs_qmops_put(mp); - xfs_dmops_put(mp); - return error; -} - -int -xfs_unmount( - xfs_mount_t *mp, - int flags, - cred_t *credp) -{ - xfs_inode_t *rip; - bhv_vnode_t *rvp; - int unmount_event_wanted = 0; - int unmount_event_flags = 0; - int xfs_unmountfs_needed = 0; - int error; - - rip = mp->m_rootip; - rvp = XFS_ITOV(rip); - -#ifdef HAVE_DMAPI - if (mp->m_flags & XFS_MOUNT_DMAPI) { - error = XFS_SEND_PREUNMOUNT(mp, - rip, DM_RIGHT_NULL, rip, DM_RIGHT_NULL, - NULL, NULL, 0, 0, - (mp->m_dmevmask & (1<<DM_EVENT_PREUNMOUNT))? - 0:DM_FLAGS_UNWANTED); - if (error) - return XFS_ERROR(error); - unmount_event_wanted = 1; - unmount_event_flags = (mp->m_dmevmask & (1<<DM_EVENT_UNMOUNT))? - 0 : DM_FLAGS_UNWANTED; - } -#endif - - /* - * Blow away any referenced inode in the filestreams cache. - * This can and will cause log traffic as inodes go inactive - * here. - */ - xfs_filestream_unmount(mp); - - XFS_bflush(mp->m_ddev_targp); - error = xfs_unmount_flush(mp, 0); - if (error) - goto out; - - ASSERT(vn_count(rvp) == 1); - - /* - * Drop the reference count - */ - IRELE(rip); - - /* - * If we're forcing a shutdown, typically because of a media error, - * we want to make sure we invalidate dirty pages that belong to - * referenced vnodes as well. - */ - if (XFS_FORCED_SHUTDOWN(mp)) { - error = xfs_sync(mp, SYNC_WAIT | SYNC_CLOSE); - ASSERT(error != EFSCORRUPTED); - } - xfs_unmountfs_needed = 1; - -out: - /* Send DMAPI event, if required. - * Then do xfs_unmountfs() if needed. - * Then return error (or zero). - */ - if (unmount_event_wanted) { - /* Note: mp structure must still exist for - * XFS_SEND_UNMOUNT() call. - */ - XFS_SEND_UNMOUNT(mp, error == 0 ? rip : NULL, - DM_RIGHT_NULL, 0, error, unmount_event_flags); - } - if (xfs_unmountfs_needed) { - /* - * Call common unmount function to flush to disk - * and free the super block buffer & mount structures. - */ - xfs_unmountfs(mp, credp); - xfs_qmops_put(mp); - xfs_dmops_put(mp); - kmem_free(mp, sizeof(xfs_mount_t)); - } - - return XFS_ERROR(error); -} - STATIC void xfs_quiesce_fs( xfs_mount_t *mp) @@ -694,30 +114,6 @@ xfs_attr_quiesce( xfs_unmountfs_writesb(mp); } -int -xfs_mntupdate( - struct xfs_mount *mp, - int *flags, - struct xfs_mount_args *args) -{ - if (!(*flags & MS_RDONLY)) { /* rw/ro -> rw */ - if (mp->m_flags & XFS_MOUNT_RDONLY) - mp->m_flags &= ~XFS_MOUNT_RDONLY; - if (args->flags & XFSMNT_BARRIER) { - mp->m_flags |= XFS_MOUNT_BARRIER; - xfs_mountfs_check_barriers(mp); - } else { - mp->m_flags &= ~XFS_MOUNT_BARRIER; - } - } else if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { /* rw -> ro */ - xfs_filestream_flush(mp); - xfs_sync(mp, SYNC_DATA_QUIESCE); - xfs_attr_quiesce(mp); - mp->m_flags |= XFS_MOUNT_RDONLY; - } - return 0; -} - /* * xfs_unmount_flush implements a set of flush operation on special * inodes, which are needed as a separate set of operations so that @@ -732,7 +128,6 @@ xfs_unmount_flush( xfs_inode_t *rip = mp->m_rootip; xfs_inode_t *rbmip; xfs_inode_t *rsumip = NULL; - bhv_vnode_t *rvp = XFS_ITOV(rip); int error; xfs_ilock(rip, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); @@ -750,7 +145,7 @@ xfs_unmount_flush( if (error == EFSCORRUPTED) goto fscorrupt_out; - ASSERT(vn_count(XFS_ITOV(rbmip)) == 1); + ASSERT(vn_count(VFS_I(rbmip)) == 1); rsumip = mp->m_rsumip; xfs_ilock(rsumip, XFS_ILOCK_EXCL); @@ -761,7 +156,7 @@ xfs_unmount_flush( if (error == EFSCORRUPTED) goto fscorrupt_out; - ASSERT(vn_count(XFS_ITOV(rsumip)) == 1); + ASSERT(vn_count(VFS_I(rsumip)) == 1); } /* @@ -771,7 +166,7 @@ xfs_unmount_flush( if (error == EFSCORRUPTED) goto fscorrupt_out2; - if (vn_count(rvp) != 1 && !relocation) { + if (vn_count(VFS_I(rip)) != 1 && !relocation) { xfs_iunlock(rip, XFS_ILOCK_EXCL); return XFS_ERROR(EBUSY); } @@ -888,7 +283,7 @@ xfs_sync_inodes( int *bypassed) { xfs_inode_t *ip = NULL; - bhv_vnode_t *vp = NULL; + struct inode *vp = NULL; int error; int last_error; uint64_t fflag; @@ -1008,7 +403,7 @@ xfs_sync_inodes( continue; } - vp = XFS_ITOV_NULL(ip); + vp = VFS_I(ip); /* * If the vnode is gone then this is being torn down, @@ -1048,7 +443,7 @@ xfs_sync_inodes( if (XFS_FORCED_SHUTDOWN(mp) && !(flags & SYNC_CLOSE)) { XFS_MOUNT_IUNLOCK(mp); - kmem_free(ipointer, sizeof(xfs_iptr_t)); + kmem_free(ipointer); return 0; } @@ -1083,7 +478,7 @@ xfs_sync_inodes( IPOINTER_INSERT(ip, mp); xfs_ilock(ip, lock_flags); - ASSERT(vp == XFS_ITOV(ip)); + ASSERT(vp == VFS_I(ip)); ASSERT(ip->i_mount == mp); vnode_refed = B_TRUE; @@ -1194,7 +589,7 @@ xfs_sync_inodes( } XFS_MOUNT_IUNLOCK(mp); ASSERT(ipointer_in == B_FALSE); - kmem_free(ipointer, sizeof(xfs_iptr_t)); + kmem_free(ipointer); return XFS_ERROR(error); } @@ -1224,7 +619,7 @@ xfs_sync_inodes( ASSERT(ipointer_in == B_FALSE); - kmem_free(ipointer, sizeof(xfs_iptr_t)); + kmem_free(ipointer); return XFS_ERROR(last_error); } diff --git a/fs/xfs/xfs_vfsops.h b/fs/xfs/xfs_vfsops.h index 1688817c55ed..a74b05087da4 100644 --- a/fs/xfs/xfs_vfsops.h +++ b/fs/xfs/xfs_vfsops.h @@ -8,11 +8,6 @@ struct kstatfs; struct xfs_mount; struct xfs_mount_args; -int xfs_mount(struct xfs_mount *mp, struct xfs_mount_args *args, - struct cred *credp); -int xfs_unmount(struct xfs_mount *mp, int flags, struct cred *credp); -int xfs_mntupdate(struct xfs_mount *mp, int *flags, - struct xfs_mount_args *args); int xfs_sync(struct xfs_mount *mp, int flags); void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname, int lnnum); diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index e475e3717eb3..8b6812f66a15 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -75,26 +75,23 @@ xfs_open( return 0; } -/* - * xfs_setattr - */ int xfs_setattr( - xfs_inode_t *ip, - bhv_vattr_t *vap, + struct xfs_inode *ip, + struct iattr *iattr, int flags, cred_t *credp) { xfs_mount_t *mp = ip->i_mount; + struct inode *inode = VFS_I(ip); + int mask = iattr->ia_valid; xfs_trans_t *tp; - int mask; int code; uint lock_flags; uint commit_flags=0; uid_t uid=0, iuid=0; gid_t gid=0, igid=0; int timeflags = 0; - xfs_prid_t projid=0, iprojid=0; struct xfs_dquot *udqp, *gdqp, *olddquot1, *olddquot2; int file_owner; int need_iolock = 1; @@ -104,30 +101,9 @@ xfs_setattr( if (mp->m_flags & XFS_MOUNT_RDONLY) return XFS_ERROR(EROFS); - /* - * Cannot set certain attributes. - */ - mask = vap->va_mask; - if (mask & XFS_AT_NOSET) { - return XFS_ERROR(EINVAL); - } - if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); - /* - * Timestamps do not need to be logged and hence do not - * need to be done within a transaction. - */ - if (mask & XFS_AT_UPDTIMES) { - ASSERT((mask & ~XFS_AT_UPDTIMES) == 0); - timeflags = ((mask & XFS_AT_UPDATIME) ? XFS_ICHGTIME_ACC : 0) | - ((mask & XFS_AT_UPDCTIME) ? XFS_ICHGTIME_CHG : 0) | - ((mask & XFS_AT_UPDMTIME) ? XFS_ICHGTIME_MOD : 0); - xfs_ichgtime(ip, timeflags); - return 0; - } - olddquot1 = olddquot2 = NULL; udqp = gdqp = NULL; @@ -139,28 +115,22 @@ xfs_setattr( * If the IDs do change before we take the ilock, we're covered * because the i_*dquot fields will get updated anyway. */ - if (XFS_IS_QUOTA_ON(mp) && - (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID))) { + if (XFS_IS_QUOTA_ON(mp) && (mask & (ATTR_UID|ATTR_GID))) { uint qflags = 0; - if ((mask & XFS_AT_UID) && XFS_IS_UQUOTA_ON(mp)) { - uid = vap->va_uid; + if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp)) { + uid = iattr->ia_uid; qflags |= XFS_QMOPT_UQUOTA; } else { uid = ip->i_d.di_uid; } - if ((mask & XFS_AT_GID) && XFS_IS_GQUOTA_ON(mp)) { - gid = vap->va_gid; + if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) { + gid = iattr->ia_gid; qflags |= XFS_QMOPT_GQUOTA; } else { gid = ip->i_d.di_gid; } - if ((mask & XFS_AT_PROJID) && XFS_IS_PQUOTA_ON(mp)) { - projid = vap->va_projid; - qflags |= XFS_QMOPT_PQUOTA; - } else { - projid = ip->i_d.di_projid; - } + /* * We take a reference when we initialize udqp and gdqp, * so it is important that we never blindly double trip on @@ -168,8 +138,8 @@ xfs_setattr( */ ASSERT(udqp == NULL); ASSERT(gdqp == NULL); - code = XFS_QM_DQVOPALLOC(mp, ip, uid, gid, projid, qflags, - &udqp, &gdqp); + code = XFS_QM_DQVOPALLOC(mp, ip, uid, gid, ip->i_d.di_projid, + qflags, &udqp, &gdqp); if (code) return code; } @@ -180,10 +150,10 @@ xfs_setattr( */ tp = NULL; lock_flags = XFS_ILOCK_EXCL; - if (flags & ATTR_NOLOCK) + if (flags & XFS_ATTR_NOLOCK) need_iolock = 0; - if (!(mask & XFS_AT_SIZE)) { - if ((mask != (XFS_AT_CTIME|XFS_AT_ATIME|XFS_AT_MTIME)) || + if (!(mask & ATTR_SIZE)) { + if ((mask != (ATTR_CTIME|ATTR_ATIME|ATTR_MTIME)) || (mp->m_flags & XFS_MOUNT_WSYNC)) { tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); commit_flags = 0; @@ -196,10 +166,10 @@ xfs_setattr( } } else { if (DM_EVENT_ENABLED(ip, DM_EVENT_TRUNCATE) && - !(flags & ATTR_DMI)) { + !(flags & XFS_ATTR_DMI)) { int dmflags = AT_DELAY_FLAG(flags) | DM_SEM_FLAG_WR; code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, ip, - vap->va_size, 0, dmflags, NULL); + iattr->ia_size, 0, dmflags, NULL); if (code) { lock_flags = 0; goto error_return; @@ -212,16 +182,14 @@ xfs_setattr( xfs_ilock(ip, lock_flags); /* boolean: are we the file owner? */ - file_owner = (current_fsuid(credp) == ip->i_d.di_uid); + file_owner = (current_fsuid() == ip->i_d.di_uid); /* * Change various properties of a file. * Only the owner or users with CAP_FOWNER * capability may do these things. */ - if (mask & - (XFS_AT_MODE|XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_UID| - XFS_AT_GID|XFS_AT_PROJID)) { + if (mask & (ATTR_MODE|ATTR_UID|ATTR_GID)) { /* * CAP_FOWNER overrides the following restrictions: * @@ -245,21 +213,21 @@ xfs_setattr( * IDs of the calling process shall match the group owner of * the file when setting the set-group-ID bit on that file */ - if (mask & XFS_AT_MODE) { + if (mask & ATTR_MODE) { mode_t m = 0; - if ((vap->va_mode & S_ISUID) && !file_owner) + if ((iattr->ia_mode & S_ISUID) && !file_owner) m |= S_ISUID; - if ((vap->va_mode & S_ISGID) && + if ((iattr->ia_mode & S_ISGID) && !in_group_p((gid_t)ip->i_d.di_gid)) m |= S_ISGID; #if 0 /* Linux allows this, Irix doesn't. */ - if ((vap->va_mode & S_ISVTX) && !S_ISDIR(ip->i_d.di_mode)) + if ((iattr->ia_mode & S_ISVTX) && !S_ISDIR(ip->i_d.di_mode)) m |= S_ISVTX; #endif if (m && !capable(CAP_FSETID)) - vap->va_mode &= ~m; + iattr->ia_mode &= ~m; } } @@ -270,7 +238,7 @@ xfs_setattr( * and can change the group id only to a group of which he * or she is a member. */ - if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) { + if (mask & (ATTR_UID|ATTR_GID)) { /* * These IDs could have changed since we last looked at them. * But, we're assured that if the ownership did change @@ -278,12 +246,9 @@ xfs_setattr( * would have changed also. */ iuid = ip->i_d.di_uid; - iprojid = ip->i_d.di_projid; igid = ip->i_d.di_gid; - gid = (mask & XFS_AT_GID) ? vap->va_gid : igid; - uid = (mask & XFS_AT_UID) ? vap->va_uid : iuid; - projid = (mask & XFS_AT_PROJID) ? (xfs_prid_t)vap->va_projid : - iprojid; + gid = (mask & ATTR_GID) ? iattr->ia_gid : igid; + uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid; /* * CAP_CHOWN overrides the following restrictions: @@ -303,11 +268,10 @@ xfs_setattr( goto error_return; } /* - * Do a quota reservation only if uid/projid/gid is actually + * Do a quota reservation only if uid/gid is actually * going to change. */ if ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) || - (XFS_IS_PQUOTA_ON(mp) && iprojid != projid) || (XFS_IS_GQUOTA_ON(mp) && igid != gid)) { ASSERT(tp); code = XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, udqp, gdqp, @@ -321,13 +285,13 @@ xfs_setattr( /* * Truncate file. Must have write permission and not be a directory. */ - if (mask & XFS_AT_SIZE) { + if (mask & ATTR_SIZE) { /* Short circuit the truncate case for zero length files */ - if ((vap->va_size == 0) && - (ip->i_size == 0) && (ip->i_d.di_nextents == 0)) { + if (iattr->ia_size == 0 && + ip->i_size == 0 && ip->i_d.di_nextents == 0) { xfs_iunlock(ip, XFS_ILOCK_EXCL); lock_flags &= ~XFS_ILOCK_EXCL; - if (mask & XFS_AT_CTIME) + if (mask & ATTR_CTIME) xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); code = 0; goto error_return; @@ -350,9 +314,9 @@ xfs_setattr( /* * Change file access or modified times. */ - if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) { + if (mask & (ATTR_ATIME|ATTR_MTIME)) { if (!file_owner) { - if ((flags & ATTR_UTIME) && + if ((mask & (ATTR_MTIME_SET|ATTR_ATIME_SET)) && !capable(CAP_FOWNER)) { code = XFS_ERROR(EPERM); goto error_return; @@ -361,90 +325,23 @@ xfs_setattr( } /* - * Change extent size or realtime flag. - */ - if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) { - /* - * Can't change extent size if any extents are allocated. - */ - if (ip->i_d.di_nextents && (mask & XFS_AT_EXTSIZE) && - ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) != - vap->va_extsize) ) { - code = XFS_ERROR(EINVAL); /* EFBIG? */ - goto error_return; - } - - /* - * Can't change realtime flag if any extents are allocated. - */ - if ((ip->i_d.di_nextents || ip->i_delayed_blks) && - (mask & XFS_AT_XFLAGS) && - (XFS_IS_REALTIME_INODE(ip)) != - (vap->va_xflags & XFS_XFLAG_REALTIME)) { - code = XFS_ERROR(EINVAL); /* EFBIG? */ - goto error_return; - } - /* - * Extent size must be a multiple of the appropriate block - * size, if set at all. - */ - if ((mask & XFS_AT_EXTSIZE) && vap->va_extsize != 0) { - xfs_extlen_t size; - - if (XFS_IS_REALTIME_INODE(ip) || - ((mask & XFS_AT_XFLAGS) && - (vap->va_xflags & XFS_XFLAG_REALTIME))) { - size = mp->m_sb.sb_rextsize << - mp->m_sb.sb_blocklog; - } else { - size = mp->m_sb.sb_blocksize; - } - if (vap->va_extsize % size) { - code = XFS_ERROR(EINVAL); - goto error_return; - } - } - /* - * If realtime flag is set then must have realtime data. - */ - if ((mask & XFS_AT_XFLAGS) && - (vap->va_xflags & XFS_XFLAG_REALTIME)) { - if ((mp->m_sb.sb_rblocks == 0) || - (mp->m_sb.sb_rextsize == 0) || - (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) { - code = XFS_ERROR(EINVAL); - goto error_return; - } - } - - /* - * Can't modify an immutable/append-only file unless - * we have appropriate permission. - */ - if ((mask & XFS_AT_XFLAGS) && - (ip->i_d.di_flags & - (XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) || - (vap->va_xflags & - (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) && - !capable(CAP_LINUX_IMMUTABLE)) { - code = XFS_ERROR(EPERM); - goto error_return; - } - } - - /* * Now we can make the changes. Before we join the inode - * to the transaction, if XFS_AT_SIZE is set then take care of + * to the transaction, if ATTR_SIZE is set then take care of * the part of the truncation that must be done without the * inode lock. This needs to be done before joining the inode * to the transaction, because the inode cannot be unlocked * once it is a part of the transaction. */ - if (mask & XFS_AT_SIZE) { + if (mask & ATTR_SIZE) { code = 0; - if ((vap->va_size > ip->i_size) && - (flags & ATTR_NOSIZETOK) == 0) { - code = xfs_igrow_start(ip, vap->va_size, credp); + if (iattr->ia_size > ip->i_size) { + /* + * Do the first part of growing a file: zero any data + * in the last block that is beyond the old EOF. We + * need to do this before the inode is joined to the + * transaction to modify the i_size. + */ + code = xfs_zero_eof(ip, iattr->ia_size, ip->i_size); } xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -461,10 +358,10 @@ xfs_setattr( * not within the range we care about here. */ if (!code && - (ip->i_size != ip->i_d.di_size) && - (vap->va_size > ip->i_d.di_size)) { + ip->i_size != ip->i_d.di_size && + iattr->ia_size > ip->i_d.di_size) { code = xfs_flush_pages(ip, - ip->i_d.di_size, vap->va_size, + ip->i_d.di_size, iattr->ia_size, XFS_B_ASYNC, FI_NONE); } @@ -472,7 +369,7 @@ xfs_setattr( vn_iowait(ip); if (!code) - code = xfs_itruncate_data(ip, vap->va_size); + code = xfs_itruncate_data(ip, iattr->ia_size); if (code) { ASSERT(tp == NULL); lock_flags &= ~XFS_ILOCK_EXCL; @@ -501,28 +398,30 @@ xfs_setattr( /* * Truncate file. Must have write permission and not be a directory. */ - if (mask & XFS_AT_SIZE) { + if (mask & ATTR_SIZE) { /* * Only change the c/mtime if we are changing the size * or we are explicitly asked to change it. This handles * the semantic difference between truncate() and ftruncate() * as implemented in the VFS. */ - if (vap->va_size != ip->i_size || (mask & XFS_AT_CTIME)) + if (iattr->ia_size != ip->i_size || (mask & ATTR_CTIME)) timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; - if (vap->va_size > ip->i_size) { - xfs_igrow_finish(tp, ip, vap->va_size, - !(flags & ATTR_DMI)); - } else if ((vap->va_size <= ip->i_size) || - ((vap->va_size == 0) && ip->i_d.di_nextents)) { + if (iattr->ia_size > ip->i_size) { + ip->i_d.di_size = iattr->ia_size; + ip->i_size = iattr->ia_size; + if (!(flags & XFS_ATTR_DMI)) + xfs_ichgtime(ip, XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + } else if (iattr->ia_size <= ip->i_size || + (iattr->ia_size == 0 && ip->i_d.di_nextents)) { /* * signal a sync transaction unless * we're truncating an already unlinked * file on a wsync filesystem */ - code = xfs_itruncate_finish(&tp, ip, - (xfs_fsize_t)vap->va_size, + code = xfs_itruncate_finish(&tp, ip, iattr->ia_size, XFS_DATA_FORK, ((ip->i_d.di_nlink != 0 || !(mp->m_flags & XFS_MOUNT_WSYNC)) @@ -544,9 +443,12 @@ xfs_setattr( /* * Change file access modes. */ - if (mask & XFS_AT_MODE) { + if (mask & ATTR_MODE) { ip->i_d.di_mode &= S_IFMT; - ip->i_d.di_mode |= vap->va_mode & ~S_IFMT; + ip->i_d.di_mode |= iattr->ia_mode & ~S_IFMT; + + inode->i_mode &= S_IFMT; + inode->i_mode |= iattr->ia_mode & ~S_IFMT; xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE); timeflags |= XFS_ICHGTIME_CHG; @@ -559,7 +461,7 @@ xfs_setattr( * and can change the group id only to a group of which he * or she is a member. */ - if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) { + if (mask & (ATTR_UID|ATTR_GID)) { /* * CAP_FSETID overrides the following restrictions: * @@ -577,39 +479,24 @@ xfs_setattr( */ if (iuid != uid) { if (XFS_IS_UQUOTA_ON(mp)) { - ASSERT(mask & XFS_AT_UID); + ASSERT(mask & ATTR_UID); ASSERT(udqp); olddquot1 = XFS_QM_DQVOPCHOWN(mp, tp, ip, &ip->i_udquot, udqp); } ip->i_d.di_uid = uid; + inode->i_uid = uid; } if (igid != gid) { if (XFS_IS_GQUOTA_ON(mp)) { ASSERT(!XFS_IS_PQUOTA_ON(mp)); - ASSERT(mask & XFS_AT_GID); + ASSERT(mask & ATTR_GID); ASSERT(gdqp); olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip, &ip->i_gdquot, gdqp); } ip->i_d.di_gid = gid; - } - if (iprojid != projid) { - if (XFS_IS_PQUOTA_ON(mp)) { - ASSERT(!XFS_IS_GQUOTA_ON(mp)); - ASSERT(mask & XFS_AT_PROJID); - ASSERT(gdqp); - olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip, - &ip->i_gdquot, gdqp); - } - ip->i_d.di_projid = projid; - /* - * We may have to rev the inode as well as - * the superblock version number since projids didn't - * exist before DINODE_VERSION_2 and SB_VERSION_NLINK. - */ - if (ip->i_d.di_version == XFS_DINODE_VERSION_1) - xfs_bump_ino_vers2(tp, ip); + inode->i_gid = gid; } xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE); @@ -620,82 +507,33 @@ xfs_setattr( /* * Change file access or modified times. */ - if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) { - if (mask & XFS_AT_ATIME) { - ip->i_d.di_atime.t_sec = vap->va_atime.tv_sec; - ip->i_d.di_atime.t_nsec = vap->va_atime.tv_nsec; + if (mask & (ATTR_ATIME|ATTR_MTIME)) { + if (mask & ATTR_ATIME) { + inode->i_atime = iattr->ia_atime; + ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec; + ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec; ip->i_update_core = 1; - timeflags &= ~XFS_ICHGTIME_ACC; } - if (mask & XFS_AT_MTIME) { - ip->i_d.di_mtime.t_sec = vap->va_mtime.tv_sec; - ip->i_d.di_mtime.t_nsec = vap->va_mtime.tv_nsec; + if (mask & ATTR_MTIME) { + inode->i_mtime = iattr->ia_mtime; + ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec; + ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec; timeflags &= ~XFS_ICHGTIME_MOD; timeflags |= XFS_ICHGTIME_CHG; } - if (tp && (flags & ATTR_UTIME)) + if (tp && (mask & (ATTR_MTIME_SET|ATTR_ATIME_SET))) xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE); } /* - * Change XFS-added attributes. - */ - if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) { - if (mask & XFS_AT_EXTSIZE) { - /* - * Converting bytes to fs blocks. - */ - ip->i_d.di_extsize = vap->va_extsize >> - mp->m_sb.sb_blocklog; - } - if (mask & XFS_AT_XFLAGS) { - uint di_flags; - - /* can't set PREALLOC this way, just preserve it */ - di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC); - if (vap->va_xflags & XFS_XFLAG_IMMUTABLE) - di_flags |= XFS_DIFLAG_IMMUTABLE; - if (vap->va_xflags & XFS_XFLAG_APPEND) - di_flags |= XFS_DIFLAG_APPEND; - if (vap->va_xflags & XFS_XFLAG_SYNC) - di_flags |= XFS_DIFLAG_SYNC; - if (vap->va_xflags & XFS_XFLAG_NOATIME) - di_flags |= XFS_DIFLAG_NOATIME; - if (vap->va_xflags & XFS_XFLAG_NODUMP) - di_flags |= XFS_DIFLAG_NODUMP; - if (vap->va_xflags & XFS_XFLAG_PROJINHERIT) - di_flags |= XFS_DIFLAG_PROJINHERIT; - if (vap->va_xflags & XFS_XFLAG_NODEFRAG) - di_flags |= XFS_DIFLAG_NODEFRAG; - if (vap->va_xflags & XFS_XFLAG_FILESTREAM) - di_flags |= XFS_DIFLAG_FILESTREAM; - if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) { - if (vap->va_xflags & XFS_XFLAG_RTINHERIT) - di_flags |= XFS_DIFLAG_RTINHERIT; - if (vap->va_xflags & XFS_XFLAG_NOSYMLINKS) - di_flags |= XFS_DIFLAG_NOSYMLINKS; - if (vap->va_xflags & XFS_XFLAG_EXTSZINHERIT) - di_flags |= XFS_DIFLAG_EXTSZINHERIT; - } else if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) { - if (vap->va_xflags & XFS_XFLAG_REALTIME) - di_flags |= XFS_DIFLAG_REALTIME; - if (vap->va_xflags & XFS_XFLAG_EXTSIZE) - di_flags |= XFS_DIFLAG_EXTSIZE; - } - ip->i_d.di_flags = di_flags; - } - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - timeflags |= XFS_ICHGTIME_CHG; - } - - /* - * Change file inode change time only if XFS_AT_CTIME set + * Change file inode change time only if ATTR_CTIME set * AND we have been called by a DMI function. */ - if ( (flags & ATTR_DMI) && (mask & XFS_AT_CTIME) ) { - ip->i_d.di_ctime.t_sec = vap->va_ctime.tv_sec; - ip->i_d.di_ctime.t_nsec = vap->va_ctime.tv_nsec; + if ((flags & XFS_ATTR_DMI) && (mask & ATTR_CTIME)) { + inode->i_ctime = iattr->ia_ctime; + ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; + ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec; ip->i_update_core = 1; timeflags &= ~XFS_ICHGTIME_CHG; } @@ -704,7 +542,7 @@ xfs_setattr( * Send out timestamp changes that need to be set to the * current time. Not done when called by a DMI function. */ - if (timeflags && !(flags & ATTR_DMI)) + if (timeflags && !(flags & XFS_ATTR_DMI)) xfs_ichgtime(ip, timeflags); XFS_STATS_INC(xs_ig_attrchg); @@ -742,7 +580,7 @@ xfs_setattr( } if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE) && - !(flags & ATTR_DMI)) { + !(flags & XFS_ATTR_DMI)) { (void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, ip, DM_RIGHT_NULL, NULL, DM_RIGHT_NULL, NULL, NULL, 0, 0, AT_DELAY_FLAG(flags)); @@ -875,7 +713,7 @@ xfs_fsync( return XFS_ERROR(EIO); /* capture size updates in I/O completion before writing the inode. */ - error = filemap_fdatawait(vn_to_inode(XFS_ITOV(ip))->i_mapping); + error = filemap_fdatawait(VFS_I(ip)->i_mapping); if (error) return XFS_ERROR(error); @@ -1321,7 +1159,6 @@ int xfs_release( xfs_inode_t *ip) { - bhv_vnode_t *vp = XFS_ITOV(ip); xfs_mount_t *mp = ip->i_mount; int error; @@ -1356,13 +1193,13 @@ xfs_release( * be exposed to that problem. */ truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED); - if (truncated && VN_DIRTY(vp) && ip->i_delayed_blks > 0) + if (truncated && VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) xfs_flush_pages(ip, 0, -1, XFS_B_ASYNC, FI_NONE); } if (ip->i_d.di_nlink != 0) { if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) && - ((ip->i_size > 0) || (VN_CACHED(vp) > 0 || + ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) && (ip->i_df.if_flags & XFS_IFEXTENTS)) && (!(ip->i_d.di_flags & @@ -1388,7 +1225,6 @@ int xfs_inactive( xfs_inode_t *ip) { - bhv_vnode_t *vp = XFS_ITOV(ip); xfs_bmap_free_t free_list; xfs_fsblock_t first_block; int committed; @@ -1403,7 +1239,7 @@ xfs_inactive( * If the inode is already free, then there can be nothing * to clean up here. */ - if (ip->i_d.di_mode == 0 || VN_BAD(vp)) { + if (ip->i_d.di_mode == 0 || VN_BAD(VFS_I(ip))) { ASSERT(ip->i_df.if_real_bytes == 0); ASSERT(ip->i_df.if_broot_bytes == 0); return VN_INACTIVE_CACHE; @@ -1433,7 +1269,7 @@ xfs_inactive( if (ip->i_d.di_nlink != 0) { if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) && - ((ip->i_size > 0) || (VN_CACHED(vp) > 0 || + ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) && (ip->i_df.if_flags & XFS_IFEXTENTS) && (!(ip->i_d.di_flags & @@ -1601,12 +1437,18 @@ xfs_inactive( return VN_INACTIVE_CACHE; } - +/* + * Lookups up an inode from "name". If ci_name is not NULL, then a CI match + * is allowed, otherwise it has to be an exact match. If a CI match is found, + * ci_name->name will point to a the actual name (caller must free) or + * will be set to NULL if an exact match is found. + */ int xfs_lookup( xfs_inode_t *dp, struct xfs_name *name, - xfs_inode_t **ipp) + xfs_inode_t **ipp, + struct xfs_name *ci_name) { xfs_ino_t inum; int error; @@ -1618,7 +1460,7 @@ xfs_lookup( return XFS_ERROR(EIO); lock_mode = xfs_ilock_map_shared(dp); - error = xfs_dir_lookup(NULL, dp, name, &inum); + error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name); xfs_iunlock_map_shared(dp, lock_mode); if (error) @@ -1626,12 +1468,15 @@ xfs_lookup( error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp, 0); if (error) - goto out; + goto out_free_name; xfs_itrace_ref(*ipp); return 0; - out: +out_free_name: + if (ci_name) + kmem_free(ci_name->name); +out: *ipp = NULL; return error; } @@ -1688,7 +1533,7 @@ xfs_create( * Make sure that we have allocated dquot(s) on disk. */ error = XFS_QM_DQVOPALLOC(mp, dp, - current_fsuid(credp), current_fsgid(credp), prid, + current_fsuid(), current_fsgid(), prid, XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT, &udqp, &gdqp); if (error) goto std_return; @@ -1860,111 +1705,6 @@ std_return: } #ifdef DEBUG -/* - * Some counters to see if (and how often) we are hitting some deadlock - * prevention code paths. - */ - -int xfs_rm_locks; -int xfs_rm_lock_delays; -int xfs_rm_attempts; -#endif - -/* - * The following routine will lock the inodes associated with the - * directory and the named entry in the directory. The locks are - * acquired in increasing inode number. - * - * If the entry is "..", then only the directory is locked. The - * vnode ref count will still include that from the .. entry in - * this case. - * - * There is a deadlock we need to worry about. If the locked directory is - * in the AIL, it might be blocking up the log. The next inode we lock - * could be already locked by another thread waiting for log space (e.g - * a permanent log reservation with a long running transaction (see - * xfs_itruncate_finish)). To solve this, we must check if the directory - * is in the ail and use lock_nowait. If we can't lock, we need to - * drop the inode lock on the directory and try again. xfs_iunlock will - * potentially push the tail if we were holding up the log. - */ -STATIC int -xfs_lock_dir_and_entry( - xfs_inode_t *dp, - xfs_inode_t *ip) /* inode of entry 'name' */ -{ - int attempts; - xfs_ino_t e_inum; - xfs_inode_t *ips[2]; - xfs_log_item_t *lp; - -#ifdef DEBUG - xfs_rm_locks++; -#endif - attempts = 0; - -again: - xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); - - e_inum = ip->i_ino; - - xfs_itrace_ref(ip); - - /* - * We want to lock in increasing inum. Since we've already - * acquired the lock on the directory, we may need to release - * if if the inum of the entry turns out to be less. - */ - if (e_inum > dp->i_ino) { - /* - * We are already in the right order, so just - * lock on the inode of the entry. - * We need to use nowait if dp is in the AIL. - */ - - lp = (xfs_log_item_t *)dp->i_itemp; - if (lp && (lp->li_flags & XFS_LI_IN_AIL)) { - if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { - attempts++; -#ifdef DEBUG - xfs_rm_attempts++; -#endif - - /* - * Unlock dp and try again. - * xfs_iunlock will try to push the tail - * if the inode is in the AIL. - */ - - xfs_iunlock(dp, XFS_ILOCK_EXCL); - - if ((attempts % 5) == 0) { - delay(1); /* Don't just spin the CPU */ -#ifdef DEBUG - xfs_rm_lock_delays++; -#endif - } - goto again; - } - } else { - xfs_ilock(ip, XFS_ILOCK_EXCL); - } - } else if (e_inum < dp->i_ino) { - xfs_iunlock(dp, XFS_ILOCK_EXCL); - - ips[0] = ip; - ips[1] = dp; - xfs_lock_inodes(ips, 2, XFS_ILOCK_EXCL); - } - /* else e_inum == dp->i_ino */ - /* This can happen if we're asked to lock /x/.. - * the entry is "..", which is also the parent directory. - */ - - return 0; -} - -#ifdef DEBUG int xfs_locked_n; int xfs_small_retries; int xfs_middle_retries; @@ -2098,12 +1838,52 @@ again: #endif } -#ifdef DEBUG -#define REMOVE_DEBUG_TRACE(x) {remove_which_error_return = (x);} -int remove_which_error_return = 0; -#else /* ! DEBUG */ -#define REMOVE_DEBUG_TRACE(x) -#endif /* ! DEBUG */ +/* + * xfs_lock_two_inodes() can only be used to lock one type of lock + * at a time - the iolock or the ilock, but not both at once. If + * we lock both at once, lockdep will report false positives saying + * we have violated locking orders. + */ +void +xfs_lock_two_inodes( + xfs_inode_t *ip0, + xfs_inode_t *ip1, + uint lock_mode) +{ + xfs_inode_t *temp; + int attempts = 0; + xfs_log_item_t *lp; + + if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) + ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0); + ASSERT(ip0->i_ino != ip1->i_ino); + + if (ip0->i_ino > ip1->i_ino) { + temp = ip0; + ip0 = ip1; + ip1 = temp; + } + + again: + xfs_ilock(ip0, xfs_lock_inumorder(lock_mode, 0)); + + /* + * If the first lock we have locked is in the AIL, we must TRY to get + * the second lock. If we can't get it, we must release the first one + * and try again. + */ + lp = (xfs_log_item_t *)ip0->i_itemp; + if (lp && (lp->li_flags & XFS_LI_IN_AIL)) { + if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(lock_mode, 1))) { + xfs_iunlock(ip0, lock_mode); + if ((++attempts % 5) == 0) + delay(1); /* Don't just spin the CPU */ + goto again; + } + } else { + xfs_ilock(ip1, xfs_lock_inumorder(lock_mode, 1)); + } +} int xfs_remove( @@ -2113,6 +1893,7 @@ xfs_remove( { xfs_mount_t *mp = dp->i_mount; xfs_trans_t *tp = NULL; + int is_dir = S_ISDIR(ip->i_d.di_mode); int error = 0; xfs_bmap_free_t free_list; xfs_fsblock_t first_block; @@ -2120,8 +1901,10 @@ xfs_remove( int committed; int link_zero; uint resblks; + uint log_count; xfs_itrace_entry(dp); + xfs_itrace_entry(ip); if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); @@ -2134,19 +1917,23 @@ xfs_remove( return error; } - xfs_itrace_entry(ip); - xfs_itrace_ref(ip); - error = XFS_QM_DQATTACH(mp, dp, 0); - if (!error) - error = XFS_QM_DQATTACH(mp, ip, 0); - if (error) { - REMOVE_DEBUG_TRACE(__LINE__); + if (error) goto std_return; - } - tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE); + error = XFS_QM_DQATTACH(mp, ip, 0); + if (error) + goto std_return; + + if (is_dir) { + tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR); + log_count = XFS_DEFAULT_LOG_COUNT; + } else { + tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE); + log_count = XFS_REMOVE_LOG_COUNT; + } cancel_flags = XFS_TRANS_RELEASE_LOG_RES; + /* * We try to get the real space reservation first, * allowing for directory btree deletion(s) implying @@ -2158,25 +1945,19 @@ xfs_remove( */ resblks = XFS_REMOVE_SPACE_RES(mp); error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT); + XFS_TRANS_PERM_LOG_RES, log_count); if (error == ENOSPC) { resblks = 0; error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT); + XFS_TRANS_PERM_LOG_RES, log_count); } if (error) { ASSERT(error != ENOSPC); - REMOVE_DEBUG_TRACE(__LINE__); - xfs_trans_cancel(tp, 0); - return error; + cancel_flags = 0; + goto out_trans_cancel; } - error = xfs_lock_dir_and_entry(dp, ip); - if (error) { - REMOVE_DEBUG_TRACE(__LINE__); - xfs_trans_cancel(tp, cancel_flags); - goto std_return; - } + xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL); /* * At this point, we've gotten both the directory and the entry @@ -2189,46 +1970,83 @@ xfs_remove( xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); /* - * Entry must exist since we did a lookup in xfs_lock_dir_and_entry. + * If we're removing a directory perform some additional validation. */ + if (is_dir) { + ASSERT(ip->i_d.di_nlink >= 2); + if (ip->i_d.di_nlink != 2) { + error = XFS_ERROR(ENOTEMPTY); + goto out_trans_cancel; + } + if (!xfs_dir_isempty(ip)) { + error = XFS_ERROR(ENOTEMPTY); + goto out_trans_cancel; + } + } + XFS_BMAP_INIT(&free_list, &first_block); error = xfs_dir_removename(tp, dp, name, ip->i_ino, &first_block, &free_list, resblks); if (error) { ASSERT(error != ENOENT); - REMOVE_DEBUG_TRACE(__LINE__); - goto error1; + goto out_bmap_cancel; } xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + /* + * Bump the in memory generation count on the parent + * directory so that other can know that it has changed. + */ dp->i_gen++; xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); - error = xfs_droplink(tp, ip); - if (error) { - REMOVE_DEBUG_TRACE(__LINE__); - goto error1; + if (is_dir) { + /* + * Drop the link from ip's "..". + */ + error = xfs_droplink(tp, dp); + if (error) + goto out_bmap_cancel; + + /* + * Drop the link from dp to ip. + */ + error = xfs_droplink(tp, ip); + if (error) + goto out_bmap_cancel; + } else { + /* + * When removing a non-directory we need to log the parent + * inode here for the i_gen update. For a directory this is + * done implicitly by the xfs_droplink call for the ".." entry. + */ + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); } - /* Determine if this is the last link while + /* + * Drop the "." link from ip to self. + */ + error = xfs_droplink(tp, ip); + if (error) + goto out_bmap_cancel; + + /* + * Determine if this is the last link while * we are in the transaction. */ - link_zero = (ip)->i_d.di_nlink==0; + link_zero = (ip->i_d.di_nlink == 0); /* * If this is a synchronous mount, make sure that the * remove transaction goes to disk before returning to * the user. */ - if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { + if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) xfs_trans_set_sync(tp); - } error = xfs_bmap_finish(&tp, &free_list, &committed); - if (error) { - REMOVE_DEBUG_TRACE(__LINE__); - goto error_rele; - } + if (error) + goto out_bmap_cancel; error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); if (error) @@ -2240,38 +2058,26 @@ xfs_remove( * will get killed on last close in xfs_close() so we don't * have to worry about that. */ - if (link_zero && xfs_inode_is_filestream(ip)) + if (!is_dir && link_zero && xfs_inode_is_filestream(ip)) xfs_filestream_deassociate(ip); xfs_itrace_exit(ip); + xfs_itrace_exit(dp); -/* Fall through to std_return with error = 0 */ std_return: if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) { - (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE, - dp, DM_RIGHT_NULL, - NULL, DM_RIGHT_NULL, - name->name, NULL, ip->i_d.di_mode, error, 0); + XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE, dp, DM_RIGHT_NULL, + NULL, DM_RIGHT_NULL, name->name, NULL, + ip->i_d.di_mode, error, 0); } - return error; - error1: - xfs_bmap_cancel(&free_list); - cancel_flags |= XFS_TRANS_ABORT; - xfs_trans_cancel(tp, cancel_flags); - goto std_return; + return error; - error_rele: - /* - * In this case make sure to not release the inode until after - * the current transaction is aborted. Releasing it beforehand - * can cause us to go to xfs_inactive and start a recursive - * transaction which can easily deadlock with the current one. - */ + out_bmap_cancel: xfs_bmap_cancel(&free_list); cancel_flags |= XFS_TRANS_ABORT; + out_trans_cancel: xfs_trans_cancel(tp, cancel_flags); - goto std_return; } @@ -2283,7 +2089,6 @@ xfs_link( { xfs_mount_t *mp = tdp->i_mount; xfs_trans_t *tp; - xfs_inode_t *ips[2]; int error; xfs_bmap_free_t free_list; xfs_fsblock_t first_block; @@ -2331,15 +2136,7 @@ xfs_link( goto error_return; } - if (sip->i_ino < tdp->i_ino) { - ips[0] = sip; - ips[1] = tdp; - } else { - ips[0] = tdp; - ips[1] = sip; - } - - xfs_lock_inodes(ips, 2, XFS_ILOCK_EXCL); + xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL); /* * Increment vnode ref counts since xfs_trans_commit & @@ -2480,7 +2277,7 @@ xfs_mkdir( * Make sure that we have allocated dquot(s) on disk. */ error = XFS_QM_DQVOPALLOC(mp, dp, - current_fsuid(credp), current_fsgid(credp), prid, + current_fsuid(), current_fsgid(), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp); if (error) goto std_return; @@ -2638,186 +2435,6 @@ std_return: } int -xfs_rmdir( - xfs_inode_t *dp, - struct xfs_name *name, - xfs_inode_t *cdp) -{ - xfs_mount_t *mp = dp->i_mount; - xfs_trans_t *tp; - int error; - xfs_bmap_free_t free_list; - xfs_fsblock_t first_block; - int cancel_flags; - int committed; - int last_cdp_link; - uint resblks; - - xfs_itrace_entry(dp); - - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); - - if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) { - error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, - dp, DM_RIGHT_NULL, - NULL, DM_RIGHT_NULL, name->name, - NULL, cdp->i_d.di_mode, 0, 0); - if (error) - return XFS_ERROR(error); - } - - /* - * Get the dquots for the inodes. - */ - error = XFS_QM_DQATTACH(mp, dp, 0); - if (!error) - error = XFS_QM_DQATTACH(mp, cdp, 0); - if (error) { - REMOVE_DEBUG_TRACE(__LINE__); - goto std_return; - } - - tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR); - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; - /* - * We try to get the real space reservation first, - * allowing for directory btree deletion(s) implying - * possible bmap insert(s). If we can't get the space - * reservation then we use 0 instead, and avoid the bmap - * btree insert(s) in the directory code by, if the bmap - * insert tries to happen, instead trimming the LAST - * block from the directory. - */ - resblks = XFS_REMOVE_SPACE_RES(mp); - error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT); - if (error == ENOSPC) { - resblks = 0; - error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT); - } - if (error) { - ASSERT(error != ENOSPC); - cancel_flags = 0; - goto error_return; - } - XFS_BMAP_INIT(&free_list, &first_block); - - /* - * Now lock the child directory inode and the parent directory - * inode in the proper order. This will take care of validating - * that the directory entry for the child directory inode has - * not changed while we were obtaining a log reservation. - */ - error = xfs_lock_dir_and_entry(dp, cdp); - if (error) { - xfs_trans_cancel(tp, cancel_flags); - goto std_return; - } - - IHOLD(dp); - xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); - - IHOLD(cdp); - xfs_trans_ijoin(tp, cdp, XFS_ILOCK_EXCL); - - ASSERT(cdp->i_d.di_nlink >= 2); - if (cdp->i_d.di_nlink != 2) { - error = XFS_ERROR(ENOTEMPTY); - goto error_return; - } - if (!xfs_dir_isempty(cdp)) { - error = XFS_ERROR(ENOTEMPTY); - goto error_return; - } - - error = xfs_dir_removename(tp, dp, name, cdp->i_ino, - &first_block, &free_list, resblks); - if (error) - goto error1; - - xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - - /* - * Bump the in memory generation count on the parent - * directory so that other can know that it has changed. - */ - dp->i_gen++; - - /* - * Drop the link from cdp's "..". - */ - error = xfs_droplink(tp, dp); - if (error) { - goto error1; - } - - /* - * Drop the link from dp to cdp. - */ - error = xfs_droplink(tp, cdp); - if (error) { - goto error1; - } - - /* - * Drop the "." link from cdp to self. - */ - error = xfs_droplink(tp, cdp); - if (error) { - goto error1; - } - - /* Determine these before committing transaction */ - last_cdp_link = (cdp)->i_d.di_nlink==0; - - /* - * If this is a synchronous mount, make sure that the - * rmdir transaction goes to disk before returning to - * the user. - */ - if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { - xfs_trans_set_sync(tp); - } - - error = xfs_bmap_finish (&tp, &free_list, &committed); - if (error) { - xfs_bmap_cancel(&free_list); - xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | - XFS_TRANS_ABORT)); - goto std_return; - } - - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - if (error) { - goto std_return; - } - - - /* Fall through to std_return with error = 0 or the errno - * from xfs_trans_commit. */ - std_return: - if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) { - (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE, - dp, DM_RIGHT_NULL, - NULL, DM_RIGHT_NULL, - name->name, NULL, cdp->i_d.di_mode, - error, 0); - } - return error; - - error1: - xfs_bmap_cancel(&free_list); - cancel_flags |= XFS_TRANS_ABORT; - /* FALLTHROUGH */ - - error_return: - xfs_trans_cancel(tp, cancel_flags); - goto std_return; -} - -int xfs_symlink( xfs_inode_t *dp, struct xfs_name *link_name, @@ -2886,7 +2503,7 @@ xfs_symlink( * Make sure that we have allocated dquot(s) on disk. */ error = XFS_QM_DQVOPALLOC(mp, dp, - current_fsuid(credp), current_fsgid(credp), prid, + current_fsuid(), current_fsgid(), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp); if (error) goto std_return; @@ -3181,14 +2798,13 @@ int xfs_reclaim( xfs_inode_t *ip) { - bhv_vnode_t *vp = XFS_ITOV(ip); xfs_itrace_entry(ip); - ASSERT(!VN_MAPPED(vp)); + ASSERT(!VN_MAPPED(VFS_I(ip))); /* bad inode, get out here ASAP */ - if (VN_BAD(vp)) { + if (VN_BAD(VFS_I(ip))) { xfs_ireclaim(ip); return 0; } @@ -3225,7 +2841,7 @@ xfs_reclaim( XFS_MOUNT_ILOCK(mp); spin_lock(&ip->i_flags_lock); __xfs_iflags_set(ip, XFS_IRECLAIMABLE); - vn_to_inode(vp)->i_private = NULL; + VFS_I(ip)->i_private = NULL; ip->i_vnode = NULL; spin_unlock(&ip->i_flags_lock); list_add_tail(&ip->i_reclaim, &mp->m_del_inodes); @@ -3241,8 +2857,7 @@ xfs_finish_reclaim( int sync_mode) { xfs_perag_t *pag = xfs_get_perag(ip->i_mount, ip->i_ino); - bhv_vnode_t *vp = XFS_ITOV_NULL(ip); - int error; + struct inode *vp = VFS_I(ip); if (vp && VN_BAD(vp)) goto reclaim; @@ -3285,29 +2900,16 @@ xfs_finish_reclaim( xfs_iflock(ip); } - if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { - if (ip->i_update_core || - ((ip->i_itemp != NULL) && - (ip->i_itemp->ili_format.ilf_fields != 0))) { - error = xfs_iflush(ip, sync_mode); - /* - * If we hit an error, typically because of filesystem - * shutdown, we don't need to let vn_reclaim to know - * because we're gonna reclaim the inode anyway. - */ - if (error) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - goto reclaim; - } - xfs_iflock(ip); /* synchronize with xfs_iflush_done */ - } - - ASSERT(ip->i_update_core == 0); - ASSERT(ip->i_itemp == NULL || - ip->i_itemp->ili_format.ilf_fields == 0); + /* + * In the case of a forced shutdown we rely on xfs_iflush() to + * wait for the inode to be unpinned before returning an error. + */ + if (xfs_iflush(ip, sync_mode) == 0) { + /* synchronize with xfs_iflush_done */ + xfs_iflock(ip); + xfs_ifunlock(ip); } - xfs_ifunlock(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); reclaim: @@ -3418,7 +3020,7 @@ xfs_alloc_file_space( /* Generate a DMAPI event if needed. */ if (alloc_type != 0 && offset < ip->i_size && - (attr_flags&ATTR_DMI) == 0 && + (attr_flags & XFS_ATTR_DMI) == 0 && DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) { xfs_off_t end_dmi_offset; @@ -3532,7 +3134,7 @@ retry: allocatesize_fsb -= allocated_fsb; } dmapi_enospc_check: - if (error == ENOSPC && (attr_flags & ATTR_DMI) == 0 && + if (error == ENOSPC && (attr_flags & XFS_ATTR_DMI) == 0 && DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE)) { error = XFS_SEND_NAMESP(mp, DM_EVENT_NOSPACE, ip, DM_RIGHT_NULL, @@ -3558,6 +3160,13 @@ error1: /* Just cancel transaction */ /* * Zero file bytes between startoff and endoff inclusive. * The iolock is held exclusive and no blocks are buffered. + * + * This function is used by xfs_free_file_space() to zero + * partial blocks when the range to free is not block aligned. + * When unreserving space with boundaries that are not block + * aligned we round up the start and round down the end + * boundaries and then use this function to zero the parts of + * the blocks that got dropped during the rounding. */ STATIC int xfs_zero_remaining_bytes( @@ -3574,6 +3183,17 @@ xfs_zero_remaining_bytes( int nimap; int error = 0; + /* + * Avoid doing I/O beyond eof - it's not necessary + * since nothing can read beyond eof. The space will + * be zeroed when the file is extended anyway. + */ + if (startoff >= ip->i_size) + return 0; + + if (endoff > ip->i_size) + endoff = ip->i_size; + bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize, XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp); @@ -3643,7 +3263,6 @@ xfs_free_file_space( xfs_off_t len, int attr_flags) { - bhv_vnode_t *vp; int committed; int done; xfs_off_t end_dmi_offset; @@ -3663,7 +3282,6 @@ xfs_free_file_space( xfs_trans_t *tp; int need_iolock = 1; - vp = XFS_ITOV(ip); mp = ip->i_mount; xfs_itrace_entry(ip); @@ -3679,7 +3297,7 @@ xfs_free_file_space( end_dmi_offset = offset + len; endoffset_fsb = XFS_B_TO_FSBT(mp, end_dmi_offset); - if (offset < ip->i_size && (attr_flags & ATTR_DMI) == 0 && + if (offset < ip->i_size && (attr_flags & XFS_ATTR_DMI) == 0 && DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) { if (end_dmi_offset > ip->i_size) end_dmi_offset = ip->i_size; @@ -3690,7 +3308,7 @@ xfs_free_file_space( return error; } - if (attr_flags & ATTR_NOLOCK) + if (attr_flags & XFS_ATTR_NOLOCK) need_iolock = 0; if (need_iolock) { xfs_ilock(ip, XFS_IOLOCK_EXCL); @@ -3700,7 +3318,7 @@ xfs_free_file_space( rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); ioffset = offset & ~(rounding - 1); - if (VN_CACHED(vp) != 0) { + if (VN_CACHED(VFS_I(ip)) != 0) { xfs_inval_cached_trace(ip, ioffset, -1, ioffset, -1); error = xfs_flushinval_pages(ip, ioffset, -1, FI_REMAPF_LOCKED); if (error) @@ -3867,7 +3485,7 @@ xfs_change_file_space( xfs_off_t startoffset; xfs_off_t llen; xfs_trans_t *tp; - bhv_vattr_t va; + struct iattr iattr; xfs_itrace_entry(ip); @@ -3941,10 +3559,10 @@ xfs_change_file_space( break; } - va.va_mask = XFS_AT_SIZE; - va.va_size = startoffset; + iattr.ia_valid = ATTR_SIZE; + iattr.ia_size = startoffset; - error = xfs_setattr(ip, &va, attr_flags, credp); + error = xfs_setattr(ip, &iattr, attr_flags, credp); if (error) return error; @@ -3974,7 +3592,7 @@ xfs_change_file_space( xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_ihold(tp, ip); - if ((attr_flags & ATTR_DMI) == 0) { + if ((attr_flags & XFS_ATTR_DMI) == 0) { ip->i_d.di_mode &= ~S_ISUID; /* diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h index 57335ba4ce53..e932a96bec54 100644 --- a/fs/xfs/xfs_vnodeops.h +++ b/fs/xfs/xfs_vnodeops.h @@ -2,9 +2,9 @@ #define _XFS_VNODEOPS_H 1 struct attrlist_cursor_kern; -struct bhv_vattr; struct cred; struct file; +struct iattr; struct inode; struct iovec; struct kiocb; @@ -15,14 +15,18 @@ struct xfs_iomap; int xfs_open(struct xfs_inode *ip); -int xfs_setattr(struct xfs_inode *ip, struct bhv_vattr *vap, int flags, +int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags, struct cred *credp); +#define XFS_ATTR_DMI 0x01 /* invocation from a DMI function */ +#define XFS_ATTR_NONBLOCK 0x02 /* return EAGAIN if operation would block */ +#define XFS_ATTR_NOLOCK 0x04 /* Don't grab any conflicting locks */ + int xfs_readlink(struct xfs_inode *ip, char *link); int xfs_fsync(struct xfs_inode *ip); int xfs_release(struct xfs_inode *ip); int xfs_inactive(struct xfs_inode *ip); int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, - struct xfs_inode **ipp); + struct xfs_inode **ipp, struct xfs_name *ci_name); int xfs_create(struct xfs_inode *dp, struct xfs_name *name, mode_t mode, xfs_dev_t rdev, struct xfs_inode **ipp, struct cred *credp); int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, @@ -31,8 +35,6 @@ int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, struct xfs_name *target_name); int xfs_mkdir(struct xfs_inode *dp, struct xfs_name *dir_name, mode_t mode, struct xfs_inode **ipp, struct cred *credp); -int xfs_rmdir(struct xfs_inode *dp, struct xfs_name *name, - struct xfs_inode *cdp); int xfs_readdir(struct xfs_inode *dp, void *dirent, size_t bufsize, xfs_off_t *offset, filldir_t filldir); int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, |