From 0a32c26e720a8b38971d0685976f4a7d63f9e2ef Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 5 Jun 2013 12:09:08 +1000 Subject: xfs: inode unlinked list needs to recalculate the inode CRC The inode unlinked list manipulations operate directly on the inode buffer, and so bypass the inode CRC calculation mechanisms. Hence an inode on the unlinked list has an invalid CRC. Fix this by recalculating the CRC whenever we modify an unlinked list pointer in an inode, ncluding during log recovery. This is trivial to do and results in unlinked list operations always leaving a consistent inode in the buffer. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_inode.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index efbe1accb6ca..7f7be5f98f52 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1638,6 +1638,10 @@ xfs_iunlink( dip->di_next_unlinked = agi->agi_unlinked[bucket_index]; offset = ip->i_imap.im_boffset + offsetof(xfs_dinode_t, di_next_unlinked); + + /* need to recalc the inode CRC if appropriate */ + xfs_dinode_calc_crc(mp, dip); + xfs_trans_inode_buf(tp, ibp); xfs_trans_log_buf(tp, ibp, offset, (offset + sizeof(xfs_agino_t) - 1)); @@ -1723,6 +1727,10 @@ xfs_iunlink_remove( dip->di_next_unlinked = cpu_to_be32(NULLAGINO); offset = ip->i_imap.im_boffset + offsetof(xfs_dinode_t, di_next_unlinked); + + /* need to recalc the inode CRC if appropriate */ + xfs_dinode_calc_crc(mp, dip); + xfs_trans_inode_buf(tp, ibp); xfs_trans_log_buf(tp, ibp, offset, (offset + sizeof(xfs_agino_t) - 1)); @@ -1796,6 +1804,10 @@ xfs_iunlink_remove( dip->di_next_unlinked = cpu_to_be32(NULLAGINO); offset = ip->i_imap.im_boffset + offsetof(xfs_dinode_t, di_next_unlinked); + + /* need to recalc the inode CRC if appropriate */ + xfs_dinode_calc_crc(mp, dip); + xfs_trans_inode_buf(tp, ibp); xfs_trans_log_buf(tp, ibp, offset, (offset + sizeof(xfs_agino_t) - 1)); @@ -1809,6 +1821,10 @@ xfs_iunlink_remove( last_dip->di_next_unlinked = cpu_to_be32(next_agino); ASSERT(next_agino != 0); offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked); + + /* need to recalc the inode CRC if appropriate */ + xfs_dinode_calc_crc(mp, last_dip); + xfs_trans_inode_buf(tp, last_ibp); xfs_trans_log_buf(tp, last_ibp, offset, (offset + sizeof(xfs_agino_t) - 1)); -- cgit v1.2.3 From cca9f93a52d2ead50b5da59ca83d5f469ee4be5f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 27 Jun 2013 16:04:49 +1000 Subject: xfs: don't do IO when creating an new inode When we are allocating a new inode, we read the inode cluster off disk to increment the generation number. We are already using a random generation number for newly allocated inodes, so if we are not using the ikeep mode, we can just generate a new generation number when we initialise the newly allocated inode. This avoids the need for reading the inode buffer during inode creation. This will speed up allocation of inodes in cold, partially allocated clusters as they will no longer need to be read from disk during allocation. It will also reduce the CPU overhead of inode allocation by not having the process the buffer read, even on cache hits. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_inode.c | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 7f7be5f98f52..d1f76da6f8bf 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1028,6 +1028,11 @@ xfs_dinode_calc_crc( /* * Read the disk inode attributes into the in-core inode structure. + * + * If we are initialising a new inode and we are not utilising the + * XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new inode core + * with a random generation number. If we are keeping inodes around, we need to + * read the inode cluster to get the existing generation number off disk. */ int xfs_iread( @@ -1047,6 +1052,22 @@ xfs_iread( if (error) return error; + /* shortcut IO on inode allocation if possible */ + if ((iget_flags & XFS_IGET_CREATE) && + !(mp->m_flags & XFS_MOUNT_IKEEP)) { + /* initialise the on-disk inode core */ + memset(&ip->i_d, 0, sizeof(ip->i_d)); + ip->i_d.di_magic = XFS_DINODE_MAGIC; + ip->i_d.di_gen = prandom_u32(); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + ip->i_d.di_version = 3; + ip->i_d.di_ino = ip->i_ino; + uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid); + } else + ip->i_d.di_version = 2; + return 0; + } + /* * Get pointers to the on-disk inode and the buffer containing it. */ @@ -1133,17 +1154,16 @@ xfs_iread( xfs_buf_set_ref(bp, XFS_INO_REF); /* - * Use xfs_trans_brelse() to release the buffer containing the - * on-disk inode, because it was acquired with xfs_trans_read_buf() - * in xfs_imap_to_bp() above. If tp is NULL, this is just a normal + * Use xfs_trans_brelse() to release the buffer containing the on-disk + * inode, because it was acquired with xfs_trans_read_buf() in + * xfs_imap_to_bp() above. If tp is NULL, this is just a normal * brelse(). If we're within a transaction, then xfs_trans_brelse() * will only release the buffer if it is not dirty within the * transaction. It will be OK to release the buffer in this case, - * because inodes on disk are never destroyed and we will be - * locking the new in-core inode before putting it in the hash - * table where other processes can find it. Thus we don't have - * to worry about the inode being changed just because we released - * the buffer. + * because inodes on disk are never destroyed and we will be locking the + * new in-core inode before putting it in the cache where other + * processes can find it. Thus we don't have to worry about the inode + * being changed just because we released the buffer. */ out_brelse: xfs_trans_brelse(tp, bp); -- cgit v1.2.3 From 1baaed8fa955ab0d23aab24477dae566ed6a105b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 27 Jun 2013 16:04:50 +1000 Subject: xfs: xfs_ifree doesn't need to modify the inode buffer Long ago, bulkstat used to read inodes directly from the backing buffer for speed. This had the unfortunate problem of being cache incoherent with unlinks, and so xfs_ifree() had to mark the inode as free directly in the backing buffer. bulkstat was changed some time ago to use inode cache coherent lookups, and so will never see unlinked inodes in it's lookups. Hence xfs_ifree() does not need to touch the inode backing buffer anymore. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_inode.c | 32 ++++---------------------------- 1 file changed, 4 insertions(+), 28 deletions(-) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index d1f76da6f8bf..9ecfe1e559fc 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2048,8 +2048,6 @@ xfs_ifree( int error; int delete; xfs_ino_t first_ino; - xfs_dinode_t *dip; - xfs_buf_t *ibp; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(ip->i_d.di_nlink == 0); @@ -2062,14 +2060,13 @@ xfs_ifree( * Pull the on-disk inode from the AGI unlinked list. */ error = xfs_iunlink_remove(tp, ip); - if (error != 0) { + if (error) return error; - } error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino); - if (error != 0) { + if (error) return error; - } + ip->i_d.di_mode = 0; /* mark incore inode as free */ ip->i_d.di_flags = 0; ip->i_d.di_dmevmask = 0; @@ -2081,31 +2078,10 @@ xfs_ifree( * by reincarnations of this inode. */ ip->i_d.di_gen++; - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, &dip, &ibp, - 0, 0); - if (error) - return error; - - /* - * Clear the on-disk di_mode. This is to prevent xfs_bulkstat - * from picking up this inode when it is reclaimed (its incore state - * initialzed but not flushed to disk yet). The in-core di_mode is - * already cleared and a corresponding transaction logged. - * The hack here just synchronizes the in-core to on-disk - * di_mode value in advance before the actual inode sync to disk. - * This is OK because the inode is already unlinked and would never - * change its di_mode again for this inode generation. - * This is a temporary hack that would require a proper fix - * in the future. - */ - dip->di_mode = 0; - - if (delete) { + if (delete) error = xfs_ifree_cluster(ip, tp, first_ino); - } return error; } -- cgit v1.2.3 From a69c7c077228b0bef38ccb1d385c099e132fe54b Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Wed, 28 Mar 2012 12:21:11 -0500 Subject: xfs: use XFS_BMAP_BMDR_SPACE vs. XFS_BROOT_SIZE_ADJ XFS_BROOT_SIZE_ADJ is an undocumented macro which accounts for the difference in size between the on-disk and in-core btree root. It's much clearer to just use the newly-added XFS_BMAP_BMDR_SPACE macro which gives us the on-disk size directly. In one case, we must test that the if_broot exists before applying the macro, however. Signed-off-by: Eric Sandeen Reviewed-by: Dave Chinner Reviewed-by: Ben Myers Signed-off-by: Ben Myers --- fs/xfs/xfs_inode.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 9ecfe1e559fc..b78481f99d9d 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2156,8 +2156,8 @@ xfs_iroot_realloc( np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, (int)new_size); ifp->if_broot_bytes = (int)new_size; - ASSERT(ifp->if_broot_bytes <= - XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ(ip)); + ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <= + XFS_IFORK_SIZE(ip, whichfork)); memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t)); return; } @@ -2210,8 +2210,9 @@ xfs_iroot_realloc( kmem_free(ifp->if_broot); ifp->if_broot = new_broot; ifp->if_broot_bytes = (int)new_size; - ASSERT(ifp->if_broot_bytes <= - XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ(ip)); + if (ifp->if_broot) + ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <= + XFS_IFORK_SIZE(ip, whichfork)); return; } @@ -2522,9 +2523,8 @@ xfs_iflush_fork( if ((iip->ili_fields & brootflag[whichfork]) && (ifp->if_broot_bytes > 0)) { ASSERT(ifp->if_broot != NULL); - ASSERT(ifp->if_broot_bytes <= - (XFS_IFORK_SIZE(ip, whichfork) + - XFS_BROOT_SIZE_ADJ(ip))); + ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <= + XFS_IFORK_SIZE(ip, whichfork)); xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes, (xfs_bmdr_block_t *)cp, XFS_DFORK_SIZE(dip, mp, whichfork)); -- cgit v1.2.3 From e1b4271ac261b290fdab51446996fb13e68a57be Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 24 Jul 2013 15:47:30 +1000 Subject: xfs: di_flushiter considered harmful When we made all inode updates transactional, we no longer needed the log recovery detection for inodes being newer on disk than the transaction being replayed - it was redundant as replay of the log would always result in the latest version of the inode would be on disk. It was redundant, but left in place because it wasn't considered to be a problem. However, with the new "don't read inodes on create" optimisation, flushiter has come back to bite us. Essentially, the optimisation made always initialises flushiter to zero in the create transaction, and so if we then crash and run recovery and the inode already on disk has a non-zero flushiter it will skip recovery of that inode. As a result, log recovery does the wrong thing and we end up with a corrupt filesystem. Because we have to support old kernel to new kernel upgrades, we can't just get rid of the flushiter support in log recovery as we might be upgrading from a kernel that doesn't have fully transactional inode updates. Unfortunately, for v4 superblocks there is no way to guarantee that log recovery knows about this fact. We cannot add a new inode format flag to say it's a "special inode create" because it won't be understood by older kernels and so recovery could do the wrong thing on downgrade. We cannot specially detect the combination of zero mode/non-zero flushiter on disk to non-zero mode, zero flushiter in the log item during recovery because wrapping of the flushiter can result in false detection. Hence that makes this "don't use flushiter" optimisation limited to a disk format that guarantees that we don't need it. And that means the only fix here is to limit the "no read IO on create" optimisation to version 5 superblocks.... Reported-by: Markus Trippelsdorf Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers (cherry picked from commit e60896d8f2b81412421953e14d3feb14177edb56) --- fs/xfs/xfs_inode.c | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index b78481f99d9d..bb262c25c8de 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -896,7 +896,6 @@ xfs_dinode_to_disk( to->di_projid_lo = cpu_to_be16(from->di_projid_lo); to->di_projid_hi = cpu_to_be16(from->di_projid_hi); memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); - to->di_flushiter = cpu_to_be16(from->di_flushiter); to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec); to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec); to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec); @@ -924,6 +923,9 @@ xfs_dinode_to_disk( to->di_lsn = cpu_to_be64(from->di_lsn); memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); uuid_copy(&to->di_uuid, &from->di_uuid); + to->di_flushiter = 0; + } else { + to->di_flushiter = cpu_to_be16(from->di_flushiter); } } @@ -1029,10 +1031,14 @@ xfs_dinode_calc_crc( /* * Read the disk inode attributes into the in-core inode structure. * - * If we are initialising a new inode and we are not utilising the - * XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new inode core - * with a random generation number. If we are keeping inodes around, we need to - * read the inode cluster to get the existing generation number off disk. + * For version 5 superblocks, if we are initialising a new inode and we are not + * utilising the XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new + * inode core with a random generation number. If we are keeping inodes around, + * we need to read the inode cluster to get the existing generation number off + * disk. Further, if we are using version 4 superblocks (i.e. v1/v2 inode + * format) then log recovery is dependent on the di_flushiter field being + * initialised from the current on-disk value and hence we must also read the + * inode off disk. */ int xfs_iread( @@ -1054,6 +1060,7 @@ xfs_iread( /* shortcut IO on inode allocation if possible */ if ((iget_flags & XFS_IGET_CREATE) && + xfs_sb_version_hascrc(&mp->m_sb) && !(mp->m_flags & XFS_MOUNT_IKEEP)) { /* initialise the on-disk inode core */ memset(&ip->i_d, 0, sizeof(ip->i_d)); @@ -2882,12 +2889,18 @@ xfs_iflush_int( __func__, ip->i_ino, ip->i_d.di_forkoff, ip); goto corrupt_out; } + /* - * bump the flush iteration count, used to detect flushes which - * postdate a log record during recovery. This is redundant as we now - * log every change and hence this can't happen. Still, it doesn't hurt. + * Inode item log recovery for v1/v2 inodes are dependent on the + * di_flushiter count for correct sequencing. We bump the flush + * iteration count so we can detect flushes which postdate a log record + * during recovery. This is redundant as we now log every change and + * hence this can't happen but we need to still do it to ensure + * backwards compatibility with old kernels that predate logging all + * inode changes. */ - ip->i_d.di_flushiter++; + if (ip->i_d.di_version < 3) + ip->i_d.di_flushiter++; /* * Copy the dirty parts of the inode into the on-disk -- cgit v1.2.3