From f5a7a6b0d9b6af7d46124ed3f6b3995225cb62d0 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 28 Jan 2008 23:58:27 -0500 Subject: jbd2: Fix assertion failure in fs/jbd2/checkpoint.c Before we start committing a transaction, we call __journal_clean_checkpoint_list() to cleanup transaction's written-back buffers. If this call happens to remove all of them (and there were already some buffers), __journal_remove_checkpoint() will decide to free the transaction because it isn't (yet) a committing transaction and soon we fail some assertion - the transaction really isn't ready to be freed :). We change the check in __journal_remove_checkpoint() to free only a transaction in T_FINISHED state. The locking there is subtle though (as everywhere in JBD ;(). We use j_list_lock to protect the check and a subsequent call to __journal_drop_transaction() and do the same in the end of journal_commit_transaction() which is the only place where a transaction can get to T_FINISHED state. Probably I'm too paranoid here and such locking is not really necessary - checkpoint lists are processed only from log_do_checkpoint() where a transaction must be already committed to be processed or from __journal_clean_checkpoint_list() where kjournald itself calls it and thus transaction cannot change state either. Better be safe if something changes in future... Signed-off-by: Jan Kara Cc: Signed-off-by: Andrew Morton --- fs/jbd2/commit.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/jbd2/commit.c') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 6986f334c643..39b5cee3dd8a 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -867,10 +867,10 @@ restart_loop: } spin_unlock(&journal->j_list_lock); /* - * This is a bit sleazy. We borrow j_list_lock to protect - * journal->j_committing_transaction in __jbd2_journal_remove_checkpoint. - * Really, __jbd2_journal_remove_checkpoint should be using j_state_lock but - * it's a bit hassle to hold that across __jbd2_journal_remove_checkpoint + * This is a bit sleazy. We use j_list_lock to protect transition + * of a transaction into T_FINISHED state and calling + * __jbd2_journal_drop_transaction(). Otherwise we could race with + * other checkpointing code processing the transaction... */ spin_lock(&journal->j_state_lock); spin_lock(&journal->j_list_lock); -- cgit v1.2.3 From 8e85fb3f305b24b79c6d9cb7a56d22b062335ad3 Mon Sep 17 00:00:00 2001 From: Johann Lombardi Date: Mon, 28 Jan 2008 23:58:27 -0500 Subject: jbd2: jbd2 stats through procfs The patch below updates the jbd stats patch to 2.6.20/jbd2. The initial patch was posted by Alex Tomas in December 2005 (http://marc.info/?l=linux-ext4&m=113538565128617&w=2). It provides statistics via procfs such as transaction lifetime and size. Sometimes, investigating performance problems, i find useful to have stats from jbd about transaction's lifetime, size, etc. here is a patch for review and inclusion probably. for example, stats after creation of 3M files in htree directory: [root@bob ~]# cat /proc/fs/jbd/sda/history R/C tid wait run lock flush log hndls block inlog ctime write drop close R 261 8260 2720 0 0 750 9892 8170 8187 C 259 750 0 4885 1 R 262 20 2200 10 0 770 9836 8170 8187 R 263 30 2200 10 0 3070 9812 8170 8187 R 264 0 5000 10 0 1340 0 0 0 C 261 8240 3212 4957 0 R 265 8260 1470 0 0 4640 9854 8170 8187 R 266 0 5000 10 0 1460 0 0 0 C 262 8210 2989 4868 0 R 267 8230 1490 10 0 4440 9875 8171 8188 R 268 0 5000 10 0 1260 0 0 0 C 263 7710 2937 4908 0 R 269 7730 1470 10 0 3330 9841 8170 8187 R 270 0 5000 10 0 830 0 0 0 C 265 8140 3234 4898 0 C 267 720 0 4849 1 R 271 8630 2740 20 0 740 9819 8170 8187 C 269 800 0 4214 1 R 272 40 2170 10 0 830 9716 8170 8187 R 273 40 2280 0 0 3530 9799 8170 8187 R 274 0 5000 10 0 990 0 0 0 where, R - line for transaction's life from T_RUNNING to T_FINISHED C - line for transaction's checkpointing tid - transaction's id wait - for how long we were waiting for new transaction to start (the longest period journal_start() took in this transaction) run - real transaction's lifetime (from T_RUNNING to T_LOCKED lock - how long we were waiting for all handles to close (time the transaction was in T_LOCKED) flush - how long it took to flush all data (data=ordered) log - how long it took to write the transaction to the log hndls - how many handles got to the transaction block - how many blocks got to the transaction inlog - how many blocks are written to the log (block + descriptors) ctime - how long it took to checkpoint the transaction write - how many blocks have been written during checkpointing drop - how many blocks have been dropped during checkpointing close - how many running transactions have been closed to checkpoint this one all times are in msec. [root@bob ~]# cat /proc/fs/jbd/sda/info 280 transaction, each upto 8192 blocks average: 1633ms waiting for transaction 3616ms running transaction 5ms transaction was being locked 1ms flushing data (in ordered mode) 1799ms logging transaction 11781 handles per transaction 5629 blocks per transaction 5641 logged blocks per transaction Signed-off-by: Johann Lombardi Signed-off-by: Mariusz Kozlowski Signed-off-by: Mingming Cao Signed-off-by: Eric Sandeen --- fs/jbd2/commit.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) (limited to 'fs/jbd2/commit.c') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 39b5cee3dd8a..8749a86f4175 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -20,6 +20,7 @@ #include #include #include +#include /* * Default IO end handler for temporary BJ_IO buffer_heads. @@ -290,6 +291,7 @@ static inline void write_tag_block(int tag_bytes, journal_block_tag_t *tag, */ void jbd2_journal_commit_transaction(journal_t *journal) { + struct transaction_stats_s stats; transaction_t *commit_transaction; struct journal_head *jh, *new_jh, *descriptor; struct buffer_head **wbuf = journal->j_wbuf; @@ -337,6 +339,11 @@ void jbd2_journal_commit_transaction(journal_t *journal) spin_lock(&journal->j_state_lock); commit_transaction->t_state = T_LOCKED; + stats.u.run.rs_wait = commit_transaction->t_max_wait; + stats.u.run.rs_locked = jiffies; + stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start, + stats.u.run.rs_locked); + spin_lock(&commit_transaction->t_handle_lock); while (commit_transaction->t_updates) { DEFINE_WAIT(wait); @@ -407,6 +414,10 @@ void jbd2_journal_commit_transaction(journal_t *journal) */ jbd2_journal_switch_revoke_table(journal); + stats.u.run.rs_flushing = jiffies; + stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked, + stats.u.run.rs_flushing); + commit_transaction->t_state = T_FLUSH; journal->j_committing_transaction = commit_transaction; journal->j_running_transaction = NULL; @@ -498,6 +509,12 @@ void jbd2_journal_commit_transaction(journal_t *journal) */ commit_transaction->t_state = T_COMMIT; + stats.u.run.rs_logging = jiffies; + stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing, + stats.u.run.rs_logging); + stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits; + stats.u.run.rs_blocks_logged = 0; + descriptor = NULL; bufs = 0; while (commit_transaction->t_buffers) { @@ -646,6 +663,7 @@ start_journal_io: submit_bh(WRITE, bh); } cond_resched(); + stats.u.run.rs_blocks_logged += bufs; /* Force a new descriptor to be generated next time round the loop. */ @@ -816,6 +834,7 @@ restart_loop: cp_transaction = jh->b_cp_transaction; if (cp_transaction) { JBUFFER_TRACE(jh, "remove from old cp transaction"); + cp_transaction->t_chp_stats.cs_dropped++; __jbd2_journal_remove_checkpoint(jh); } @@ -890,6 +909,36 @@ restart_loop: J_ASSERT(commit_transaction->t_state == T_COMMIT); + commit_transaction->t_start = jiffies; + stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging, + commit_transaction->t_start); + + /* + * File the transaction for history + */ + stats.ts_type = JBD2_STATS_RUN; + stats.ts_tid = commit_transaction->t_tid; + stats.u.run.rs_handle_count = commit_transaction->t_handle_count; + spin_lock(&journal->j_history_lock); + memcpy(journal->j_history + journal->j_history_cur, &stats, + sizeof(stats)); + if (++journal->j_history_cur == journal->j_history_max) + journal->j_history_cur = 0; + + /* + * Calculate overall stats + */ + journal->j_stats.ts_tid++; + journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait; + journal->j_stats.u.run.rs_running += stats.u.run.rs_running; + journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked; + journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing; + journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging; + journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count; + journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks; + journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged; + spin_unlock(&journal->j_history_lock); + commit_transaction->t_state = T_FINISHED; J_ASSERT(commit_transaction == journal->j_committing_transaction); journal->j_commit_sequence = commit_transaction->t_tid; -- cgit v1.2.3 From 818d276ceb83aa9fdebb5e0a53188290312de987 Mon Sep 17 00:00:00 2001 From: Girish Shilamkar Date: Mon, 28 Jan 2008 23:58:27 -0500 Subject: ext4: Add the journal checksum feature The journal checksum feature adds two new flags i.e JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT and JBD2_FEATURE_COMPAT_CHECKSUM. JBD2_FEATURE_CHECKSUM flag indicates that the commit block contains the checksum for the blocks described by the descriptor blocks. Due to checksums, writing of the commit record no longer needs to be synchronous. Now commit record can be sent to disk without waiting for descriptor blocks to be written to disk. This behavior is controlled using JBD2_FEATURE_ASYNC_COMMIT flag. Older kernels/e2fsck should not be able to recover the journal with _ASYNC_COMMIT hence it is made incompat. The commit header has been extended to hold the checksum along with the type of the checksum. For recovery in pass scan checksums are verified to ensure the sanity and completeness(in case of _ASYNC_COMMIT) of every transaction. Signed-off-by: Andreas Dilger Signed-off-by: Girish Shilamkar Signed-off-by: Dave Kleikamp Signed-off-by: Mingming Cao --- fs/jbd2/commit.c | 198 +++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 149 insertions(+), 49 deletions(-) (limited to 'fs/jbd2/commit.c') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 8749a86f4175..da8d0eb3b7b9 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -21,6 +21,7 @@ #include #include #include +#include /* * Default IO end handler for temporary BJ_IO buffer_heads. @@ -93,19 +94,23 @@ static int inverted_lock(journal_t *journal, struct buffer_head *bh) return 1; } -/* Done it all: now write the commit record. We should have +/* + * Done it all: now submit the commit record. We should have * cleaned up our previous buffers by now, so if we are in abort * mode we can now just skip the rest of the journal write * entirely. * * Returns 1 if the journal needs to be aborted or 0 on success */ -static int journal_write_commit_record(journal_t *journal, - transaction_t *commit_transaction) +static int journal_submit_commit_record(journal_t *journal, + transaction_t *commit_transaction, + struct buffer_head **cbh, + __u32 crc32_sum) { struct journal_head *descriptor; + struct commit_header *tmp; struct buffer_head *bh; - int i, ret; + int ret; int barrier_done = 0; if (is_journal_aborted(journal)) @@ -117,21 +122,33 @@ static int journal_write_commit_record(journal_t *journal, bh = jh2bh(descriptor); - /* AKPM: buglet - add `i' to tmp! */ - for (i = 0; i < bh->b_size; i += 512) { - journal_header_t *tmp = (journal_header_t*)bh->b_data; - tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); - tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); - tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); + tmp = (struct commit_header *)bh->b_data; + tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); + tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); + tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); + + if (JBD2_HAS_COMPAT_FEATURE(journal, + JBD2_FEATURE_COMPAT_CHECKSUM)) { + tmp->h_chksum_type = JBD2_CRC32_CHKSUM; + tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE; + tmp->h_chksum[0] = cpu_to_be32(crc32_sum); } - JBUFFER_TRACE(descriptor, "write commit block"); + JBUFFER_TRACE(descriptor, "submit commit block"); + lock_buffer(bh); + set_buffer_dirty(bh); - if (journal->j_flags & JBD2_BARRIER) { + set_buffer_uptodate(bh); + bh->b_end_io = journal_end_buffer_io_sync; + + if (journal->j_flags & JBD2_BARRIER && + !JBD2_HAS_COMPAT_FEATURE(journal, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { set_buffer_ordered(bh); barrier_done = 1; } - ret = sync_dirty_buffer(bh); + ret = submit_bh(WRITE, bh); + /* is it possible for another commit to fail at roughly * the same time as this one? If so, we don't want to * trust the barrier flag in the super, but instead want @@ -152,14 +169,72 @@ static int journal_write_commit_record(journal_t *journal, clear_buffer_ordered(bh); set_buffer_uptodate(bh); set_buffer_dirty(bh); - ret = sync_dirty_buffer(bh); + ret = submit_bh(WRITE, bh); } - put_bh(bh); /* One for getblk() */ - jbd2_journal_put_journal_head(descriptor); + *cbh = bh; + return ret; +} + +/* + * This function along with journal_submit_commit_record + * allows to write the commit record asynchronously. + */ +static int journal_wait_on_commit_record(struct buffer_head *bh) +{ + int ret = 0; + + clear_buffer_dirty(bh); + wait_on_buffer(bh); - return (ret == -EIO); + if (unlikely(!buffer_uptodate(bh))) + ret = -EIO; + put_bh(bh); /* One for getblk() */ + jbd2_journal_put_journal_head(bh2jh(bh)); + + return ret; } +/* + * Wait for all submitted IO to complete. + */ +static int journal_wait_on_locked_list(journal_t *journal, + transaction_t *commit_transaction) +{ + int ret = 0; + struct journal_head *jh; + + while (commit_transaction->t_locked_list) { + struct buffer_head *bh; + + jh = commit_transaction->t_locked_list->b_tprev; + bh = jh2bh(jh); + get_bh(bh); + if (buffer_locked(bh)) { + spin_unlock(&journal->j_list_lock); + wait_on_buffer(bh); + if (unlikely(!buffer_uptodate(bh))) + ret = -EIO; + spin_lock(&journal->j_list_lock); + } + if (!inverted_lock(journal, bh)) { + put_bh(bh); + spin_lock(&journal->j_list_lock); + continue; + } + if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) { + __jbd2_journal_unfile_buffer(jh); + jbd_unlock_bh_state(bh); + jbd2_journal_remove_journal_head(bh); + put_bh(bh); + } else { + jbd_unlock_bh_state(bh); + } + put_bh(bh); + cond_resched_lock(&journal->j_list_lock); + } + return ret; + } + static void journal_do_submit_data(struct buffer_head **wbuf, int bufs) { int i; @@ -275,7 +350,21 @@ write_out_data: journal_do_submit_data(wbuf, bufs); } -static inline void write_tag_block(int tag_bytes, journal_block_tag_t *tag, +static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh) +{ + struct page *page = bh->b_page; + char *addr; + __u32 checksum; + + addr = kmap_atomic(page, KM_USER0); + checksum = crc32_be(crc32_sum, + (void *)(addr + offset_in_page(bh->b_data)), bh->b_size); + kunmap_atomic(addr, KM_USER0); + + return checksum; +} + +static void write_tag_block(int tag_bytes, journal_block_tag_t *tag, unsigned long long block) { tag->t_blocknr = cpu_to_be32(block & (u32)~0); @@ -307,6 +396,8 @@ void jbd2_journal_commit_transaction(journal_t *journal) int tag_flag; int i; int tag_bytes = journal_tag_bytes(journal); + struct buffer_head *cbh = NULL; /* For transactional checksums */ + __u32 crc32_sum = ~0; /* * First job: lock down the current transaction and wait for @@ -451,38 +542,15 @@ void jbd2_journal_commit_transaction(journal_t *journal) journal_submit_data_buffers(journal, commit_transaction); /* - * Wait for all previously submitted IO to complete. + * Wait for all previously submitted IO to complete if commit + * record is to be written synchronously. */ spin_lock(&journal->j_list_lock); - while (commit_transaction->t_locked_list) { - struct buffer_head *bh; + if (!JBD2_HAS_INCOMPAT_FEATURE(journal, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) + err = journal_wait_on_locked_list(journal, + commit_transaction); - jh = commit_transaction->t_locked_list->b_tprev; - bh = jh2bh(jh); - get_bh(bh); - if (buffer_locked(bh)) { - spin_unlock(&journal->j_list_lock); - wait_on_buffer(bh); - if (unlikely(!buffer_uptodate(bh))) - err = -EIO; - spin_lock(&journal->j_list_lock); - } - if (!inverted_lock(journal, bh)) { - put_bh(bh); - spin_lock(&journal->j_list_lock); - continue; - } - if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) { - __jbd2_journal_unfile_buffer(jh); - jbd_unlock_bh_state(bh); - jbd2_journal_remove_journal_head(bh); - put_bh(bh); - } else { - jbd_unlock_bh_state(bh); - } - put_bh(bh); - cond_resched_lock(&journal->j_list_lock); - } spin_unlock(&journal->j_list_lock); if (err) @@ -656,6 +724,15 @@ void jbd2_journal_commit_transaction(journal_t *journal) start_journal_io: for (i = 0; i < bufs; i++) { struct buffer_head *bh = wbuf[i]; + /* + * Compute checksum. + */ + if (JBD2_HAS_COMPAT_FEATURE(journal, + JBD2_FEATURE_COMPAT_CHECKSUM)) { + crc32_sum = + jbd2_checksum_data(crc32_sum, bh); + } + lock_buffer(bh); clear_buffer_dirty(bh); set_buffer_uptodate(bh); @@ -672,6 +749,23 @@ start_journal_io: } } + /* Done it all: now write the commit record asynchronously. */ + + if (JBD2_HAS_INCOMPAT_FEATURE(journal, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { + err = journal_submit_commit_record(journal, commit_transaction, + &cbh, crc32_sum); + if (err) + __jbd2_journal_abort_hard(journal); + + spin_lock(&journal->j_list_lock); + err = journal_wait_on_locked_list(journal, + commit_transaction); + spin_unlock(&journal->j_list_lock); + if (err) + __jbd2_journal_abort_hard(journal); + } + /* Lo and behold: we have just managed to send a transaction to the log. Before we can commit it, wait for the IO so far to complete. Control buffers being written are on the @@ -771,8 +865,14 @@ wait_for_iobuf: jbd_debug(3, "JBD: commit phase 6\n"); - if (journal_write_commit_record(journal, commit_transaction)) - err = -EIO; + if (!JBD2_HAS_INCOMPAT_FEATURE(journal, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { + err = journal_submit_commit_record(journal, commit_transaction, + &cbh, crc32_sum); + if (err) + __jbd2_journal_abort_hard(journal); + } + err = journal_wait_on_commit_record(cbh); if (err) jbd2_journal_abort(journal, err); -- cgit v1.2.3 From 95c354fe9f7d6decc08a92aa26eb233ecc2155bf Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 30 Jan 2008 13:31:20 +0100 Subject: spinlock: lockbreak cleanup The break_lock data structure and code for spinlocks is quite nasty. Not only does it double the size of a spinlock but it changes locking to a potentially less optimal trylock. Put all of that under CONFIG_GENERIC_LOCKBREAK, and introduce a __raw_spin_is_contended that uses the lock data itself to determine whether there are waiters on the lock, to be used if CONFIG_GENERIC_LOCKBREAK is not set. Rename need_lockbreak to spin_needbreak, make it use spin_is_contended to decouple it from the spinlock implementation, and make it typesafe (rwlocks do not have any need_lockbreak sites -- why do they even get bloated up with that break_lock then?). Signed-off-by: Nick Piggin Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- fs/jbd2/commit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/jbd2/commit.c') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index da8d0eb3b7b9..4f302d279279 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -341,7 +341,7 @@ write_out_data: put_bh(bh); } - if (lock_need_resched(&journal->j_list_lock)) { + if (need_resched() || spin_needbreak(&journal->j_list_lock)) { spin_unlock(&journal->j_list_lock); goto write_out_data; } -- cgit v1.2.3