From 8b7128429235d9bd72cfd5ed20c77c4f3118f744 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 11 Jun 2008 16:50:36 -0400 Subject: Btrfs: Add async worker threads for pre and post IO checksumming Btrfs has been using workqueues to spread the checksumming load across other CPUs in the system. But, workqueues only schedule work on the same CPU that queued the work, giving them a limited benefit for systems with higher CPU counts. This code adds a generic facility to schedule work with pools of kthreads, and changes the bio submission code to queue bios up. The queueing is important to make sure large numbers of procs on the system don't turn streaming workloads into random workloads by sending IO down concurrently. The end result of all of this is much higher performance (and CPU usage) when doing checksumming on large machines. Two worker pools are created, one for writes and one for endio processing. The two could deadlock if we tried to service both from a single pool. Signed-off-by: Chris Mason --- fs/btrfs/async-thread.c | 288 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 288 insertions(+) create mode 100644 fs/btrfs/async-thread.c (limited to 'fs/btrfs/async-thread.c') diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c new file mode 100644 index 000000000000..2911b67bd6f7 --- /dev/null +++ b/fs/btrfs/async-thread.c @@ -0,0 +1,288 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include "async-thread.h" + +/* + * container for the kthread task pointer and the list of pending work + * One of these is allocated per thread. + */ +struct btrfs_worker_thread { + /* list of struct btrfs_work that are waiting for service */ + struct list_head pending; + + /* list of worker threads from struct btrfs_workers */ + struct list_head worker_list; + + /* kthread */ + struct task_struct *task; + + /* number of things on the pending list */ + atomic_t num_pending; + + /* protects the pending list. */ + spinlock_t lock; + + /* set to non-zero when this thread is already awake and kicking */ + int working; +}; + +/* + * main loop for servicing work items + */ +static int worker_loop(void *arg) +{ + struct btrfs_worker_thread *worker = arg; + struct list_head *cur; + struct btrfs_work *work; + do { + spin_lock_irq(&worker->lock); + while(!list_empty(&worker->pending)) { + cur = worker->pending.next; + work = list_entry(cur, struct btrfs_work, list); + list_del(&work->list); + clear_bit(0, &work->flags); + + work->worker = worker; + spin_unlock_irq(&worker->lock); + + work->func(work); + + atomic_dec(&worker->num_pending); + spin_lock_irq(&worker->lock); + } + worker->working = 0; + if (freezing(current)) { + refrigerator(); + } else { + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irq(&worker->lock); + schedule(); + __set_current_state(TASK_RUNNING); + } + } while (!kthread_should_stop()); + return 0; +} + +/* + * this will wait for all the worker threads to shutdown + */ +int btrfs_stop_workers(struct btrfs_workers *workers) +{ + struct list_head *cur; + struct btrfs_worker_thread *worker; + + while(!list_empty(&workers->worker_list)) { + cur = workers->worker_list.next; + worker = list_entry(cur, struct btrfs_worker_thread, + worker_list); + kthread_stop(worker->task); + list_del(&worker->worker_list); + kfree(worker); + } + return 0; +} + +/* + * simple init on struct btrfs_workers + */ +void btrfs_init_workers(struct btrfs_workers *workers, int max) +{ + workers->num_workers = 0; + INIT_LIST_HEAD(&workers->worker_list); + workers->last = NULL; + spin_lock_init(&workers->lock); + workers->max_workers = max; +} + +/* + * starts new worker threads. This does not enforce the max worker + * count in case you need to temporarily go past it. + */ +int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) +{ + struct btrfs_worker_thread *worker; + int ret = 0; + int i; + + for (i = 0; i < num_workers; i++) { + worker = kzalloc(sizeof(*worker), GFP_NOFS); + if (!worker) { + ret = -ENOMEM; + goto fail; + } + + INIT_LIST_HEAD(&worker->pending); + INIT_LIST_HEAD(&worker->worker_list); + spin_lock_init(&worker->lock); + atomic_set(&worker->num_pending, 0); + worker->task = kthread_run(worker_loop, worker, "btrfs"); + if (IS_ERR(worker->task)) { + ret = PTR_ERR(worker->task); + goto fail; + } + + spin_lock_irq(&workers->lock); + list_add_tail(&worker->worker_list, &workers->worker_list); + workers->last = worker; + workers->num_workers++; + spin_unlock_irq(&workers->lock); + } + return 0; +fail: + btrfs_stop_workers(workers); + return ret; +} + +/* + * run through the list and find a worker thread that doesn't have a lot + * to do right now. This can return null if we aren't yet at the thread + * count limit and all of the threads are busy. + */ +static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers) +{ + struct btrfs_worker_thread *worker; + struct list_head *next; + struct list_head *start; + int enforce_min = workers->num_workers < workers->max_workers; + + /* start with the last thread if it isn't busy */ + worker = workers->last; + if (atomic_read(&worker->num_pending) < 64) + goto done; + + next = worker->worker_list.next; + start = &worker->worker_list; + + /* + * check all the workers for someone that is bored. FIXME, do + * something smart here + */ + while(next != start) { + if (next == &workers->worker_list) { + next = workers->worker_list.next; + continue; + } + worker = list_entry(next, struct btrfs_worker_thread, + worker_list); + if (atomic_read(&worker->num_pending) < 64 || !enforce_min) + goto done; + next = next->next; + } + /* + * nobody was bored, if we're already at the max thread count, + * use the last thread + */ + if (!enforce_min || atomic_read(&workers->last->num_pending) < 64) { + return workers->last; + } + return NULL; +done: + workers->last = worker; + return worker; +} + +static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers) +{ + struct btrfs_worker_thread *worker; + unsigned long flags; + +again: + spin_lock_irqsave(&workers->lock, flags); + worker = next_worker(workers); + spin_unlock_irqrestore(&workers->lock, flags); + + if (!worker) { + spin_lock_irqsave(&workers->lock, flags); + if (workers->num_workers >= workers->max_workers) { + /* + * we have failed to find any workers, just + * return the force one + */ + worker = list_entry(workers->worker_list.next, + struct btrfs_worker_thread, worker_list); + spin_unlock_irqrestore(&workers->lock, flags); + } else { + spin_unlock_irqrestore(&workers->lock, flags); + /* we're below the limit, start another worker */ + btrfs_start_workers(workers, 1); + goto again; + } + } + return worker; +} + +/* + * btrfs_requeue_work just puts the work item back on the tail of the list + * it was taken from. It is intended for use with long running work functions + * that make some progress and want to give the cpu up for others. + */ +int btrfs_requeue_work(struct btrfs_work *work) +{ + struct btrfs_worker_thread *worker = work->worker; + unsigned long flags; + + if (test_and_set_bit(0, &work->flags)) + goto out; + + spin_lock_irqsave(&worker->lock, flags); + atomic_inc(&worker->num_pending); + list_add_tail(&work->list, &worker->pending); + spin_unlock_irqrestore(&worker->lock, flags); +out: + return 0; +} + +/* + * places a struct btrfs_work into the pending queue of one of the kthreads + */ +int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) +{ + struct btrfs_worker_thread *worker; + unsigned long flags; + int wake = 0; + + /* don't requeue something already on a list */ + if (test_and_set_bit(0, &work->flags)) + goto out; + + worker = find_worker(workers); + + spin_lock_irqsave(&worker->lock, flags); + atomic_inc(&worker->num_pending); + list_add_tail(&work->list, &worker->pending); + + /* + * avoid calling into wake_up_process if this thread has already + * been kicked + */ + if (!worker->working) + wake = 1; + worker->working = 1; + + spin_unlock_irqrestore(&worker->lock, flags); + + if (wake) + wake_up_process(worker->task); +out: + return 0; +} -- cgit v1.2.3 From d05e5a4dad985acc938e62d9752c813647dedd9d Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 11 Jun 2008 17:09:48 -0400 Subject: Btrfs: Add backport for the kthread work on kernels older than 2.6.20 Signed-off-by: Chris Mason --- fs/btrfs/async-thread.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/async-thread.c') diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 2911b67bd6f7..8b9e2cf50606 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -16,10 +16,17 @@ * Boston, MA 021110-1307, USA. */ +#include #include #include #include -#include + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20) +# include +#else +# include +#endif + #include "async-thread.h" /* -- cgit v1.2.3 From 35d8ba66294ff2a53c17337a1aa1ff6739492f41 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 11 Jun 2008 20:21:24 -0400 Subject: Btrfs: Worker thread optimizations This changes the worker thread pool to maintain a list of idle threads, avoiding a complex search for a good thread to wake up. Threads have two states: idle - we try to reuse the last thread used in hopes of improving the batching ratios busy - each time a new work item is added to a busy task, the task is rotated to the end of the line. Signed-off-by: Chris Mason --- fs/btrfs/async-thread.c | 100 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 69 insertions(+), 31 deletions(-) (limited to 'fs/btrfs/async-thread.c') diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 8b9e2cf50606..8d4cc4679d51 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -34,6 +34,9 @@ * One of these is allocated per thread. */ struct btrfs_worker_thread { + /* pool we belong to */ + struct btrfs_workers *workers; + /* list of struct btrfs_work that are waiting for service */ struct list_head pending; @@ -51,8 +54,44 @@ struct btrfs_worker_thread { /* set to non-zero when this thread is already awake and kicking */ int working; + + /* are we currently idle */ + int idle; }; +/* + * helper function to move a thread onto the idle list after it + * has finished some requests. + */ +static void check_idle_worker(struct btrfs_worker_thread *worker) +{ + if (!worker->idle && atomic_read(&worker->num_pending) < + worker->workers->idle_thresh / 2) { + unsigned long flags; + spin_lock_irqsave(&worker->workers->lock, flags); + worker->idle = 1; + list_move(&worker->worker_list, &worker->workers->idle_list); + spin_unlock_irqrestore(&worker->workers->lock, flags); + } +} + +/* + * helper function to move a thread off the idle list after new + * pending work is added. + */ +static void check_busy_worker(struct btrfs_worker_thread *worker) +{ + if (worker->idle && atomic_read(&worker->num_pending) >= + worker->workers->idle_thresh) { + unsigned long flags; + spin_lock_irqsave(&worker->workers->lock, flags); + worker->idle = 0; + list_move_tail(&worker->worker_list, + &worker->workers->worker_list); + spin_unlock_irqrestore(&worker->workers->lock, flags); + } +} + /* * main loop for servicing work items */ @@ -76,6 +115,7 @@ static int worker_loop(void *arg) atomic_dec(&worker->num_pending); spin_lock_irq(&worker->lock); + check_idle_worker(worker); } worker->working = 0; if (freezing(current)) { @@ -98,6 +138,7 @@ int btrfs_stop_workers(struct btrfs_workers *workers) struct list_head *cur; struct btrfs_worker_thread *worker; + list_splice_init(&workers->idle_list, &workers->worker_list); while(!list_empty(&workers->worker_list)) { cur = workers->worker_list.next; worker = list_entry(cur, struct btrfs_worker_thread, @@ -116,9 +157,10 @@ void btrfs_init_workers(struct btrfs_workers *workers, int max) { workers->num_workers = 0; INIT_LIST_HEAD(&workers->worker_list); - workers->last = NULL; + INIT_LIST_HEAD(&workers->idle_list); spin_lock_init(&workers->lock); workers->max_workers = max; + workers->idle_thresh = 64; } /* @@ -143,14 +185,14 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) spin_lock_init(&worker->lock); atomic_set(&worker->num_pending, 0); worker->task = kthread_run(worker_loop, worker, "btrfs"); + worker->workers = workers; if (IS_ERR(worker->task)) { ret = PTR_ERR(worker->task); goto fail; } spin_lock_irq(&workers->lock); - list_add_tail(&worker->worker_list, &workers->worker_list); - workers->last = worker; + list_add_tail(&worker->worker_list, &workers->idle_list); workers->num_workers++; spin_unlock_irq(&workers->lock); } @@ -169,42 +211,30 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers) { struct btrfs_worker_thread *worker; struct list_head *next; - struct list_head *start; int enforce_min = workers->num_workers < workers->max_workers; - /* start with the last thread if it isn't busy */ - worker = workers->last; - if (atomic_read(&worker->num_pending) < 64) - goto done; - - next = worker->worker_list.next; - start = &worker->worker_list; - /* - * check all the workers for someone that is bored. FIXME, do - * something smart here + * if we find an idle thread, don't move it to the end of the + * idle list. This improves the chance that the next submission + * will reuse the same thread, and maybe catch it while it is still + * working */ - while(next != start) { - if (next == &workers->worker_list) { - next = workers->worker_list.next; - continue; - } + if (!list_empty(&workers->idle_list)) { + next = workers->idle_list.next; worker = list_entry(next, struct btrfs_worker_thread, worker_list); - if (atomic_read(&worker->num_pending) < 64 || !enforce_min) - goto done; - next = next->next; + return worker; } + if (enforce_min || list_empty(&workers->worker_list)) + return NULL; + /* - * nobody was bored, if we're already at the max thread count, - * use the last thread + * if we pick a busy task, move the task to the end of the list. + * hopefully this will keep things somewhat evenly balanced */ - if (!enforce_min || atomic_read(&workers->last->num_pending) < 64) { - return workers->last; - } - return NULL; -done: - workers->last = worker; + next = workers->worker_list.next; + worker = list_entry(next, struct btrfs_worker_thread, worker_list); + list_move_tail(next, &workers->worker_list); return worker; } @@ -221,11 +251,17 @@ again: if (!worker) { spin_lock_irqsave(&workers->lock, flags); if (workers->num_workers >= workers->max_workers) { + struct list_head *fallback = NULL; /* * we have failed to find any workers, just * return the force one */ - worker = list_entry(workers->worker_list.next, + if (!list_empty(&workers->worker_list)) + fallback = workers->worker_list.next; + if (!list_empty(&workers->idle_list)) + fallback = workers->idle_list.next; + BUG_ON(!fallback); + worker = list_entry(fallback, struct btrfs_worker_thread, worker_list); spin_unlock_irqrestore(&workers->lock, flags); } else { @@ -254,6 +290,7 @@ int btrfs_requeue_work(struct btrfs_work *work) spin_lock_irqsave(&worker->lock, flags); atomic_inc(&worker->num_pending); list_add_tail(&work->list, &worker->pending); + check_busy_worker(worker); spin_unlock_irqrestore(&worker->lock, flags); out: return 0; @@ -276,6 +313,7 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) spin_lock_irqsave(&worker->lock, flags); atomic_inc(&worker->num_pending); + check_busy_worker(worker); list_add_tail(&work->list, &worker->pending); /* -- cgit v1.2.3 From 3bf10418675cb424724b5cb9d7725b234defe1fd Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 30 Jul 2008 09:24:37 -0400 Subject: Btrfs: async-thread: fix possible memory leak When kthread_run() returns failure, this worker hasn't been added to the list, so btrfs_stop_workers() won't free it. Signed-off-by: Li Zefan Signed-off-by: Chris Mason --- fs/btrfs/async-thread.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs/async-thread.c') diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 8d4cc4679d51..5fe6a0d532ed 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -187,6 +187,7 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) worker->task = kthread_run(worker_loop, worker, "btrfs"); worker->workers = workers; if (IS_ERR(worker->task)) { + kfree(worker); ret = PTR_ERR(worker->task); goto fail; } -- cgit v1.2.3 From 61b4944018449003ac5f9757f4d125dce519cf51 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 31 Jul 2008 15:42:53 -0400 Subject: Btrfs: Fix streaming read performance with checksumming on Large streaming reads make for large bios, which means each entry on the list async work queues represents a large amount of data. IO congestion throttling on the device was kicking in before the async worker threads decided a single thread was busy and needed some help. The end result was that a streaming read would result in a single CPU running at 100% instead of balancing the work off to other CPUs. This patch also changes the pre-IO checksum lookup done by reads to work on a per-bio basis instead of a per-page. This results in many extra btree lookups on large streaming reads. Doing the checksum lookup right before bio submit allows us to reuse searches while processing adjacent offsets. Signed-off-by: Chris Mason --- fs/btrfs/async-thread.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/async-thread.c') diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 5fe6a0d532ed..bc2980c433ef 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -160,7 +160,7 @@ void btrfs_init_workers(struct btrfs_workers *workers, int max) INIT_LIST_HEAD(&workers->idle_list); spin_lock_init(&workers->lock); workers->max_workers = max; - workers->idle_thresh = 64; + workers->idle_thresh = 32; } /* -- cgit v1.2.3 From 5443be45f5cb57d02fd895a0bcaf7e7d9890b1df Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 15 Aug 2008 15:34:16 -0400 Subject: Btrfs: Give all the worker threads descriptive names Signed-off-by: Chris Mason --- fs/btrfs/async-thread.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/async-thread.c') diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index bc2980c433ef..5f2f5a8c2289 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -153,7 +153,7 @@ int btrfs_stop_workers(struct btrfs_workers *workers) /* * simple init on struct btrfs_workers */ -void btrfs_init_workers(struct btrfs_workers *workers, int max) +void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max) { workers->num_workers = 0; INIT_LIST_HEAD(&workers->worker_list); @@ -161,6 +161,7 @@ void btrfs_init_workers(struct btrfs_workers *workers, int max) spin_lock_init(&workers->lock); workers->max_workers = max; workers->idle_thresh = 32; + workers->name = name; } /* @@ -184,7 +185,9 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) INIT_LIST_HEAD(&worker->worker_list); spin_lock_init(&worker->lock); atomic_set(&worker->num_pending, 0); - worker->task = kthread_run(worker_loop, worker, "btrfs"); + worker->task = kthread_run(worker_loop, worker, + "btrfs-%s-%d", workers->name, + workers->num_workers + i); worker->workers = workers; if (IS_ERR(worker->task)) { kfree(worker); -- cgit v1.2.3 From 4854ddd0ed0a687fc2d7c45a529c406232e31e7b Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 15 Aug 2008 15:34:17 -0400 Subject: Btrfs: Wait for kernel threads to make progress during async submission Before this change, btrfs would use a bdi congestion function to make sure there weren't too many pending async checksum work items. This change makes the process creating async work items wait instead, leading to fewer congestion returns from the bdi. This improves pdflush background_writeout scanning. Signed-off-by: Chris Mason --- fs/btrfs/async-thread.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/async-thread.c') diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 5f2f5a8c2289..958cd8b5f0d7 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -48,6 +48,7 @@ struct btrfs_worker_thread { /* number of things on the pending list */ atomic_t num_pending; + unsigned long sequence; /* protects the pending list. */ spinlock_t lock; @@ -197,6 +198,7 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) spin_lock_irq(&workers->lock); list_add_tail(&worker->worker_list, &workers->idle_list); + worker->idle = 1; workers->num_workers++; spin_unlock_irq(&workers->lock); } @@ -238,7 +240,10 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers) */ next = workers->worker_list.next; worker = list_entry(next, struct btrfs_worker_thread, worker_list); - list_move_tail(next, &workers->worker_list); + atomic_inc(&worker->num_pending); + worker->sequence++; + if (worker->sequence % 4 == 0) + list_move_tail(next, &workers->worker_list); return worker; } -- cgit v1.2.3 From 53863232ef961778aa414b700ed88a48e8e871e6 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 15 Aug 2008 15:34:18 -0400 Subject: Btrfs: Lower contention on the csum mutex This takes the csum mutex deeper in the call chain and releases it more often. Signed-off-by: Chris Mason --- fs/btrfs/async-thread.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/async-thread.c') diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 958cd8b5f0d7..2ee301740195 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -48,6 +48,7 @@ struct btrfs_worker_thread { /* number of things on the pending list */ atomic_t num_pending; + unsigned long sequence; /* protects the pending list. */ @@ -242,7 +243,7 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers) worker = list_entry(next, struct btrfs_worker_thread, worker_list); atomic_inc(&worker->num_pending); worker->sequence++; - if (worker->sequence % 4 == 0) + if (worker->sequence % workers->idle_thresh == 0) list_move_tail(next, &workers->worker_list); return worker; } -- cgit v1.2.3 From 2b1f55b0f0d0d1a66470ef4ea2696cd5dd741a12 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 24 Sep 2008 11:48:04 -0400 Subject: Remove Btrfs compat code for older kernels Btrfs had compatibility code for kernels back to 2.6.18. These have been removed, and will be maintained in a separate backport git tree from now on. Signed-off-by: Chris Mason --- fs/btrfs/async-thread.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'fs/btrfs/async-thread.c') diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 2ee301740195..4e780b279de6 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -20,13 +20,7 @@ #include #include #include - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20) # include -#else -# include -#endif - #include "async-thread.h" /* -- cgit v1.2.3 From d352ac68148b69937d39ca5d48bcc4478e118dbf Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 29 Sep 2008 15:18:18 -0400 Subject: Btrfs: add and improve comments This improves the comments at the top of many functions. It didn't dive into the guts of functions because I was trying to avoid merging problems with the new allocator and back reference work. extent-tree.c and volumes.c were both skipped, and there is definitely more work todo in cleaning and commenting the code. Signed-off-by: Chris Mason --- fs/btrfs/async-thread.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/async-thread.c') diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 4e780b279de6..04fb9702d14c 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -231,17 +231,25 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers) /* * if we pick a busy task, move the task to the end of the list. - * hopefully this will keep things somewhat evenly balanced + * hopefully this will keep things somewhat evenly balanced. + * Do the move in batches based on the sequence number. This groups + * requests submitted at roughly the same time onto the same worker. */ next = workers->worker_list.next; worker = list_entry(next, struct btrfs_worker_thread, worker_list); atomic_inc(&worker->num_pending); worker->sequence++; + if (worker->sequence % workers->idle_thresh == 0) list_move_tail(next, &workers->worker_list); return worker; } +/* + * selects a worker thread to take the next job. This will either find + * an idle worker, start a new worker up to the max count, or just return + * one of the existing busy workers. + */ static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers) { struct btrfs_worker_thread *worker; -- cgit v1.2.3 From 75ccf47d13bfb66de7faf596bfe497b9af7aaa40 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 30 Sep 2008 19:24:06 -0400 Subject: Btrfs: fix multi-device code to use raid policies set by mkfs When reading in block groups, a global mask of the available raid policies should be adjusted based on the types of block groups found on disk. This global mask is then used to decide which raid policy to use for new block groups. The recent allocator changes dropped the call that updated the global mask, making all the block groups allocated at run time single striped onto a single drive. This also fixes the async worker threads to set any thread that uses the requeue mechanism as busy. This allows us to avoid blocking on get_request_wait for the async bio submission threads. Signed-off-by: Chris Mason --- fs/btrfs/async-thread.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/async-thread.c') diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 04fb9702d14c..d82efd722a48 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -302,8 +302,20 @@ int btrfs_requeue_work(struct btrfs_work *work) spin_lock_irqsave(&worker->lock, flags); atomic_inc(&worker->num_pending); list_add_tail(&work->list, &worker->pending); - check_busy_worker(worker); + + /* by definition we're busy, take ourselves off the idle + * list + */ + if (worker->idle) { + spin_lock_irqsave(&worker->workers->lock, flags); + worker->idle = 0; + list_move_tail(&worker->worker_list, + &worker->workers->worker_list); + spin_unlock_irqrestore(&worker->workers->lock, flags); + } + spin_unlock_irqrestore(&worker->lock, flags); + out: return 0; } -- cgit v1.2.3 From 4a69a41009c4ac691f7d9c289f5f37fabeddce46 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 6 Nov 2008 22:03:00 -0500 Subject: Btrfs: Add ordered async work queues Btrfs uses kernel threads to create async work queues for cpu intensive operations such as checksumming and decompression. These work well, but they make it difficult to keep IO order intact. A single writepages call from pdflush or fsync will turn into a number of bios, and each bio is checksummed in parallel. Once the checksum is computed, the bio is sent down to the disk, and since we don't control the order in which the parallel operations happen, they might go down to the disk in almost any order. The code deals with this somewhat by having deep work queues for a single kernel thread, making it very likely that a single thread will process all the bios for a single inode. This patch introduces an explicitly ordered work queue. As work structs are placed into the queue they are put onto the tail of a list. They have three callbacks: ->func (cpu intensive processing here) ->ordered_func (order sensitive processing here) ->ordered_free (free the work struct, all processing is done) The work struct has three callbacks. The func callback does the cpu intensive work, and when it completes the work struct is marked as done. Every time a work struct completes, the list is checked to see if the head is marked as done. If so the ordered_func callback is used to do the order sensitive processing and the ordered_free callback is used to do any cleanup. Then we loop back and check the head of the list again. This patch also changes the checksumming code to use the ordered workqueues. One a 4 drive array, it increases streaming writes from 280MB/s to 350MB/s. Signed-off-by: Chris Mason --- fs/btrfs/async-thread.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 64 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/async-thread.c') diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index d82efd722a48..e1e49715459e 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -23,6 +23,10 @@ # include #include "async-thread.h" +#define WORK_QUEUED_BIT 0 +#define WORK_DONE_BIT 1 +#define WORK_ORDER_DONE_BIT 2 + /* * container for the kthread task pointer and the list of pending work * One of these is allocated per thread. @@ -88,6 +92,47 @@ static void check_busy_worker(struct btrfs_worker_thread *worker) } } +static noinline int run_ordered_completions(struct btrfs_workers *workers, + struct btrfs_work *work) +{ + unsigned long flags; + + if (!workers->ordered) + return 0; + + set_bit(WORK_DONE_BIT, &work->flags); + + spin_lock_irqsave(&workers->lock, flags); + + while(!list_empty(&workers->order_list)) { + work = list_entry(workers->order_list.next, + struct btrfs_work, order_list); + + if (!test_bit(WORK_DONE_BIT, &work->flags)) + break; + + /* we are going to call the ordered done function, but + * we leave the work item on the list as a barrier so + * that later work items that are done don't have their + * functions called before this one returns + */ + if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags)) + break; + + spin_unlock_irqrestore(&workers->lock, flags); + + work->ordered_func(work); + + /* now take the lock again and call the freeing code */ + spin_lock_irqsave(&workers->lock, flags); + list_del(&work->order_list); + work->ordered_free(work); + } + + spin_unlock_irqrestore(&workers->lock, flags); + return 0; +} + /* * main loop for servicing work items */ @@ -102,7 +147,7 @@ static int worker_loop(void *arg) cur = worker->pending.next; work = list_entry(cur, struct btrfs_work, list); list_del(&work->list); - clear_bit(0, &work->flags); + clear_bit(WORK_QUEUED_BIT, &work->flags); work->worker = worker; spin_unlock_irq(&worker->lock); @@ -110,8 +155,15 @@ static int worker_loop(void *arg) work->func(work); atomic_dec(&worker->num_pending); + /* + * unless this is an ordered work queue, + * 'work' was probably freed by func above. + */ + run_ordered_completions(worker->workers, work); + spin_lock_irq(&worker->lock); check_idle_worker(worker); + } worker->working = 0; if (freezing(current)) { @@ -154,10 +206,12 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max) workers->num_workers = 0; INIT_LIST_HEAD(&workers->worker_list); INIT_LIST_HEAD(&workers->idle_list); + INIT_LIST_HEAD(&workers->order_list); spin_lock_init(&workers->lock); workers->max_workers = max; workers->idle_thresh = 32; workers->name = name; + workers->ordered = 0; } /* @@ -296,7 +350,7 @@ int btrfs_requeue_work(struct btrfs_work *work) struct btrfs_worker_thread *worker = work->worker; unsigned long flags; - if (test_and_set_bit(0, &work->flags)) + if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) goto out; spin_lock_irqsave(&worker->lock, flags); @@ -330,10 +384,17 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) int wake = 0; /* don't requeue something already on a list */ - if (test_and_set_bit(0, &work->flags)) + if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) goto out; worker = find_worker(workers); + if (workers->ordered) { + spin_lock_irqsave(&workers->lock, flags); + list_add_tail(&work->order_list, &workers->order_list); + spin_unlock_irqrestore(&workers->lock, flags); + } else { + INIT_LIST_HEAD(&work->order_list); + } spin_lock_irqsave(&worker->lock, flags); atomic_inc(&worker->num_pending); -- cgit v1.2.3 From 0df49b911db2b22ea808b596070b1cc65c23d148 Mon Sep 17 00:00:00 2001 From: yanhai zhu Date: Wed, 12 Nov 2008 14:36:58 -0500 Subject: Btrfs: Check kthread_should_stop() before schedule() in worker_loop In worker_loop(), the func should check whether it has been requested to stop before it decides to schedule out. Otherwise if the stop request(also the last wake_up()) sent by btrfs_stop_workers() happens when worker_loop() running after the "while" judgement and before schedule(), woker_loop() will schedule away and never be woken up, which will also cause btrfs_stop_workers() wait forever. Signed-off-by: Chris Mason --- fs/btrfs/async-thread.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/async-thread.c') diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index e1e49715459e..4229450b7596 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -171,7 +171,8 @@ static int worker_loop(void *arg) } else { set_current_state(TASK_INTERRUPTIBLE); spin_unlock_irq(&worker->lock); - schedule(); + if (!kthread_should_stop()) + schedule(); __set_current_state(TASK_RUNNING); } } while (!kthread_should_stop()); -- cgit v1.2.3 From d397712bcc6a759a560fd247e6053ecae091f958 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 5 Jan 2009 21:25:51 -0500 Subject: Btrfs: Fix checkpatch.pl warnings There were many, most are fixed now. struct-funcs.c generates some warnings but these are bogus. Signed-off-by: Chris Mason --- fs/btrfs/async-thread.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/async-thread.c') diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 4229450b7596..8e2fec05dbe0 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -104,7 +104,7 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers, spin_lock_irqsave(&workers->lock, flags); - while(!list_empty(&workers->order_list)) { + while (!list_empty(&workers->order_list)) { work = list_entry(workers->order_list.next, struct btrfs_work, order_list); @@ -143,7 +143,7 @@ static int worker_loop(void *arg) struct btrfs_work *work; do { spin_lock_irq(&worker->lock); - while(!list_empty(&worker->pending)) { + while (!list_empty(&worker->pending)) { cur = worker->pending.next; work = list_entry(cur, struct btrfs_work, list); list_del(&work->list); @@ -188,7 +188,7 @@ int btrfs_stop_workers(struct btrfs_workers *workers) struct btrfs_worker_thread *worker; list_splice_init(&workers->idle_list, &workers->worker_list); - while(!list_empty(&workers->worker_list)) { + while (!list_empty(&workers->worker_list)) { cur = workers->worker_list.next; worker = list_entry(cur, struct btrfs_worker_thread, worker_list); -- cgit v1.2.3