From 8b7128429235d9bd72cfd5ed20c77c4f3118f744 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 11 Jun 2008 16:50:36 -0400
Subject: Btrfs: Add async worker threads for pre and post IO checksumming

Btrfs has been using workqueues to spread the checksumming load across
other CPUs in the system.  But, workqueues only schedule work on the
same CPU that queued the work, giving them a limited benefit for systems with
higher CPU counts.

This code adds a generic facility to schedule work with pools of kthreads,
and changes the bio submission code to queue bios up.  The queueing is
important to make sure large numbers of procs on the system don't
turn streaming workloads into random workloads by sending IO down
concurrently.

The end result of all of this is much higher performance (and CPU usage) when
doing checksumming on large machines.  Two worker pools are created,
one for writes and one for endio processing.  The two could deadlock if
we tried to service both from a single pool.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/async-thread.c | 288 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 288 insertions(+)
 create mode 100644 fs/btrfs/async-thread.c

(limited to 'fs/btrfs/async-thread.c')

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
new file mode 100644
index 000000000000..2911b67bd6f7
--- /dev/null
+++ b/fs/btrfs/async-thread.c
@@ -0,0 +1,288 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kthread.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/freezer.h>
+#include "async-thread.h"
+
+/*
+ * container for the kthread task pointer and the list of pending work
+ * One of these is allocated per thread.
+ */
+struct btrfs_worker_thread {
+	/* list of struct btrfs_work that are waiting for service */
+	struct list_head pending;
+
+	/* list of worker threads from struct btrfs_workers */
+	struct list_head worker_list;
+
+	/* kthread */
+	struct task_struct *task;
+
+	/* number of things on the pending list */
+	atomic_t num_pending;
+
+	/* protects the pending list. */
+	spinlock_t lock;
+
+	/* set to non-zero when this thread is already awake and kicking */
+	int working;
+};
+
+/*
+ * main loop for servicing work items
+ */
+static int worker_loop(void *arg)
+{
+	struct btrfs_worker_thread *worker = arg;
+	struct list_head *cur;
+	struct btrfs_work *work;
+	do {
+		spin_lock_irq(&worker->lock);
+		while(!list_empty(&worker->pending)) {
+			cur = worker->pending.next;
+			work = list_entry(cur, struct btrfs_work, list);
+			list_del(&work->list);
+			clear_bit(0, &work->flags);
+
+			work->worker = worker;
+			spin_unlock_irq(&worker->lock);
+
+			work->func(work);
+
+			atomic_dec(&worker->num_pending);
+			spin_lock_irq(&worker->lock);
+		}
+		worker->working = 0;
+		if (freezing(current)) {
+			refrigerator();
+		} else {
+			set_current_state(TASK_INTERRUPTIBLE);
+			spin_unlock_irq(&worker->lock);
+			schedule();
+			__set_current_state(TASK_RUNNING);
+		}
+	} while (!kthread_should_stop());
+	return 0;
+}
+
+/*
+ * this will wait for all the worker threads to shutdown
+ */
+int btrfs_stop_workers(struct btrfs_workers *workers)
+{
+	struct list_head *cur;
+	struct btrfs_worker_thread *worker;
+
+	while(!list_empty(&workers->worker_list)) {
+		cur = workers->worker_list.next;
+		worker = list_entry(cur, struct btrfs_worker_thread,
+				    worker_list);
+		kthread_stop(worker->task);
+		list_del(&worker->worker_list);
+		kfree(worker);
+	}
+	return 0;
+}
+
+/*
+ * simple init on struct btrfs_workers
+ */
+void btrfs_init_workers(struct btrfs_workers *workers, int max)
+{
+	workers->num_workers = 0;
+	INIT_LIST_HEAD(&workers->worker_list);
+	workers->last = NULL;
+	spin_lock_init(&workers->lock);
+	workers->max_workers = max;
+}
+
+/*
+ * starts new worker threads.  This does not enforce the max worker
+ * count in case you need to temporarily go past it.
+ */
+int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
+{
+	struct btrfs_worker_thread *worker;
+	int ret = 0;
+	int i;
+
+	for (i = 0; i < num_workers; i++) {
+		worker = kzalloc(sizeof(*worker), GFP_NOFS);
+		if (!worker) {
+			ret = -ENOMEM;
+			goto fail;
+		}
+
+		INIT_LIST_HEAD(&worker->pending);
+		INIT_LIST_HEAD(&worker->worker_list);
+		spin_lock_init(&worker->lock);
+		atomic_set(&worker->num_pending, 0);
+		worker->task = kthread_run(worker_loop, worker, "btrfs");
+		if (IS_ERR(worker->task)) {
+			ret = PTR_ERR(worker->task);
+			goto fail;
+		}
+
+		spin_lock_irq(&workers->lock);
+		list_add_tail(&worker->worker_list, &workers->worker_list);
+		workers->last = worker;
+		workers->num_workers++;
+		spin_unlock_irq(&workers->lock);
+	}
+	return 0;
+fail:
+	btrfs_stop_workers(workers);
+	return ret;
+}
+
+/*
+ * run through the list and find a worker thread that doesn't have a lot
+ * to do right now.  This can return null if we aren't yet at the thread
+ * count limit and all of the threads are busy.
+ */
+static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
+{
+	struct btrfs_worker_thread *worker;
+	struct list_head *next;
+	struct list_head *start;
+	int enforce_min = workers->num_workers < workers->max_workers;
+
+	/* start with the last thread if it isn't busy */
+	worker = workers->last;
+	if (atomic_read(&worker->num_pending) < 64)
+		goto done;
+
+	next = worker->worker_list.next;
+	start = &worker->worker_list;
+
+	/*
+	 * check all the workers for someone that is bored.  FIXME, do
+	 * something smart here
+	 */
+	while(next != start) {
+		if (next == &workers->worker_list) {
+			next = workers->worker_list.next;
+			continue;
+		}
+		worker = list_entry(next, struct btrfs_worker_thread,
+				    worker_list);
+		if (atomic_read(&worker->num_pending) < 64 || !enforce_min)
+			goto done;
+		next = next->next;
+	}
+	/*
+	 * nobody was bored, if we're already at the max thread count,
+	 * use the last thread
+	 */
+	if (!enforce_min || atomic_read(&workers->last->num_pending) < 64) {
+		return workers->last;
+	}
+	return NULL;
+done:
+	workers->last = worker;
+	return worker;
+}
+
+static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
+{
+	struct btrfs_worker_thread *worker;
+	unsigned long flags;
+
+again:
+	spin_lock_irqsave(&workers->lock, flags);
+	worker = next_worker(workers);
+	spin_unlock_irqrestore(&workers->lock, flags);
+
+	if (!worker) {
+		spin_lock_irqsave(&workers->lock, flags);
+		if (workers->num_workers >= workers->max_workers) {
+			/*
+			 * we have failed to find any workers, just
+			 * return the force one
+			 */
+			worker = list_entry(workers->worker_list.next,
+				  struct btrfs_worker_thread, worker_list);
+			spin_unlock_irqrestore(&workers->lock, flags);
+		} else {
+			spin_unlock_irqrestore(&workers->lock, flags);
+			/* we're below the limit, start another worker */
+			btrfs_start_workers(workers, 1);
+			goto again;
+		}
+	}
+	return worker;
+}
+
+/*
+ * btrfs_requeue_work just puts the work item back on the tail of the list
+ * it was taken from.  It is intended for use with long running work functions
+ * that make some progress and want to give the cpu up for others.
+ */
+int btrfs_requeue_work(struct btrfs_work *work)
+{
+	struct btrfs_worker_thread *worker = work->worker;
+	unsigned long flags;
+
+	if (test_and_set_bit(0, &work->flags))
+		goto out;
+
+	spin_lock_irqsave(&worker->lock, flags);
+	atomic_inc(&worker->num_pending);
+	list_add_tail(&work->list, &worker->pending);
+	spin_unlock_irqrestore(&worker->lock, flags);
+out:
+	return 0;
+}
+
+/*
+ * places a struct btrfs_work into the pending queue of one of the kthreads
+ */
+int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
+{
+	struct btrfs_worker_thread *worker;
+	unsigned long flags;
+	int wake = 0;
+
+	/* don't requeue something already on a list */
+	if (test_and_set_bit(0, &work->flags))
+		goto out;
+
+	worker = find_worker(workers);
+
+	spin_lock_irqsave(&worker->lock, flags);
+	atomic_inc(&worker->num_pending);
+	list_add_tail(&work->list, &worker->pending);
+
+	/*
+	 * avoid calling into wake_up_process if this thread has already
+	 * been kicked
+	 */
+	if (!worker->working)
+		wake = 1;
+	worker->working = 1;
+
+	spin_unlock_irqrestore(&worker->lock, flags);
+
+	if (wake)
+		wake_up_process(worker->task);
+out:
+	return 0;
+}
-- 
cgit v1.2.3


From d05e5a4dad985acc938e62d9752c813647dedd9d Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 11 Jun 2008 17:09:48 -0400
Subject: Btrfs: Add backport for the kthread work on kernels older than 2.6.20

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/async-thread.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs/btrfs/async-thread.c')

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 2911b67bd6f7..8b9e2cf50606 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -16,10 +16,17 @@
  * Boston, MA 021110-1307, USA.
  */
 
+#include <linux/version.h>
 #include <linux/kthread.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
-#include <linux/freezer.h>
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
+# include <linux/freezer.h>
+#else
+# include <linux/sched.h>
+#endif
+
 #include "async-thread.h"
 
 /*
-- 
cgit v1.2.3


From 35d8ba66294ff2a53c17337a1aa1ff6739492f41 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 11 Jun 2008 20:21:24 -0400
Subject: Btrfs: Worker thread optimizations

This changes the worker thread pool to maintain a list of idle threads,
avoiding a complex search for a good thread to wake up.

Threads have two states:

idle - we try to reuse the last thread used in hopes of improving the batching
ratios

busy - each time a new work item is added to a busy task, the task is
rotated to the end of the line.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/async-thread.c | 100 +++++++++++++++++++++++++++++++++---------------
 1 file changed, 69 insertions(+), 31 deletions(-)

(limited to 'fs/btrfs/async-thread.c')

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 8b9e2cf50606..8d4cc4679d51 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -34,6 +34,9 @@
  * One of these is allocated per thread.
  */
 struct btrfs_worker_thread {
+	/* pool we belong to */
+	struct btrfs_workers *workers;
+
 	/* list of struct btrfs_work that are waiting for service */
 	struct list_head pending;
 
@@ -51,8 +54,44 @@ struct btrfs_worker_thread {
 
 	/* set to non-zero when this thread is already awake and kicking */
 	int working;
+
+	/* are we currently idle */
+	int idle;
 };
 
+/*
+ * helper function to move a thread onto the idle list after it
+ * has finished some requests.
+ */
+static void check_idle_worker(struct btrfs_worker_thread *worker)
+{
+	if (!worker->idle && atomic_read(&worker->num_pending) <
+	    worker->workers->idle_thresh / 2) {
+		unsigned long flags;
+		spin_lock_irqsave(&worker->workers->lock, flags);
+		worker->idle = 1;
+		list_move(&worker->worker_list, &worker->workers->idle_list);
+		spin_unlock_irqrestore(&worker->workers->lock, flags);
+	}
+}
+
+/*
+ * helper function to move a thread off the idle list after new
+ * pending work is added.
+ */
+static void check_busy_worker(struct btrfs_worker_thread *worker)
+{
+	if (worker->idle && atomic_read(&worker->num_pending) >=
+	    worker->workers->idle_thresh) {
+		unsigned long flags;
+		spin_lock_irqsave(&worker->workers->lock, flags);
+		worker->idle = 0;
+		list_move_tail(&worker->worker_list,
+			       &worker->workers->worker_list);
+		spin_unlock_irqrestore(&worker->workers->lock, flags);
+	}
+}
+
 /*
  * main loop for servicing work items
  */
@@ -76,6 +115,7 @@ static int worker_loop(void *arg)
 
 			atomic_dec(&worker->num_pending);
 			spin_lock_irq(&worker->lock);
+			check_idle_worker(worker);
 		}
 		worker->working = 0;
 		if (freezing(current)) {
@@ -98,6 +138,7 @@ int btrfs_stop_workers(struct btrfs_workers *workers)
 	struct list_head *cur;
 	struct btrfs_worker_thread *worker;
 
+	list_splice_init(&workers->idle_list, &workers->worker_list);
 	while(!list_empty(&workers->worker_list)) {
 		cur = workers->worker_list.next;
 		worker = list_entry(cur, struct btrfs_worker_thread,
@@ -116,9 +157,10 @@ void btrfs_init_workers(struct btrfs_workers *workers, int max)
 {
 	workers->num_workers = 0;
 	INIT_LIST_HEAD(&workers->worker_list);
-	workers->last = NULL;
+	INIT_LIST_HEAD(&workers->idle_list);
 	spin_lock_init(&workers->lock);
 	workers->max_workers = max;
+	workers->idle_thresh = 64;
 }
 
 /*
@@ -143,14 +185,14 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
 		spin_lock_init(&worker->lock);
 		atomic_set(&worker->num_pending, 0);
 		worker->task = kthread_run(worker_loop, worker, "btrfs");
+		worker->workers = workers;
 		if (IS_ERR(worker->task)) {
 			ret = PTR_ERR(worker->task);
 			goto fail;
 		}
 
 		spin_lock_irq(&workers->lock);
-		list_add_tail(&worker->worker_list, &workers->worker_list);
-		workers->last = worker;
+		list_add_tail(&worker->worker_list, &workers->idle_list);
 		workers->num_workers++;
 		spin_unlock_irq(&workers->lock);
 	}
@@ -169,42 +211,30 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
 {
 	struct btrfs_worker_thread *worker;
 	struct list_head *next;
-	struct list_head *start;
 	int enforce_min = workers->num_workers < workers->max_workers;
 
-	/* start with the last thread if it isn't busy */
-	worker = workers->last;
-	if (atomic_read(&worker->num_pending) < 64)
-		goto done;
-
-	next = worker->worker_list.next;
-	start = &worker->worker_list;
-
 	/*
-	 * check all the workers for someone that is bored.  FIXME, do
-	 * something smart here
+	 * if we find an idle thread, don't move it to the end of the
+	 * idle list.  This improves the chance that the next submission
+	 * will reuse the same thread, and maybe catch it while it is still
+	 * working
 	 */
-	while(next != start) {
-		if (next == &workers->worker_list) {
-			next = workers->worker_list.next;
-			continue;
-		}
+	if (!list_empty(&workers->idle_list)) {
+		next = workers->idle_list.next;
 		worker = list_entry(next, struct btrfs_worker_thread,
 				    worker_list);
-		if (atomic_read(&worker->num_pending) < 64 || !enforce_min)
-			goto done;
-		next = next->next;
+		return worker;
 	}
+	if (enforce_min || list_empty(&workers->worker_list))
+		return NULL;
+
 	/*
-	 * nobody was bored, if we're already at the max thread count,
-	 * use the last thread
+	 * if we pick a busy task, move the task to the end of the list.
+	 * hopefully this will keep things somewhat evenly balanced
 	 */
-	if (!enforce_min || atomic_read(&workers->last->num_pending) < 64) {
-		return workers->last;
-	}
-	return NULL;
-done:
-	workers->last = worker;
+	next = workers->worker_list.next;
+	worker = list_entry(next, struct btrfs_worker_thread, worker_list);
+	list_move_tail(next, &workers->worker_list);
 	return worker;
 }
 
@@ -221,11 +251,17 @@ again:
 	if (!worker) {
 		spin_lock_irqsave(&workers->lock, flags);
 		if (workers->num_workers >= workers->max_workers) {
+			struct list_head *fallback = NULL;
 			/*
 			 * we have failed to find any workers, just
 			 * return the force one
 			 */
-			worker = list_entry(workers->worker_list.next,
+			if (!list_empty(&workers->worker_list))
+				fallback = workers->worker_list.next;
+			if (!list_empty(&workers->idle_list))
+				fallback = workers->idle_list.next;
+			BUG_ON(!fallback);
+			worker = list_entry(fallback,
 				  struct btrfs_worker_thread, worker_list);
 			spin_unlock_irqrestore(&workers->lock, flags);
 		} else {
@@ -254,6 +290,7 @@ int btrfs_requeue_work(struct btrfs_work *work)
 	spin_lock_irqsave(&worker->lock, flags);
 	atomic_inc(&worker->num_pending);
 	list_add_tail(&work->list, &worker->pending);
+	check_busy_worker(worker);
 	spin_unlock_irqrestore(&worker->lock, flags);
 out:
 	return 0;
@@ -276,6 +313,7 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
 
 	spin_lock_irqsave(&worker->lock, flags);
 	atomic_inc(&worker->num_pending);
+	check_busy_worker(worker);
 	list_add_tail(&work->list, &worker->pending);
 
 	/*
-- 
cgit v1.2.3


From 3bf10418675cb424724b5cb9d7725b234defe1fd Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 30 Jul 2008 09:24:37 -0400
Subject: Btrfs: async-thread: fix possible memory leak

When kthread_run() returns failure, this worker hasn't been
added to the list, so btrfs_stop_workers() won't free it.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/async-thread.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs/btrfs/async-thread.c')

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 8d4cc4679d51..5fe6a0d532ed 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -187,6 +187,7 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
 		worker->task = kthread_run(worker_loop, worker, "btrfs");
 		worker->workers = workers;
 		if (IS_ERR(worker->task)) {
+			kfree(worker);
 			ret = PTR_ERR(worker->task);
 			goto fail;
 		}
-- 
cgit v1.2.3


From 61b4944018449003ac5f9757f4d125dce519cf51 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 31 Jul 2008 15:42:53 -0400
Subject: Btrfs: Fix streaming read performance with checksumming on

Large streaming reads make for large bios, which means each entry on the
list async work queues represents a large amount of data.  IO
congestion throttling on the device was kicking in before the async
worker threads decided a single thread was busy and needed some help.

The end result was that a streaming read would result in a single CPU
running at 100% instead of balancing the work off to other CPUs.

This patch also changes the pre-IO checksum lookup done by reads to
work on a per-bio basis instead of a per-page.  This results in many
extra btree lookups on large streaming reads.  Doing the checksum lookup
right before bio submit allows us to reuse searches while processing
adjacent offsets.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/async-thread.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs/async-thread.c')

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 5fe6a0d532ed..bc2980c433ef 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -160,7 +160,7 @@ void btrfs_init_workers(struct btrfs_workers *workers, int max)
 	INIT_LIST_HEAD(&workers->idle_list);
 	spin_lock_init(&workers->lock);
 	workers->max_workers = max;
-	workers->idle_thresh = 64;
+	workers->idle_thresh = 32;
 }
 
 /*
-- 
cgit v1.2.3


From 5443be45f5cb57d02fd895a0bcaf7e7d9890b1df Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 15 Aug 2008 15:34:16 -0400
Subject: Btrfs: Give all the worker threads descriptive names

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/async-thread.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs/btrfs/async-thread.c')

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index bc2980c433ef..5f2f5a8c2289 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -153,7 +153,7 @@ int btrfs_stop_workers(struct btrfs_workers *workers)
 /*
  * simple init on struct btrfs_workers
  */
-void btrfs_init_workers(struct btrfs_workers *workers, int max)
+void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
 {
 	workers->num_workers = 0;
 	INIT_LIST_HEAD(&workers->worker_list);
@@ -161,6 +161,7 @@ void btrfs_init_workers(struct btrfs_workers *workers, int max)
 	spin_lock_init(&workers->lock);
 	workers->max_workers = max;
 	workers->idle_thresh = 32;
+	workers->name = name;
 }
 
 /*
@@ -184,7 +185,9 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
 		INIT_LIST_HEAD(&worker->worker_list);
 		spin_lock_init(&worker->lock);
 		atomic_set(&worker->num_pending, 0);
-		worker->task = kthread_run(worker_loop, worker, "btrfs");
+		worker->task = kthread_run(worker_loop, worker,
+					   "btrfs-%s-%d", workers->name,
+					   workers->num_workers + i);
 		worker->workers = workers;
 		if (IS_ERR(worker->task)) {
 			kfree(worker);
-- 
cgit v1.2.3


From 4854ddd0ed0a687fc2d7c45a529c406232e31e7b Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 15 Aug 2008 15:34:17 -0400
Subject: Btrfs: Wait for kernel threads to make progress during async
 submission

Before this change, btrfs would use a bdi congestion function to make
sure there weren't too many pending async checksum work items.

This change makes the process creating async work items wait instead,
leading to fewer congestion returns from the bdi.  This improves
pdflush background_writeout scanning.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/async-thread.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs/btrfs/async-thread.c')

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 5f2f5a8c2289..958cd8b5f0d7 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -48,6 +48,7 @@ struct btrfs_worker_thread {
 
 	/* number of things on the pending list */
 	atomic_t num_pending;
+	unsigned long sequence;
 
 	/* protects the pending list. */
 	spinlock_t lock;
@@ -197,6 +198,7 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
 
 		spin_lock_irq(&workers->lock);
 		list_add_tail(&worker->worker_list, &workers->idle_list);
+		worker->idle = 1;
 		workers->num_workers++;
 		spin_unlock_irq(&workers->lock);
 	}
@@ -238,7 +240,10 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
 	 */
 	next = workers->worker_list.next;
 	worker = list_entry(next, struct btrfs_worker_thread, worker_list);
-	list_move_tail(next, &workers->worker_list);
+	atomic_inc(&worker->num_pending);
+	worker->sequence++;
+	if (worker->sequence % 4 == 0)
+		list_move_tail(next, &workers->worker_list);
 	return worker;
 }
 
-- 
cgit v1.2.3


From 53863232ef961778aa414b700ed88a48e8e871e6 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 15 Aug 2008 15:34:18 -0400
Subject: Btrfs: Lower contention on the csum mutex

This takes the csum mutex deeper in the call chain and releases it
more often.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/async-thread.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs/btrfs/async-thread.c')

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 958cd8b5f0d7..2ee301740195 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -48,6 +48,7 @@ struct btrfs_worker_thread {
 
 	/* number of things on the pending list */
 	atomic_t num_pending;
+
 	unsigned long sequence;
 
 	/* protects the pending list. */
@@ -242,7 +243,7 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
 	worker = list_entry(next, struct btrfs_worker_thread, worker_list);
 	atomic_inc(&worker->num_pending);
 	worker->sequence++;
-	if (worker->sequence % 4 == 0)
+	if (worker->sequence % workers->idle_thresh == 0)
 		list_move_tail(next, &workers->worker_list);
 	return worker;
 }
-- 
cgit v1.2.3


From 2b1f55b0f0d0d1a66470ef4ea2696cd5dd741a12 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 24 Sep 2008 11:48:04 -0400
Subject: Remove Btrfs compat code for older kernels

Btrfs had compatibility code for kernels back to 2.6.18.  These have
been removed, and will be maintained in a separate backport
git tree from now on.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/async-thread.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'fs/btrfs/async-thread.c')

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 2ee301740195..4e780b279de6 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -20,13 +20,7 @@
 #include <linux/kthread.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
-
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
 # include <linux/freezer.h>
-#else
-# include <linux/sched.h>
-#endif
-
 #include "async-thread.h"
 
 /*
-- 
cgit v1.2.3


From d352ac68148b69937d39ca5d48bcc4478e118dbf Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 29 Sep 2008 15:18:18 -0400
Subject: Btrfs: add and improve comments

This improves the comments at the top of many functions.  It didn't
dive into the guts of functions because I was trying to
avoid merging problems with the new allocator and back reference work.

extent-tree.c and volumes.c were both skipped, and there is definitely
more work todo in cleaning and commenting the code.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/async-thread.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'fs/btrfs/async-thread.c')

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 4e780b279de6..04fb9702d14c 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -231,17 +231,25 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
 
 	/*
 	 * if we pick a busy task, move the task to the end of the list.
-	 * hopefully this will keep things somewhat evenly balanced
+	 * hopefully this will keep things somewhat evenly balanced.
+	 * Do the move in batches based on the sequence number.  This groups
+	 * requests submitted at roughly the same time onto the same worker.
 	 */
 	next = workers->worker_list.next;
 	worker = list_entry(next, struct btrfs_worker_thread, worker_list);
 	atomic_inc(&worker->num_pending);
 	worker->sequence++;
+
 	if (worker->sequence % workers->idle_thresh == 0)
 		list_move_tail(next, &workers->worker_list);
 	return worker;
 }
 
+/*
+ * selects a worker thread to take the next job.  This will either find
+ * an idle worker, start a new worker up to the max count, or just return
+ * one of the existing busy workers.
+ */
 static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
 {
 	struct btrfs_worker_thread *worker;
-- 
cgit v1.2.3


From 75ccf47d13bfb66de7faf596bfe497b9af7aaa40 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 30 Sep 2008 19:24:06 -0400
Subject: Btrfs: fix multi-device code to use raid policies set by mkfs

When reading in block groups, a global mask of the available raid policies
should be adjusted based on the types of block groups found on disk.  This
global mask is then used to decide which raid policy to use for new
block groups.

The recent allocator changes dropped the call that updated the global
mask, making all the block groups allocated at run time single striped
onto a single drive.

This also fixes the async worker threads to set any thread that uses
the requeue mechanism as busy.  This allows us to avoid blocking
on get_request_wait for the async bio submission threads.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/async-thread.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'fs/btrfs/async-thread.c')

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 04fb9702d14c..d82efd722a48 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -302,8 +302,20 @@ int btrfs_requeue_work(struct btrfs_work *work)
 	spin_lock_irqsave(&worker->lock, flags);
 	atomic_inc(&worker->num_pending);
 	list_add_tail(&work->list, &worker->pending);
-	check_busy_worker(worker);
+
+	/* by definition we're busy, take ourselves off the idle
+	 * list
+	 */
+	if (worker->idle) {
+		spin_lock_irqsave(&worker->workers->lock, flags);
+		worker->idle = 0;
+		list_move_tail(&worker->worker_list,
+			       &worker->workers->worker_list);
+		spin_unlock_irqrestore(&worker->workers->lock, flags);
+	}
+
 	spin_unlock_irqrestore(&worker->lock, flags);
+
 out:
 	return 0;
 }
-- 
cgit v1.2.3


From 4a69a41009c4ac691f7d9c289f5f37fabeddce46 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 6 Nov 2008 22:03:00 -0500
Subject: Btrfs: Add ordered async work queues

Btrfs uses kernel threads to create async work queues for cpu intensive
operations such as checksumming and decompression.  These work well,
but they make it difficult to keep IO order intact.

A single writepages call from pdflush or fsync will turn into a number
of bios, and each bio is checksummed in parallel.  Once the checksum is
computed, the bio is sent down to the disk, and since we don't control
the order in which the parallel operations happen, they might go down to
the disk in almost any order.

The code deals with this somewhat by having deep work queues for a single
kernel thread, making it very likely that a single thread will process all
the bios for a single inode.

This patch introduces an explicitly ordered work queue.  As work structs
are placed into the queue they are put onto the tail of a list.  They have
three callbacks:

->func (cpu intensive processing here)
->ordered_func (order sensitive processing here)
->ordered_free (free the work struct, all processing is done)

The work struct has three callbacks.  The func callback does the cpu intensive
work, and when it completes the work struct is marked as done.

Every time a work struct completes, the list is checked to see if the head
is marked as done.  If so the ordered_func callback is used to do the
order sensitive processing and the ordered_free callback is used to do
any cleanup.  Then we loop back and check the head of the list again.

This patch also changes the checksumming code to use the ordered workqueues.
One a 4 drive array, it increases streaming writes from 280MB/s to 350MB/s.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/async-thread.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 64 insertions(+), 3 deletions(-)

(limited to 'fs/btrfs/async-thread.c')

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index d82efd722a48..e1e49715459e 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -23,6 +23,10 @@
 # include <linux/freezer.h>
 #include "async-thread.h"
 
+#define WORK_QUEUED_BIT 0
+#define WORK_DONE_BIT 1
+#define WORK_ORDER_DONE_BIT 2
+
 /*
  * container for the kthread task pointer and the list of pending work
  * One of these is allocated per thread.
@@ -88,6 +92,47 @@ static void check_busy_worker(struct btrfs_worker_thread *worker)
 	}
 }
 
+static noinline int run_ordered_completions(struct btrfs_workers *workers,
+					    struct btrfs_work *work)
+{
+	unsigned long flags;
+
+	if (!workers->ordered)
+		return 0;
+
+	set_bit(WORK_DONE_BIT, &work->flags);
+
+	spin_lock_irqsave(&workers->lock, flags);
+
+	while(!list_empty(&workers->order_list)) {
+		work = list_entry(workers->order_list.next,
+				  struct btrfs_work, order_list);
+
+		if (!test_bit(WORK_DONE_BIT, &work->flags))
+			break;
+
+		/* we are going to call the ordered done function, but
+		 * we leave the work item on the list as a barrier so
+		 * that later work items that are done don't have their
+		 * functions called before this one returns
+		 */
+		if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
+			break;
+
+		spin_unlock_irqrestore(&workers->lock, flags);
+
+		work->ordered_func(work);
+
+		/* now take the lock again and call the freeing code */
+		spin_lock_irqsave(&workers->lock, flags);
+		list_del(&work->order_list);
+		work->ordered_free(work);
+	}
+
+	spin_unlock_irqrestore(&workers->lock, flags);
+	return 0;
+}
+
 /*
  * main loop for servicing work items
  */
@@ -102,7 +147,7 @@ static int worker_loop(void *arg)
 			cur = worker->pending.next;
 			work = list_entry(cur, struct btrfs_work, list);
 			list_del(&work->list);
-			clear_bit(0, &work->flags);
+			clear_bit(WORK_QUEUED_BIT, &work->flags);
 
 			work->worker = worker;
 			spin_unlock_irq(&worker->lock);
@@ -110,8 +155,15 @@ static int worker_loop(void *arg)
 			work->func(work);
 
 			atomic_dec(&worker->num_pending);
+			/*
+			 * unless this is an ordered work queue,
+			 * 'work' was probably freed by func above.
+			 */
+			run_ordered_completions(worker->workers, work);
+
 			spin_lock_irq(&worker->lock);
 			check_idle_worker(worker);
+
 		}
 		worker->working = 0;
 		if (freezing(current)) {
@@ -154,10 +206,12 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
 	workers->num_workers = 0;
 	INIT_LIST_HEAD(&workers->worker_list);
 	INIT_LIST_HEAD(&workers->idle_list);
+	INIT_LIST_HEAD(&workers->order_list);
 	spin_lock_init(&workers->lock);
 	workers->max_workers = max;
 	workers->idle_thresh = 32;
 	workers->name = name;
+	workers->ordered = 0;
 }
 
 /*
@@ -296,7 +350,7 @@ int btrfs_requeue_work(struct btrfs_work *work)
 	struct btrfs_worker_thread *worker = work->worker;
 	unsigned long flags;
 
-	if (test_and_set_bit(0, &work->flags))
+	if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
 		goto out;
 
 	spin_lock_irqsave(&worker->lock, flags);
@@ -330,10 +384,17 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
 	int wake = 0;
 
 	/* don't requeue something already on a list */
-	if (test_and_set_bit(0, &work->flags))
+	if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
 		goto out;
 
 	worker = find_worker(workers);
+	if (workers->ordered) {
+		spin_lock_irqsave(&workers->lock, flags);
+		list_add_tail(&work->order_list, &workers->order_list);
+		spin_unlock_irqrestore(&workers->lock, flags);
+	} else {
+		INIT_LIST_HEAD(&work->order_list);
+	}
 
 	spin_lock_irqsave(&worker->lock, flags);
 	atomic_inc(&worker->num_pending);
-- 
cgit v1.2.3


From 0df49b911db2b22ea808b596070b1cc65c23d148 Mon Sep 17 00:00:00 2001
From: yanhai zhu <zhu.yanhai@gmail.com>
Date: Wed, 12 Nov 2008 14:36:58 -0500
Subject: Btrfs: Check kthread_should_stop() before schedule() in worker_loop

In worker_loop(), the func should check whether it has been requested to stop
before it decides to schedule out.

Otherwise if the stop request(also the last wake_up()) sent by
btrfs_stop_workers() happens when worker_loop() running after the "while"
judgement and before schedule(), woker_loop() will schedule away and never be
woken up, which will also cause btrfs_stop_workers() wait forever.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/async-thread.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs/btrfs/async-thread.c')

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index e1e49715459e..4229450b7596 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -171,7 +171,8 @@ static int worker_loop(void *arg)
 		} else {
 			set_current_state(TASK_INTERRUPTIBLE);
 			spin_unlock_irq(&worker->lock);
-			schedule();
+			if (!kthread_should_stop())
+				schedule();
 			__set_current_state(TASK_RUNNING);
 		}
 	} while (!kthread_should_stop());
-- 
cgit v1.2.3


From d397712bcc6a759a560fd247e6053ecae091f958 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 5 Jan 2009 21:25:51 -0500
Subject: Btrfs: Fix checkpatch.pl warnings

There were many, most are fixed now.  struct-funcs.c generates some warnings
but these are bogus.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/async-thread.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs/btrfs/async-thread.c')

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 4229450b7596..8e2fec05dbe0 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -104,7 +104,7 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers,
 
 	spin_lock_irqsave(&workers->lock, flags);
 
-	while(!list_empty(&workers->order_list)) {
+	while (!list_empty(&workers->order_list)) {
 		work = list_entry(workers->order_list.next,
 				  struct btrfs_work, order_list);
 
@@ -143,7 +143,7 @@ static int worker_loop(void *arg)
 	struct btrfs_work *work;
 	do {
 		spin_lock_irq(&worker->lock);
-		while(!list_empty(&worker->pending)) {
+		while (!list_empty(&worker->pending)) {
 			cur = worker->pending.next;
 			work = list_entry(cur, struct btrfs_work, list);
 			list_del(&work->list);
@@ -188,7 +188,7 @@ int btrfs_stop_workers(struct btrfs_workers *workers)
 	struct btrfs_worker_thread *worker;
 
 	list_splice_init(&workers->idle_list, &workers->worker_list);
-	while(!list_empty(&workers->worker_list)) {
+	while (!list_empty(&workers->worker_list)) {
 		cur = workers->worker_list.next;
 		worker = list_entry(cur, struct btrfs_worker_thread,
 				    worker_list);
-- 
cgit v1.2.3