From 66e188fc3173eaeb32d3479758685e34889aff14 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Tue, 21 Jan 2014 15:48:22 -0800 Subject: ocfs2: add DLM recovery callbacks These are the callbacks called by the fs/dlm code in case the membership changes. If there is a failure while/during calling any of these, the DLM creates a new membership and relays to the rest of the nodes. - recover_prep() is called when DLM understands a node is down. - recover_slot() is called once all nodes have acknowledged recover_prep and recovery can begin. - recover_done() is called once the recovery is complete. It returns the new membership. Signed-off-by: Goldwyn Rodrigues Reviewed-by: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/stack_user.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'fs/ocfs2/stack_user.c') diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 286edf1e231f..4111855a4def 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -110,6 +110,8 @@ struct ocfs2_live_connection { struct list_head oc_list; struct ocfs2_cluster_connection *oc_conn; + atomic_t oc_this_node; + int oc_our_slot; }; struct ocfs2_control_private { @@ -799,6 +801,42 @@ static int fs_protocol_compare(struct ocfs2_protocol_version *existing, return 0; } +static void user_recover_prep(void *arg) +{ +} + +static void user_recover_slot(void *arg, struct dlm_slot *slot) +{ + struct ocfs2_cluster_connection *conn = arg; + printk(KERN_INFO "ocfs2: Node %d/%d down. Initiating recovery.\n", + slot->nodeid, slot->slot); + conn->cc_recovery_handler(slot->nodeid, conn->cc_recovery_data); + +} + +static void user_recover_done(void *arg, struct dlm_slot *slots, + int num_slots, int our_slot, + uint32_t generation) +{ + struct ocfs2_cluster_connection *conn = arg; + struct ocfs2_live_connection *lc = conn->cc_private; + int i; + + for (i = 0; i < num_slots; i++) + if (slots[i].slot == our_slot) { + atomic_set(&lc->oc_this_node, slots[i].nodeid); + break; + } + + lc->oc_our_slot = our_slot; +} + +const struct dlm_lockspace_ops ocfs2_ls_ops = { + .recover_prep = user_recover_prep, + .recover_slot = user_recover_slot, + .recover_done = user_recover_done, +}; + static int user_cluster_connect(struct ocfs2_cluster_connection *conn) { dlm_lockspace_t *fsdlm; -- cgit v1.2.3 From 24aa338611e9b02d045a1a99050135b0d49f41b5 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Tue, 21 Jan 2014 15:48:23 -0800 Subject: ocfs2: shift allocation ocfs2_live_connection to user_connect() We perform this because the DLM recovery callbacks will require the ocfs2_live_connection structure to record the node information when dlm_new_lockspace() is updated (in the last patch of the series). Before calling dlm_new_lockspace(), we need the structure ready for the .recover_done() callback, which would set oc_this_node. This is the reason we allocate ocfs2_live_connection beforehand in user_connect(). [AKPM] rc initialization is not required because it assigned in case of errors. It will be cleared by compiler anyways. Signed-off-by: Goldwyn Rodrigues Reveiwed-by: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/stack_user.c | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) (limited to 'fs/ocfs2/stack_user.c') diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 4111855a4def..0afb86d9b279 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -200,15 +200,10 @@ static struct ocfs2_live_connection *ocfs2_connection_find(const char *name) * mount path. Since the VFS prevents multiple calls to * fill_super(), we can't get dupes here. */ -static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn, - struct ocfs2_live_connection **c_ret) +static int ocfs2_live_connection_attach(struct ocfs2_cluster_connection *conn, + struct ocfs2_live_connection *c) { int rc = 0; - struct ocfs2_live_connection *c; - - c = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL); - if (!c) - return -ENOMEM; mutex_lock(&ocfs2_control_lock); c->oc_conn = conn; @@ -222,12 +217,6 @@ static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn, } mutex_unlock(&ocfs2_control_lock); - - if (!rc) - *c_ret = c; - else - kfree(c); - return rc; } @@ -840,12 +829,18 @@ const struct dlm_lockspace_ops ocfs2_ls_ops = { static int user_cluster_connect(struct ocfs2_cluster_connection *conn) { dlm_lockspace_t *fsdlm; - struct ocfs2_live_connection *uninitialized_var(control); - int rc = 0; + struct ocfs2_live_connection *lc; + int rc; BUG_ON(conn == NULL); - rc = ocfs2_live_connection_new(conn, &control); + lc = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL); + if (!lc) { + rc = -ENOMEM; + goto out; + } + + rc = ocfs2_live_connection_attach(conn, lc); if (rc) goto out; @@ -861,20 +856,24 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn) conn->cc_version.pv_major, conn->cc_version.pv_minor, running_proto.pv_major, running_proto.pv_minor); rc = -EPROTO; - ocfs2_live_connection_drop(control); + ocfs2_live_connection_drop(lc); + lc = NULL; goto out; } rc = dlm_new_lockspace(conn->cc_name, NULL, DLM_LSFL_FS, DLM_LVB_LEN, NULL, NULL, NULL, &fsdlm); if (rc) { - ocfs2_live_connection_drop(control); + ocfs2_live_connection_drop(lc); + lc = NULL; goto out; } - conn->cc_private = control; + conn->cc_private = lc; conn->cc_lockspace = fsdlm; out: + if (rc && lc) + kfree(lc); return rc; } -- cgit v1.2.3 From 3e8341516409d026636be4d7534b84e6e90bef37 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Tue, 21 Jan 2014 15:48:24 -0800 Subject: ocfs2: pass ocfs2_cluster_connection to ocfs2_this_node This is done to differentiate between using and not using controld and use the connection information accordingly. We need to be backward compatible. So, we use a new enum ocfs2_connection_type to identify when controld is used and when it is not. Signed-off-by: Goldwyn Rodrigues Reviewed-by: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/stack_user.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'fs/ocfs2/stack_user.c') diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 0afb86d9b279..22b95acc2a82 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -103,6 +103,11 @@ #define OCFS2_CONTROL_MESSAGE_VERNUM_LEN 2 #define OCFS2_CONTROL_MESSAGE_NODENUM_LEN 8 +enum ocfs2_connection_type { + WITH_CONTROLD, + NO_CONTROLD +}; + /* * ocfs2_live_connection is refcounted because the filesystem and * miscdevice sides can detach in different order. Let's just be safe. @@ -110,6 +115,7 @@ struct ocfs2_live_connection { struct list_head oc_list; struct ocfs2_cluster_connection *oc_conn; + enum ocfs2_connection_type oc_type; atomic_t oc_this_node; int oc_our_slot; }; @@ -840,6 +846,7 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn) goto out; } + lc->oc_type = WITH_CONTROLD; rc = ocfs2_live_connection_attach(conn, lc); if (rc) goto out; @@ -886,11 +893,16 @@ static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn) return 0; } -static int user_cluster_this_node(unsigned int *this_node) +static int user_cluster_this_node(struct ocfs2_cluster_connection *conn, + unsigned int *this_node) { int rc; + struct ocfs2_live_connection *lc = conn->cc_private; - rc = ocfs2_control_get_this_node(); + if (lc->oc_type == WITH_CONTROLD) + rc = ocfs2_control_get_this_node(); + else + rc = -EINVAL; if (rc < 0) return rc; -- cgit v1.2.3 From 415036303319c0a46caf9059711710bfa34e3bf3 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Tue, 21 Jan 2014 15:48:25 -0800 Subject: ocfs2: framework for version LVB Use the native DLM locks for version control negotiation. Most of the framework is taken from gfs2/lock_dlm.c Signed-off-by: Goldwyn Rodrigues Reviewed-by: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/stack_user.c | 101 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) (limited to 'fs/ocfs2/stack_user.c') diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 22b95acc2a82..d3867100aca8 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -102,6 +102,7 @@ #define OCFS2_TEXT_UUID_LEN 32 #define OCFS2_CONTROL_MESSAGE_VERNUM_LEN 2 #define OCFS2_CONTROL_MESSAGE_NODENUM_LEN 8 +#define VERSION_LOCK "version_lock" enum ocfs2_connection_type { WITH_CONTROLD, @@ -118,6 +119,9 @@ struct ocfs2_live_connection { enum ocfs2_connection_type oc_type; atomic_t oc_this_node; int oc_our_slot; + struct dlm_lksb oc_version_lksb; + char oc_lvb[DLM_LVB_LEN]; + struct completion oc_sync_wait; }; struct ocfs2_control_private { @@ -796,6 +800,103 @@ static int fs_protocol_compare(struct ocfs2_protocol_version *existing, return 0; } +static void lvb_to_version(char *lvb, struct ocfs2_protocol_version *ver) +{ + struct ocfs2_protocol_version *pv = + (struct ocfs2_protocol_version *)lvb; + /* + * ocfs2_protocol_version has two u8 variables, so we don't + * need any endian conversion. + */ + ver->pv_major = pv->pv_major; + ver->pv_minor = pv->pv_minor; +} + +static void version_to_lvb(struct ocfs2_protocol_version *ver, char *lvb) +{ + struct ocfs2_protocol_version *pv = + (struct ocfs2_protocol_version *)lvb; + /* + * ocfs2_protocol_version has two u8 variables, so we don't + * need any endian conversion. + */ + pv->pv_major = ver->pv_major; + pv->pv_minor = ver->pv_minor; +} + +static void sync_wait_cb(void *arg) +{ + struct ocfs2_cluster_connection *conn = arg; + struct ocfs2_live_connection *lc = conn->cc_private; + complete(&lc->oc_sync_wait); +} + +static int sync_unlock(struct ocfs2_cluster_connection *conn, + struct dlm_lksb *lksb, char *name) +{ + int error; + struct ocfs2_live_connection *lc = conn->cc_private; + + error = dlm_unlock(conn->cc_lockspace, lksb->sb_lkid, 0, lksb, conn); + if (error) { + printk(KERN_ERR "%s lkid %x error %d\n", + name, lksb->sb_lkid, error); + return error; + } + + wait_for_completion(&lc->oc_sync_wait); + + if (lksb->sb_status != -DLM_EUNLOCK) { + printk(KERN_ERR "%s lkid %x status %d\n", + name, lksb->sb_lkid, lksb->sb_status); + return -1; + } + return 0; +} + +static int sync_lock(struct ocfs2_cluster_connection *conn, + int mode, uint32_t flags, + struct dlm_lksb *lksb, char *name) +{ + int error, status; + struct ocfs2_live_connection *lc = conn->cc_private; + + error = dlm_lock(conn->cc_lockspace, mode, lksb, flags, + name, strlen(name), + 0, sync_wait_cb, conn, NULL); + if (error) { + printk(KERN_ERR "%s lkid %x flags %x mode %d error %d\n", + name, lksb->sb_lkid, flags, mode, error); + return error; + } + + wait_for_completion(&lc->oc_sync_wait); + + status = lksb->sb_status; + + if (status && status != -EAGAIN) { + printk(KERN_ERR "%s lkid %x flags %x mode %d status %d\n", + name, lksb->sb_lkid, flags, mode, status); + } + + return status; +} + + +static int version_lock(struct ocfs2_cluster_connection *conn, int mode, + int flags) +{ + struct ocfs2_live_connection *lc = conn->cc_private; + return sync_lock(conn, mode, flags, + &lc->oc_version_lksb, VERSION_LOCK); +} + +static int version_unlock(struct ocfs2_cluster_connection *conn) +{ + struct ocfs2_live_connection *lc = conn->cc_private; + return sync_unlock(conn, &lc->oc_version_lksb, VERSION_LOCK); +} + static void user_recover_prep(void *arg) { } -- cgit v1.2.3 From c994c2ebdbbc391a42f177c8eb7882ebf3f142d8 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Tue, 21 Jan 2014 15:48:32 -0800 Subject: ocfs2: use the new DLM operation callbacks while requesting new lockspace Attempt to use the new DLM operations. If it is not supported, use the traditional ocfs2_controld. To exchange ocfs2 versioning, we use the LVB of the version dlm lock. It first attempts to take the lock in EX mode (non-blocking). If successful (which means it is the first mount), it writes the version number and downconverts to PR lock. If it is unsuccessful, it reads the version from the lock. If this becomes the standard (with o2cb as well), it could simplify userspace tools to check if the filesystem is mounted on other nodes. Dan: Since ocfs2_protocol_version are two u8 values, the additional checks with LONG* don't make sense. Signed-off-by: Goldwyn Rodrigues Signed-off-by: Dan Carpenter Reviewed-by: Mark Fasheh Cc: Joel Becker Cc: Dan Carpenter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/stack_user.c | 126 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 102 insertions(+), 24 deletions(-) (limited to 'fs/ocfs2/stack_user.c') diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index d3867100aca8..673ab40eb81b 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include "stackglue.h" @@ -122,6 +123,7 @@ struct ocfs2_live_connection { struct dlm_lksb oc_version_lksb; char oc_lvb[DLM_LVB_LEN]; struct completion oc_sync_wait; + wait_queue_head_t oc_wait; }; struct ocfs2_control_private { @@ -218,7 +220,7 @@ static int ocfs2_live_connection_attach(struct ocfs2_cluster_connection *conn, mutex_lock(&ocfs2_control_lock); c->oc_conn = conn; - if (atomic_read(&ocfs2_control_opened)) + if ((c->oc_type == NO_CONTROLD) || atomic_read(&ocfs2_control_opened)) list_add(&c->oc_list, &ocfs2_live_connection_list); else { printk(KERN_ERR @@ -897,6 +899,55 @@ static int version_unlock(struct ocfs2_cluster_connection *conn) return sync_unlock(conn, &lc->oc_version_lksb, VERSION_LOCK); } +/* get_protocol_version() + * + * To exchange ocfs2 versioning, we use the LVB of the version dlm lock. + * The algorithm is: + * 1. Attempt to take the lock in EX mode (non-blocking). + * 2. If successful (which means it is the first mount), write the + * version number and downconvert to PR lock. + * 3. If unsuccessful (returns -EAGAIN), read the version from the LVB after + * taking the PR lock. + */ + +static int get_protocol_version(struct ocfs2_cluster_connection *conn) +{ + int ret; + struct ocfs2_live_connection *lc = conn->cc_private; + struct ocfs2_protocol_version pv; + + running_proto.pv_major = + ocfs2_user_plugin.sp_max_proto.pv_major; + running_proto.pv_minor = + ocfs2_user_plugin.sp_max_proto.pv_minor; + + lc->oc_version_lksb.sb_lvbptr = lc->oc_lvb; + ret = version_lock(conn, DLM_LOCK_EX, + DLM_LKF_VALBLK|DLM_LKF_NOQUEUE); + if (!ret) { + conn->cc_version.pv_major = running_proto.pv_major; + conn->cc_version.pv_minor = running_proto.pv_minor; + version_to_lvb(&running_proto, lc->oc_lvb); + version_lock(conn, DLM_LOCK_PR, DLM_LKF_CONVERT|DLM_LKF_VALBLK); + } else if (ret == -EAGAIN) { + ret = version_lock(conn, DLM_LOCK_PR, DLM_LKF_VALBLK); + if (ret) + goto out; + lvb_to_version(lc->oc_lvb, &pv); + + if ((pv.pv_major != running_proto.pv_major) || + (pv.pv_minor > running_proto.pv_minor)) { + ret = -EINVAL; + goto out; + } + + conn->cc_version.pv_major = pv.pv_major; + conn->cc_version.pv_minor = pv.pv_minor; + } +out: + return ret; +} + static void user_recover_prep(void *arg) { } @@ -925,6 +976,7 @@ static void user_recover_done(void *arg, struct dlm_slot *slots, } lc->oc_our_slot = our_slot; + wake_up(&lc->oc_wait); } const struct dlm_lockspace_ops ocfs2_ls_ops = { @@ -933,11 +985,21 @@ const struct dlm_lockspace_ops ocfs2_ls_ops = { .recover_done = user_recover_done, }; +static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn) +{ + version_unlock(conn); + dlm_release_lockspace(conn->cc_lockspace, 2); + conn->cc_lockspace = NULL; + ocfs2_live_connection_drop(conn->cc_private); + conn->cc_private = NULL; + return 0; +} + static int user_cluster_connect(struct ocfs2_cluster_connection *conn) { dlm_lockspace_t *fsdlm; struct ocfs2_live_connection *lc; - int rc; + int rc, ops_rv; BUG_ON(conn == NULL); @@ -947,11 +1009,44 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn) goto out; } - lc->oc_type = WITH_CONTROLD; + init_waitqueue_head(&lc->oc_wait); + init_completion(&lc->oc_sync_wait); + atomic_set(&lc->oc_this_node, 0); + conn->cc_private = lc; + lc->oc_type = NO_CONTROLD; + + rc = dlm_new_lockspace(conn->cc_name, conn->cc_cluster_name, + DLM_LSFL_FS, DLM_LVB_LEN, + &ocfs2_ls_ops, conn, &ops_rv, &fsdlm); + if (rc) + goto out; + + if (ops_rv == -EOPNOTSUPP) { + lc->oc_type = WITH_CONTROLD; + printk(KERN_NOTICE "ocfs2: You seem to be using an older " + "version of dlm_controld and/or ocfs2-tools." + " Please consider upgrading.\n"); + } else if (ops_rv) { + rc = ops_rv; + goto out; + } + conn->cc_lockspace = fsdlm; + rc = ocfs2_live_connection_attach(conn, lc); if (rc) goto out; + if (lc->oc_type == NO_CONTROLD) { + rc = get_protocol_version(conn); + if (rc) { + printk(KERN_ERR "ocfs2: Could not determine" + " locking version\n"); + user_cluster_disconnect(conn); + goto out; + } + wait_event(lc->oc_wait, (atomic_read(&lc->oc_this_node) > 0)); + } + /* * running_proto must have been set before we allowed any mounts * to proceed. @@ -959,40 +1054,20 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn) if (fs_protocol_compare(&running_proto, &conn->cc_version)) { printk(KERN_ERR "Unable to mount with fs locking protocol version " - "%u.%u because the userspace control daemon has " - "negotiated %u.%u\n", + "%u.%u because negotiated protocol is %u.%u\n", conn->cc_version.pv_major, conn->cc_version.pv_minor, running_proto.pv_major, running_proto.pv_minor); rc = -EPROTO; ocfs2_live_connection_drop(lc); lc = NULL; - goto out; - } - - rc = dlm_new_lockspace(conn->cc_name, NULL, DLM_LSFL_FS, DLM_LVB_LEN, - NULL, NULL, NULL, &fsdlm); - if (rc) { - ocfs2_live_connection_drop(lc); - lc = NULL; - goto out; } - conn->cc_private = lc; - conn->cc_lockspace = fsdlm; out: if (rc && lc) kfree(lc); return rc; } -static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn) -{ - dlm_release_lockspace(conn->cc_lockspace, 2); - conn->cc_lockspace = NULL; - ocfs2_live_connection_drop(conn->cc_private); - conn->cc_private = NULL; - return 0; -} static int user_cluster_this_node(struct ocfs2_cluster_connection *conn, unsigned int *this_node) @@ -1002,8 +1077,11 @@ static int user_cluster_this_node(struct ocfs2_cluster_connection *conn, if (lc->oc_type == WITH_CONTROLD) rc = ocfs2_control_get_this_node(); + else if (lc->oc_type == NO_CONTROLD) + rc = atomic_read(&lc->oc_this_node); else rc = -EINVAL; + if (rc < 0) return rc; -- cgit v1.2.3 From 16eac4be46fdd19e4e0bcd06e77e947266d2cf35 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 21 Jan 2014 15:48:37 -0800 Subject: ocfs2: fix sparse non static symbol warning Fixes the following sparse warning: fs/ocfs2/stack_user.c:930:32: warning: symbol 'ocfs2_ls_ops' was not declared. Should it be static? Signed-off-by: Wei Yongjun Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/stack_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ocfs2/stack_user.c') diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 673ab40eb81b..13a8537d8e8b 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -979,7 +979,7 @@ static void user_recover_done(void *arg, struct dlm_slot *slots, wake_up(&lc->oc_wait); } -const struct dlm_lockspace_ops ocfs2_ls_ops = { +static const struct dlm_lockspace_ops ocfs2_ls_ops = { .recover_prep = user_recover_prep, .recover_slot = user_recover_slot, .recover_done = user_recover_done, -- cgit v1.2.3