diff options
Diffstat (limited to 'net/sunrpc')
-rw-r--r-- | net/sunrpc/auth.c | 38 | ||||
-rw-r--r-- | net/sunrpc/auth_generic.c | 20 | ||||
-rw-r--r-- | net/sunrpc/auth_gss/auth_gss.c | 295 | ||||
-rw-r--r-- | net/sunrpc/auth_gss/gss_generic_token.c | 6 | ||||
-rw-r--r-- | net/sunrpc/auth_gss/gss_mech_switch.c | 18 | ||||
-rw-r--r-- | net/sunrpc/auth_gss/svcauth_gss.c | 28 | ||||
-rw-r--r-- | net/sunrpc/clnt.c | 28 | ||||
-rw-r--r-- | net/sunrpc/rpc_pipe.c | 42 | ||||
-rw-r--r-- | net/sunrpc/rpcb_clnt.c | 57 | ||||
-rw-r--r-- | net/sunrpc/svcauth_unix.c | 24 | ||||
-rw-r--r-- | net/sunrpc/svcsock.c | 15 | ||||
-rw-r--r-- | net/sunrpc/xdr.c | 50 | ||||
-rw-r--r-- | net/sunrpc/xprt.c | 12 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/rpc_rdma.c | 29 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 4 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_sendto.c | 2 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_transport.c | 16 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/transport.c | 55 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/verbs.c | 747 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/xprt_rdma.h | 17 | ||||
-rw-r--r-- | net/sunrpc/xprtsock.c | 99 |
21 files changed, 1118 insertions, 484 deletions
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 6bfea9ed6869..0c431c277af5 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -83,10 +83,8 @@ rpcauth_create(rpc_authflavor_t pseudoflavor, struct rpc_clnt *clnt) if (flavor >= RPC_AUTH_MAXFLAVOR) goto out; -#ifdef CONFIG_KMOD if ((ops = auth_flavors[flavor]) == NULL) request_module("rpc-auth-%u", flavor); -#endif spin_lock(&rpc_authflavor_lock); ops = auth_flavors[flavor]; if (ops == NULL || !try_module_get(ops->owner)) { @@ -230,19 +228,21 @@ static int rpcauth_prune_expired(struct list_head *free, int nr_to_scan) { spinlock_t *cache_lock; - struct rpc_cred *cred; + struct rpc_cred *cred, *next; unsigned long expired = jiffies - RPC_AUTH_EXPIRY_MORATORIUM; - while (!list_empty(&cred_unused)) { - cred = list_entry(cred_unused.next, struct rpc_cred, cr_lru); + list_for_each_entry_safe(cred, next, &cred_unused, cr_lru) { + + /* Enforce a 60 second garbage collection moratorium */ + if (time_in_range_open(cred->cr_expire, expired, jiffies) && + test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) != 0) + continue; + list_del_init(&cred->cr_lru); number_cred_unused--; if (atomic_read(&cred->cr_count) != 0) continue; - /* Enforce a 5 second garbage collection moratorium */ - if (time_in_range(cred->cr_expire, expired, jiffies) && - test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0) - continue; + cache_lock = &cred->cr_auth->au_credcache->lock; spin_lock(cache_lock); if (atomic_read(&cred->cr_count) == 0) { @@ -350,16 +350,18 @@ EXPORT_SYMBOL_GPL(rpcauth_lookup_credcache); struct rpc_cred * rpcauth_lookupcred(struct rpc_auth *auth, int flags) { - struct auth_cred acred = { - .uid = current->fsuid, - .gid = current->fsgid, - .group_info = current->group_info, - }; + struct auth_cred acred; struct rpc_cred *ret; + const struct cred *cred = current_cred(); dprintk("RPC: looking up %s cred\n", auth->au_ops->au_name); - get_group_info(acred.group_info); + + memset(&acred, 0, sizeof(acred)); + acred.uid = cred->fsuid; + acred.gid = cred->fsgid; + acred.group_info = get_group_info(((struct cred *)cred)->group_info); + ret = auth->au_ops->lookup_cred(auth, &acred, flags); put_group_info(acred.group_info); return ret; @@ -455,7 +457,7 @@ need_lock: } if (test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) == 0) rpcauth_unhash_cred(cred); - else if (test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) != 0) { + if (test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) != 0) { cred->cr_expire = jiffies; list_add_tail(&cred->cr_lru, &cred_unused); number_cred_unused++; @@ -513,7 +515,7 @@ rpcauth_wrap_req(struct rpc_task *task, kxdrproc_t encode, void *rqstp, if (cred->cr_ops->crwrap_req) return cred->cr_ops->crwrap_req(task, encode, rqstp, data, obj); /* By default, we encode the arguments normally. */ - return rpc_call_xdrproc(encode, rqstp, data, obj); + return encode(rqstp, data, obj); } int @@ -528,7 +530,7 @@ rpcauth_unwrap_resp(struct rpc_task *task, kxdrproc_t decode, void *rqstp, return cred->cr_ops->crunwrap_resp(task, decode, rqstp, data, obj); /* By default, we decode the arguments normally. */ - return rpc_call_xdrproc(decode, rqstp, data, obj); + return decode(rqstp, data, obj); } int diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c index 744b79fdcb19..4028502f0528 100644 --- a/net/sunrpc/auth_generic.c +++ b/net/sunrpc/auth_generic.c @@ -133,13 +133,29 @@ static int generic_match(struct auth_cred *acred, struct rpc_cred *cred, int flags) { struct generic_cred *gcred = container_of(cred, struct generic_cred, gc_base); + int i; if (gcred->acred.uid != acred->uid || gcred->acred.gid != acred->gid || - gcred->acred.group_info != acred->group_info || gcred->acred.machine_cred != acred->machine_cred) - return 0; + goto out_nomatch; + + /* Optimisation in the case where pointers are identical... */ + if (gcred->acred.group_info == acred->group_info) + goto out_match; + + /* Slow path... */ + if (gcred->acred.group_info->ngroups != acred->group_info->ngroups) + goto out_nomatch; + for (i = 0; i < gcred->acred.group_info->ngroups; i++) { + if (GROUP_AT(gcred->acred.group_info, i) != + GROUP_AT(acred->group_info, i)) + goto out_nomatch; + } +out_match: return 1; +out_nomatch: + return 0; } void __init rpc_init_generic_auth(void) diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 853a4142cea1..e630b38a6047 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -72,11 +72,25 @@ struct gss_auth { struct gss_api_mech *mech; enum rpc_gss_svc service; struct rpc_clnt *client; - struct dentry *dentry; + /* + * There are two upcall pipes; dentry[1], named "gssd", is used + * for the new text-based upcall; dentry[0] is named after the + * mechanism (for example, "krb5") and exists for + * backwards-compatibility with older gssd's. + */ + struct dentry *dentry[2]; }; +/* pipe_version >= 0 if and only if someone has a pipe open. */ +static int pipe_version = -1; +static atomic_t pipe_users = ATOMIC_INIT(0); +static DEFINE_SPINLOCK(pipe_version_lock); +static struct rpc_wait_queue pipe_version_rpc_waitqueue; +static DECLARE_WAIT_QUEUE_HEAD(pipe_version_waitqueue); + static void gss_free_ctx(struct gss_cl_ctx *); -static struct rpc_pipe_ops gss_upcall_ops; +static struct rpc_pipe_ops gss_upcall_ops_v0; +static struct rpc_pipe_ops gss_upcall_ops_v1; static inline struct gss_cl_ctx * gss_get_ctx(struct gss_cl_ctx *ctx) @@ -220,6 +234,7 @@ err: return p; } +#define UPCALL_BUF_LEN 128 struct gss_upcall_msg { atomic_t count; @@ -227,16 +242,41 @@ struct gss_upcall_msg { struct rpc_pipe_msg msg; struct list_head list; struct gss_auth *auth; + struct rpc_inode *inode; struct rpc_wait_queue rpc_waitqueue; wait_queue_head_t waitqueue; struct gss_cl_ctx *ctx; + char databuf[UPCALL_BUF_LEN]; }; +static int get_pipe_version(void) +{ + int ret; + + spin_lock(&pipe_version_lock); + if (pipe_version >= 0) { + atomic_inc(&pipe_users); + ret = pipe_version; + } else + ret = -EAGAIN; + spin_unlock(&pipe_version_lock); + return ret; +} + +static void put_pipe_version(void) +{ + if (atomic_dec_and_lock(&pipe_users, &pipe_version_lock)) { + pipe_version = -1; + spin_unlock(&pipe_version_lock); + } +} + static void gss_release_msg(struct gss_upcall_msg *gss_msg) { if (!atomic_dec_and_test(&gss_msg->count)) return; + put_pipe_version(); BUG_ON(!list_empty(&gss_msg->list)); if (gss_msg->ctx != NULL) gss_put_ctx(gss_msg->ctx); @@ -266,8 +306,8 @@ __gss_find_upcall(struct rpc_inode *rpci, uid_t uid) static inline struct gss_upcall_msg * gss_add_msg(struct gss_auth *gss_auth, struct gss_upcall_msg *gss_msg) { - struct inode *inode = gss_auth->dentry->d_inode; - struct rpc_inode *rpci = RPC_I(inode); + struct rpc_inode *rpci = gss_msg->inode; + struct inode *inode = &rpci->vfs_inode; struct gss_upcall_msg *old; spin_lock(&inode->i_lock); @@ -293,8 +333,7 @@ __gss_unhash_msg(struct gss_upcall_msg *gss_msg) static void gss_unhash_msg(struct gss_upcall_msg *gss_msg) { - struct gss_auth *gss_auth = gss_msg->auth; - struct inode *inode = gss_auth->dentry->d_inode; + struct inode *inode = &gss_msg->inode->vfs_inode; if (list_empty(&gss_msg->list)) return; @@ -310,7 +349,7 @@ gss_upcall_callback(struct rpc_task *task) struct gss_cred *gss_cred = container_of(task->tk_msg.rpc_cred, struct gss_cred, gc_base); struct gss_upcall_msg *gss_msg = gss_cred->gc_upcall; - struct inode *inode = gss_msg->auth->dentry->d_inode; + struct inode *inode = &gss_msg->inode->vfs_inode; spin_lock(&inode->i_lock); if (gss_msg->ctx) @@ -323,22 +362,75 @@ gss_upcall_callback(struct rpc_task *task) gss_release_msg(gss_msg); } +static void gss_encode_v0_msg(struct gss_upcall_msg *gss_msg) +{ + gss_msg->msg.data = &gss_msg->uid; + gss_msg->msg.len = sizeof(gss_msg->uid); +} + +static void gss_encode_v1_msg(struct gss_upcall_msg *gss_msg, + struct rpc_clnt *clnt, int machine_cred) +{ + char *p = gss_msg->databuf; + int len = 0; + + gss_msg->msg.len = sprintf(gss_msg->databuf, "mech=%s uid=%d ", + gss_msg->auth->mech->gm_name, + gss_msg->uid); + p += gss_msg->msg.len; + if (clnt->cl_principal) { + len = sprintf(p, "target=%s ", clnt->cl_principal); + p += len; + gss_msg->msg.len += len; + } + if (machine_cred) { + len = sprintf(p, "service=* "); + p += len; + gss_msg->msg.len += len; + } else if (!strcmp(clnt->cl_program->name, "nfs4_cb")) { + len = sprintf(p, "service=nfs "); + p += len; + gss_msg->msg.len += len; + } + len = sprintf(p, "\n"); + gss_msg->msg.len += len; + + gss_msg->msg.data = gss_msg->databuf; + BUG_ON(gss_msg->msg.len > UPCALL_BUF_LEN); +} + +static void gss_encode_msg(struct gss_upcall_msg *gss_msg, + struct rpc_clnt *clnt, int machine_cred) +{ + if (pipe_version == 0) + gss_encode_v0_msg(gss_msg); + else /* pipe_version == 1 */ + gss_encode_v1_msg(gss_msg, clnt, machine_cred); +} + static inline struct gss_upcall_msg * -gss_alloc_msg(struct gss_auth *gss_auth, uid_t uid) +gss_alloc_msg(struct gss_auth *gss_auth, uid_t uid, struct rpc_clnt *clnt, + int machine_cred) { struct gss_upcall_msg *gss_msg; + int vers; gss_msg = kzalloc(sizeof(*gss_msg), GFP_NOFS); - if (gss_msg != NULL) { - INIT_LIST_HEAD(&gss_msg->list); - rpc_init_wait_queue(&gss_msg->rpc_waitqueue, "RPCSEC_GSS upcall waitq"); - init_waitqueue_head(&gss_msg->waitqueue); - atomic_set(&gss_msg->count, 1); - gss_msg->msg.data = &gss_msg->uid; - gss_msg->msg.len = sizeof(gss_msg->uid); - gss_msg->uid = uid; - gss_msg->auth = gss_auth; + if (gss_msg == NULL) + return ERR_PTR(-ENOMEM); + vers = get_pipe_version(); + if (vers < 0) { + kfree(gss_msg); + return ERR_PTR(vers); } + gss_msg->inode = RPC_I(gss_auth->dentry[vers]->d_inode); + INIT_LIST_HEAD(&gss_msg->list); + rpc_init_wait_queue(&gss_msg->rpc_waitqueue, "RPCSEC_GSS upcall waitq"); + init_waitqueue_head(&gss_msg->waitqueue); + atomic_set(&gss_msg->count, 1); + gss_msg->uid = uid; + gss_msg->auth = gss_auth; + gss_encode_msg(gss_msg, clnt, machine_cred); return gss_msg; } @@ -350,16 +442,13 @@ gss_setup_upcall(struct rpc_clnt *clnt, struct gss_auth *gss_auth, struct rpc_cr struct gss_upcall_msg *gss_new, *gss_msg; uid_t uid = cred->cr_uid; - /* Special case: rpc.gssd assumes that uid == 0 implies machine creds */ - if (gss_cred->gc_machine_cred != 0) - uid = 0; - - gss_new = gss_alloc_msg(gss_auth, uid); - if (gss_new == NULL) - return ERR_PTR(-ENOMEM); + gss_new = gss_alloc_msg(gss_auth, uid, clnt, gss_cred->gc_machine_cred); + if (IS_ERR(gss_new)) + return gss_new; gss_msg = gss_add_msg(gss_auth, gss_new); if (gss_msg == gss_new) { - int res = rpc_queue_upcall(gss_auth->dentry->d_inode, &gss_new->msg); + struct inode *inode = &gss_new->inode->vfs_inode; + int res = rpc_queue_upcall(inode, &gss_new->msg); if (res) { gss_unhash_msg(gss_new); gss_msg = ERR_PTR(res); @@ -369,6 +458,18 @@ gss_setup_upcall(struct rpc_clnt *clnt, struct gss_auth *gss_auth, struct rpc_cr return gss_msg; } +static void warn_gssd(void) +{ + static unsigned long ratelimit; + unsigned long now = jiffies; + + if (time_after(now, ratelimit)) { + printk(KERN_WARNING "RPC: AUTH_GSS upcall timed out.\n" + "Please check user daemon is running.\n"); + ratelimit = now + 15*HZ; + } +} + static inline int gss_refresh_upcall(struct rpc_task *task) { @@ -378,16 +479,25 @@ gss_refresh_upcall(struct rpc_task *task) struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_upcall_msg *gss_msg; - struct inode *inode = gss_auth->dentry->d_inode; + struct inode *inode; int err = 0; dprintk("RPC: %5u gss_refresh_upcall for uid %u\n", task->tk_pid, cred->cr_uid); gss_msg = gss_setup_upcall(task->tk_client, gss_auth, cred); + if (IS_ERR(gss_msg) == -EAGAIN) { + /* XXX: warning on the first, under the assumption we + * shouldn't normally hit this case on a refresh. */ + warn_gssd(); + task->tk_timeout = 15*HZ; + rpc_sleep_on(&pipe_version_rpc_waitqueue, task, NULL); + return 0; + } if (IS_ERR(gss_msg)) { err = PTR_ERR(gss_msg); goto out; } + inode = &gss_msg->inode->vfs_inode; spin_lock(&inode->i_lock); if (gss_cred->gc_upcall != NULL) rpc_sleep_on(&gss_cred->gc_upcall->rpc_waitqueue, task, NULL); @@ -414,18 +524,29 @@ out: static inline int gss_create_upcall(struct gss_auth *gss_auth, struct gss_cred *gss_cred) { - struct inode *inode = gss_auth->dentry->d_inode; + struct inode *inode; struct rpc_cred *cred = &gss_cred->gc_base; struct gss_upcall_msg *gss_msg; DEFINE_WAIT(wait); int err = 0; dprintk("RPC: gss_upcall for uid %u\n", cred->cr_uid); +retry: gss_msg = gss_setup_upcall(gss_auth->client, gss_auth, cred); + if (PTR_ERR(gss_msg) == -EAGAIN) { + err = wait_event_interruptible_timeout(pipe_version_waitqueue, + pipe_version >= 0, 15*HZ); + if (err) + goto out; + if (pipe_version < 0) + warn_gssd(); + goto retry; + } if (IS_ERR(gss_msg)) { err = PTR_ERR(gss_msg); goto out; } + inode = &gss_msg->inode->vfs_inode; for (;;) { prepare_to_wait(&gss_msg->waitqueue, &wait, TASK_INTERRUPTIBLE); spin_lock(&inode->i_lock); @@ -543,6 +664,38 @@ out: return err; } +static int gss_pipe_open(struct inode *inode, int new_version) +{ + int ret = 0; + + spin_lock(&pipe_version_lock); + if (pipe_version < 0) { + /* First open of any gss pipe determines the version: */ + pipe_version = new_version; + rpc_wake_up(&pipe_version_rpc_waitqueue); + wake_up(&pipe_version_waitqueue); + } else if (pipe_version != new_version) { + /* Trying to open a pipe of a different version */ + ret = -EBUSY; + goto out; + } + atomic_inc(&pipe_users); +out: + spin_unlock(&pipe_version_lock); + return ret; + +} + +static int gss_pipe_open_v0(struct inode *inode) +{ + return gss_pipe_open(inode, 0); +} + +static int gss_pipe_open_v1(struct inode *inode) +{ + return gss_pipe_open(inode, 1); +} + static void gss_pipe_release(struct inode *inode) { @@ -562,27 +715,22 @@ gss_pipe_release(struct inode *inode) spin_lock(&inode->i_lock); } spin_unlock(&inode->i_lock); + + put_pipe_version(); } static void gss_pipe_destroy_msg(struct rpc_pipe_msg *msg) { struct gss_upcall_msg *gss_msg = container_of(msg, struct gss_upcall_msg, msg); - static unsigned long ratelimit; if (msg->errno < 0) { dprintk("RPC: gss_pipe_destroy_msg releasing msg %p\n", gss_msg); atomic_inc(&gss_msg->count); gss_unhash_msg(gss_msg); - if (msg->errno == -ETIMEDOUT) { - unsigned long now = jiffies; - if (time_after(now, ratelimit)) { - printk(KERN_WARNING "RPC: AUTH_GSS upcall timed out.\n" - "Please check user daemon is running!\n"); - ratelimit = now + 15*HZ; - } - } + if (msg->errno == -ETIMEDOUT) + warn_gssd(); gss_release_msg(gss_msg); } } @@ -623,20 +771,38 @@ gss_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor) atomic_set(&auth->au_count, 1); kref_init(&gss_auth->kref); - gss_auth->dentry = rpc_mkpipe(clnt->cl_dentry, gss_auth->mech->gm_name, - clnt, &gss_upcall_ops, RPC_PIPE_WAIT_FOR_OPEN); - if (IS_ERR(gss_auth->dentry)) { - err = PTR_ERR(gss_auth->dentry); + /* + * Note: if we created the old pipe first, then someone who + * examined the directory at the right moment might conclude + * that we supported only the old pipe. So we instead create + * the new pipe first. + */ + gss_auth->dentry[1] = rpc_mkpipe(clnt->cl_dentry, + "gssd", + clnt, &gss_upcall_ops_v1, + RPC_PIPE_WAIT_FOR_OPEN); + if (IS_ERR(gss_auth->dentry[1])) { + err = PTR_ERR(gss_auth->dentry[1]); goto err_put_mech; } + gss_auth->dentry[0] = rpc_mkpipe(clnt->cl_dentry, + gss_auth->mech->gm_name, + clnt, &gss_upcall_ops_v0, + RPC_PIPE_WAIT_FOR_OPEN); + if (IS_ERR(gss_auth->dentry[0])) { + err = PTR_ERR(gss_auth->dentry[0]); + goto err_unlink_pipe_1; + } err = rpcauth_init_credcache(auth); if (err) - goto err_unlink_pipe; + goto err_unlink_pipe_0; return auth; -err_unlink_pipe: - rpc_unlink(gss_auth->dentry); +err_unlink_pipe_0: + rpc_unlink(gss_auth->dentry[0]); +err_unlink_pipe_1: + rpc_unlink(gss_auth->dentry[1]); err_put_mech: gss_mech_put(gss_auth->mech); err_free: @@ -649,8 +815,8 @@ out_dec: static void gss_free(struct gss_auth *gss_auth) { - rpc_unlink(gss_auth->dentry); - gss_auth->dentry = NULL; + rpc_unlink(gss_auth->dentry[1]); + rpc_unlink(gss_auth->dentry[0]); gss_mech_put(gss_auth->mech); kfree(gss_auth); @@ -693,7 +859,7 @@ gss_destroying_context(struct rpc_cred *cred) struct rpc_task *task; if (gss_cred->gc_ctx == NULL || - test_and_clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) == 0) + test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) == 0) return 0; gss_cred->gc_ctx->gc_proc = RPC_GSS_PROC_DESTROY; @@ -757,14 +923,12 @@ gss_free_cred_callback(struct rcu_head *head) } static void -gss_destroy_cred(struct rpc_cred *cred) +gss_destroy_nullcred(struct rpc_cred *cred) { struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_auth *gss_auth = container_of(cred->cr_auth, struct gss_auth, rpc_auth); struct gss_cl_ctx *ctx = gss_cred->gc_ctx; - if (gss_destroying_context(cred)) - return; rcu_assign_pointer(gss_cred->gc_ctx, NULL); call_rcu(&cred->cr_rcu, gss_free_cred_callback); if (ctx) @@ -772,6 +936,15 @@ gss_destroy_cred(struct rpc_cred *cred) kref_put(&gss_auth->kref, gss_free_callback); } +static void +gss_destroy_cred(struct rpc_cred *cred) +{ + + if (gss_destroying_context(cred)) + return; + gss_destroy_nullcred(cred); +} + /* * Lookup RPCSEC_GSS cred for the current process */ @@ -1017,7 +1190,7 @@ gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx, offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base; *p++ = htonl(rqstp->rq_seqno); - status = rpc_call_xdrproc(encode, rqstp, p, obj); + status = encode(rqstp, p, obj); if (status) return status; @@ -1111,7 +1284,7 @@ gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx, offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base; *p++ = htonl(rqstp->rq_seqno); - status = rpc_call_xdrproc(encode, rqstp, p, obj); + status = encode(rqstp, p, obj); if (status) return status; @@ -1170,12 +1343,12 @@ gss_wrap_req(struct rpc_task *task, /* The spec seems a little ambiguous here, but I think that not * wrapping context destruction requests makes the most sense. */ - status = rpc_call_xdrproc(encode, rqstp, p, obj); + status = encode(rqstp, p, obj); goto out; } switch (gss_cred->gc_service) { case RPC_GSS_SVC_NONE: - status = rpc_call_xdrproc(encode, rqstp, p, obj); + status = encode(rqstp, p, obj); break; case RPC_GSS_SVC_INTEGRITY: status = gss_wrap_req_integ(cred, ctx, encode, @@ -1291,7 +1464,7 @@ gss_unwrap_resp(struct rpc_task *task, cred->cr_auth->au_rslack = cred->cr_auth->au_verfsize + (p - savedp) + (savedlen - head->iov_len); out_decode: - status = rpc_call_xdrproc(decode, rqstp, p, obj); + status = decode(rqstp, p, obj); out: gss_put_ctx(ctx); dprintk("RPC: %5u gss_unwrap_resp returning %d\n", task->tk_pid, @@ -1324,7 +1497,7 @@ static const struct rpc_credops gss_credops = { static const struct rpc_credops gss_nullops = { .cr_name = "AUTH_GSS", - .crdestroy = gss_destroy_cred, + .crdestroy = gss_destroy_nullcred, .crbind = rpcauth_generic_bind_cred, .crmatch = gss_match, .crmarshal = gss_marshal, @@ -1334,10 +1507,19 @@ static const struct rpc_credops gss_nullops = { .crunwrap_resp = gss_unwrap_resp, }; -static struct rpc_pipe_ops gss_upcall_ops = { +static struct rpc_pipe_ops gss_upcall_ops_v0 = { + .upcall = gss_pipe_upcall, + .downcall = gss_pipe_downcall, + .destroy_msg = gss_pipe_destroy_msg, + .open_pipe = gss_pipe_open_v0, + .release_pipe = gss_pipe_release, +}; + +static struct rpc_pipe_ops gss_upcall_ops_v1 = { .upcall = gss_pipe_upcall, .downcall = gss_pipe_downcall, .destroy_msg = gss_pipe_destroy_msg, + .open_pipe = gss_pipe_open_v1, .release_pipe = gss_pipe_release, }; @@ -1354,6 +1536,7 @@ static int __init init_rpcsec_gss(void) err = gss_svc_init(); if (err) goto out_unregister; + rpc_init_wait_queue(&pipe_version_rpc_waitqueue, "gss pipe version"); return 0; out_unregister: rpcauth_unregister(&authgss_ops); diff --git a/net/sunrpc/auth_gss/gss_generic_token.c b/net/sunrpc/auth_gss/gss_generic_token.c index d83b881685fe..c0ba39c4f5f2 100644 --- a/net/sunrpc/auth_gss/gss_generic_token.c +++ b/net/sunrpc/auth_gss/gss_generic_token.c @@ -152,7 +152,7 @@ g_token_size(struct xdr_netobj *mech, unsigned int body_size) return(1 + der_length_size(body_size) + body_size); } -EXPORT_SYMBOL(g_token_size); +EXPORT_SYMBOL_GPL(g_token_size); /* fills in a buffer with the token header. The buffer is assumed to be the right size. buf is advanced past the token header */ @@ -167,7 +167,7 @@ g_make_token_header(struct xdr_netobj *mech, int body_size, unsigned char **buf) TWRITE_STR(*buf, mech->data, ((int) mech->len)); } -EXPORT_SYMBOL(g_make_token_header); +EXPORT_SYMBOL_GPL(g_make_token_header); /* * Given a buffer containing a token, reads and verifies the token, @@ -231,5 +231,5 @@ g_verify_token_header(struct xdr_netobj *mech, int *body_size, return(ret); } -EXPORT_SYMBOL(g_verify_token_header); +EXPORT_SYMBOL_GPL(g_verify_token_header); diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c index bce9d527af08..6efbb0cd3c7c 100644 --- a/net/sunrpc/auth_gss/gss_mech_switch.c +++ b/net/sunrpc/auth_gss/gss_mech_switch.c @@ -117,7 +117,7 @@ gss_mech_register(struct gss_api_mech *gm) return 0; } -EXPORT_SYMBOL(gss_mech_register); +EXPORT_SYMBOL_GPL(gss_mech_register); void gss_mech_unregister(struct gss_api_mech *gm) @@ -129,7 +129,7 @@ gss_mech_unregister(struct gss_api_mech *gm) gss_mech_free(gm); } -EXPORT_SYMBOL(gss_mech_unregister); +EXPORT_SYMBOL_GPL(gss_mech_unregister); struct gss_api_mech * gss_mech_get(struct gss_api_mech *gm) @@ -138,7 +138,7 @@ gss_mech_get(struct gss_api_mech *gm) return gm; } -EXPORT_SYMBOL(gss_mech_get); +EXPORT_SYMBOL_GPL(gss_mech_get); struct gss_api_mech * gss_mech_get_by_name(const char *name) @@ -158,7 +158,7 @@ gss_mech_get_by_name(const char *name) } -EXPORT_SYMBOL(gss_mech_get_by_name); +EXPORT_SYMBOL_GPL(gss_mech_get_by_name); static inline int mech_supports_pseudoflavor(struct gss_api_mech *gm, u32 pseudoflavor) @@ -191,7 +191,7 @@ gss_mech_get_by_pseudoflavor(u32 pseudoflavor) return gm; } -EXPORT_SYMBOL(gss_mech_get_by_pseudoflavor); +EXPORT_SYMBOL_GPL(gss_mech_get_by_pseudoflavor); u32 gss_svc_to_pseudoflavor(struct gss_api_mech *gm, u32 service) @@ -205,7 +205,7 @@ gss_svc_to_pseudoflavor(struct gss_api_mech *gm, u32 service) } return RPC_AUTH_MAXFLAVOR; /* illegal value */ } -EXPORT_SYMBOL(gss_svc_to_pseudoflavor); +EXPORT_SYMBOL_GPL(gss_svc_to_pseudoflavor); u32 gss_pseudoflavor_to_service(struct gss_api_mech *gm, u32 pseudoflavor) @@ -219,7 +219,7 @@ gss_pseudoflavor_to_service(struct gss_api_mech *gm, u32 pseudoflavor) return 0; } -EXPORT_SYMBOL(gss_pseudoflavor_to_service); +EXPORT_SYMBOL_GPL(gss_pseudoflavor_to_service); char * gss_service_to_auth_domain_name(struct gss_api_mech *gm, u32 service) @@ -233,7 +233,7 @@ gss_service_to_auth_domain_name(struct gss_api_mech *gm, u32 service) return NULL; } -EXPORT_SYMBOL(gss_service_to_auth_domain_name); +EXPORT_SYMBOL_GPL(gss_service_to_auth_domain_name); void gss_mech_put(struct gss_api_mech * gm) @@ -242,7 +242,7 @@ gss_mech_put(struct gss_api_mech * gm) module_put(gm->gm_owner); } -EXPORT_SYMBOL(gss_mech_put); +EXPORT_SYMBOL_GPL(gss_mech_put); /* The mech could probably be determined from the token instead, but it's just * as easy for now to pass it in. */ diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 81ae3d62a0cc..2278a50c6444 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -332,6 +332,7 @@ struct rsc { struct svc_cred cred; struct gss_svc_seq_data seqdata; struct gss_ctx *mechctx; + char *client_name; }; static struct cache_head *rsc_table[RSC_HASHMAX]; @@ -346,6 +347,7 @@ static void rsc_free(struct rsc *rsci) gss_delete_sec_context(&rsci->mechctx); if (rsci->cred.cr_group_info) put_group_info(rsci->cred.cr_group_info); + kfree(rsci->client_name); } static void rsc_put(struct kref *ref) @@ -383,6 +385,7 @@ rsc_init(struct cache_head *cnew, struct cache_head *ctmp) tmp->handle.data = NULL; new->mechctx = NULL; new->cred.cr_group_info = NULL; + new->client_name = NULL; } static void @@ -397,6 +400,8 @@ update_rsc(struct cache_head *cnew, struct cache_head *ctmp) spin_lock_init(&new->seqdata.sd_lock); new->cred = tmp->cred; tmp->cred.cr_group_info = NULL; + new->client_name = tmp->client_name; + tmp->client_name = NULL; } static struct cache_head * @@ -486,6 +491,15 @@ static int rsc_parse(struct cache_detail *cd, status = gss_import_sec_context(buf, len, gm, &rsci.mechctx); if (status) goto out; + + /* get client name */ + len = qword_get(&mesg, buf, mlen); + if (len > 0) { + rsci.client_name = kstrdup(buf, GFP_KERNEL); + if (!rsci.client_name) + goto out; + } + } rsci.h.expiry_time = expiry; rscp = rsc_update(&rsci, rscp); @@ -746,7 +760,7 @@ u32 svcauth_gss_flavor(struct auth_domain *dom) return gd->pseudoflavor; } -EXPORT_SYMBOL(svcauth_gss_flavor); +EXPORT_SYMBOL_GPL(svcauth_gss_flavor); int svcauth_gss_register_pseudoflavor(u32 pseudoflavor, char * name) @@ -780,7 +794,7 @@ out: return stat; } -EXPORT_SYMBOL(svcauth_gss_register_pseudoflavor); +EXPORT_SYMBOL_GPL(svcauth_gss_register_pseudoflavor); static inline int read_u32_from_xdr_buf(struct xdr_buf *buf, int base, u32 *obj) @@ -913,6 +927,16 @@ struct gss_svc_data { struct rsc *rsci; }; +char *svc_gss_principal(struct svc_rqst *rqstp) +{ + struct gss_svc_data *gd = (struct gss_svc_data *)rqstp->rq_auth_data; + + if (gd && gd->rsci) + return gd->rsci->client_name; + return NULL; +} +EXPORT_SYMBOL_GPL(svc_gss_principal); + static int svcauth_gss_set_client(struct svc_rqst *rqstp) { diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index da0789fa1b88..836f15c0c4a3 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -197,6 +197,12 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, stru clnt->cl_rtt = &clnt->cl_rtt_default; rpc_init_rtt(&clnt->cl_rtt_default, clnt->cl_timeout->to_initval); + clnt->cl_principal = NULL; + if (args->client_name) { + clnt->cl_principal = kstrdup(args->client_name, GFP_KERNEL); + if (!clnt->cl_principal) + goto out_no_principal; + } kref_init(&clnt->cl_kref); @@ -213,10 +219,10 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, stru } /* save the nodename */ - clnt->cl_nodelen = strlen(utsname()->nodename); + clnt->cl_nodelen = strlen(init_utsname()->nodename); if (clnt->cl_nodelen > UNX_MAXNODENAME) clnt->cl_nodelen = UNX_MAXNODENAME; - memcpy(clnt->cl_nodename, utsname()->nodename, clnt->cl_nodelen); + memcpy(clnt->cl_nodename, init_utsname()->nodename, clnt->cl_nodelen); rpc_register_client(clnt); return clnt; @@ -226,6 +232,8 @@ out_no_auth: rpc_put_mount(); } out_no_path: + kfree(clnt->cl_principal); +out_no_principal: rpc_free_iostats(clnt->cl_metrics); out_no_stats: if (clnt->cl_server != clnt->cl_inline_name) @@ -271,15 +279,15 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args) case AF_INET: { struct sockaddr_in *sin = (struct sockaddr_in *)args->address; - snprintf(servername, sizeof(servername), NIPQUAD_FMT, - NIPQUAD(sin->sin_addr.s_addr)); + snprintf(servername, sizeof(servername), "%pI4", + &sin->sin_addr.s_addr); break; } case AF_INET6: { struct sockaddr_in6 *sin = (struct sockaddr_in6 *)args->address; - snprintf(servername, sizeof(servername), NIP6_FMT, - NIP6(sin->sin6_addr)); + snprintf(servername, sizeof(servername), "%pI6", + &sin->sin6_addr); break; } default: @@ -354,6 +362,11 @@ rpc_clone_client(struct rpc_clnt *clnt) new->cl_metrics = rpc_alloc_iostats(clnt); if (new->cl_metrics == NULL) goto out_no_stats; + if (clnt->cl_principal) { + new->cl_principal = kstrdup(clnt->cl_principal, GFP_KERNEL); + if (new->cl_principal == NULL) + goto out_no_principal; + } kref_init(&new->cl_kref); err = rpc_setup_pipedir(new, clnt->cl_program->pipe_dir_name); if (err != 0) @@ -366,6 +379,8 @@ rpc_clone_client(struct rpc_clnt *clnt) rpciod_up(); return new; out_no_path: + kfree(new->cl_principal); +out_no_principal: rpc_free_iostats(new->cl_metrics); out_no_stats: kfree(new); @@ -417,6 +432,7 @@ rpc_free_client(struct kref *kref) out_free: rpc_unregister_client(clnt); rpc_free_iostats(clnt->cl_metrics); + kfree(clnt->cl_principal); clnt->cl_metrics = NULL; xprt_put(clnt->cl_xprt); rpciod_down(); diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 23a2b8f6dc49..192453248870 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -113,7 +113,7 @@ out: wake_up(&rpci->waitq); return res; } -EXPORT_SYMBOL(rpc_queue_upcall); +EXPORT_SYMBOL_GPL(rpc_queue_upcall); static inline void rpc_inode_setowner(struct inode *inode, void *private) @@ -126,13 +126,14 @@ rpc_close_pipes(struct inode *inode) { struct rpc_inode *rpci = RPC_I(inode); struct rpc_pipe_ops *ops; + int need_release; mutex_lock(&inode->i_mutex); ops = rpci->ops; if (ops != NULL) { LIST_HEAD(free_list); - spin_lock(&inode->i_lock); + need_release = rpci->nreaders != 0 || rpci->nwriters != 0; rpci->nreaders = 0; list_splice_init(&rpci->in_upcall, &free_list); list_splice_init(&rpci->pipe, &free_list); @@ -141,7 +142,7 @@ rpc_close_pipes(struct inode *inode) spin_unlock(&inode->i_lock); rpc_purge_list(rpci, &free_list, ops->destroy_msg, -EPIPE); rpci->nwriters = 0; - if (ops->release_pipe) + if (need_release && ops->release_pipe) ops->release_pipe(inode); cancel_delayed_work_sync(&rpci->queue_timeout); } @@ -169,16 +170,24 @@ static int rpc_pipe_open(struct inode *inode, struct file *filp) { struct rpc_inode *rpci = RPC_I(inode); + int first_open; int res = -ENXIO; mutex_lock(&inode->i_mutex); - if (rpci->ops != NULL) { - if (filp->f_mode & FMODE_READ) - rpci->nreaders ++; - if (filp->f_mode & FMODE_WRITE) - rpci->nwriters ++; - res = 0; + if (rpci->ops == NULL) + goto out; + first_open = rpci->nreaders == 0 && rpci->nwriters == 0; + if (first_open && rpci->ops->open_pipe) { + res = rpci->ops->open_pipe(inode); + if (res) + goto out; } + if (filp->f_mode & FMODE_READ) + rpci->nreaders++; + if (filp->f_mode & FMODE_WRITE) + rpci->nwriters++; + res = 0; +out: mutex_unlock(&inode->i_mutex); return res; } @@ -188,6 +197,7 @@ rpc_pipe_release(struct inode *inode, struct file *filp) { struct rpc_inode *rpci = RPC_I(inode); struct rpc_pipe_msg *msg; + int last_close; mutex_lock(&inode->i_mutex); if (rpci->ops == NULL) @@ -214,7 +224,8 @@ rpc_pipe_release(struct inode *inode, struct file *filp) rpci->ops->destroy_msg, -EAGAIN); } } - if (rpci->ops->release_pipe) + last_close = rpci->nwriters == 0 && rpci->nreaders == 0; + if (last_close && rpci->ops->release_pipe) rpci->ops->release_pipe(inode); out: mutex_unlock(&inode->i_mutex); @@ -396,6 +407,7 @@ enum { RPCAUTH_nfs, RPCAUTH_portmap, RPCAUTH_statd, + RPCAUTH_nfsd4_cb, RPCAUTH_RootEOF }; @@ -429,6 +441,10 @@ static struct rpc_filelist files[] = { .name = "statd", .mode = S_IFDIR | S_IRUGO | S_IXUGO, }, + [RPCAUTH_nfsd4_cb] = { + .name = "nfsd4_cb", + .mode = S_IFDIR | S_IRUGO | S_IXUGO, + }, }; enum { @@ -748,7 +764,7 @@ rpc_rmdir(struct dentry *dentry) * @name: name of pipe * @private: private data to associate with the pipe, for the caller's use * @ops: operations defining the behavior of the pipe: upcall, downcall, - * release_pipe, and destroy_msg. + * release_pipe, open_pipe, and destroy_msg. * @flags: rpc_inode flags * * Data is made available for userspace to read by calls to @@ -808,7 +824,7 @@ err_dput: -ENOMEM); goto out; } -EXPORT_SYMBOL(rpc_mkpipe); +EXPORT_SYMBOL_GPL(rpc_mkpipe); /** * rpc_unlink - remove a pipe @@ -839,7 +855,7 @@ rpc_unlink(struct dentry *dentry) dput(parent); return error; } -EXPORT_SYMBOL(rpc_unlink); +EXPORT_SYMBOL_GPL(rpc_unlink); /* * populate the filesystem diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 34abc91058d8..03ae007641e4 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -270,10 +270,9 @@ static int rpcb_register_netid4(struct sockaddr_in *address_to_register, char buf[32]; /* Construct AF_INET universal address */ - snprintf(buf, sizeof(buf), - NIPQUAD_FMT".%u.%u", - NIPQUAD(address_to_register->sin_addr.s_addr), - port >> 8, port & 0xff); + snprintf(buf, sizeof(buf), "%pI4.%u.%u", + &address_to_register->sin_addr.s_addr, + port >> 8, port & 0xff); map->r_addr = buf; dprintk("RPC: %sregistering [%u, %u, %s, '%s'] with " @@ -305,9 +304,9 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register, snprintf(buf, sizeof(buf), "::.%u.%u", port >> 8, port & 0xff); else - snprintf(buf, sizeof(buf), NIP6_FMT".%u.%u", - NIP6(address_to_register->sin6_addr), - port >> 8, port & 0xff); + snprintf(buf, sizeof(buf), "%pI6.%u.%u", + &address_to_register->sin6_addr, + port >> 8, port & 0xff); map->r_addr = buf; dprintk("RPC: %sregistering [%u, %u, %s, '%s'] with " @@ -422,8 +421,8 @@ int rpcb_getport_sync(struct sockaddr_in *sin, u32 prog, u32 vers, int prot) struct rpc_clnt *rpcb_clnt; int status; - dprintk("RPC: %s(" NIPQUAD_FMT ", %u, %u, %d)\n", - __func__, NIPQUAD(sin->sin_addr.s_addr), prog, vers, prot); + dprintk("RPC: %s(%pI4, %u, %u, %d)\n", + __func__, &sin->sin_addr.s_addr, prog, vers, prot); rpcb_clnt = rpcb_create(NULL, (struct sockaddr *)sin, sizeof(*sin), prot, RPCBVERS_2); @@ -460,6 +459,28 @@ static struct rpc_task *rpcb_call_async(struct rpc_clnt *rpcb_clnt, struct rpcbi return rpc_run_task(&task_setup_data); } +/* + * In the case where rpc clients have been cloned, we want to make + * sure that we use the program number/version etc of the actual + * owner of the xprt. To do so, we walk back up the tree of parents + * to find whoever created the transport and/or whoever has the + * autobind flag set. + */ +static struct rpc_clnt *rpcb_find_transport_owner(struct rpc_clnt *clnt) +{ + struct rpc_clnt *parent = clnt->cl_parent; + + while (parent != clnt) { + if (parent->cl_xprt != clnt->cl_xprt) + break; + if (clnt->cl_autobind) + break; + clnt = parent; + parent = parent->cl_parent; + } + return clnt; +} + /** * rpcb_getport_async - obtain the port for a given RPC service on a given host * @task: task that is waiting for portmapper request @@ -469,10 +490,10 @@ static struct rpc_task *rpcb_call_async(struct rpc_clnt *rpcb_clnt, struct rpcbi */ void rpcb_getport_async(struct rpc_task *task) { - struct rpc_clnt *clnt = task->tk_client; + struct rpc_clnt *clnt; struct rpc_procinfo *proc; u32 bind_version; - struct rpc_xprt *xprt = task->tk_xprt; + struct rpc_xprt *xprt; struct rpc_clnt *rpcb_clnt; static struct rpcbind_args *map; struct rpc_task *child; @@ -481,13 +502,13 @@ void rpcb_getport_async(struct rpc_task *task) size_t salen; int status; + clnt = rpcb_find_transport_owner(task->tk_client); + xprt = clnt->cl_xprt; + dprintk("RPC: %5u %s(%s, %u, %u, %d)\n", task->tk_pid, __func__, clnt->cl_server, clnt->cl_prog, clnt->cl_vers, xprt->prot); - /* Autobind on cloned rpc clients is discouraged */ - BUG_ON(clnt->cl_parent != clnt); - /* Put self on the wait queue to ensure we get notified if * some other task is already attempting to bind the port */ rpc_sleep_on(&xprt->binding, task, NULL); @@ -549,7 +570,7 @@ void rpcb_getport_async(struct rpc_task *task) status = -ENOMEM; dprintk("RPC: %5u %s: no memory available\n", task->tk_pid, __func__); - goto bailout_nofree; + goto bailout_release_client; } map->r_prog = clnt->cl_prog; map->r_vers = clnt->cl_vers; @@ -569,11 +590,13 @@ void rpcb_getport_async(struct rpc_task *task) task->tk_pid, __func__); return; } - rpc_put_task(child); - task->tk_xprt->stat.bind_count++; + xprt->stat.bind_count++; + rpc_put_task(child); return; +bailout_release_client: + rpc_release_client(rpcb_clnt); bailout_nofree: rpcb_wake_rpcbind_waiters(xprt, status); task->tk_status = status; diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index f24800f2c098..82240e6127b2 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -162,13 +162,9 @@ static void ip_map_request(struct cache_detail *cd, struct ip_map *im = container_of(h, struct ip_map, h); if (ipv6_addr_v4mapped(&(im->m_addr))) { - snprintf(text_addr, 20, NIPQUAD_FMT, - ntohl(im->m_addr.s6_addr32[3]) >> 24 & 0xff, - ntohl(im->m_addr.s6_addr32[3]) >> 16 & 0xff, - ntohl(im->m_addr.s6_addr32[3]) >> 8 & 0xff, - ntohl(im->m_addr.s6_addr32[3]) >> 0 & 0xff); + snprintf(text_addr, 20, "%pI4", &im->m_addr.s6_addr32[3]); } else { - snprintf(text_addr, 40, NIP6_FMT, NIP6(im->m_addr)); + snprintf(text_addr, 40, "%pI6", &im->m_addr); } qword_add(bpp, blen, im->m_class); qword_add(bpp, blen, text_addr); @@ -208,13 +204,13 @@ static int ip_map_parse(struct cache_detail *cd, len = qword_get(&mesg, buf, mlen); if (len <= 0) return -EINVAL; - if (sscanf(buf, NIPQUAD_FMT "%c", &b1, &b2, &b3, &b4, &c) == 4) { + if (sscanf(buf, "%u.%u.%u.%u%c", &b1, &b2, &b3, &b4, &c) == 4) { addr.s6_addr32[0] = 0; addr.s6_addr32[1] = 0; addr.s6_addr32[2] = htonl(0xffff); addr.s6_addr32[3] = htonl((((((b1<<8)|b2)<<8)|b3)<<8)|b4); - } else if (sscanf(buf, NIP6_FMT "%c", + } else if (sscanf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x%c", &b1, &b2, &b3, &b4, &b5, &b6, &b7, &b8, &c) == 8) { addr.s6_addr16[0] = htons(b1); addr.s6_addr16[1] = htons(b2); @@ -278,16 +274,10 @@ static int ip_map_show(struct seq_file *m, dom = im->m_client->h.name; if (ipv6_addr_v4mapped(&addr)) { - seq_printf(m, "%s " NIPQUAD_FMT " %s\n", - im->m_class, - ntohl(addr.s6_addr32[3]) >> 24 & 0xff, - ntohl(addr.s6_addr32[3]) >> 16 & 0xff, - ntohl(addr.s6_addr32[3]) >> 8 & 0xff, - ntohl(addr.s6_addr32[3]) >> 0 & 0xff, - dom); + seq_printf(m, "%s %pI4 %s\n", + im->m_class, &addr.s6_addr32[3], dom); } else { - seq_printf(m, "%s " NIP6_FMT " %s\n", - im->m_class, NIP6(addr), dom); + seq_printf(m, "%s %pI6 %s\n", im->m_class, &addr, dom); } return 0; } diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 95293f549e9c..ef3238d665ee 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -250,10 +250,10 @@ static int one_sock_name(char *buf, struct svc_sock *svsk) switch(svsk->sk_sk->sk_family) { case AF_INET: - len = sprintf(buf, "ipv4 %s %u.%u.%u.%u %d\n", - svsk->sk_sk->sk_protocol==IPPROTO_UDP? + len = sprintf(buf, "ipv4 %s %pI4 %d\n", + svsk->sk_sk->sk_protocol == IPPROTO_UDP ? "udp" : "tcp", - NIPQUAD(inet_sk(svsk->sk_sk)->rcv_saddr), + &inet_sk(svsk->sk_sk)->rcv_saddr, inet_sk(svsk->sk_sk)->num); break; default: @@ -1183,7 +1183,11 @@ int svc_addsock(struct svc_serv *serv, else if (so->state > SS_UNCONNECTED) err = -EISCONN; else { - svsk = svc_setup_socket(serv, so, &err, SVC_SOCK_DEFAULTS); + if (!try_module_get(THIS_MODULE)) + err = -ENOENT; + else + svsk = svc_setup_socket(serv, so, &err, + SVC_SOCK_DEFAULTS); if (svsk) { struct sockaddr_storage addr; struct sockaddr *sin = (struct sockaddr *)&addr; @@ -1196,7 +1200,8 @@ int svc_addsock(struct svc_serv *serv, spin_unlock_bh(&serv->sv_lock); svc_xprt_received(&svsk->sk_xprt); err = 0; - } + } else + module_put(THIS_MODULE); } if (err) { sockfd_put(so); diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 79a55d56cc98..406e26de584e 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -28,7 +28,7 @@ xdr_encode_netobj(__be32 *p, const struct xdr_netobj *obj) memcpy(p, obj->data, obj->len); return p + XDR_QUADLEN(obj->len); } -EXPORT_SYMBOL(xdr_encode_netobj); +EXPORT_SYMBOL_GPL(xdr_encode_netobj); __be32 * xdr_decode_netobj(__be32 *p, struct xdr_netobj *obj) @@ -41,7 +41,7 @@ xdr_decode_netobj(__be32 *p, struct xdr_netobj *obj) obj->data = (u8 *) p; return p + XDR_QUADLEN(len); } -EXPORT_SYMBOL(xdr_decode_netobj); +EXPORT_SYMBOL_GPL(xdr_decode_netobj); /** * xdr_encode_opaque_fixed - Encode fixed length opaque data @@ -71,7 +71,7 @@ __be32 *xdr_encode_opaque_fixed(__be32 *p, const void *ptr, unsigned int nbytes) } return p; } -EXPORT_SYMBOL(xdr_encode_opaque_fixed); +EXPORT_SYMBOL_GPL(xdr_encode_opaque_fixed); /** * xdr_encode_opaque - Encode variable length opaque data @@ -86,14 +86,14 @@ __be32 *xdr_encode_opaque(__be32 *p, const void *ptr, unsigned int nbytes) *p++ = htonl(nbytes); return xdr_encode_opaque_fixed(p, ptr, nbytes); } -EXPORT_SYMBOL(xdr_encode_opaque); +EXPORT_SYMBOL_GPL(xdr_encode_opaque); __be32 * xdr_encode_string(__be32 *p, const char *string) { return xdr_encode_array(p, string, strlen(string)); } -EXPORT_SYMBOL(xdr_encode_string); +EXPORT_SYMBOL_GPL(xdr_encode_string); __be32 * xdr_decode_string_inplace(__be32 *p, char **sp, @@ -108,7 +108,7 @@ xdr_decode_string_inplace(__be32 *p, char **sp, *sp = (char *) p; return p + XDR_QUADLEN(len); } -EXPORT_SYMBOL(xdr_decode_string_inplace); +EXPORT_SYMBOL_GPL(xdr_decode_string_inplace); void xdr_encode_pages(struct xdr_buf *xdr, struct page **pages, unsigned int base, @@ -136,7 +136,7 @@ xdr_encode_pages(struct xdr_buf *xdr, struct page **pages, unsigned int base, xdr->buflen += len; xdr->len += len; } -EXPORT_SYMBOL(xdr_encode_pages); +EXPORT_SYMBOL_GPL(xdr_encode_pages); void xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset, @@ -158,7 +158,7 @@ xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset, xdr->buflen += len; } -EXPORT_SYMBOL(xdr_inline_pages); +EXPORT_SYMBOL_GPL(xdr_inline_pages); /* * Helper routines for doing 'memmove' like operations on a struct xdr_buf @@ -428,7 +428,7 @@ xdr_shift_buf(struct xdr_buf *buf, size_t len) { xdr_shrink_bufhead(buf, len); } -EXPORT_SYMBOL(xdr_shift_buf); +EXPORT_SYMBOL_GPL(xdr_shift_buf); /** * xdr_init_encode - Initialize a struct xdr_stream for sending data. @@ -465,7 +465,7 @@ void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p) iov->iov_len += len; } } -EXPORT_SYMBOL(xdr_init_encode); +EXPORT_SYMBOL_GPL(xdr_init_encode); /** * xdr_reserve_space - Reserve buffer space for sending @@ -492,7 +492,7 @@ __be32 * xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes) xdr->buf->len += nbytes; return p; } -EXPORT_SYMBOL(xdr_reserve_space); +EXPORT_SYMBOL_GPL(xdr_reserve_space); /** * xdr_write_pages - Insert a list of pages into an XDR buffer for sending @@ -527,7 +527,7 @@ void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, unsigned int b buf->buflen += len; buf->len += len; } -EXPORT_SYMBOL(xdr_write_pages); +EXPORT_SYMBOL_GPL(xdr_write_pages); /** * xdr_init_decode - Initialize an xdr_stream for decoding data. @@ -547,7 +547,7 @@ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p) xdr->p = p; xdr->end = (__be32 *)((char *)iov->iov_base + len); } -EXPORT_SYMBOL(xdr_init_decode); +EXPORT_SYMBOL_GPL(xdr_init_decode); /** * xdr_inline_decode - Retrieve non-page XDR data to decode @@ -569,7 +569,7 @@ __be32 * xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes) xdr->p = q; return p; } -EXPORT_SYMBOL(xdr_inline_decode); +EXPORT_SYMBOL_GPL(xdr_inline_decode); /** * xdr_read_pages - Ensure page-based XDR data to decode is aligned at current pointer position @@ -613,7 +613,7 @@ void xdr_read_pages(struct xdr_stream *xdr, unsigned int len) xdr->p = (__be32 *)((char *)iov->iov_base + padding); xdr->end = (__be32 *)((char *)iov->iov_base + end); } -EXPORT_SYMBOL(xdr_read_pages); +EXPORT_SYMBOL_GPL(xdr_read_pages); /** * xdr_enter_page - decode data from the XDR page @@ -638,7 +638,7 @@ void xdr_enter_page(struct xdr_stream *xdr, unsigned int len) xdr->p = (__be32 *)(kaddr + xdr->buf->page_base); xdr->end = (__be32 *)((char *)xdr->p + len); } -EXPORT_SYMBOL(xdr_enter_page); +EXPORT_SYMBOL_GPL(xdr_enter_page); static struct kvec empty_iov = {.iov_base = NULL, .iov_len = 0}; @@ -650,7 +650,7 @@ xdr_buf_from_iov(struct kvec *iov, struct xdr_buf *buf) buf->page_len = 0; buf->buflen = buf->len = iov->iov_len; } -EXPORT_SYMBOL(xdr_buf_from_iov); +EXPORT_SYMBOL_GPL(xdr_buf_from_iov); /* Sets subbuf to the portion of buf of length len beginning base bytes * from the start of buf. Returns -1 if base of length are out of bounds. */ @@ -699,7 +699,7 @@ xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf, return -1; return 0; } -EXPORT_SYMBOL(xdr_buf_subsegment); +EXPORT_SYMBOL_GPL(xdr_buf_subsegment); static void __read_bytes_from_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned int len) { @@ -730,7 +730,7 @@ int read_bytes_from_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, u __read_bytes_from_xdr_buf(&subbuf, obj, len); return 0; } -EXPORT_SYMBOL(read_bytes_from_xdr_buf); +EXPORT_SYMBOL_GPL(read_bytes_from_xdr_buf); static void __write_bytes_to_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned int len) { @@ -774,7 +774,7 @@ xdr_decode_word(struct xdr_buf *buf, unsigned int base, u32 *obj) *obj = ntohl(raw); return 0; } -EXPORT_SYMBOL(xdr_decode_word); +EXPORT_SYMBOL_GPL(xdr_decode_word); int xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj) @@ -783,7 +783,7 @@ xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj) return write_bytes_to_xdr_buf(buf, base, &raw, sizeof(obj)); } -EXPORT_SYMBOL(xdr_encode_word); +EXPORT_SYMBOL_GPL(xdr_encode_word); /* If the netobj starting offset bytes from the start of xdr_buf is contained * entirely in the head or the tail, set object to point to it; otherwise @@ -821,7 +821,7 @@ int xdr_buf_read_netobj(struct xdr_buf *buf, struct xdr_netobj *obj, unsigned in __read_bytes_from_xdr_buf(&subbuf, obj->data, obj->len); return 0; } -EXPORT_SYMBOL(xdr_buf_read_netobj); +EXPORT_SYMBOL_GPL(xdr_buf_read_netobj); /* Returns 0 on success, or else a negative error code. */ static int @@ -1027,7 +1027,7 @@ xdr_decode_array2(struct xdr_buf *buf, unsigned int base, return xdr_xcode_array2(buf, base, desc, 0); } -EXPORT_SYMBOL(xdr_decode_array2); +EXPORT_SYMBOL_GPL(xdr_decode_array2); int xdr_encode_array2(struct xdr_buf *buf, unsigned int base, @@ -1039,7 +1039,7 @@ xdr_encode_array2(struct xdr_buf *buf, unsigned int base, return xdr_xcode_array2(buf, base, desc, 1); } -EXPORT_SYMBOL(xdr_encode_array2); +EXPORT_SYMBOL_GPL(xdr_encode_array2); int xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len, @@ -1106,5 +1106,5 @@ xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len, out: return ret; } -EXPORT_SYMBOL(xdr_process_buf); +EXPORT_SYMBOL_GPL(xdr_process_buf); diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 99a52aabe332..29e401bb612e 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -108,13 +108,10 @@ int xprt_register_transport(struct xprt_class *transport) goto out; } - result = -EINVAL; - if (try_module_get(THIS_MODULE)) { - list_add_tail(&transport->list, &xprt_list); - printk(KERN_INFO "RPC: Registered %s transport module.\n", - transport->name); - result = 0; - } + list_add_tail(&transport->list, &xprt_list); + printk(KERN_INFO "RPC: Registered %s transport module.\n", + transport->name); + result = 0; out: spin_unlock(&xprt_list_lock); @@ -143,7 +140,6 @@ int xprt_unregister_transport(struct xprt_class *transport) "RPC: Unregistered %s transport module.\n", transport->name); list_del_init(&transport->list); - module_put(THIS_MODULE); goto out; } } diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 5c1954d28d09..14106d26bb95 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -118,6 +118,10 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, } if (xdrbuf->tail[0].iov_len) { + /* the rpcrdma protocol allows us to omit any trailing + * xdr pad bytes, saving the server an RDMA operation. */ + if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize) + return n; if (n == nsegs) return 0; seg[n].mr_page = NULL; @@ -508,8 +512,8 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) if (hdrlen == 0) return -1; - dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd\n" - " headerp 0x%p base 0x%p lkey 0x%x\n", + dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd" + " headerp 0x%p base 0x%p lkey 0x%x\n", __func__, transfertypes[wtype], hdrlen, rpclen, padlen, headerp, base, req->rl_iov.lkey); @@ -594,7 +598,7 @@ rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __b * Scatter inline received data back into provided iov's. */ static void -rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len) +rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) { int i, npages, curlen, olen; char *destp; @@ -660,6 +664,13 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len) } else rqst->rq_rcv_buf.tail[0].iov_len = 0; + if (pad) { + /* implicit padding on terminal chunk */ + unsigned char *p = rqst->rq_rcv_buf.tail[0].iov_base; + while (pad--) + p[rqst->rq_rcv_buf.tail[0].iov_len++] = 0; + } + if (copy_len) dprintk("RPC: %s: %d bytes in" " %d extra segments (%d lost)\n", @@ -681,12 +692,14 @@ rpcrdma_conn_func(struct rpcrdma_ep *ep) struct rpc_xprt *xprt = ep->rep_xprt; spin_lock_bh(&xprt->transport_lock); + if (++xprt->connect_cookie == 0) /* maintain a reserved value */ + ++xprt->connect_cookie; if (ep->rep_connected > 0) { if (!xprt_test_and_set_connected(xprt)) xprt_wake_pending_tasks(xprt, 0); } else { if (xprt_test_and_clear_connected(xprt)) - xprt_wake_pending_tasks(xprt, ep->rep_connected); + xprt_wake_pending_tasks(xprt, -ENOTCONN); } spin_unlock_bh(&xprt->transport_lock); } @@ -792,14 +805,20 @@ repost: ((unsigned char *)iptr - (unsigned char *)headerp); status = rep->rr_len + rdmalen; r_xprt->rx_stats.total_rdma_reply += rdmalen; + /* special case - last chunk may omit padding */ + if (rdmalen &= 3) { + rdmalen = 4 - rdmalen; + status += rdmalen; + } } else { /* else ordinary inline */ + rdmalen = 0; iptr = (__be32 *)((unsigned char *)headerp + 28); rep->rr_len -= 28; /*sizeof *headerp;*/ status = rep->rr_len; } /* Fix up the rpc results for upper layer */ - rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len); + rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, rdmalen); break; case htonl(RDMA_NOMSG): diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index a4756576d687..629a28764da9 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -646,8 +646,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) ret = rdma_read_xdr(rdma_xprt, rmsgp, rqstp, ctxt); if (ret > 0) { /* read-list posted, defer until data received from client. */ - svc_xprt_received(xprt); - return 0; + goto defer; } if (ret < 0) { /* Post of read-list failed, free context. */ @@ -679,6 +678,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) * close bit and call svc_xprt_delete */ set_bit(XPT_CLOSE, &xprt->xpt_flags); +defer: svc_xprt_received(xprt); return 0; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 9a7a8e7ae038..a3334e3b73cc 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -69,7 +69,7 @@ * array is only concerned with the reply we are assured that we have * on extra page for the RPCRMDA header. */ -int fast_reg_xdr(struct svcxprt_rdma *xprt, +static int fast_reg_xdr(struct svcxprt_rdma *xprt, struct xdr_buf *xdr, struct svc_rdma_req_map *vec) { diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 6fb493cbd29f..3d810e7df3fb 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -61,7 +61,7 @@ static int svc_rdma_has_wspace(struct svc_xprt *xprt); static void rq_cq_reap(struct svcxprt_rdma *xprt); static void sq_cq_reap(struct svcxprt_rdma *xprt); -DECLARE_TASKLET(dto_tasklet, dto_tasklet_func, 0UL); +static DECLARE_TASKLET(dto_tasklet, dto_tasklet_func, 0UL); static DEFINE_SPINLOCK(dto_lock); static LIST_HEAD(dto_xprt_q); @@ -827,7 +827,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) struct rdma_conn_param conn_param; struct ib_qp_init_attr qp_attr; struct ib_device_attr devattr; - int dma_mr_acc; + int uninitialized_var(dma_mr_acc); int need_dma_mr; int ret; int i; @@ -1048,21 +1048,21 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) dprintk("svcrdma: new connection %p accepted with the following " "attributes:\n" - " local_ip : %d.%d.%d.%d\n" + " local_ip : %pI4\n" " local_port : %d\n" - " remote_ip : %d.%d.%d.%d\n" + " remote_ip : %pI4\n" " remote_port : %d\n" " max_sge : %d\n" " sq_depth : %d\n" " max_requests : %d\n" " ord : %d\n", newxprt, - NIPQUAD(((struct sockaddr_in *)&newxprt->sc_cm_id-> - route.addr.src_addr)->sin_addr.s_addr), + &((struct sockaddr_in *)&newxprt->sc_cm_id-> + route.addr.src_addr)->sin_addr.s_addr, ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id-> route.addr.src_addr)->sin_port), - NIPQUAD(((struct sockaddr_in *)&newxprt->sc_cm_id-> - route.addr.dst_addr)->sin_addr.s_addr), + &((struct sockaddr_in *)&newxprt->sc_cm_id-> + route.addr.dst_addr)->sin_addr.s_addr, ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id-> route.addr.dst_addr)->sin_port), newxprt->sc_max_sge, diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index a564c1a39ec5..1dd6123070e9 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -70,11 +70,8 @@ static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE; static unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE; static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE; static unsigned int xprt_rdma_inline_write_padding; -#if !RPCRDMA_PERSISTENT_REGISTRATION -static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_REGISTER; /* FMR? */ -#else -static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_ALLPHYSICAL; -#endif +static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR; + int xprt_rdma_pad_optimize = 0; #ifdef RPC_DEBUG @@ -140,6 +137,14 @@ static ctl_table xr_tunables_table[] = { .extra2 = &max_memreg, }, { + .ctl_name = CTL_UNNUMBERED, + .procname = "rdma_pad_optimize", + .data = &xprt_rdma_pad_optimize, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = 0, }, }; @@ -169,7 +174,7 @@ xprt_rdma_format_addresses(struct rpc_xprt *xprt) buf = kzalloc(20, GFP_KERNEL); if (buf) - snprintf(buf, 20, NIPQUAD_FMT, NIPQUAD(addr->sin_addr.s_addr)); + snprintf(buf, 20, "%pI4", &addr->sin_addr.s_addr); xprt->address_strings[RPC_DISPLAY_ADDR] = buf; buf = kzalloc(8, GFP_KERNEL); @@ -181,8 +186,8 @@ xprt_rdma_format_addresses(struct rpc_xprt *xprt) buf = kzalloc(48, GFP_KERNEL); if (buf) - snprintf(buf, 48, "addr="NIPQUAD_FMT" port=%u proto=%s", - NIPQUAD(addr->sin_addr.s_addr), + snprintf(buf, 48, "addr=%pI4 port=%u proto=%s", + &addr->sin_addr.s_addr, ntohs(addr->sin_port), "rdma"); xprt->address_strings[RPC_DISPLAY_ALL] = buf; @@ -199,8 +204,8 @@ xprt_rdma_format_addresses(struct rpc_xprt *xprt) buf = kzalloc(30, GFP_KERNEL); if (buf) - snprintf(buf, 30, NIPQUAD_FMT".%u.%u", - NIPQUAD(addr->sin_addr.s_addr), + snprintf(buf, 30, "%pI4.%u.%u", + &addr->sin_addr.s_addr, ntohs(addr->sin_port) >> 8, ntohs(addr->sin_port) & 0xff); xprt->address_strings[RPC_DISPLAY_UNIVERSAL_ADDR] = buf; @@ -364,8 +369,8 @@ xprt_setup_rdma(struct xprt_create *args) if (ntohs(sin->sin_port) != 0) xprt_set_bound(xprt); - dprintk("RPC: %s: %u.%u.%u.%u:%u\n", __func__, - NIPQUAD(sin->sin_addr.s_addr), ntohs(sin->sin_port)); + dprintk("RPC: %s: %pI4:%u\n", + __func__, &sin->sin_addr.s_addr, ntohs(sin->sin_port)); /* Set max requests */ cdata.max_requests = xprt->max_reqs; @@ -458,6 +463,8 @@ xprt_rdma_close(struct rpc_xprt *xprt) struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); dprintk("RPC: %s: closing\n", __func__); + if (r_xprt->rx_ep.rep_connected > 0) + xprt->reestablish_timeout = 0; xprt_disconnect_done(xprt); (void) rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia); } @@ -485,6 +492,11 @@ xprt_rdma_connect(struct rpc_task *task) /* Reconnect */ schedule_delayed_work(&r_xprt->rdma_connect, xprt->reestablish_timeout); + xprt->reestablish_timeout <<= 1; + if (xprt->reestablish_timeout > (30 * HZ)) + xprt->reestablish_timeout = (30 * HZ); + else if (xprt->reestablish_timeout < (5 * HZ)) + xprt->reestablish_timeout = (5 * HZ); } else { schedule_delayed_work(&r_xprt->rdma_connect, 0); if (!RPC_IS_ASYNC(task)) @@ -591,6 +603,7 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size) } dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req); out: + req->rl_connect_cookie = 0; /* our reserved value */ return req->rl_xdr_buf; outfail: @@ -694,13 +707,21 @@ xprt_rdma_send_request(struct rpc_task *task) req->rl_reply->rr_xprt = xprt; } - if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) { - xprt_disconnect_done(xprt); - return -ENOTCONN; /* implies disconnect */ - } + /* Must suppress retransmit to maintain credits */ + if (req->rl_connect_cookie == xprt->connect_cookie) + goto drop_connection; + req->rl_connect_cookie = xprt->connect_cookie; + + if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) + goto drop_connection; + task->tk_bytes_sent += rqst->rq_snd_buf.len; rqst->rq_bytes_sent = 0; return 0; + +drop_connection: + xprt_disconnect_done(xprt); + return -ENOTCONN; /* implies disconnect */ } static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) @@ -770,7 +791,7 @@ static void __exit xprt_rdma_cleanup(void) { int rc; - dprintk("RPCRDMA Module Removed, deregister RPC RDMA transport\n"); + dprintk(KERN_INFO "RPCRDMA Module Removed, deregister RPC RDMA transport\n"); #ifdef RPC_DEBUG if (sunrpc_table_header) { unregister_sysctl_table(sunrpc_table_header); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 8ea283ecc522..3b21e0cc5e69 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -276,7 +276,9 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) struct rpcrdma_xprt *xprt = id->context; struct rpcrdma_ia *ia = &xprt->rx_ia; struct rpcrdma_ep *ep = &xprt->rx_ep; +#ifdef RPC_DEBUG struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr; +#endif struct ib_qp_attr attr; struct ib_qp_init_attr iattr; int connstate = 0; @@ -284,6 +286,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: case RDMA_CM_EVENT_ROUTE_RESOLVED: + ia->ri_async_rc = 0; complete(&ia->ri_done); break; case RDMA_CM_EVENT_ADDR_ERROR: @@ -322,12 +325,11 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) case RDMA_CM_EVENT_DEVICE_REMOVAL: connstate = -ENODEV; connected: - dprintk("RPC: %s: %s: %u.%u.%u.%u:%u" - " (ep 0x%p event 0x%x)\n", + dprintk("RPC: %s: %s: %pI4:%u (ep 0x%p event 0x%x)\n", __func__, (event->event <= 11) ? conn[event->event] : "unknown connection error", - NIPQUAD(addr->sin_addr.s_addr), + &addr->sin_addr.s_addr, ntohs(addr->sin_port), ep, event->event); atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1); @@ -338,13 +340,31 @@ connected: wake_up_all(&ep->rep_connect_wait); break; default: - ia->ri_async_rc = -EINVAL; - dprintk("RPC: %s: unexpected CM event %X\n", + dprintk("RPC: %s: unexpected CM event %d\n", __func__, event->event); - complete(&ia->ri_done); break; } +#ifdef RPC_DEBUG + if (connstate == 1) { + int ird = attr.max_dest_rd_atomic; + int tird = ep->rep_remote_cma.responder_resources; + printk(KERN_INFO "rpcrdma: connection to %pI4:%u " + "on %s, memreg %d slots %d ird %d%s\n", + &addr->sin_addr.s_addr, + ntohs(addr->sin_port), + ia->ri_id->device->name, + ia->ri_memreg_strategy, + xprt->rx_buf.rb_max_requests, + ird, ird < 4 && ird < tird / 2 ? " (low!)" : ""); + } else if (connstate < 0) { + printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n", + &addr->sin_addr.s_addr, + ntohs(addr->sin_port), + connstate); + } +#endif + return 0; } @@ -355,6 +375,8 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rdma_cm_id *id; int rc; + init_completion(&ia->ri_done); + id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP); if (IS_ERR(id)) { rc = PTR_ERR(id); @@ -363,26 +385,28 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, return id; } - ia->ri_async_rc = 0; + ia->ri_async_rc = -ETIMEDOUT; rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT); if (rc) { dprintk("RPC: %s: rdma_resolve_addr() failed %i\n", __func__, rc); goto out; } - wait_for_completion(&ia->ri_done); + wait_for_completion_interruptible_timeout(&ia->ri_done, + msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1); rc = ia->ri_async_rc; if (rc) goto out; - ia->ri_async_rc = 0; + ia->ri_async_rc = -ETIMEDOUT; rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT); if (rc) { dprintk("RPC: %s: rdma_resolve_route() failed %i\n", __func__, rc); goto out; } - wait_for_completion(&ia->ri_done); + wait_for_completion_interruptible_timeout(&ia->ri_done, + msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1); rc = ia->ri_async_rc; if (rc) goto out; @@ -423,11 +447,10 @@ rpcrdma_clean_cq(struct ib_cq *cq) int rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) { - int rc; + int rc, mem_priv; + struct ib_device_attr devattr; struct rpcrdma_ia *ia = &xprt->rx_ia; - init_completion(&ia->ri_done); - ia->ri_id = rpcrdma_create_id(xprt, ia, addr); if (IS_ERR(ia->ri_id)) { rc = PTR_ERR(ia->ri_id); @@ -443,6 +466,73 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) } /* + * Query the device to determine if the requested memory + * registration strategy is supported. If it isn't, set the + * strategy to a globally supported model. + */ + rc = ib_query_device(ia->ri_id->device, &devattr); + if (rc) { + dprintk("RPC: %s: ib_query_device failed %d\n", + __func__, rc); + goto out2; + } + + if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) { + ia->ri_have_dma_lkey = 1; + ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey; + } + + switch (memreg) { + case RPCRDMA_MEMWINDOWS: + case RPCRDMA_MEMWINDOWS_ASYNC: + if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) { + dprintk("RPC: %s: MEMWINDOWS registration " + "specified but not supported by adapter, " + "using slower RPCRDMA_REGISTER\n", + __func__); + memreg = RPCRDMA_REGISTER; + } + break; + case RPCRDMA_MTHCAFMR: + if (!ia->ri_id->device->alloc_fmr) { +#if RPCRDMA_PERSISTENT_REGISTRATION + dprintk("RPC: %s: MTHCAFMR registration " + "specified but not supported by adapter, " + "using riskier RPCRDMA_ALLPHYSICAL\n", + __func__); + memreg = RPCRDMA_ALLPHYSICAL; +#else + dprintk("RPC: %s: MTHCAFMR registration " + "specified but not supported by adapter, " + "using slower RPCRDMA_REGISTER\n", + __func__); + memreg = RPCRDMA_REGISTER; +#endif + } + break; + case RPCRDMA_FRMR: + /* Requires both frmr reg and local dma lkey */ + if ((devattr.device_cap_flags & + (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) != + (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) { +#if RPCRDMA_PERSISTENT_REGISTRATION + dprintk("RPC: %s: FRMR registration " + "specified but not supported by adapter, " + "using riskier RPCRDMA_ALLPHYSICAL\n", + __func__); + memreg = RPCRDMA_ALLPHYSICAL; +#else + dprintk("RPC: %s: FRMR registration " + "specified but not supported by adapter, " + "using slower RPCRDMA_REGISTER\n", + __func__); + memreg = RPCRDMA_REGISTER; +#endif + } + break; + } + + /* * Optionally obtain an underlying physical identity mapping in * order to do a memory window-based bind. This base registration * is protected from remote access - that is enabled only by binding @@ -450,22 +540,28 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) * revoked after the corresponding completion similar to a storage * adapter. */ - if (memreg > RPCRDMA_REGISTER) { - int mem_priv = IB_ACCESS_LOCAL_WRITE; - switch (memreg) { + switch (memreg) { + case RPCRDMA_BOUNCEBUFFERS: + case RPCRDMA_REGISTER: + case RPCRDMA_FRMR: + break; #if RPCRDMA_PERSISTENT_REGISTRATION - case RPCRDMA_ALLPHYSICAL: - mem_priv |= IB_ACCESS_REMOTE_WRITE; - mem_priv |= IB_ACCESS_REMOTE_READ; - break; + case RPCRDMA_ALLPHYSICAL: + mem_priv = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ; + goto register_setup; #endif - case RPCRDMA_MEMWINDOWS_ASYNC: - case RPCRDMA_MEMWINDOWS: - mem_priv |= IB_ACCESS_MW_BIND; - break; - default: + case RPCRDMA_MEMWINDOWS_ASYNC: + case RPCRDMA_MEMWINDOWS: + mem_priv = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_MW_BIND; + goto register_setup; + case RPCRDMA_MTHCAFMR: + if (ia->ri_have_dma_lkey) break; - } + mem_priv = IB_ACCESS_LOCAL_WRITE; + register_setup: ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv); if (IS_ERR(ia->ri_bind_mem)) { printk(KERN_ALERT "%s: ib_get_dma_mr for " @@ -475,7 +571,15 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) memreg = RPCRDMA_REGISTER; ia->ri_bind_mem = NULL; } + break; + default: + printk(KERN_ERR "%s: invalid memory registration mode %d\n", + __func__, memreg); + rc = -EINVAL; + goto out2; } + dprintk("RPC: %s: memory registration strategy is %d\n", + __func__, memreg); /* Else will do memory reg/dereg for each chunk */ ia->ri_memreg_strategy = memreg; @@ -483,6 +587,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) return 0; out2: rdma_destroy_id(ia->ri_id); + ia->ri_id = NULL; out1: return rc; } @@ -503,15 +608,17 @@ rpcrdma_ia_close(struct rpcrdma_ia *ia) dprintk("RPC: %s: ib_dereg_mr returned %i\n", __func__, rc); } - if (ia->ri_id != NULL && !IS_ERR(ia->ri_id) && ia->ri_id->qp) - rdma_destroy_qp(ia->ri_id); + if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) { + if (ia->ri_id->qp) + rdma_destroy_qp(ia->ri_id); + rdma_destroy_id(ia->ri_id); + ia->ri_id = NULL; + } if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) { rc = ib_dealloc_pd(ia->ri_pd); dprintk("RPC: %s: ib_dealloc_pd returned %i\n", __func__, rc); } - if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) - rdma_destroy_id(ia->ri_id); } /* @@ -541,6 +648,12 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, ep->rep_attr.srq = NULL; ep->rep_attr.cap.max_send_wr = cdata->max_requests; switch (ia->ri_memreg_strategy) { + case RPCRDMA_FRMR: + /* Add room for frmr register and invalidate WRs */ + ep->rep_attr.cap.max_send_wr *= 3; + if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) + return -EINVAL; + break; case RPCRDMA_MEMWINDOWS_ASYNC: case RPCRDMA_MEMWINDOWS: /* Add room for mw_binds+unbinds - overkill! */ @@ -617,29 +730,13 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, ep->rep_remote_cma.private_data_len = 0; /* Client offers RDMA Read but does not initiate */ - switch (ia->ri_memreg_strategy) { - case RPCRDMA_BOUNCEBUFFERS: + ep->rep_remote_cma.initiator_depth = 0; + if (ia->ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS) ep->rep_remote_cma.responder_resources = 0; - break; - case RPCRDMA_MTHCAFMR: - case RPCRDMA_REGISTER: - ep->rep_remote_cma.responder_resources = cdata->max_requests * - (RPCRDMA_MAX_DATA_SEGS / 8); - break; - case RPCRDMA_MEMWINDOWS: - case RPCRDMA_MEMWINDOWS_ASYNC: -#if RPCRDMA_PERSISTENT_REGISTRATION - case RPCRDMA_ALLPHYSICAL: -#endif - ep->rep_remote_cma.responder_resources = cdata->max_requests * - (RPCRDMA_MAX_DATA_SEGS / 2); - break; - default: - break; - } - if (ep->rep_remote_cma.responder_resources > devattr.max_qp_rd_atom) + else if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */ + ep->rep_remote_cma.responder_resources = 32; + else ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom; - ep->rep_remote_cma.initiator_depth = 0; ep->rep_remote_cma.retry_count = 7; ep->rep_remote_cma.flow_control = 0; @@ -679,21 +776,16 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) if (rc) dprintk("RPC: %s: rpcrdma_ep_disconnect" " returned %i\n", __func__, rc); + rdma_destroy_qp(ia->ri_id); + ia->ri_id->qp = NULL; } - ep->rep_func = NULL; - /* padding - could be done in rpcrdma_buffer_destroy... */ if (ep->rep_pad_mr) { rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad); ep->rep_pad_mr = NULL; } - if (ia->ri_id->qp) { - rdma_destroy_qp(ia->ri_id); - ia->ri_id->qp = NULL; - } - rpcrdma_clean_cq(ep->rep_cq); rc = ib_destroy_cq(ep->rep_cq); if (rc) @@ -712,9 +804,8 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) struct rdma_cm_id *id; int rc = 0; int retry_count = 0; - int reconnect = (ep->rep_connected != 0); - if (reconnect) { + if (ep->rep_connected != 0) { struct rpcrdma_xprt *xprt; retry: rc = rpcrdma_ep_disconnect(ep, ia); @@ -745,6 +836,7 @@ retry: goto out; } /* END TEMP */ + rdma_destroy_qp(ia->ri_id); rdma_destroy_id(ia->ri_id); ia->ri_id = id; } @@ -769,14 +861,6 @@ if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) { } } - /* Theoretically a client initiator_depth > 0 is not needed, - * but many peers fail to complete the connection unless they - * == responder_resources! */ - if (ep->rep_remote_cma.initiator_depth != - ep->rep_remote_cma.responder_resources) - ep->rep_remote_cma.initiator_depth = - ep->rep_remote_cma.responder_resources; - ep->rep_connected = 0; rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma); @@ -786,9 +870,6 @@ if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) { goto out; } - if (reconnect) - return 0; - wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0); /* @@ -805,14 +886,16 @@ if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) { if (ep->rep_connected <= 0) { /* Sometimes, the only way to reliably connect to remote * CMs is to use same nonzero values for ORD and IRD. */ - ep->rep_remote_cma.initiator_depth = - ep->rep_remote_cma.responder_resources; - if (ep->rep_remote_cma.initiator_depth == 0) - ++ep->rep_remote_cma.initiator_depth; - if (ep->rep_remote_cma.responder_resources == 0) - ++ep->rep_remote_cma.responder_resources; - if (retry_count++ == 0) + if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 && + (ep->rep_remote_cma.responder_resources == 0 || + ep->rep_remote_cma.initiator_depth != + ep->rep_remote_cma.responder_resources)) { + if (ep->rep_remote_cma.responder_resources == 0) + ep->rep_remote_cma.responder_resources = 1; + ep->rep_remote_cma.initiator_depth = + ep->rep_remote_cma.responder_resources; goto retry; + } rc = ep->rep_connected; } else { dprintk("RPC: %s: connected\n", __func__); @@ -863,6 +946,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, char *p; size_t len; int i, rc; + struct rpcrdma_mw *r; buf->rb_max_requests = cdata->max_requests; spin_lock_init(&buf->rb_lock); @@ -873,7 +957,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, * 2. arrays of struct rpcrdma_req to fill in pointers * 3. array of struct rpcrdma_rep for replies * 4. padding, if any - * 5. mw's, if any + * 5. mw's, fmr's or frmr's, if any * Send/recv buffers in req/rep need to be registered */ @@ -881,6 +965,10 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *)); len += cdata->padding; switch (ia->ri_memreg_strategy) { + case RPCRDMA_FRMR: + len += buf->rb_max_requests * RPCRDMA_MAX_SEGS * + sizeof(struct rpcrdma_mw); + break; case RPCRDMA_MTHCAFMR: /* TBD we are perhaps overallocating here */ len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS * @@ -927,15 +1015,37 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, * and also reduce unbind-to-bind collision. */ INIT_LIST_HEAD(&buf->rb_mws); + r = (struct rpcrdma_mw *)p; switch (ia->ri_memreg_strategy) { + case RPCRDMA_FRMR: + for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) { + r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd, + RPCRDMA_MAX_SEGS); + if (IS_ERR(r->r.frmr.fr_mr)) { + rc = PTR_ERR(r->r.frmr.fr_mr); + dprintk("RPC: %s: ib_alloc_fast_reg_mr" + " failed %i\n", __func__, rc); + goto out; + } + r->r.frmr.fr_pgl = + ib_alloc_fast_reg_page_list(ia->ri_id->device, + RPCRDMA_MAX_SEGS); + if (IS_ERR(r->r.frmr.fr_pgl)) { + rc = PTR_ERR(r->r.frmr.fr_pgl); + dprintk("RPC: %s: " + "ib_alloc_fast_reg_page_list " + "failed %i\n", __func__, rc); + goto out; + } + list_add(&r->mw_list, &buf->rb_mws); + ++r; + } + break; case RPCRDMA_MTHCAFMR: - { - struct rpcrdma_mw *r = (struct rpcrdma_mw *)p; - struct ib_fmr_attr fa = { - RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT - }; /* TBD we are perhaps overallocating here */ for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) { + static struct ib_fmr_attr fa = + { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT }; r->r.fmr = ib_alloc_fmr(ia->ri_pd, IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ, &fa); @@ -948,12 +1058,9 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, list_add(&r->mw_list, &buf->rb_mws); ++r; } - } break; case RPCRDMA_MEMWINDOWS_ASYNC: case RPCRDMA_MEMWINDOWS: - { - struct rpcrdma_mw *r = (struct rpcrdma_mw *)p; /* Allocate one extra request's worth, for full cycling */ for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) { r->r.mw = ib_alloc_mw(ia->ri_pd); @@ -966,7 +1073,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, list_add(&r->mw_list, &buf->rb_mws); ++r; } - } break; default: break; @@ -1046,6 +1152,7 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) { int rc, i; struct rpcrdma_ia *ia = rdmab_to_ia(buf); + struct rpcrdma_mw *r; /* clean up in reverse order from create * 1. recv mr memory (mr free, then kfree) @@ -1065,11 +1172,19 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) } if (buf->rb_send_bufs && buf->rb_send_bufs[i]) { while (!list_empty(&buf->rb_mws)) { - struct rpcrdma_mw *r; r = list_entry(buf->rb_mws.next, struct rpcrdma_mw, mw_list); list_del(&r->mw_list); switch (ia->ri_memreg_strategy) { + case RPCRDMA_FRMR: + rc = ib_dereg_mr(r->r.frmr.fr_mr); + if (rc) + dprintk("RPC: %s:" + " ib_dereg_mr" + " failed %i\n", + __func__, rc); + ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); + break; case RPCRDMA_MTHCAFMR: rc = ib_dealloc_fmr(r->r.fmr); if (rc) @@ -1115,6 +1230,8 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) { struct rpcrdma_req *req; unsigned long flags; + int i; + struct rpcrdma_mw *r; spin_lock_irqsave(&buffers->rb_lock, flags); if (buffers->rb_send_index == buffers->rb_max_requests) { @@ -1135,9 +1252,8 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) } buffers->rb_send_bufs[buffers->rb_send_index++] = NULL; if (!list_empty(&buffers->rb_mws)) { - int i = RPCRDMA_MAX_SEGS - 1; + i = RPCRDMA_MAX_SEGS - 1; do { - struct rpcrdma_mw *r; r = list_entry(buffers->rb_mws.next, struct rpcrdma_mw, mw_list); list_del(&r->mw_list); @@ -1171,6 +1287,7 @@ rpcrdma_buffer_put(struct rpcrdma_req *req) req->rl_reply = NULL; } switch (ia->ri_memreg_strategy) { + case RPCRDMA_FRMR: case RPCRDMA_MTHCAFMR: case RPCRDMA_MEMWINDOWS_ASYNC: case RPCRDMA_MEMWINDOWS: @@ -1252,7 +1369,11 @@ rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len, va, len, DMA_BIDIRECTIONAL); iov->length = len; - if (ia->ri_bind_mem != NULL) { + if (ia->ri_have_dma_lkey) { + *mrp = NULL; + iov->lkey = ia->ri_dma_lkey; + return 0; + } else if (ia->ri_bind_mem != NULL) { *mrp = NULL; iov->lkey = ia->ri_bind_mem->lkey; return 0; @@ -1329,15 +1450,292 @@ rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg) seg->mr_dma, seg->mr_dmalen, seg->mr_dir); } +static int +rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, + int *nsegs, int writing, struct rpcrdma_ia *ia, + struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_mr_seg *seg1 = seg; + struct ib_send_wr frmr_wr, *bad_wr; + u8 key; + int len, pageoff; + int i, rc; + + pageoff = offset_in_page(seg1->mr_offset); + seg1->mr_offset -= pageoff; /* start of page */ + seg1->mr_len += pageoff; + len = -pageoff; + if (*nsegs > RPCRDMA_MAX_DATA_SEGS) + *nsegs = RPCRDMA_MAX_DATA_SEGS; + for (i = 0; i < *nsegs;) { + rpcrdma_map_one(ia, seg, writing); + seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->page_list[i] = seg->mr_dma; + len += seg->mr_len; + ++seg; + ++i; + /* Check for holes */ + if ((i < *nsegs && offset_in_page(seg->mr_offset)) || + offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) + break; + } + dprintk("RPC: %s: Using frmr %p to map %d segments\n", + __func__, seg1->mr_chunk.rl_mw, i); + + /* Bump the key */ + key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF); + ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key); + + /* Prepare FRMR WR */ + memset(&frmr_wr, 0, sizeof frmr_wr); + frmr_wr.opcode = IB_WR_FAST_REG_MR; + frmr_wr.send_flags = 0; /* unsignaled */ + frmr_wr.wr.fast_reg.iova_start = (unsigned long)seg1->mr_dma; + frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl; + frmr_wr.wr.fast_reg.page_list_len = i; + frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT; + frmr_wr.wr.fast_reg.length = i << PAGE_SHIFT; + frmr_wr.wr.fast_reg.access_flags = (writing ? + IB_ACCESS_REMOTE_WRITE : IB_ACCESS_REMOTE_READ); + frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; + DECR_CQCOUNT(&r_xprt->rx_ep); + + rc = ib_post_send(ia->ri_id->qp, &frmr_wr, &bad_wr); + + if (rc) { + dprintk("RPC: %s: failed ib_post_send for register," + " status %i\n", __func__, rc); + while (i--) + rpcrdma_unmap_one(ia, --seg); + } else { + seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; + seg1->mr_base = seg1->mr_dma + pageoff; + seg1->mr_nsegs = i; + seg1->mr_len = len; + } + *nsegs = i; + return rc; +} + +static int +rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg, + struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_mr_seg *seg1 = seg; + struct ib_send_wr invalidate_wr, *bad_wr; + int rc; + + while (seg1->mr_nsegs--) + rpcrdma_unmap_one(ia, seg++); + + memset(&invalidate_wr, 0, sizeof invalidate_wr); + invalidate_wr.opcode = IB_WR_LOCAL_INV; + invalidate_wr.send_flags = 0; /* unsignaled */ + invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; + DECR_CQCOUNT(&r_xprt->rx_ep); + + rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); + if (rc) + dprintk("RPC: %s: failed ib_post_send for invalidate," + " status %i\n", __func__, rc); + return rc; +} + +static int +rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg, + int *nsegs, int writing, struct rpcrdma_ia *ia) +{ + struct rpcrdma_mr_seg *seg1 = seg; + u64 physaddrs[RPCRDMA_MAX_DATA_SEGS]; + int len, pageoff, i, rc; + + pageoff = offset_in_page(seg1->mr_offset); + seg1->mr_offset -= pageoff; /* start of page */ + seg1->mr_len += pageoff; + len = -pageoff; + if (*nsegs > RPCRDMA_MAX_DATA_SEGS) + *nsegs = RPCRDMA_MAX_DATA_SEGS; + for (i = 0; i < *nsegs;) { + rpcrdma_map_one(ia, seg, writing); + physaddrs[i] = seg->mr_dma; + len += seg->mr_len; + ++seg; + ++i; + /* Check for holes */ + if ((i < *nsegs && offset_in_page(seg->mr_offset)) || + offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) + break; + } + rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr, + physaddrs, i, seg1->mr_dma); + if (rc) { + dprintk("RPC: %s: failed ib_map_phys_fmr " + "%u@0x%llx+%i (%d)... status %i\n", __func__, + len, (unsigned long long)seg1->mr_dma, + pageoff, i, rc); + while (i--) + rpcrdma_unmap_one(ia, --seg); + } else { + seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey; + seg1->mr_base = seg1->mr_dma + pageoff; + seg1->mr_nsegs = i; + seg1->mr_len = len; + } + *nsegs = i; + return rc; +} + +static int +rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg, + struct rpcrdma_ia *ia) +{ + struct rpcrdma_mr_seg *seg1 = seg; + LIST_HEAD(l); + int rc; + + list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l); + rc = ib_unmap_fmr(&l); + while (seg1->mr_nsegs--) + rpcrdma_unmap_one(ia, seg++); + if (rc) + dprintk("RPC: %s: failed ib_unmap_fmr," + " status %i\n", __func__, rc); + return rc; +} + +static int +rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg, + int *nsegs, int writing, struct rpcrdma_ia *ia, + struct rpcrdma_xprt *r_xprt) +{ + int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE : + IB_ACCESS_REMOTE_READ); + struct ib_mw_bind param; + int rc; + + *nsegs = 1; + rpcrdma_map_one(ia, seg, writing); + param.mr = ia->ri_bind_mem; + param.wr_id = 0ULL; /* no send cookie */ + param.addr = seg->mr_dma; + param.length = seg->mr_len; + param.send_flags = 0; + param.mw_access_flags = mem_priv; + + DECR_CQCOUNT(&r_xprt->rx_ep); + rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, ¶m); + if (rc) { + dprintk("RPC: %s: failed ib_bind_mw " + "%u@0x%llx status %i\n", + __func__, seg->mr_len, + (unsigned long long)seg->mr_dma, rc); + rpcrdma_unmap_one(ia, seg); + } else { + seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey; + seg->mr_base = param.addr; + seg->mr_nsegs = 1; + } + return rc; +} + +static int +rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg, + struct rpcrdma_ia *ia, + struct rpcrdma_xprt *r_xprt, void **r) +{ + struct ib_mw_bind param; + LIST_HEAD(l); + int rc; + + BUG_ON(seg->mr_nsegs != 1); + param.mr = ia->ri_bind_mem; + param.addr = 0ULL; /* unbind */ + param.length = 0; + param.mw_access_flags = 0; + if (*r) { + param.wr_id = (u64) (unsigned long) *r; + param.send_flags = IB_SEND_SIGNALED; + INIT_CQCOUNT(&r_xprt->rx_ep); + } else { + param.wr_id = 0ULL; + param.send_flags = 0; + DECR_CQCOUNT(&r_xprt->rx_ep); + } + rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, ¶m); + rpcrdma_unmap_one(ia, seg); + if (rc) + dprintk("RPC: %s: failed ib_(un)bind_mw," + " status %i\n", __func__, rc); + else + *r = NULL; /* will upcall on completion */ + return rc; +} + +static int +rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg, + int *nsegs, int writing, struct rpcrdma_ia *ia) +{ + int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE : + IB_ACCESS_REMOTE_READ); + struct rpcrdma_mr_seg *seg1 = seg; + struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS]; + int len, i, rc = 0; + + if (*nsegs > RPCRDMA_MAX_DATA_SEGS) + *nsegs = RPCRDMA_MAX_DATA_SEGS; + for (len = 0, i = 0; i < *nsegs;) { + rpcrdma_map_one(ia, seg, writing); + ipb[i].addr = seg->mr_dma; + ipb[i].size = seg->mr_len; + len += seg->mr_len; + ++seg; + ++i; + /* Check for holes */ + if ((i < *nsegs && offset_in_page(seg->mr_offset)) || + offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len)) + break; + } + seg1->mr_base = seg1->mr_dma; + seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd, + ipb, i, mem_priv, &seg1->mr_base); + if (IS_ERR(seg1->mr_chunk.rl_mr)) { + rc = PTR_ERR(seg1->mr_chunk.rl_mr); + dprintk("RPC: %s: failed ib_reg_phys_mr " + "%u@0x%llx (%d)... status %i\n", + __func__, len, + (unsigned long long)seg1->mr_dma, i, rc); + while (i--) + rpcrdma_unmap_one(ia, --seg); + } else { + seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey; + seg1->mr_nsegs = i; + seg1->mr_len = len; + } + *nsegs = i; + return rc; +} + +static int +rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg, + struct rpcrdma_ia *ia) +{ + struct rpcrdma_mr_seg *seg1 = seg; + int rc; + + rc = ib_dereg_mr(seg1->mr_chunk.rl_mr); + seg1->mr_chunk.rl_mr = NULL; + while (seg1->mr_nsegs--) + rpcrdma_unmap_one(ia, seg++); + if (rc) + dprintk("RPC: %s: failed ib_dereg_mr," + " status %i\n", __func__, rc); + return rc; +} + int rpcrdma_register_external(struct rpcrdma_mr_seg *seg, int nsegs, int writing, struct rpcrdma_xprt *r_xprt) { struct rpcrdma_ia *ia = &r_xprt->rx_ia; - int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE : - IB_ACCESS_REMOTE_READ); - struct rpcrdma_mr_seg *seg1 = seg; - int i; int rc = 0; switch (ia->ri_memreg_strategy) { @@ -1352,114 +1750,25 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg, break; #endif - /* Registration using fast memory registration */ + /* Registration using frmr registration */ + case RPCRDMA_FRMR: + rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt); + break; + + /* Registration using fmr memory registration */ case RPCRDMA_MTHCAFMR: - { - u64 physaddrs[RPCRDMA_MAX_DATA_SEGS]; - int len, pageoff = offset_in_page(seg->mr_offset); - seg1->mr_offset -= pageoff; /* start of page */ - seg1->mr_len += pageoff; - len = -pageoff; - if (nsegs > RPCRDMA_MAX_DATA_SEGS) - nsegs = RPCRDMA_MAX_DATA_SEGS; - for (i = 0; i < nsegs;) { - rpcrdma_map_one(ia, seg, writing); - physaddrs[i] = seg->mr_dma; - len += seg->mr_len; - ++seg; - ++i; - /* Check for holes */ - if ((i < nsegs && offset_in_page(seg->mr_offset)) || - offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len)) - break; - } - nsegs = i; - rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr, - physaddrs, nsegs, seg1->mr_dma); - if (rc) { - dprintk("RPC: %s: failed ib_map_phys_fmr " - "%u@0x%llx+%i (%d)... status %i\n", __func__, - len, (unsigned long long)seg1->mr_dma, - pageoff, nsegs, rc); - while (nsegs--) - rpcrdma_unmap_one(ia, --seg); - } else { - seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey; - seg1->mr_base = seg1->mr_dma + pageoff; - seg1->mr_nsegs = nsegs; - seg1->mr_len = len; - } - } + rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia); break; /* Registration using memory windows */ case RPCRDMA_MEMWINDOWS_ASYNC: case RPCRDMA_MEMWINDOWS: - { - struct ib_mw_bind param; - rpcrdma_map_one(ia, seg, writing); - param.mr = ia->ri_bind_mem; - param.wr_id = 0ULL; /* no send cookie */ - param.addr = seg->mr_dma; - param.length = seg->mr_len; - param.send_flags = 0; - param.mw_access_flags = mem_priv; - - DECR_CQCOUNT(&r_xprt->rx_ep); - rc = ib_bind_mw(ia->ri_id->qp, - seg->mr_chunk.rl_mw->r.mw, ¶m); - if (rc) { - dprintk("RPC: %s: failed ib_bind_mw " - "%u@0x%llx status %i\n", - __func__, seg->mr_len, - (unsigned long long)seg->mr_dma, rc); - rpcrdma_unmap_one(ia, seg); - } else { - seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey; - seg->mr_base = param.addr; - seg->mr_nsegs = 1; - nsegs = 1; - } - } + rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt); break; /* Default registration each time */ default: - { - struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS]; - int len = 0; - if (nsegs > RPCRDMA_MAX_DATA_SEGS) - nsegs = RPCRDMA_MAX_DATA_SEGS; - for (i = 0; i < nsegs;) { - rpcrdma_map_one(ia, seg, writing); - ipb[i].addr = seg->mr_dma; - ipb[i].size = seg->mr_len; - len += seg->mr_len; - ++seg; - ++i; - /* Check for holes */ - if ((i < nsegs && offset_in_page(seg->mr_offset)) || - offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len)) - break; - } - nsegs = i; - seg1->mr_base = seg1->mr_dma; - seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd, - ipb, nsegs, mem_priv, &seg1->mr_base); - if (IS_ERR(seg1->mr_chunk.rl_mr)) { - rc = PTR_ERR(seg1->mr_chunk.rl_mr); - dprintk("RPC: %s: failed ib_reg_phys_mr " - "%u@0x%llx (%d)... status %i\n", - __func__, len, - (unsigned long long)seg1->mr_dma, nsegs, rc); - while (nsegs--) - rpcrdma_unmap_one(ia, --seg); - } else { - seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey; - seg1->mr_nsegs = nsegs; - seg1->mr_len = len; - } - } + rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia); break; } if (rc) @@ -1473,7 +1782,6 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, struct rpcrdma_xprt *r_xprt, void *r) { struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rpcrdma_mr_seg *seg1 = seg; int nsegs = seg->mr_nsegs, rc; switch (ia->ri_memreg_strategy) { @@ -1486,56 +1794,21 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, break; #endif + case RPCRDMA_FRMR: + rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt); + break; + case RPCRDMA_MTHCAFMR: - { - LIST_HEAD(l); - list_add(&seg->mr_chunk.rl_mw->r.fmr->list, &l); - rc = ib_unmap_fmr(&l); - while (seg1->mr_nsegs--) - rpcrdma_unmap_one(ia, seg++); - } - if (rc) - dprintk("RPC: %s: failed ib_unmap_fmr," - " status %i\n", __func__, rc); + rc = rpcrdma_deregister_fmr_external(seg, ia); break; case RPCRDMA_MEMWINDOWS_ASYNC: case RPCRDMA_MEMWINDOWS: - { - struct ib_mw_bind param; - BUG_ON(nsegs != 1); - param.mr = ia->ri_bind_mem; - param.addr = 0ULL; /* unbind */ - param.length = 0; - param.mw_access_flags = 0; - if (r) { - param.wr_id = (u64) (unsigned long) r; - param.send_flags = IB_SEND_SIGNALED; - INIT_CQCOUNT(&r_xprt->rx_ep); - } else { - param.wr_id = 0ULL; - param.send_flags = 0; - DECR_CQCOUNT(&r_xprt->rx_ep); - } - rc = ib_bind_mw(ia->ri_id->qp, - seg->mr_chunk.rl_mw->r.mw, ¶m); - rpcrdma_unmap_one(ia, seg); - } - if (rc) - dprintk("RPC: %s: failed ib_(un)bind_mw," - " status %i\n", __func__, rc); - else - r = NULL; /* will upcall on completion */ + rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r); break; default: - rc = ib_dereg_mr(seg1->mr_chunk.rl_mr); - seg1->mr_chunk.rl_mr = NULL; - while (seg1->mr_nsegs--) - rpcrdma_unmap_one(ia, seg++); - if (rc) - dprintk("RPC: %s: failed ib_dereg_mr," - " status %i\n", __func__, rc); + rc = rpcrdma_deregister_default_external(seg, ia); break; } if (r) { diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 2427822f8bd4..c7a7eba991bc 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -51,6 +51,9 @@ #include <linux/sunrpc/rpc_rdma.h> /* RPC/RDMA protocol */ #include <linux/sunrpc/xprtrdma.h> /* xprt parameters */ +#define RDMA_RESOLVE_TIMEOUT (5000) /* 5 seconds */ +#define RDMA_CONNECT_RETRY_MAX (2) /* retries if no listener backlog */ + /* * Interface Adapter -- one per transport instance */ @@ -58,6 +61,8 @@ struct rpcrdma_ia { struct rdma_cm_id *ri_id; struct ib_pd *ri_pd; struct ib_mr *ri_bind_mem; + u32 ri_dma_lkey; + int ri_have_dma_lkey; struct completion ri_done; int ri_async_rc; enum rpcrdma_memreg ri_memreg_strategy; @@ -156,6 +161,10 @@ struct rpcrdma_mr_seg { /* chunk descriptors */ union { struct ib_mw *mw; struct ib_fmr *fmr; + struct { + struct ib_fast_reg_page_list *fr_pgl; + struct ib_mr *fr_mr; + } frmr; } r; struct list_head mw_list; } *rl_mw; @@ -175,6 +184,7 @@ struct rpcrdma_req { size_t rl_size; /* actual length of buffer */ unsigned int rl_niovs; /* 0, 2 or 4 */ unsigned int rl_nchunks; /* non-zero if chunks */ + unsigned int rl_connect_cookie; /* retry detection */ struct rpcrdma_buffer *rl_buffer; /* home base for this structure */ struct rpcrdma_rep *rl_reply;/* holder for reply buffer */ struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];/* chunk segments */ @@ -198,7 +208,7 @@ struct rpcrdma_buffer { atomic_t rb_credits; /* most recent server credits */ unsigned long rb_cwndscale; /* cached framework rpc_cwndscale */ int rb_max_requests;/* client max requests */ - struct list_head rb_mws; /* optional memory windows/fmrs */ + struct list_head rb_mws; /* optional memory windows/fmrs/frmrs */ int rb_send_index; struct rpcrdma_req **rb_send_bufs; int rb_recv_index; @@ -273,6 +283,11 @@ struct rpcrdma_xprt { #define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, xprt) #define rpcx_to_rdmad(x) (rpcx_to_rdmax(x)->rx_data) +/* Setting this to 0 ensures interoperability with early servers. + * Setting this to 1 enhances certain unaligned read/write performance. + * Default is 0, see sysctl entry and rpc_rdma.c rpcrdma_convert_iovs() */ +extern int xprt_rdma_pad_optimize; + /* * Interface Adapter calls - xprtrdma/verbs.c */ diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 9a288d5eea64..5cbb404c4cdf 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -249,6 +249,7 @@ struct sock_xprt { void (*old_data_ready)(struct sock *, int); void (*old_state_change)(struct sock *); void (*old_write_space)(struct sock *); + void (*old_error_report)(struct sock *); }; /* @@ -283,8 +284,7 @@ static void xs_format_ipv4_peer_addresses(struct rpc_xprt *xprt, buf = kzalloc(20, GFP_KERNEL); if (buf) { - snprintf(buf, 20, NIPQUAD_FMT, - NIPQUAD(addr->sin_addr.s_addr)); + snprintf(buf, 20, "%pI4", &addr->sin_addr.s_addr); } xprt->address_strings[RPC_DISPLAY_ADDR] = buf; @@ -299,8 +299,8 @@ static void xs_format_ipv4_peer_addresses(struct rpc_xprt *xprt, buf = kzalloc(48, GFP_KERNEL); if (buf) { - snprintf(buf, 48, "addr="NIPQUAD_FMT" port=%u proto=%s", - NIPQUAD(addr->sin_addr.s_addr), + snprintf(buf, 48, "addr=%pI4 port=%u proto=%s", + &addr->sin_addr.s_addr, ntohs(addr->sin_port), protocol); } @@ -322,8 +322,8 @@ static void xs_format_ipv4_peer_addresses(struct rpc_xprt *xprt, buf = kzalloc(30, GFP_KERNEL); if (buf) { - snprintf(buf, 30, NIPQUAD_FMT".%u.%u", - NIPQUAD(addr->sin_addr.s_addr), + snprintf(buf, 30, "%pI4.%u.%u", + &addr->sin_addr.s_addr, ntohs(addr->sin_port) >> 8, ntohs(addr->sin_port) & 0xff); } @@ -341,8 +341,7 @@ static void xs_format_ipv6_peer_addresses(struct rpc_xprt *xprt, buf = kzalloc(40, GFP_KERNEL); if (buf) { - snprintf(buf, 40, NIP6_FMT, - NIP6(addr->sin6_addr)); + snprintf(buf, 40, "%pI6",&addr->sin6_addr); } xprt->address_strings[RPC_DISPLAY_ADDR] = buf; @@ -357,18 +356,17 @@ static void xs_format_ipv6_peer_addresses(struct rpc_xprt *xprt, buf = kzalloc(64, GFP_KERNEL); if (buf) { - snprintf(buf, 64, "addr="NIP6_FMT" port=%u proto=%s", - NIP6(addr->sin6_addr), + snprintf(buf, 64, "addr=%pI6 port=%u proto=%s", + &addr->sin6_addr, ntohs(addr->sin6_port), protocol); } xprt->address_strings[RPC_DISPLAY_ALL] = buf; buf = kzalloc(36, GFP_KERNEL); - if (buf) { - snprintf(buf, 36, NIP6_SEQFMT, - NIP6(addr->sin6_addr)); - } + if (buf) + snprintf(buf, 36, "%pi6", &addr->sin6_addr); + xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = buf; buf = kzalloc(8, GFP_KERNEL); @@ -380,10 +378,10 @@ static void xs_format_ipv6_peer_addresses(struct rpc_xprt *xprt, buf = kzalloc(50, GFP_KERNEL); if (buf) { - snprintf(buf, 50, NIP6_FMT".%u.%u", - NIP6(addr->sin6_addr), - ntohs(addr->sin6_port) >> 8, - ntohs(addr->sin6_port) & 0xff); + snprintf(buf, 50, "%pI6.%u.%u", + &addr->sin6_addr, + ntohs(addr->sin6_port) >> 8, + ntohs(addr->sin6_port) & 0xff); } xprt->address_strings[RPC_DISPLAY_UNIVERSAL_ADDR] = buf; @@ -698,8 +696,9 @@ static int xs_tcp_send_request(struct rpc_task *task) case -EAGAIN: xs_nospace(task); break; - case -ECONNREFUSED: case -ECONNRESET: + xs_tcp_shutdown(xprt); + case -ECONNREFUSED: case -ENOTCONN: case -EPIPE: status = -ENOTCONN; @@ -742,6 +741,22 @@ out_release: xprt_release_xprt(xprt, task); } +static void xs_save_old_callbacks(struct sock_xprt *transport, struct sock *sk) +{ + transport->old_data_ready = sk->sk_data_ready; + transport->old_state_change = sk->sk_state_change; + transport->old_write_space = sk->sk_write_space; + transport->old_error_report = sk->sk_error_report; +} + +static void xs_restore_old_callbacks(struct sock_xprt *transport, struct sock *sk) +{ + sk->sk_data_ready = transport->old_data_ready; + sk->sk_state_change = transport->old_state_change; + sk->sk_write_space = transport->old_write_space; + sk->sk_error_report = transport->old_error_report; +} + /** * xs_close - close a socket * @xprt: transport @@ -765,9 +780,8 @@ static void xs_close(struct rpc_xprt *xprt) transport->sock = NULL; sk->sk_user_data = NULL; - sk->sk_data_ready = transport->old_data_ready; - sk->sk_state_change = transport->old_state_change; - sk->sk_write_space = transport->old_write_space; + + xs_restore_old_callbacks(transport, sk); write_unlock_bh(&sk->sk_callback_lock); sk->sk_no_check = 0; @@ -1180,6 +1194,28 @@ static void xs_tcp_state_change(struct sock *sk) } /** + * xs_tcp_error_report - callback mainly for catching RST events + * @sk: socket + */ +static void xs_tcp_error_report(struct sock *sk) +{ + struct rpc_xprt *xprt; + + read_lock(&sk->sk_callback_lock); + if (sk->sk_err != ECONNRESET || sk->sk_state != TCP_ESTABLISHED) + goto out; + if (!(xprt = xprt_from_sock(sk))) + goto out; + dprintk("RPC: %s client %p...\n" + "RPC: error %d\n", + __func__, xprt, sk->sk_err); + + xprt_force_disconnect(xprt); +out: + read_unlock(&sk->sk_callback_lock); +} + +/** * xs_udp_write_space - callback invoked when socket buffer space * becomes available * @sk: socket whose state has changed @@ -1376,8 +1412,8 @@ static int xs_bind4(struct sock_xprt *transport, struct socket *sock) if (port > last) nloop++; } while (err == -EADDRINUSE && nloop != 2); - dprintk("RPC: %s "NIPQUAD_FMT":%u: %s (%d)\n", - __func__, NIPQUAD(myaddr.sin_addr), + dprintk("RPC: %s %pI4:%u: %s (%d)\n", + __func__, &myaddr.sin_addr, port, err ? "failed" : "ok", err); return err; } @@ -1409,8 +1445,8 @@ static int xs_bind6(struct sock_xprt *transport, struct socket *sock) if (port > last) nloop++; } while (err == -EADDRINUSE && nloop != 2); - dprintk("RPC: xs_bind6 "NIP6_FMT":%u: %s (%d)\n", - NIP6(myaddr.sin6_addr), port, err ? "failed" : "ok", err); + dprintk("RPC: xs_bind6 %pI6:%u: %s (%d)\n", + &myaddr.sin6_addr, port, err ? "failed" : "ok", err); return err; } @@ -1454,10 +1490,9 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) write_lock_bh(&sk->sk_callback_lock); + xs_save_old_callbacks(transport, sk); + sk->sk_user_data = xprt; - transport->old_data_ready = sk->sk_data_ready; - transport->old_state_change = sk->sk_state_change; - transport->old_write_space = sk->sk_write_space; sk->sk_data_ready = xs_udp_data_ready; sk->sk_write_space = xs_udp_write_space; sk->sk_no_check = UDP_CSUM_NORCV; @@ -1589,13 +1624,13 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) write_lock_bh(&sk->sk_callback_lock); + xs_save_old_callbacks(transport, sk); + sk->sk_user_data = xprt; - transport->old_data_ready = sk->sk_data_ready; - transport->old_state_change = sk->sk_state_change; - transport->old_write_space = sk->sk_write_space; sk->sk_data_ready = xs_tcp_data_ready; sk->sk_state_change = xs_tcp_state_change; sk->sk_write_space = xs_tcp_write_space; + sk->sk_error_report = xs_tcp_error_report; sk->sk_allocation = GFP_ATOMIC; /* socket options */ |