From 7f1bda447c9bd48b415acedba6b830f61591601f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 18 Dec 2017 14:39:13 -0500 Subject: [PATCH 01/69] NFS: Add a cond_resched() to nfs_commit_release_pages() The commit list can get very large, and so we need a cond_resched() in nfs_commit_release_pages() in order to ensure we don't hog the CPU for excessive periods of time. Reported-by: Mike Galbraith Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 4a379d7918f2..cf61108f8f8d 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1837,6 +1837,8 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) set_bit(NFS_CONTEXT_RESEND_WRITES, &req->wb_context->flags); next: nfs_unlock_and_release_request(req); + /* Latency breaker */ + cond_resched(); } nfss = NFS_SERVER(data->inode); if (atomic_long_read(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) From b8b8d221090f2b469027f04e451ef1877cb1be08 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 7 Nov 2017 10:51:37 -0500 Subject: [PATCH 02/69] NFSv4: Convert CLOSE to use nfs4_async_handle_exception() Convert CLOSE so that it specifies the correct stateid, state and inode for the error handling. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 56fa5a16e097..ea2b7e8db437 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3148,6 +3148,11 @@ static void nfs4_close_done(struct rpc_task *task, void *data) struct nfs4_state *state = calldata->state; struct nfs_server *server = NFS_SERVER(calldata->inode); nfs4_stateid *res_stateid = NULL; + struct nfs4_exception exception = { + .state = state, + .inode = calldata->inode, + .stateid = &calldata->arg.stateid, + }; dprintk("%s: begin!\n", __func__); if (!nfs4_sequence_done(task, &calldata->res.seq_res)) @@ -3213,7 +3218,9 @@ static void nfs4_close_done(struct rpc_task *task, void *data) case -NFS4ERR_BAD_STATEID: break; default: - if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN) + task->tk_status = nfs4_async_handle_exception(task, + server, task->tk_status, &exception); + if (exception.retry) goto out_restart; } nfs_clear_open_stateid(state, &calldata->arg.stateid, From e0dba0128a662ab8540fcea76aca8ae27774c7da Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 7 Nov 2017 11:02:32 -0500 Subject: [PATCH 03/69] NFSv4: Convert DELEGRETURN to use nfs4_handle_exception() Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ea2b7e8db437..b704e08b390c 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5764,6 +5764,10 @@ struct nfs4_delegreturndata { static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) { struct nfs4_delegreturndata *data = calldata; + struct nfs4_exception exception = { + .inode = data->inode, + .stateid = &data->stateid, + }; if (!nfs4_sequence_done(task, &data->res.seq_res)) return; @@ -5825,10 +5829,11 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) } /* Fallthrough */ default: - if (nfs4_async_handle_error(task, data->res.server, - NULL, NULL) == -EAGAIN) { + task->tk_status = nfs4_async_handle_exception(task, + data->res.server, task->tk_status, + &exception); + if (exception.retry) goto out_restart; - } } data->rpc_status = task->tk_status; return; From 82571552a034f7531b0c8e76c1fd593c8755c519 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 7 Nov 2017 11:14:49 -0500 Subject: [PATCH 04/69] NFSv4: Convert LOCKU to use nfs4_async_handle_exception() Convert CLOSE so that it specifies the correct stateid and inode for the error handling. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index b704e08b390c..b927fda32e74 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6071,6 +6071,10 @@ static void nfs4_locku_release_calldata(void *data) static void nfs4_locku_done(struct rpc_task *task, void *data) { struct nfs4_unlockdata *calldata = data; + struct nfs4_exception exception = { + .inode = calldata->lsp->ls_state->inode, + .stateid = &calldata->arg.stateid, + }; if (!nfs4_sequence_done(task, &calldata->res.seq_res)) return; @@ -6094,8 +6098,10 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) rpc_restart_call_prepare(task); break; default: - if (nfs4_async_handle_error(task, calldata->server, - NULL, NULL) == -EAGAIN) + task->tk_status = nfs4_async_handle_exception(task, + calldata->server, task->tk_status, + &exception); + if (exception.retry) rpc_restart_call_prepare(task); } nfs_release_seqid(calldata->arg.seqid); From 8634ef5e05311f32d7f2aee06f6b27a8834a3bd6 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 6 Jan 2018 09:53:49 -0500 Subject: [PATCH 05/69] NFS: Fix nfsstat breakage due to LOOKUPP The LOOKUPP operation was inserted into the nfs4_procedures array rather than being appended, which put /proc/net/rpc/nfs out of whack, and broke the nfsstat utility. Fix by moving the LOOKUPP operation to the end of the array, and by ensuring that it keeps the same length whether or not NFSV4.1 and NFSv4.2 are compiled in. Fixes: 5b5faaf6df734 ("nfs4: add NFSv4 LOOKUPP handlers") Cc: stable@vger.kernel.org # v4.13+ Signed-off-by: Trond Myklebust --- fs/nfs/nfs4xdr.c | 64 ++++++++++++++++++++++++++------------------ include/linux/nfs4.h | 12 ++++++--- 2 files changed, 46 insertions(+), 30 deletions(-) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 77c6729e57f0..65c9c4175145 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -7678,6 +7678,22 @@ nfs4_stat_to_errno(int stat) .p_name = #proc, \ } +#if defined(CONFIG_NFS_V4_1) +#define PROC41(proc, argtype, restype) \ + PROC(proc, argtype, restype) +#else +#define PROC41(proc, argtype, restype) \ + STUB(proc) +#endif + +#if defined(CONFIG_NFS_V4_2) +#define PROC42(proc, argtype, restype) \ + PROC(proc, argtype, restype) +#else +#define PROC42(proc, argtype, restype) \ + STUB(proc) +#endif + const struct rpc_procinfo nfs4_procedures[] = { PROC(READ, enc_read, dec_read), PROC(WRITE, enc_write, dec_write), @@ -7698,7 +7714,6 @@ const struct rpc_procinfo nfs4_procedures[] = { PROC(ACCESS, enc_access, dec_access), PROC(GETATTR, enc_getattr, dec_getattr), PROC(LOOKUP, enc_lookup, dec_lookup), - PROC(LOOKUPP, enc_lookupp, dec_lookupp), PROC(LOOKUP_ROOT, enc_lookup_root, dec_lookup_root), PROC(REMOVE, enc_remove, dec_remove), PROC(RENAME, enc_rename, dec_rename), @@ -7717,33 +7732,30 @@ const struct rpc_procinfo nfs4_procedures[] = { PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner), PROC(SECINFO, enc_secinfo, dec_secinfo), PROC(FSID_PRESENT, enc_fsid_present, dec_fsid_present), -#if defined(CONFIG_NFS_V4_1) - PROC(EXCHANGE_ID, enc_exchange_id, dec_exchange_id), - PROC(CREATE_SESSION, enc_create_session, dec_create_session), - PROC(DESTROY_SESSION, enc_destroy_session, dec_destroy_session), - PROC(SEQUENCE, enc_sequence, dec_sequence), - PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time), - PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete), - PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo), - PROC(LAYOUTGET, enc_layoutget, dec_layoutget), - PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit), - PROC(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn), - PROC(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name), - PROC(TEST_STATEID, enc_test_stateid, dec_test_stateid), - PROC(FREE_STATEID, enc_free_stateid, dec_free_stateid), + PROC41(EXCHANGE_ID, enc_exchange_id, dec_exchange_id), + PROC41(CREATE_SESSION, enc_create_session, dec_create_session), + PROC41(DESTROY_SESSION, enc_destroy_session, dec_destroy_session), + PROC41(SEQUENCE, enc_sequence, dec_sequence), + PROC41(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time), + PROC41(RECLAIM_COMPLETE,enc_reclaim_complete, dec_reclaim_complete), + PROC41(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo), + PROC41(LAYOUTGET, enc_layoutget, dec_layoutget), + PROC41(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit), + PROC41(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn), + PROC41(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name), + PROC41(TEST_STATEID, enc_test_stateid, dec_test_stateid), + PROC41(FREE_STATEID, enc_free_stateid, dec_free_stateid), STUB(GETDEVICELIST), - PROC(BIND_CONN_TO_SESSION, + PROC41(BIND_CONN_TO_SESSION, enc_bind_conn_to_session, dec_bind_conn_to_session), - PROC(DESTROY_CLIENTID, enc_destroy_clientid, dec_destroy_clientid), -#endif /* CONFIG_NFS_V4_1 */ -#ifdef CONFIG_NFS_V4_2 - PROC(SEEK, enc_seek, dec_seek), - PROC(ALLOCATE, enc_allocate, dec_allocate), - PROC(DEALLOCATE, enc_deallocate, dec_deallocate), - PROC(LAYOUTSTATS, enc_layoutstats, dec_layoutstats), - PROC(CLONE, enc_clone, dec_clone), - PROC(COPY, enc_copy, dec_copy), -#endif /* CONFIG_NFS_V4_2 */ + PROC41(DESTROY_CLIENTID,enc_destroy_clientid, dec_destroy_clientid), + PROC42(SEEK, enc_seek, dec_seek), + PROC42(ALLOCATE, enc_allocate, dec_allocate), + PROC42(DEALLOCATE, enc_deallocate, dec_deallocate), + PROC42(LAYOUTSTATS, enc_layoutstats, dec_layoutstats), + PROC42(CLONE, enc_clone, dec_clone), + PROC42(COPY, enc_copy, dec_copy), + PROC(LOOKUPP, enc_lookupp, dec_lookupp), }; static unsigned int nfs_version4_counts[ARRAY_SIZE(nfs4_procedures)]; diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 47adac640191..57ffaa20d564 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -457,7 +457,12 @@ enum lock_type4 { #define NFS4_DEBUG 1 -/* Index of predefined Linux client operations */ +/* + * Index of predefined Linux client operations + * + * To ensure that /proc/net/rpc/nfs remains correctly ordered, please + * append only to this enum when adding new client operations. + */ enum { NFSPROC4_CLNT_NULL = 0, /* Unused */ @@ -480,7 +485,6 @@ enum { NFSPROC4_CLNT_ACCESS, NFSPROC4_CLNT_GETATTR, NFSPROC4_CLNT_LOOKUP, - NFSPROC4_CLNT_LOOKUPP, NFSPROC4_CLNT_LOOKUP_ROOT, NFSPROC4_CLNT_REMOVE, NFSPROC4_CLNT_RENAME, @@ -500,7 +504,6 @@ enum { NFSPROC4_CLNT_SECINFO, NFSPROC4_CLNT_FSID_PRESENT, - /* nfs41 */ NFSPROC4_CLNT_EXCHANGE_ID, NFSPROC4_CLNT_CREATE_SESSION, NFSPROC4_CLNT_DESTROY_SESSION, @@ -518,13 +521,14 @@ enum { NFSPROC4_CLNT_BIND_CONN_TO_SESSION, NFSPROC4_CLNT_DESTROY_CLIENTID, - /* nfs42 */ NFSPROC4_CLNT_SEEK, NFSPROC4_CLNT_ALLOCATE, NFSPROC4_CLNT_DEALLOCATE, NFSPROC4_CLNT_LAYOUTSTATS, NFSPROC4_CLNT_CLONE, NFSPROC4_CLNT_COPY, + + NFSPROC4_CLNT_LOOKUPP, }; /* nfs41 types */ From 9ccee940bd5b766b6dab6c1a80908b9490a4850d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 4 Jan 2018 17:46:09 -0500 Subject: [PATCH 06/69] Support statx() mask and query flags parameters Support the query flags AT_STATX_FORCE_SYNC by forcing an attribute revalidation, and AT_STATX_DONT_SYNC by returning cached attributes only. Use the mask to optimise away server revalidation for attributes that are not being requested by the user. Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 48 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 11 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index b992d2382ffa..deeb7d1097d0 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -735,12 +735,20 @@ int nfs_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); - int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME; + struct nfs_server *server = NFS_SERVER(inode); + unsigned long cache_validity; int err = 0; + bool force_sync = query_flags & AT_STATX_FORCE_SYNC; + bool do_update = false; trace_nfs_getattr_enter(inode); + + if ((query_flags & AT_STATX_DONT_SYNC) && !force_sync) + goto out_no_update; + /* Flush out writes to the server in order to update c/mtime. */ - if (S_ISREG(inode->i_mode)) { + if ((request_mask & (STATX_CTIME|STATX_MTIME)) && + S_ISREG(inode->i_mode)) { err = filemap_write_and_wait(inode->i_mapping); if (err) goto out; @@ -757,24 +765,42 @@ int nfs_getattr(const struct path *path, struct kstat *stat, */ if ((path->mnt->mnt_flags & MNT_NOATIME) || ((path->mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))) - need_atime = 0; + request_mask &= ~STATX_ATIME; - if (need_atime || nfs_need_revalidate_inode(inode)) { - struct nfs_server *server = NFS_SERVER(inode); + /* Is the user requesting attributes that might need revalidation? */ + if (!(request_mask & (STATX_MODE|STATX_NLINK|STATX_ATIME|STATX_CTIME| + STATX_MTIME|STATX_UID|STATX_GID| + STATX_SIZE|STATX_BLOCKS))) + goto out_no_revalidate; + /* Check whether the cached attributes are stale */ + do_update |= force_sync || nfs_attribute_cache_expired(inode); + cache_validity = READ_ONCE(NFS_I(inode)->cache_validity); + do_update |= cache_validity & + (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL); + if (request_mask & STATX_ATIME) + do_update |= cache_validity & NFS_INO_INVALID_ATIME; + if (request_mask & (STATX_CTIME|STATX_MTIME)) + do_update |= cache_validity & NFS_INO_REVAL_PAGECACHE; + if (do_update) { + /* Update the attribute cache */ if (!(server->flags & NFS_MOUNT_NOAC)) nfs_readdirplus_parent_cache_miss(path->dentry); else nfs_readdirplus_parent_cache_hit(path->dentry); err = __nfs_revalidate_inode(server, inode); + if (err) + goto out; } else nfs_readdirplus_parent_cache_hit(path->dentry); - if (!err) { - generic_fillattr(inode, stat); - stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); - if (S_ISDIR(inode->i_mode)) - stat->blksize = NFS_SERVER(inode)->dtsize; - } +out_no_revalidate: + /* Only return attributes that were revalidated. */ + stat->result_mask &= request_mask; +out_no_update: + generic_fillattr(inode, stat); + stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); + if (S_ISDIR(inode->i_mode)) + stat->blksize = NFS_SERVER(inode)->dtsize; out: trace_nfs_getattr_exit(inode, err); return err; From aaa1500894655427f1b964d1f31dbbd20baa80ab Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 22 Nov 2017 08:23:41 +1100 Subject: [PATCH 07/69] nfs: remove dead code from nfs_encode_fh() This code can never be used as the IS_AUTOMOUNT(inode) case has already been handled. So remove it to avoid confusion. Signed-off-by: NeilBrown Signed-off-by: Trond Myklebust --- fs/nfs/export.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/nfs/export.c b/fs/nfs/export.c index 83fd09fc8f77..2b80a6652818 100644 --- a/fs/nfs/export.c +++ b/fs/nfs/export.c @@ -48,10 +48,6 @@ nfs_encode_fh(struct inode *inode, __u32 *p, int *max_len, struct inode *parent) *max_len = len; return FILEID_INVALID; } - if (IS_AUTOMOUNT(inode)) { - *max_len = FILEID_INVALID; - goto out; - } p[FILEID_HIGH_OFF] = NFS_FILEID(inode) >> 32; p[FILEID_LOW_OFF] = NFS_FILEID(inode); From dce2630c7da73b0634686bca557cc8945cc450c8 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 13 Dec 2017 09:57:09 +1100 Subject: [PATCH 08/69] NFSv4: always set NFS_LOCK_LOST when a lock is lost. There are 2 comments in the NFSv4 code which suggest that SIGLOST should possibly be sent to a process. In these cases a lock has been lost. The current practice is to set NFS_LOCK_LOST so that read/write returns EIO when a lock is lost. So change these comments to code when sets NFS_LOCK_LOST. One case is when lock recovery after apparent server restart fails with NFS4ERR_DENIED, NFS4ERR_RECLAIM_BAD, or NFS4ERRO_RECLAIM_CONFLICT. The other case is when a lock attempt as part of lease recovery fails with NFS4ERR_DENIED. In an ideal world, these should not happen. However I have a packet trace showing an NFSv4.1 session getting NFS4ERR_BADSESSION after an extended network parition. The NFSv4.1 client treats this like server reboot until/unless it get NFS4ERR_NO_GRACE, in which case it switches over to "nograce" recovery mode. In this network trace, the client attempts to recover a lock and the server (incorrectly) reports NFS4ERR_DENIED rather than NFS4ERR_NO_GRACE. This leads to the ineffective comment and the client then continues to write using the OPEN stateid. Signed-off-by: NeilBrown Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 12 ++++++++---- fs/nfs/nfs4state.c | 5 ++++- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index b927fda32e74..f8a2b226f571 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2019,7 +2019,7 @@ static int nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *sta return ret; } -static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct nfs4_state *state, const nfs4_stateid *stateid, int err) +static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct nfs4_state *state, const nfs4_stateid *stateid, struct file_lock *fl, int err) { switch (err) { default: @@ -2066,7 +2066,11 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct return -EAGAIN; case -ENOMEM: case -NFS4ERR_DENIED: - /* kill_proc(fl->fl_pid, SIGLOST, 1); */ + if (fl) { + struct nfs4_lock_state *lsp = fl->fl_u.nfs4_fl.owner; + if (lsp) + set_bit(NFS_LOCK_LOST, &lsp->ls_flags); + } return 0; } return err; @@ -2102,7 +2106,7 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, err = nfs4_open_recover_helper(opendata, FMODE_READ); } nfs4_opendata_put(opendata); - return nfs4_handle_delegation_recall_error(server, state, stateid, err); + return nfs4_handle_delegation_recall_error(server, state, stateid, NULL, err); } static void nfs4_open_confirm_prepare(struct rpc_task *task, void *calldata) @@ -6757,7 +6761,7 @@ int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, if (err != 0) return err; err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW); - return nfs4_handle_delegation_recall_error(server, state, stateid, err); + return nfs4_handle_delegation_recall_error(server, state, stateid, fl, err); } struct nfs_release_lockowner_data { diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index e4f4a09ed9f4..91a4d4eeb235 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1482,6 +1482,7 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_ struct inode *inode = state->inode; struct nfs_inode *nfsi = NFS_I(inode); struct file_lock *fl; + struct nfs4_lock_state *lsp; int status = 0; struct file_lock_context *flctx = inode->i_flctx; struct list_head *list; @@ -1522,7 +1523,9 @@ restart: case -NFS4ERR_DENIED: case -NFS4ERR_RECLAIM_BAD: case -NFS4ERR_RECLAIM_CONFLICT: - /* kill_proc(fl->fl_pid, SIGLOST, 1); */ + lsp = fl->fl_u.nfs4_fl.owner; + if (lsp) + set_bit(NFS_LOCK_LOST, &lsp->ls_flags); status = 0; } spin_lock(&flctx->flc_lock); From e545735a3255a2f211d9c7efd2d67f2affcde1ba Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Fri, 8 Dec 2017 12:52:37 -0500 Subject: [PATCH 09/69] NFS: remove unused offset arg in nfs_pgio_rpcsetup nfs_pgio_rpcsetup() is always called with an offset of 0, so we should be able to drop the arguement altogether. Signed-off-by: Benjamin Coddington Signed-off-by: Trond Myklebust --- fs/nfs/pagelist.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index d0543e19098a..18a7626ac638 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -537,7 +537,7 @@ EXPORT_SYMBOL_GPL(nfs_pgio_header_free); * @cinfo: Commit information for the call (writes only) */ static void nfs_pgio_rpcsetup(struct nfs_pgio_header *hdr, - unsigned int count, unsigned int offset, + unsigned int count, int how, struct nfs_commit_info *cinfo) { struct nfs_page *req = hdr->req; @@ -546,10 +546,10 @@ static void nfs_pgio_rpcsetup(struct nfs_pgio_header *hdr, * NB: take care not to mess about with hdr->commit et al. */ hdr->args.fh = NFS_FH(hdr->inode); - hdr->args.offset = req_offset(req) + offset; + hdr->args.offset = req_offset(req); /* pnfs_set_layoutcommit needs this */ hdr->mds_offset = hdr->args.offset; - hdr->args.pgbase = req->wb_pgbase + offset; + hdr->args.pgbase = req->wb_pgbase; hdr->args.pages = hdr->page_array.pagevec; hdr->args.count = count; hdr->args.context = get_nfs_open_context(req->wb_context); @@ -789,7 +789,7 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, desc->pg_ioflags &= ~FLUSH_COND_STABLE; /* Set up the argument struct */ - nfs_pgio_rpcsetup(hdr, mirror->pg_count, 0, desc->pg_ioflags, &cinfo); + nfs_pgio_rpcsetup(hdr, mirror->pg_count, desc->pg_ioflags, &cinfo); desc->pg_rpc_callops = &nfs_pgio_common_ops; return 0; } From ad6b0241c94e2732f928ec9ef5b3561d7448c8fe Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Fri, 8 Dec 2017 12:52:47 -0500 Subject: [PATCH 10/69] pnfs/blocklayout: Add module alias for LAYOUT4_SCSI The blocklayout module contains the client support for both block and SCSI layouts. Add a module alias for the SCSI layout type so that the module will be loaded for SCSI layouts. Signed-off-by: Benjamin Coddington Reviewed-by: Christoph Hellwig Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/blocklayout.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 995d707537da..ec110aa87634 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -967,6 +967,7 @@ static void __exit nfs4blocklayout_exit(void) } MODULE_ALIAS("nfs-layouttype4-3"); +MODULE_ALIAS("nfs-layouttype4-5"); module_init(nfs4blocklayout_init); module_exit(nfs4blocklayout_exit); From d78471d32bb60837930026e11828af596fb4bdac Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Fri, 8 Dec 2017 12:52:57 -0500 Subject: [PATCH 11/69] pnfs/blocklayout: set PNFS_LAYOUTRETURN_ON_ERROR If there's an error doing I/O to block device, and the client resends the I/O to the MDS, the MDS must recall the layout from the client before processing the I/O. Let's preempt that exchange by returning the layout before falling back to the MDS when there's an error. Signed-off-by: Benjamin Coddington Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/blocklayout.c | 2 ++ fs/nfs/pnfs.h | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index ec110aa87634..334570888649 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -887,6 +887,7 @@ static struct pnfs_layoutdriver_type blocklayout_type = { .name = "LAYOUT_BLOCK_VOLUME", .owner = THIS_MODULE, .flags = PNFS_LAYOUTRET_ON_SETATTR | + PNFS_LAYOUTRET_ON_ERROR | PNFS_READ_WHOLE_PAGE, .read_pagelist = bl_read_pagelist, .write_pagelist = bl_write_pagelist, @@ -910,6 +911,7 @@ static struct pnfs_layoutdriver_type scsilayout_type = { .name = "LAYOUT_SCSI", .owner = THIS_MODULE, .flags = PNFS_LAYOUTRET_ON_SETATTR | + PNFS_LAYOUTRET_ON_ERROR | PNFS_READ_WHOLE_PAGE, .read_pagelist = bl_read_pagelist, .write_pagelist = bl_write_pagelist, diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 8d507c361d98..29a19814e538 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -524,8 +524,10 @@ static inline int pnfs_return_layout(struct inode *ino) struct nfs_inode *nfsi = NFS_I(ino); struct nfs_server *nfss = NFS_SERVER(ino); - if (pnfs_enabled_sb(nfss) && nfsi->layout) + if (pnfs_enabled_sb(nfss) && nfsi->layout) { + set_bit(NFS_LAYOUT_RETURN_REQUESTED, &nfsi->layout->plh_flags); return _pnfs_return_layout(ino); + } return 0; } From b3dce6a2f0601be9b6781b394fdf6ceb63009a44 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Fri, 8 Dec 2017 12:52:59 -0500 Subject: [PATCH 12/69] pnfs/blocklayout: handle transient devices PNFS block/SCSI layouts should gracefully handle cases where block devices are not available when a layout is retrieved, or the block devices are removed while the client holds a layout. While setting up a layout segment, keep a record of an unavailable or un-parsable block device in cache with a flag so that subsequent layouts do not spam the server with GETDEVINFO. We can reuse the current NFS_DEVICEID_UNAVAILABLE handling with one variation: instead of reusing the device, we will discard it and send a fresh GETDEVINFO after the timeout, since the lookup and validation of the device occurs within the GETDEVINFO response handling. A lookup of a layout segment that references an unavailable device will return a segment with the NFS_LSEG_UNAVAILABLE flag set. This will allow the pgio layer to mark the layout with the appropriate fail bit, which forces subsequent IO to the MDS, and prevents spamming the server with LAYOUTGET, LAYOUTRETURN. Finally, when IO to a block device fails, look up the block device(s) referenced by the pgio header, and mark them as unavailable. Signed-off-by: Benjamin Coddington Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/blocklayout.c | 82 ++++++++++++++++++++++++++++++-- fs/nfs/blocklayout/dev.c | 7 +-- fs/nfs/pnfs.c | 2 +- fs/nfs/pnfs.h | 2 + fs/nfs/pnfs_dev.c | 1 - 5 files changed, 82 insertions(+), 12 deletions(-) diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 334570888649..ca6cf54b54df 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -184,6 +184,29 @@ retry: return bio; } +static void bl_mark_devices_unavailable(struct nfs_pgio_header *header, bool rw) +{ + struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg); + size_t bytes_left = header->args.count; + sector_t isect, extent_length = 0; + struct pnfs_block_extent be; + + isect = header->args.offset >> SECTOR_SHIFT; + bytes_left += header->args.offset - (isect << SECTOR_SHIFT); + + while (bytes_left > 0) { + if (!ext_tree_lookup(bl, isect, &be, rw)) + return; + extent_length = be.be_length - (isect - be.be_f_offset); + nfs4_mark_deviceid_unavailable(be.be_device); + isect += extent_length; + if (bytes_left > extent_length << SECTOR_SHIFT) + bytes_left -= extent_length << SECTOR_SHIFT; + else + bytes_left = 0; + } +} + static void bl_end_io_read(struct bio *bio) { struct parallel_io *par = bio->bi_private; @@ -194,6 +217,7 @@ static void bl_end_io_read(struct bio *bio) if (!header->pnfs_error) header->pnfs_error = -EIO; pnfs_set_lo_fail(header->lseg); + bl_mark_devices_unavailable(header, false); } bio_put(bio); @@ -323,6 +347,7 @@ static void bl_end_io_write(struct bio *bio) if (!header->pnfs_error) header->pnfs_error = -EIO; pnfs_set_lo_fail(header->lseg); + bl_mark_devices_unavailable(header, true); } bio_put(bio); put_parallel(par); @@ -552,6 +577,31 @@ static int decode_sector_number(__be32 **rp, sector_t *sp) return 0; } +static struct nfs4_deviceid_node * +bl_find_get_deviceid(struct nfs_server *server, + const struct nfs4_deviceid *id, struct rpc_cred *cred, + gfp_t gfp_mask) +{ + struct nfs4_deviceid_node *node; + unsigned long start, end; + +retry: + node = nfs4_find_get_deviceid(server, id, cred, gfp_mask); + if (!node) + return ERR_PTR(-ENODEV); + + if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags) == 0) + return node; + + end = jiffies; + start = end - PNFS_DEVICE_RETRY_TIMEOUT; + if (!time_in_range(node->timestamp_unavailable, start, end)) { + nfs4_delete_deviceid(node->ld, node->nfs_client, id); + goto retry; + } + return ERR_PTR(-ENODEV); +} + static int bl_alloc_extent(struct xdr_stream *xdr, struct pnfs_layout_hdr *lo, struct layout_verification *lv, struct list_head *extents, @@ -573,16 +623,18 @@ bl_alloc_extent(struct xdr_stream *xdr, struct pnfs_layout_hdr *lo, memcpy(&id, p, NFS4_DEVICEID4_SIZE); p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); - error = -EIO; - be->be_device = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), &id, + be->be_device = bl_find_get_deviceid(NFS_SERVER(lo->plh_inode), &id, lo->plh_lc_cred, gfp_mask); - if (!be->be_device) + if (IS_ERR(be->be_device)) { + error = PTR_ERR(be->be_device); goto out_free_be; + } /* * The next three values are read in as bytes, but stored in the * extent structure in 512-byte granularity. */ + error = -EIO; if (decode_sector_number(&p, &be->be_f_offset) < 0) goto out_put_deviceid; if (decode_sector_number(&p, &be->be_length) < 0) @@ -692,11 +744,16 @@ out_free_scratch: __free_page(scratch); out: dprintk("%s returns %d\n", __func__, status); - if (status) { + switch (status) { + case -ENODEV: + /* Our extent block devices are unavailable */ + set_bit(NFS_LSEG_UNAVAILABLE, &lseg->pls_flags); + case 0: + return lseg; + default: kfree(lseg); return ERR_PTR(status); } - return lseg; } static void @@ -798,6 +855,13 @@ bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) } pnfs_generic_pg_init_read(pgio, req); + + if (pgio->pg_lseg && + test_bit(NFS_LSEG_UNAVAILABLE, &pgio->pg_lseg->pls_flags)) { + pnfs_error_mark_layout_for_return(pgio->pg_inode, pgio->pg_lseg); + pnfs_set_lo_fail(pgio->pg_lseg); + nfs_pageio_reset_read_mds(pgio); + } } /* @@ -853,6 +917,14 @@ bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) wb_size = nfs_dreq_bytes_left(pgio->pg_dreq); pnfs_generic_pg_init_write(pgio, req, wb_size); + + if (pgio->pg_lseg && + test_bit(NFS_LSEG_UNAVAILABLE, &pgio->pg_lseg->pls_flags)) { + + pnfs_error_mark_layout_for_return(pgio->pg_inode, pgio->pg_lseg); + pnfs_set_lo_fail(pgio->pg_lseg); + nfs_pageio_reset_write_mds(pgio); + } } /* diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index 95f74bd2c067..a7efd83779d2 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -533,14 +533,11 @@ bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, goto out_free_volumes; ret = bl_parse_deviceid(server, top, volumes, nr_volumes - 1, gfp_mask); - if (ret) { - bl_free_device(top); - kfree(top); - goto out_free_volumes; - } node = &top->node; nfs4_init_deviceid_node(node, server, &pdev->dev_id); + if (ret) + nfs4_mark_deviceid_unavailable(node); out_free_volumes: kfree(volumes); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index d602fe9e1ac8..b3dae6ec2d39 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -655,7 +655,7 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, return 0; list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) if (pnfs_match_lseg_recall(lseg, recall_range, seq)) { - dprintk("%s: freeing lseg %p iomode %d seq %u" + dprintk("%s: freeing lseg %p iomode %d seq %u " "offset %llu length %llu\n", __func__, lseg, lseg->pls_range.iomode, lseg->pls_seq, lseg->pls_range.offset, lseg->pls_range.length); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 29a19814e538..daf6cbf5c15f 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -40,6 +40,7 @@ enum { NFS_LSEG_ROC, /* roc bit received from server */ NFS_LSEG_LAYOUTCOMMIT, /* layoutcommit bit set for layoutcommit */ NFS_LSEG_LAYOUTRETURN, /* layoutreturn bit set for layoutreturn */ + NFS_LSEG_UNAVAILABLE, /* unavailable bit set for temporary problem */ }; /* Individual ip address */ @@ -86,6 +87,7 @@ enum pnfs_try_status { */ #define NFS4_DEF_DS_TIMEO 600 /* in tenths of a second */ #define NFS4_DEF_DS_RETRANS 5 +#define PNFS_DEVICE_RETRY_TIMEOUT (120*HZ) /* error codes for internal use */ #define NFS4ERR_RESET_TO_MDS 12001 diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c index 2961fcd7a2df..e8a07b3f9aaa 100644 --- a/fs/nfs/pnfs_dev.c +++ b/fs/nfs/pnfs_dev.c @@ -43,7 +43,6 @@ #define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS) #define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1) -#define PNFS_DEVICE_RETRY_TIMEOUT (120*HZ) static struct hlist_head nfs4_deviceid_cache[NFS4_DEVICE_ID_HASH_SIZE]; static DEFINE_SPINLOCK(nfs4_deviceid_lock); From ba4a76f703ab7eb72941fdaac848502073d6e9ee Mon Sep 17 00:00:00 2001 From: Scott Mayhew Date: Fri, 15 Dec 2017 16:12:32 -0500 Subject: [PATCH 13/69] nfs/pnfs: fix nfs_direct_req ref leak when i/o falls back to the mds Currently when falling back to doing I/O through the MDS (via pnfs_{read|write}_through_mds), the client frees the nfs_pgio_header without releasing the reference taken on the dreq via pnfs_generic_pg_{read|write}pages -> nfs_pgheader_init -> nfs_direct_pgio_init. It then takes another reference on the dreq via nfs_generic_pg_pgios -> nfs_pgheader_init -> nfs_direct_pgio_init and as a result the requester will become stuck in inode_dio_wait. Once that happens, other processes accessing the inode will become stuck as well. Ensure that pnfs_read_through_mds() and pnfs_write_through_mds() clean up correctly by calling hdr->completion_ops->completion() instead of calling hdr->release() directly. This can be reproduced (sometimes) by performing "storage failover takeover" commands on NetApp filer while doing direct I/O from a client. This can also be reproduced using SystemTap to simulate a failure while doing direct I/O from a client (from Dave Wysochanski ): stap -v -g -e 'probe module("nfs_layout_nfsv41_files").function("nfs4_fl_prepare_ds").return { $return=NULL; exit(); }' Suggested-by: Trond Myklebust Signed-off-by: Scott Mayhew Fixes: 1ca018d28d ("pNFS: Fix a memory leak when attempted pnfs fails") Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index b3dae6ec2d39..c13e826614b5 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -2255,7 +2255,7 @@ pnfs_write_through_mds(struct nfs_pageio_descriptor *desc, nfs_pageio_reset_write_mds(desc); mirror->pg_recoalesce = 1; } - hdr->release(hdr); + hdr->completion_ops->completion(hdr); } static enum pnfs_try_status @@ -2378,7 +2378,7 @@ pnfs_read_through_mds(struct nfs_pageio_descriptor *desc, nfs_pageio_reset_read_mds(desc); mirror->pg_recoalesce = 1; } - hdr->release(hdr); + hdr->completion_ops->completion(hdr); } /* From fee21fb587f57748c3c971e3432c4a28d74146fc Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Wed, 29 Nov 2017 13:15:43 +0200 Subject: [PATCH 14/69] lockd: convert nlm_host.h_count from atomic_t to refcount_t atomic_t variables are currently used to implement reference counters with the following properties: - counter is initialized to 1 using atomic_set() - a resource is freed upon counter reaching zero - once counter reaches zero, its further increments aren't allowed - counter schema uses basic atomic operations (set, inc, inc_not_zero, dec_and_test, etc.) Such atomic variables should be converted to a newly provided refcount_t type and API that prevents accidental counter overflows and underflows. This is important since overflows and underflows can lead to use-after-free situation and be exploitable. The variable nlm_host.h_count is used as pure reference counter. Convert it to refcount_t and fix up the operations. **Important note for maintainers: Some functions from refcount_t API defined in lib/refcount.c have different memory ordering guarantees than their atomic counterparts. The full comparison can be seen in https://lkml.org/lkml/2017/11/15/57 and it is hopefully soon in state to be merged to the documentation tree. Normally the differences should not matter since refcount_t provides enough guarantees to satisfy the refcounting use cases, but in some rare cases it might matter. Please double check that you don't have some undocumented memory guarantees for this variable usage. For the nlm_host.h_count it might make a difference in following places: - nlmsvc_release_host(): decrement in refcount_dec() provides RELEASE ordering, while original atomic_dec() was fully unordered. Since the change is for better, it should not matter. - nlmclnt_release_host(): decrement in refcount_dec_and_test() only provides RELEASE ordering and control dependency on success vs. fully ordered atomic counterpart. It doesn't seem to matter in this case since object freeing happens under mutex lock anyway. Suggested-by: Kees Cook Reviewed-by: David Windsor Reviewed-by: Hans Liljestrand Signed-off-by: Elena Reshetova Signed-off-by: Trond Myklebust --- fs/lockd/host.c | 14 +++++++------- include/linux/lockd/lockd.h | 3 ++- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 826a89184f90..11b6832277f6 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -151,7 +151,7 @@ static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni, host->h_state = 0; host->h_nsmstate = 0; host->h_pidcount = 0; - atomic_set(&host->h_count, 1); + refcount_set(&host->h_count, 1); mutex_init(&host->h_mutex); host->h_nextrebind = now + NLM_HOST_REBIND; host->h_expires = now + NLM_HOST_EXPIRE; @@ -290,7 +290,7 @@ void nlmclnt_release_host(struct nlm_host *host) WARN_ON_ONCE(host->h_server); - if (atomic_dec_and_test(&host->h_count)) { + if (refcount_dec_and_test(&host->h_count)) { WARN_ON_ONCE(!list_empty(&host->h_lockowners)); WARN_ON_ONCE(!list_empty(&host->h_granted)); WARN_ON_ONCE(!list_empty(&host->h_reclaim)); @@ -410,7 +410,7 @@ void nlmsvc_release_host(struct nlm_host *host) dprintk("lockd: release server host %s\n", host->h_name); WARN_ON_ONCE(!host->h_server); - atomic_dec(&host->h_count); + refcount_dec(&host->h_count); } /* @@ -504,7 +504,7 @@ struct nlm_host * nlm_get_host(struct nlm_host *host) { if (host) { dprintk("lockd: get host %s\n", host->h_name); - atomic_inc(&host->h_count); + refcount_inc(&host->h_count); host->h_expires = jiffies + NLM_HOST_EXPIRE; } return host; @@ -593,7 +593,7 @@ static void nlm_complain_hosts(struct net *net) if (net && host->net != net) continue; dprintk(" %s (cnt %d use %d exp %ld net %x)\n", - host->h_name, atomic_read(&host->h_count), + host->h_name, refcount_read(&host->h_count), host->h_inuse, host->h_expires, host->net->ns.inum); } } @@ -662,11 +662,11 @@ nlm_gc_hosts(struct net *net) for_each_host_safe(host, next, chain, nlm_server_hosts) { if (net && host->net != net) continue; - if (atomic_read(&host->h_count) || host->h_inuse + if (refcount_read(&host->h_count) || host->h_inuse || time_before(jiffies, host->h_expires)) { dprintk("nlm_gc_hosts skipping %s " "(cnt %d use %d exp %ld net %x)\n", - host->h_name, atomic_read(&host->h_count), + host->h_name, refcount_read(&host->h_count), host->h_inuse, host->h_expires, host->net->ns.inum); continue; diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index d7d313fb9cd4..39dfeea20963 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -58,7 +59,7 @@ struct nlm_host { u32 h_state; /* pseudo-state counter */ u32 h_nsmstate; /* true remote NSM state */ u32 h_pidcount; /* Pseudopids */ - atomic_t h_count; /* reference count */ + refcount_t h_count; /* reference count */ struct mutex h_mutex; /* mutex for pmap binding */ unsigned long h_nextrebind; /* next portmap call */ unsigned long h_expires; /* eligible for GC */ From c751082ceff7d5907f436729dd7cccb88cffc4de Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Wed, 29 Nov 2017 13:15:44 +0200 Subject: [PATCH 15/69] lockd: convert nsm_handle.sm_count from atomic_t to refcount_t atomic_t variables are currently used to implement reference counters with the following properties: - counter is initialized to 1 using atomic_set() - a resource is freed upon counter reaching zero - once counter reaches zero, its further increments aren't allowed - counter schema uses basic atomic operations (set, inc, inc_not_zero, dec_and_test, etc.) Such atomic variables should be converted to a newly provided refcount_t type and API that prevents accidental counter overflows and underflows. This is important since overflows and underflows can lead to use-after-free situation and be exploitable. The variable nsm_handle.sm_count is used as pure reference counter. Convert it to refcount_t and fix up the operations. **Important note for maintainers: Some functions from refcount_t API defined in lib/refcount.c have different memory ordering guarantees than their atomic counterparts. The full comparison can be seen in https://lkml.org/lkml/2017/11/15/57 and it is hopefully soon in state to be merged to the documentation tree. Normally the differences should not matter since refcount_t provides enough guarantees to satisfy the refcounting use cases, but in some rare cases it might matter. Please double check that you don't have some undocumented memory guarantees for this variable usage. For the nsm_handle.sm_count it might make a difference in following places: - nsm_release(): decrement in refcount_dec_and_lock() only provides RELEASE ordering, control dependency on success and holds a spin lock on success vs. fully ordered atomic counterpart. No change for the spin lock guarantees. Suggested-by: Kees Cook Reviewed-by: David Windsor Reviewed-by: Hans Liljestrand Signed-off-by: Elena Reshetova Signed-off-by: Trond Myklebust --- fs/lockd/host.c | 2 +- fs/lockd/mon.c | 14 +++++++------- include/linux/lockd/lockd.h | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 11b6832277f6..7d6ab72bbe65 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -114,7 +114,7 @@ static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni, unsigned long now = jiffies; if (nsm != NULL) - atomic_inc(&nsm->sm_count); + refcount_inc(&nsm->sm_count); else { host = NULL; nsm = nsm_get_handle(ni->net, ni->sap, ni->salen, diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 96cfb2967ac7..654594ef4f94 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -191,7 +191,7 @@ void nsm_unmonitor(const struct nlm_host *host) struct nsm_res res; int status; - if (atomic_read(&nsm->sm_count) == 1 + if (refcount_read(&nsm->sm_count) == 1 && nsm->sm_monitored && !nsm->sm_sticky) { dprintk("lockd: nsm_unmonitor(%s)\n", nsm->sm_name); @@ -279,7 +279,7 @@ static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap, if (unlikely(new == NULL)) return NULL; - atomic_set(&new->sm_count, 1); + refcount_set(&new->sm_count, 1); new->sm_name = (char *)(new + 1); memcpy(nsm_addr(new), sap, salen); new->sm_addrlen = salen; @@ -337,13 +337,13 @@ retry: cached = nsm_lookup_addr(&ln->nsm_handles, sap); if (cached != NULL) { - atomic_inc(&cached->sm_count); + refcount_inc(&cached->sm_count); spin_unlock(&nsm_lock); kfree(new); dprintk("lockd: found nsm_handle for %s (%s), " "cnt %d\n", cached->sm_name, cached->sm_addrbuf, - atomic_read(&cached->sm_count)); + refcount_read(&cached->sm_count)); return cached; } @@ -388,12 +388,12 @@ struct nsm_handle *nsm_reboot_lookup(const struct net *net, return cached; } - atomic_inc(&cached->sm_count); + refcount_inc(&cached->sm_count); spin_unlock(&nsm_lock); dprintk("lockd: host %s (%s) rebooted, cnt %d\n", cached->sm_name, cached->sm_addrbuf, - atomic_read(&cached->sm_count)); + refcount_read(&cached->sm_count)); return cached; } @@ -404,7 +404,7 @@ struct nsm_handle *nsm_reboot_lookup(const struct net *net, */ void nsm_release(struct nsm_handle *nsm) { - if (atomic_dec_and_lock(&nsm->sm_count, &nsm_lock)) { + if (refcount_dec_and_lock(&nsm->sm_count, &nsm_lock)) { list_del(&nsm->sm_link); spin_unlock(&nsm_lock); dprintk("lockd: destroyed nsm_handle for %s (%s)\n", diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 39dfeea20963..cded0ad9ca05 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -84,7 +84,7 @@ struct nlm_host { struct nsm_handle { struct list_head sm_link; - atomic_t sm_count; + refcount_t sm_count; char *sm_mon_name; char *sm_name; struct sockaddr_storage sm_addr; From 431f125b67d51a84b93095a7df6b3c30222753b1 Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Wed, 29 Nov 2017 13:15:45 +0200 Subject: [PATCH 16/69] lockd: convert nlm_lockowner.count from atomic_t to refcount_t atomic_t variables are currently used to implement reference counters with the following properties: - counter is initialized to 1 using atomic_set() - a resource is freed upon counter reaching zero - once counter reaches zero, its further increments aren't allowed - counter schema uses basic atomic operations (set, inc, inc_not_zero, dec_and_test, etc.) Such atomic variables should be converted to a newly provided refcount_t type and API that prevents accidental counter overflows and underflows. This is important since overflows and underflows can lead to use-after-free situation and be exploitable. The variable nlm_lockowner.count is used as pure reference counter. Convert it to refcount_t and fix up the operations. **Important note for maintainers: Some functions from refcount_t API defined in lib/refcount.c have different memory ordering guarantees than their atomic counterparts. The full comparison can be seen in https://lkml.org/lkml/2017/11/15/57 and it is hopefully soon in state to be merged to the documentation tree. Normally the differences should not matter since refcount_t provides enough guarantees to satisfy the refcounting use cases, but in some rare cases it might matter. Please double check that you don't have some undocumented memory guarantees for this variable usage. For the nlm_lockowner.count it might make a difference in following places: - nlm_put_lockowner(): decrement in refcount_dec_and_lock() only provides RELEASE ordering, control dependency on success and holds a spin lock on success vs. fully ordered atomic counterpart. No changes in spin lock guarantees. Suggested-by: Kees Cook Reviewed-by: David Windsor Reviewed-by: Hans Liljestrand Signed-off-by: Elena Reshetova Signed-off-by: Trond Myklebust --- fs/lockd/clntproc.c | 6 +++--- include/linux/lockd/lockd.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 066ac313ae5c..112173dbea76 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -48,13 +48,13 @@ void nlmclnt_next_cookie(struct nlm_cookie *c) static struct nlm_lockowner *nlm_get_lockowner(struct nlm_lockowner *lockowner) { - atomic_inc(&lockowner->count); + refcount_inc(&lockowner->count); return lockowner; } static void nlm_put_lockowner(struct nlm_lockowner *lockowner) { - if (!atomic_dec_and_lock(&lockowner->count, &lockowner->host->h_lock)) + if (!refcount_dec_and_lock(&lockowner->count, &lockowner->host->h_lock)) return; list_del(&lockowner->list); spin_unlock(&lockowner->host->h_lock); @@ -105,7 +105,7 @@ static struct nlm_lockowner *nlm_find_lockowner(struct nlm_host *host, fl_owner_ res = __nlm_find_lockowner(host, owner); if (res == NULL && new != NULL) { res = new; - atomic_set(&new->count, 1); + refcount_set(&new->count, 1); new->owner = owner; new->pid = __nlm_alloc_pid(host); new->host = nlm_get_host(host); diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index cded0ad9ca05..86d012a76862 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -123,7 +123,7 @@ static inline struct sockaddr *nlm_srcaddr(const struct nlm_host *host) */ struct nlm_lockowner { struct list_head list; - atomic_t count; + refcount_t count; struct nlm_host *host; fl_owner_t owner; From fbca30c51350399f49b09421b5ee2ef8d00c05d8 Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Wed, 29 Nov 2017 13:15:46 +0200 Subject: [PATCH 17/69] lockd: convert nlm_rqst.a_count from atomic_t to refcount_t atomic_t variables are currently used to implement reference counters with the following properties: - counter is initialized to 1 using atomic_set() - a resource is freed upon counter reaching zero - once counter reaches zero, its further increments aren't allowed - counter schema uses basic atomic operations (set, inc, inc_not_zero, dec_and_test, etc.) Such atomic variables should be converted to a newly provided refcount_t type and API that prevents accidental counter overflows and underflows. This is important since overflows and underflows can lead to use-after-free situation and be exploitable. The variable nlm_rqst.a_count is used as pure reference counter. Convert it to refcount_t and fix up the operations. **Important note for maintainers: Some functions from refcount_t API defined in lib/refcount.c have different memory ordering guarantees than their atomic counterparts. The full comparison can be seen in https://lkml.org/lkml/2017/11/15/57 and it is hopefully soon in state to be merged to the documentation tree. Normally the differences should not matter since refcount_t provides enough guarantees to satisfy the refcounting use cases, but in some rare cases it might matter. Please double check that you don't have some undocumented memory guarantees for this variable usage. For the nlm_rqst.a_count it might make a difference in following places: - nlmclnt_release_call() and nlmsvc_release_call(): decrement in refcount_dec_and_test() only provides RELEASE ordering and control dependency on success vs. fully ordered atomic counterpart Suggested-by: Kees Cook Reviewed-by: David Windsor Reviewed-by: Hans Liljestrand Signed-off-by: Elena Reshetova Signed-off-by: Trond Myklebust --- fs/lockd/clntproc.c | 8 ++++---- fs/lockd/svcproc.c | 2 +- include/linux/lockd/lockd.h | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 112173dbea76..a2c0dfc6fdc0 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -204,7 +204,7 @@ struct nlm_rqst *nlm_alloc_call(struct nlm_host *host) for(;;) { call = kzalloc(sizeof(*call), GFP_KERNEL); if (call != NULL) { - atomic_set(&call->a_count, 1); + refcount_set(&call->a_count, 1); locks_init_lock(&call->a_args.lock.fl); locks_init_lock(&call->a_res.lock.fl); call->a_host = nlm_get_host(host); @@ -222,7 +222,7 @@ void nlmclnt_release_call(struct nlm_rqst *call) { const struct nlmclnt_operations *nlmclnt_ops = call->a_host->h_nlmclnt_ops; - if (!atomic_dec_and_test(&call->a_count)) + if (!refcount_dec_and_test(&call->a_count)) return; if (nlmclnt_ops && nlmclnt_ops->nlmclnt_release_call) nlmclnt_ops->nlmclnt_release_call(call->a_callback_data); @@ -678,7 +678,7 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl) goto out; } - atomic_inc(&req->a_count); + refcount_inc(&req->a_count); status = nlmclnt_async_call(nfs_file_cred(fl->fl_file), req, NLMPROC_UNLOCK, &nlmclnt_unlock_ops); if (status < 0) @@ -769,7 +769,7 @@ static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl nlmclnt_setlockargs(req, fl); req->a_args.block = block; - atomic_inc(&req->a_count); + refcount_inc(&req->a_count); status = nlmclnt_async_call(nfs_file_cred(fl->fl_file), req, NLMPROC_CANCEL, &nlmclnt_cancel_ops); if (status == 0 && req->a_res.status == nlm_lck_denied) diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 0d670c5c378f..ea77c66d3cc3 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -295,7 +295,7 @@ static void nlmsvc_callback_exit(struct rpc_task *task, void *data) void nlmsvc_release_call(struct nlm_rqst *call) { - if (!atomic_dec_and_test(&call->a_count)) + if (!refcount_dec_and_test(&call->a_count)) return; nlmsvc_release_host(call->a_host); kfree(call); diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 86d012a76862..4fd95dbeb52f 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -137,7 +137,7 @@ struct nlm_wait; */ #define NLMCLNT_OHSIZE ((__NEW_UTS_LEN) + 10u) struct nlm_rqst { - atomic_t a_count; + refcount_t a_count; unsigned int a_flags; /* initial RPC task flags */ struct nlm_host * a_host; /* host handle */ struct nlm_args a_args; /* arguments */ From fb455baad6fc4de77d762e89dae75c2e2aa98559 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 4 Dec 2017 14:13:30 -0500 Subject: [PATCH 18/69] nfs: Define NFS_RDMA_PORT The NFS/RDMA port assignment is specified in Section 9 of RFC 8267. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/uapi/linux/nfs.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/uapi/linux/nfs.h b/include/uapi/linux/nfs.h index 057d22a48416..946cb62d64b0 100644 --- a/include/uapi/linux/nfs.h +++ b/include/uapi/linux/nfs.h @@ -12,6 +12,7 @@ #define NFS_PROGRAM 100003 #define NFS_PORT 2049 +#define NFS_RDMA_PORT 20049 #define NFS_MAXDATA 8192 #define NFS_MAXPATHLEN 1024 #define NFS_MAXNAMLEN 255 From 530ea4219231e62341f79a5517d7b4f12ec3b74f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 4 Dec 2017 14:13:38 -0500 Subject: [PATCH 19/69] nfs: Referrals should use the same proto setting as their parent Helen Chao noticed that when a user traverses a referral on an NFS/RDMA mount, the resulting submount always uses TCP. This behavior does not match the vers= setting when traversing a referral (vers=4.1 is preserved). It also does not match the behavior of crossing from the pseudofs into a real filesystem (proto=rdma is preserved in that case). The Linux NFS client does not currently support the fs_locations_info attribute. The situation is similar for all NFSv4 servers I know of. Therefore until the community has broad support for fs_locations_info, when following a referral: - First try to connect with RPC-over-RDMA. This will fail quickly if the client has no RDMA-capable interfaces. - If connecting with RPC-over-RDMA fails, or the RPC-over-RDMA transport is not available, use TCP. Reported-by: Helen Chao Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/nfs4client.c | 23 ++++++++++++++++++++--- fs/nfs/nfs4namespace.c | 2 -- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 65a7e5da508c..507f1f534b16 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -1123,19 +1123,36 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, /* Initialise the client representation from the parent server */ nfs_server_copy_userdata(server, parent_server); - /* Get a client representation. - * Note: NFSv4 always uses TCP, */ + /* Get a client representation */ +#ifdef CONFIG_SUNRPC_XPRT_RDMA + rpc_set_port(data->addr, NFS_RDMA_PORT); error = nfs4_set_client(server, data->hostname, data->addr, data->addrlen, parent_client->cl_ipaddr, - rpc_protocol(parent_server->client), + XPRT_TRANSPORT_RDMA, + parent_server->client->cl_timeout, + parent_client->cl_mvops->minor_version, + parent_client->cl_net); + if (!error) + goto init_server; +#endif /* CONFIG_SUNRPC_XPRT_RDMA */ + + rpc_set_port(data->addr, NFS_PORT); + error = nfs4_set_client(server, data->hostname, + data->addr, + data->addrlen, + parent_client->cl_ipaddr, + XPRT_TRANSPORT_TCP, parent_server->client->cl_timeout, parent_client->cl_mvops->minor_version, parent_client->cl_net); if (error < 0) goto error; +#ifdef CONFIG_SUNRPC_XPRT_RDMA +init_server: +#endif error = nfs_init_server_rpcclient(server, parent_server->client->cl_timeout, data->authflavor); if (error < 0) goto error; diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 8c3f327d858d..24f06dcc2b08 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -270,8 +270,6 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, if (mountdata->addrlen == 0) continue; - rpc_set_port(mountdata->addr, NFS_PORT); - memcpy(page2, buf->data, buf->len); page2[buf->len] = '\0'; mountdata->hostname = page2; From 801b564309bafea3cfd71f9c535d88f4d58d2d34 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 4 Dec 2017 14:13:46 -0500 Subject: [PATCH 20/69] nfs: Update server port after referral or migration After traversing a referral or recovering from a migration event, ensure that the server port reported in /proc/mounts is updated to the correct port setting for the new submount. Reported-by: Helen Chao Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/nfs4client.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 507f1f534b16..04612c24d394 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -861,6 +861,7 @@ static int nfs4_set_client(struct nfs_server *server, set_bit(NFS_CS_MIGRATION, &cl_init.init_flags); if (test_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status)) set_bit(NFS_CS_TSM_POSSIBLE, &cl_init.init_flags); + server->port = rpc_get_port(addr); /* Allocate or find a client reference we can use */ clp = nfs_get_client(&cl_init); From 024fbf9c2eae6fa10844a9a3588a3d61c89683e8 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 4 Dec 2017 14:13:55 -0500 Subject: [PATCH 21/69] SUNRPC: Remove rpc_protocol() Since nfs4_create_referral_server was the only call site of rpc_protocol, rpc_protocol can now be removed. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/clnt.h | 1 - net/sunrpc/clnt.c | 16 ---------------- 2 files changed, 17 deletions(-) diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 71c237e8240e..ed761f751ecb 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -179,7 +179,6 @@ struct rpc_task *rpc_call_null(struct rpc_clnt *clnt, struct rpc_cred *cred, int rpc_restart_call_prepare(struct rpc_task *); int rpc_restart_call(struct rpc_task *); void rpc_setbufsize(struct rpc_clnt *, unsigned int, unsigned int); -int rpc_protocol(struct rpc_clnt *); struct net * rpc_net_ns(struct rpc_clnt *); size_t rpc_max_payload(struct rpc_clnt *); size_t rpc_max_bc_payload(struct rpc_clnt *); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index e2a4184f3c5d..6e432ecd7f99 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1375,22 +1375,6 @@ rpc_setbufsize(struct rpc_clnt *clnt, unsigned int sndsize, unsigned int rcvsize } EXPORT_SYMBOL_GPL(rpc_setbufsize); -/** - * rpc_protocol - Get transport protocol number for an RPC client - * @clnt: RPC client to query - * - */ -int rpc_protocol(struct rpc_clnt *clnt) -{ - int protocol; - - rcu_read_lock(); - protocol = rcu_dereference(clnt->cl_xprt)->prot; - rcu_read_unlock(); - return protocol; -} -EXPORT_SYMBOL_GPL(rpc_protocol); - /** * rpc_net_ns - Get the network namespace for this RPC client * @clnt: RPC client to query From 3d188805f83250409fc251bfb0a699810086885b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 14 Jan 2018 15:47:06 -0500 Subject: [PATCH 22/69] SUNRPC: Chunk reading of replies from the server Read the TCP data in chunks of max 2MB so that we do not hog the socket lock. Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 6d0cc3b8f932..620be573d78b 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -52,6 +52,8 @@ #include "sunrpc.h" +#define RPC_TCP_READ_CHUNK_SZ (3*512*1024) + static void xs_close(struct rpc_xprt *xprt); static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, struct socket *sock); @@ -1479,6 +1481,7 @@ static int xs_tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, uns .offset = offset, .count = len, }; + size_t ret; dprintk("RPC: xs_tcp_data_recv started\n"); do { @@ -1507,9 +1510,14 @@ static int xs_tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, uns /* Skip over any trailing bytes on short reads */ xs_tcp_read_discard(transport, &desc); } while (desc.count); + ret = len - desc.count; + if (ret < rd_desc->count) + rd_desc->count -= ret; + else + rd_desc->count = 0; trace_xs_tcp_data_recv(transport); dprintk("RPC: xs_tcp_data_recv done\n"); - return len - desc.count; + return ret; } static void xs_tcp_data_receive(struct sock_xprt *transport) @@ -1517,7 +1525,6 @@ static void xs_tcp_data_receive(struct sock_xprt *transport) struct rpc_xprt *xprt = &transport->xprt; struct sock *sk; read_descriptor_t rd_desc = { - .count = 2*1024*1024, .arg.data = xprt, }; unsigned long total = 0; @@ -1531,16 +1538,16 @@ static void xs_tcp_data_receive(struct sock_xprt *transport) /* We use rd_desc to pass struct xprt to xs_tcp_data_recv */ for (loop = 0; loop < 64; loop++) { + rd_desc.count = RPC_TCP_READ_CHUNK_SZ; lock_sock(sk); read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv); - if (read <= 0) { + if (rd_desc.count != 0 || read < 0) { clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state); release_sock(sk); break; } release_sock(sk); total += read; - rd_desc.count = 65536; } if (test_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) queue_work(xprtiod_workqueue, &transport->recv_worker); From 0af3442af7c4c5b06e72708d7dd937974d1b4114 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 14 Jan 2018 15:28:29 -0500 Subject: [PATCH 23/69] SUNRPC: Add explicit rescheduling points in the receive path When reading the reply from the server, insert an explicit cond_resched() to avoid starving higher priority tasks. Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 620be573d78b..18803021f242 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1005,6 +1005,7 @@ static void xs_local_data_receive(struct sock_xprt *transport) struct sock *sk; int err; +restart: mutex_lock(&transport->recv_mutex); sk = transport->inet; if (sk == NULL) @@ -1018,6 +1019,11 @@ static void xs_local_data_receive(struct sock_xprt *transport) } if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) break; + if (need_resched()) { + mutex_unlock(&transport->recv_mutex); + cond_resched(); + goto restart; + } } out: mutex_unlock(&transport->recv_mutex); @@ -1096,6 +1102,7 @@ static void xs_udp_data_receive(struct sock_xprt *transport) struct sock *sk; int err; +restart: mutex_lock(&transport->recv_mutex); sk = transport->inet; if (sk == NULL) @@ -1109,6 +1116,11 @@ static void xs_udp_data_receive(struct sock_xprt *transport) } if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) break; + if (need_resched()) { + mutex_unlock(&transport->recv_mutex); + cond_resched(); + goto restart; + } } out: mutex_unlock(&transport->recv_mutex); @@ -1528,16 +1540,16 @@ static void xs_tcp_data_receive(struct sock_xprt *transport) .arg.data = xprt, }; unsigned long total = 0; - int loop; int read = 0; +restart: mutex_lock(&transport->recv_mutex); sk = transport->inet; if (sk == NULL) goto out; /* We use rd_desc to pass struct xprt to xs_tcp_data_recv */ - for (loop = 0; loop < 64; loop++) { + for (;;) { rd_desc.count = RPC_TCP_READ_CHUNK_SZ; lock_sock(sk); read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv); @@ -1548,6 +1560,11 @@ static void xs_tcp_data_receive(struct sock_xprt *transport) } release_sock(sk); total += read; + if (need_resched()) { + mutex_unlock(&transport->recv_mutex); + cond_resched(); + goto restart; + } } if (test_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) queue_work(xprtiod_workqueue, &transport->recv_worker); From f96adf1ea08a37e3739b1f929ce8318eee63b1f4 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 16 Jan 2018 10:33:14 +0100 Subject: [PATCH 24/69] nfs: remove unused label in nfs_encode_fh() The only reference to the label got removed, so we now get a harmless compiler warning: fs/nfs/export.c: In function 'nfs_encode_fh': fs/nfs/export.c:58:1: error: label 'out' defined but not used [-Werror=unused-label] Fixes: aaa150089465 ("nfs: remove dead code from nfs_encode_fh()") Signed-off-by: Arnd Bergmann Signed-off-by: Trond Myklebust --- fs/nfs/export.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/nfs/export.c b/fs/nfs/export.c index 2b80a6652818..ab5de3246c5c 100644 --- a/fs/nfs/export.c +++ b/fs/nfs/export.c @@ -55,7 +55,6 @@ nfs_encode_fh(struct inode *inode, __u32 *p, int *max_len, struct inode *parent) p[len - 1] = 0; /* Padding */ nfs_copy_fh(clnt_fh, server_fh); *max_len = len; -out: dprintk("%s: result fh fileid %llu mode %u size %d\n", __func__, NFS_FILEID(inode), inode->i_mode, *max_len); return *max_len; From 1b8d97b0a837beaf48a8449955b52c650a7114b4 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 16 Jan 2018 10:08:00 -0500 Subject: [PATCH 25/69] NFS: commit direct writes even if they fail partially If some of the WRITE calls making up an O_DIRECT write syscall fail, we neglect to commit, even if some of the WRITEs succeed. We also depend on the commit code to free the reference count on the nfs_page taken in the "if (request_commit)" case at the end of nfs_direct_write_completion(). The problem was originally noticed because ENOSPC's encountered partway through a write would result in a closed file being sillyrenamed when it should have been unlinked. Signed-off-by: J. Bruce Fields Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index d2972d537469..8c10b0562e75 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -775,10 +775,8 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) spin_lock(&dreq->lock); - if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) { - dreq->flags = 0; + if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) dreq->error = hdr->error; - } if (dreq->error == 0) { nfs_direct_good_bytes(dreq, hdr); if (nfs_write_need_commit(hdr)) { From 03ac1a76ce5e5a6052a421e1d6a5c97778e88a8c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 14 Dec 2017 20:56:01 -0500 Subject: [PATCH 26/69] xprtrdma: Fix buffer leak after transport set up failure This leak has been around forever, and is exceptionally rare. EINVAL causes mount to fail with "an incorrect mount option was specified" although it's not likely that one of the mount options is incorrect. Instead, return ENODEV in this case, as this appears to be an issue with system or device configuration rather than a specific mount option. Some obsolete comments are also removed. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/transport.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 6ee1ad8978f3..7f9b62822807 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -414,20 +414,10 @@ xprt_setup_rdma(struct xprt_create *args) if (rc) goto out2; - /* - * Allocate pre-registered send and receive buffers for headers and - * any inline data. Also specify any padding which will be provided - * from a preregistered zero buffer. - */ rc = rpcrdma_buffer_create(new_xprt); if (rc) goto out3; - /* - * Register a callback for connection events. This is necessary because - * connection loss notification is async. We also catch connection loss - * when reaping receives. - */ INIT_DELAYED_WORK(&new_xprt->rx_connect_worker, xprt_rdma_connect_worker); @@ -448,8 +438,9 @@ xprt_setup_rdma(struct xprt_create *args) return xprt; out4: + rpcrdma_buffer_destroy(&new_xprt->rx_buf); xprt_rdma_free_addresses(xprt); - rc = -EINVAL; + rc = -ENODEV; out3: rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia); out2: From d698c4a02ee02053bbebe051322ff427a2dad56a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 14 Dec 2017 20:56:09 -0500 Subject: [PATCH 27/69] xprtrdma: Fix backchannel allocation of extra rpcrdma_reps The backchannel code uses rpcrdma_recv_buffer_put to add new reps to the free rep list. This also decrements rb_recv_count, which spoofs the receive overrun logic in rpcrdma_buffer_get_rep. Commit 9b06688bc3b9 ("xprtrdma: Fix additional uses of spin_lock_irqsave(rb_lock)") replaced the original open-coded list_add with a call to rpcrdma_recv_buffer_put(), but then a year later, commit 05c974669ece ("xprtrdma: Fix receive buffer accounting") added rep accounting to rpcrdma_recv_buffer_put. It was an oversight to let the backchannel continue to use this function. The fix this, let's combine the "add to free list" logic with rpcrdma_create_rep. Also, do not allocate RPCRDMA_MAX_BC_REQUESTS rpcrdma_reps in rpcrdma_buffer_create and then allocate additional rpcrdma_reps in rpcrdma_bc_setup_reps. Allocating the extra reps during backchannel set-up is sufficient. Fixes: 05c974669ece ("xprtrdma: Fix receive buffer accounting") Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/backchannel.c | 12 ++---------- net/sunrpc/xprtrdma/verbs.c | 32 ++++++++++++++++++------------- net/sunrpc/xprtrdma/xprt_rdma.h | 2 +- 3 files changed, 22 insertions(+), 24 deletions(-) diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 8b818bb3518a..256c67b433c1 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -74,21 +74,13 @@ out_fail: static int rpcrdma_bc_setup_reps(struct rpcrdma_xprt *r_xprt, unsigned int count) { - struct rpcrdma_rep *rep; int rc = 0; while (count--) { - rep = rpcrdma_create_rep(r_xprt); - if (IS_ERR(rep)) { - pr_err("RPC: %s: reply buffer alloc failed\n", - __func__); - rc = PTR_ERR(rep); + rc = rpcrdma_create_rep(r_xprt); + if (rc) break; - } - - rpcrdma_recv_buffer_put(rep); } - return rc; } diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 8607c029c0dd..6eecd9714051 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1093,10 +1093,17 @@ rpcrdma_create_req(struct rpcrdma_xprt *r_xprt) return req; } -struct rpcrdma_rep * +/** + * rpcrdma_create_rep - Allocate an rpcrdma_rep object + * @r_xprt: controlling transport + * + * Returns 0 on success or a negative errno on failure. + */ +int rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_rep *rep; int rc; @@ -1121,12 +1128,18 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt) rep->rr_recv_wr.wr_cqe = &rep->rr_cqe; rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov; rep->rr_recv_wr.num_sge = 1; - return rep; + + spin_lock(&buf->rb_lock); + list_add(&rep->rr_list, &buf->rb_recv_bufs); + spin_unlock(&buf->rb_lock); + return 0; out_free: kfree(rep); out: - return ERR_PTR(rc); + dprintk("RPC: %s: reply buffer %d alloc failed\n", + __func__, rc); + return rc; } int @@ -1167,17 +1180,10 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) } INIT_LIST_HEAD(&buf->rb_recv_bufs); - for (i = 0; i < buf->rb_max_requests + RPCRDMA_MAX_BC_REQUESTS; i++) { - struct rpcrdma_rep *rep; - - rep = rpcrdma_create_rep(r_xprt); - if (IS_ERR(rep)) { - dprintk("RPC: %s: reply buffer %d alloc failed\n", - __func__, i); - rc = PTR_ERR(rep); + for (i = 0; i <= buf->rb_max_requests; i++) { + rc = rpcrdma_create_rep(r_xprt); + if (rc) goto out; - } - list_add(&rep->rr_list, &buf->rb_recv_bufs); } rc = rpcrdma_sendctxs_create(r_xprt); diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 1342f743f1c4..3b63e61feae2 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -564,8 +564,8 @@ int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_rep *); * Buffer calls - xprtrdma/verbs.c */ struct rpcrdma_req *rpcrdma_create_req(struct rpcrdma_xprt *); -struct rpcrdma_rep *rpcrdma_create_rep(struct rpcrdma_xprt *); void rpcrdma_destroy_req(struct rpcrdma_req *); +int rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt); int rpcrdma_buffer_create(struct rpcrdma_xprt *); void rpcrdma_buffer_destroy(struct rpcrdma_buffer *); struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_buffer *buf); From 42b9f5c58aa8c59c91ead0254f0c193e3438b020 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 14 Dec 2017 20:56:18 -0500 Subject: [PATCH 28/69] xprtrdma: Eliminate unnecessary lock cycle in xprt_rdma_send_request The rpcrdma_req is not shared yet, and its associated Send hasn't been posted, thus RMW should be safe. There's no need for the expense of a lock cycle here. Fixes: 0ba6f37012db ("xprtrdma: Refactor rpcrdma_deferred_completion") Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/transport.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 7f9b62822807..7db063f8dd8a 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -735,7 +735,7 @@ xprt_rdma_send_request(struct rpc_task *task) goto drop_connection; req->rl_connect_cookie = xprt->connect_cookie; - set_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags); + __set_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags); if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) goto drop_connection; From c34416182f041e4107e531c6083c3df9a8af96f7 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 14 Dec 2017 20:56:26 -0500 Subject: [PATCH 29/69] xprtrdma: Per-mode handling for Remote Invalidation Refactoring change: Remote Invalidation is particular to the memory registration mode that is use. Use a callout instead of a generic function to handle Remote Invalidation. This gets rid of the 8-byte flags field in struct rpcrdma_mw, of which only a single bit flag has been allocated. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/frwr_ops.c | 24 +++++++++++++++++++++--- net/sunrpc/xprtrdma/rpc_rdma.c | 24 ++++-------------------- net/sunrpc/xprtrdma/verbs.c | 1 - net/sunrpc/xprtrdma/xprt_rdma.h | 8 ++------ 4 files changed, 27 insertions(+), 30 deletions(-) diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 773e66e10a15..e1f73037b554 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -450,6 +450,26 @@ out_senderr: return ERR_PTR(-ENOTCONN); } +/* Handle a remotely invalidated mw on the @mws list + */ +static void +frwr_op_reminv(struct rpcrdma_rep *rep, struct list_head *mws) +{ + struct rpcrdma_mw *mw; + + list_for_each_entry(mw, mws, mw_list) + if (mw->mw_handle == rep->rr_inv_rkey) { + struct rpcrdma_xprt *r_xprt = mw->mw_xprt; + + list_del(&mw->mw_list); + mw->frmr.fr_state = FRMR_IS_INVALID; + ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, + mw->mw_sg, mw->mw_nents, mw->mw_dir); + rpcrdma_put_mw(r_xprt, mw); + break; /* only one invalidated MR per RPC */ + } +} + /* Invalidate all memory regions that were registered for "req". * * Sleeps until it is safe for the host CPU to access the @@ -478,9 +498,6 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws) list_for_each_entry(mw, mws, mw_list) { mw->frmr.fr_state = FRMR_IS_INVALID; - if (mw->mw_flags & RPCRDMA_MW_F_RI) - continue; - f = &mw->frmr; dprintk("RPC: %s: invalidating frmr %p\n", __func__, f); @@ -553,6 +570,7 @@ reset_mrs: const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { .ro_map = frwr_op_map, + .ro_reminv = frwr_op_reminv, .ro_unmap_sync = frwr_op_unmap_sync, .ro_recover_mr = frwr_op_recover_mr, .ro_open = frwr_op_open, diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index a3f2ab283aeb..d7463bce3df3 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -984,24 +984,6 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) return fixup_copy_count; } -/* Caller must guarantee @rep remains stable during this call. - */ -static void -rpcrdma_mark_remote_invalidation(struct list_head *mws, - struct rpcrdma_rep *rep) -{ - struct rpcrdma_mw *mw; - - if (!(rep->rr_wc_flags & IB_WC_WITH_INVALIDATE)) - return; - - list_for_each_entry(mw, mws, mw_list) - if (mw->mw_handle == rep->rr_inv_rkey) { - mw->mw_flags = RPCRDMA_MW_F_RI; - break; /* only one invalidated MR per RPC */ - } -} - /* By convention, backchannel calls arrive via rdma_msg type * messages, and never populate the chunk lists. This makes * the RPC/RDMA header small and fixed in size, so it is @@ -1339,9 +1321,11 @@ void rpcrdma_deferred_completion(struct work_struct *work) struct rpcrdma_rep *rep = container_of(work, struct rpcrdma_rep, rr_work); struct rpcrdma_req *req = rpcr_to_rdmar(rep->rr_rqst); + struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; - rpcrdma_mark_remote_invalidation(&req->rl_registered, rep); - rpcrdma_release_rqst(rep->rr_rxprt, req); + if (rep->rr_wc_flags & IB_WC_WITH_INVALIDATE) + r_xprt->rx_ia.ri_ops->ro_reminv(rep, &req->rl_registered); + rpcrdma_release_rqst(r_xprt, req); rpcrdma_complete_rqst(rep); } diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 6eecd9714051..1cf1eb42786e 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1307,7 +1307,6 @@ rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt) if (!mw) goto out_nomws; - mw->mw_flags = 0; return mw; out_nomws: diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 3b63e61feae2..e787dda3b0f5 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -272,7 +272,6 @@ struct rpcrdma_mw { struct scatterlist *mw_sg; int mw_nents; enum dma_data_direction mw_dir; - unsigned long mw_flags; union { struct rpcrdma_fmr fmr; struct rpcrdma_frmr frmr; @@ -284,11 +283,6 @@ struct rpcrdma_mw { struct list_head mw_all; }; -/* mw_flags */ -enum { - RPCRDMA_MW_F_RI = 1, -}; - /* * struct rpcrdma_req -- structure central to the request/reply sequence. * @@ -485,6 +479,8 @@ struct rpcrdma_memreg_ops { (*ro_map)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *, int, bool, struct rpcrdma_mw **); + void (*ro_reminv)(struct rpcrdma_rep *rep, + struct list_head *mws); void (*ro_unmap_sync)(struct rpcrdma_xprt *, struct list_head *); void (*ro_recover_mr)(struct rpcrdma_mw *); From 3f0e3edd6ad209ee18a3ea5c2a6e2046d2a2e783 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 14 Dec 2017 20:56:34 -0500 Subject: [PATCH 30/69] xprtrdma: Remove ri_reminv_expected Clean up. Commit b5f0afbea4f2 ("xprtrdma: Per-connection pad optimization") should have removed this. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 2 -- net/sunrpc/xprtrdma/xprt_rdma.h | 1 - 2 files changed, 3 deletions(-) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 1cf1eb42786e..4a9b6f811859 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -192,7 +192,6 @@ rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt, unsigned int rsize, wsize; /* Default settings for RPC-over-RDMA Version One */ - r_xprt->rx_ia.ri_reminv_expected = false; r_xprt->rx_ia.ri_implicit_roundup = xprt_rdma_pad_optimize; rsize = RPCRDMA_V1_DEF_INLINE_SIZE; wsize = RPCRDMA_V1_DEF_INLINE_SIZE; @@ -200,7 +199,6 @@ rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt, if (pmsg && pmsg->cp_magic == rpcrdma_cmp_magic && pmsg->cp_version == RPCRDMA_CMP_VERSION) { - r_xprt->rx_ia.ri_reminv_expected = true; r_xprt->rx_ia.ri_implicit_roundup = true; rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size); wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size); diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index e787dda3b0f5..80ea3db054d8 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -77,7 +77,6 @@ struct rpcrdma_ia { unsigned int ri_max_inline_write; unsigned int ri_max_inline_read; unsigned int ri_max_send_sges; - bool ri_reminv_expected; bool ri_implicit_roundup; enum ib_mr_type ri_mrtype; unsigned long ri_flags; From 104927042cde1f86e9f3959ba9c8b4dae1616d69 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 14 Dec 2017 20:56:42 -0500 Subject: [PATCH 31/69] xprtrdma: Remove unused padding variables Clean up. Remove fields that should have been removed by commit b3221d6a53c4 ("xprtrdma: Remove logic that constructs RDMA_MSGP type calls"). Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/transport.c | 9 +++------ net/sunrpc/xprtrdma/xprt_rdma.h | 1 - 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 7db063f8dd8a..dc9000d087a9 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -67,7 +67,6 @@ static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE; unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE; static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE; -static unsigned int xprt_rdma_inline_write_padding; unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR; int xprt_rdma_pad_optimize; @@ -81,6 +80,7 @@ static unsigned int zero; static unsigned int max_padding = PAGE_SIZE; static unsigned int min_memreg = RPCRDMA_BOUNCEBUFFERS; static unsigned int max_memreg = RPCRDMA_LAST - 1; +static unsigned int dummy; static struct ctl_table_header *sunrpc_table_header; @@ -114,7 +114,7 @@ static struct ctl_table xr_tunables_table[] = { }, { .procname = "rdma_inline_write_padding", - .data = &xprt_rdma_inline_write_padding, + .data = &dummy, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, @@ -387,8 +387,6 @@ xprt_setup_rdma(struct xprt_create *args) if (cdata.inline_rsize > cdata.rsize) cdata.inline_rsize = cdata.rsize; - cdata.padding = xprt_rdma_inline_write_padding; - /* * Create new transport instance, which includes initialized * o ia @@ -895,8 +893,7 @@ int xprt_rdma_init(void) "\tMaxInlineRead %d\n\tMaxInlineWrite %d\n", xprt_rdma_slot_table_entries, xprt_rdma_max_inline_read, xprt_rdma_max_inline_write); - dprintk("\tPadding %d\n\tMemreg %d\n", - xprt_rdma_inline_write_padding, xprt_rdma_memreg_strategy); + dprintk("\tPadding 0\n\tMemreg %d\n", xprt_rdma_memreg_strategy); #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) if (!sunrpc_table_header) diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 80ea3db054d8..375df3d3ff59 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -437,7 +437,6 @@ struct rpcrdma_create_data_internal { unsigned int wsize; /* mount wsize - max write hdr+data */ unsigned int inline_rsize; /* max non-rdma read data payload */ unsigned int inline_wsize; /* max non-rdma write data payload */ - unsigned int padding; /* non-rdma write header padding */ }; /* From d461f1f2fb91b5629019b3b405528bc88c49f863 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 14 Dec 2017 20:56:50 -0500 Subject: [PATCH 32/69] xprtrdma: Initialize the xprt address string array earlier This makes the address strings available for debugging messages in earlier stages of transport set up. The first benefit is to get rid of the single-use rep_remote_addr field, saving 128+ bytes in struct rpcrdma_ep. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/transport.c | 5 ++--- net/sunrpc/xprtrdma/verbs.c | 16 +++++++--------- net/sunrpc/xprtrdma/xprt_rdma.h | 13 ++++++++++++- 3 files changed, 21 insertions(+), 13 deletions(-) diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index dc9000d087a9..f6d171e15d7d 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -373,6 +373,7 @@ xprt_setup_rdma(struct xprt_create *args) if (rpc_get_port(sap)) xprt_set_bound(xprt); + xprt_rdma_format_addresses(xprt, sap); cdata.max_requests = xprt->max_reqs; @@ -405,7 +406,6 @@ xprt_setup_rdma(struct xprt_create *args) */ new_xprt->rx_data = cdata; new_ep = &new_xprt->rx_ep; - new_ep->rep_remote_addr = cdata.addr; rc = rpcrdma_ep_create(&new_xprt->rx_ep, &new_xprt->rx_ia, &new_xprt->rx_data); @@ -419,7 +419,6 @@ xprt_setup_rdma(struct xprt_create *args) INIT_DELAYED_WORK(&new_xprt->rx_connect_worker, xprt_rdma_connect_worker); - xprt_rdma_format_addresses(xprt, sap); xprt->max_payload = new_xprt->rx_ia.ri_ops->ro_maxpages(new_xprt); if (xprt->max_payload == 0) goto out4; @@ -437,13 +436,13 @@ xprt_setup_rdma(struct xprt_create *args) out4: rpcrdma_buffer_destroy(&new_xprt->rx_buf); - xprt_rdma_free_addresses(xprt); rc = -ENODEV; out3: rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia); out2: rpcrdma_ia_close(&new_xprt->rx_ia); out1: + xprt_rdma_free_addresses(xprt); xprt_free(xprt); return ERR_PTR(rc); } diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 4a9b6f811859..0b4d6a3f6d02 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -219,9 +219,6 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) struct rpcrdma_xprt *xprt = id->context; struct rpcrdma_ia *ia = &xprt->rx_ia; struct rpcrdma_ep *ep = &xprt->rx_ep; -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) - struct sockaddr *sap = (struct sockaddr *)&ep->rep_remote_addr; -#endif int connstate = 0; switch (event->event) { @@ -244,9 +241,9 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) break; case RDMA_CM_EVENT_DEVICE_REMOVAL: #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) - pr_info("rpcrdma: removing device %s for %pIS:%u\n", + pr_info("rpcrdma: removing device %s for %s:%s\n", ia->ri_device->name, - sap, rpc_get_port(sap)); + rpcrdma_addrstr(xprt), rpcrdma_portstr(xprt)); #endif set_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags); ep->rep_connected = -ENODEV; @@ -269,8 +266,8 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) connstate = -ENETDOWN; goto connected; case RDMA_CM_EVENT_REJECTED: - dprintk("rpcrdma: connection to %pIS:%u rejected: %s\n", - sap, rpc_get_port(sap), + dprintk("rpcrdma: connection to %s:%s rejected: %s\n", + rpcrdma_addrstr(xprt), rpcrdma_portstr(xprt), rdma_reject_msg(id, event->status)); connstate = -ECONNREFUSED; if (event->status == IB_CM_REJ_STALE_CONN) @@ -285,8 +282,9 @@ connected: wake_up_all(&ep->rep_connect_wait); /*FALLTHROUGH*/ default: - dprintk("RPC: %s: %pIS:%u on %s/%s (ep 0x%p): %s\n", - __func__, sap, rpc_get_port(sap), + dprintk("RPC: %s: %s:%s on %s/%s (ep 0x%p): %s\n", + __func__, + rpcrdma_addrstr(xprt), rpcrdma_portstr(xprt), ia->ri_device->name, ia->ri_ops->ro_displayname, ep, rdma_event_msg(event->event)); break; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 375df3d3ff59..0b28026e1502 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -100,7 +100,6 @@ struct rpcrdma_ep { wait_queue_head_t rep_connect_wait; struct rpcrdma_connect_private rep_cm_private; struct rdma_conn_param rep_remote_cma; - struct sockaddr_storage rep_remote_addr; struct delayed_work rep_connect_worker; }; @@ -519,6 +518,18 @@ struct rpcrdma_xprt { #define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, rx_xprt) #define rpcx_to_rdmad(x) (rpcx_to_rdmax(x)->rx_data) +static inline const char * +rpcrdma_addrstr(const struct rpcrdma_xprt *r_xprt) +{ + return r_xprt->rx_xprt.address_strings[RPC_DISPLAY_ADDR]; +} + +static inline const char * +rpcrdma_portstr(const struct rpcrdma_xprt *r_xprt) +{ + return r_xprt->rx_xprt.address_strings[RPC_DISPLAY_PORT]; +} + /* Setting this to 0 ensures interoperability with early servers. * Setting this to 1 enhances certain unaligned read/write performance. * Default is 0, see sysctl entry and rpc_rdma.c rpcrdma_convert_iovs() */ From dd229cee4ed2617ccddc0937608728cd87c934c2 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 14 Dec 2017 20:56:58 -0500 Subject: [PATCH 33/69] xprtrdma: Remove another sockaddr_storage field (cdata::addr) Save more space in struct rpcrdma_xprt by removing the redundant "addr" field from struct rpcrdma_create_data_internal. Wherever we have rpcrdma_xprt, we also have the rpc_xprt, which has a sockaddr_storage field with the same content. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/transport.c | 8 ++------ net/sunrpc/xprtrdma/verbs.c | 20 +++++++++----------- net/sunrpc/xprtrdma/xprt_rdma.h | 3 +-- 3 files changed, 12 insertions(+), 19 deletions(-) diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index f6d171e15d7d..8ba0aa8c566f 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -361,9 +361,7 @@ xprt_setup_rdma(struct xprt_create *args) /* * Set up RDMA-specific connect data. */ - - sap = (struct sockaddr *)&cdata.addr; - memcpy(sap, args->dstaddr, args->addrlen); + sap = args->dstaddr; /* Ensure xprt->addr holds valid server TCP (not RDMA) * address, for any side protocols which peek at it */ @@ -397,7 +395,7 @@ xprt_setup_rdma(struct xprt_create *args) new_xprt = rpcx_to_rdmax(xprt); - rc = rpcrdma_ia_open(new_xprt, sap); + rc = rpcrdma_ia_open(new_xprt); if (rc) goto out1; @@ -483,8 +481,6 @@ xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port) sap = (struct sockaddr_in *)&xprt->addr; sap->sin_port = htons(port); - sap = (struct sockaddr_in *)&rpcx_to_rdmad(xprt).addr; - sap->sin_port = htons(port); dprintk("RPC: %s: %u\n", __func__, port); } diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 0b4d6a3f6d02..d6c737d4c36b 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -294,8 +294,7 @@ connected: } static struct rdma_cm_id * -rpcrdma_create_id(struct rpcrdma_xprt *xprt, - struct rpcrdma_ia *ia, struct sockaddr *addr) +rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia) { unsigned long wtimeout = msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1; struct rdma_cm_id *id; @@ -314,7 +313,9 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, } ia->ri_async_rc = -ETIMEDOUT; - rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT); + rc = rdma_resolve_addr(id, NULL, + (struct sockaddr *)&xprt->rx_xprt.addr, + RDMA_RESOLVE_TIMEOUT); if (rc) { dprintk("RPC: %s: rdma_resolve_addr() failed %i\n", __func__, rc); @@ -361,19 +362,18 @@ out: /** * rpcrdma_ia_open - Open and initialize an Interface Adapter. - * @xprt: controlling transport - * @addr: IP address of remote peer + * @xprt: transport with IA to (re)initialize * * Returns 0 on success, negative errno if an appropriate * Interface Adapter could not be found and opened. */ int -rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr) +rpcrdma_ia_open(struct rpcrdma_xprt *xprt) { struct rpcrdma_ia *ia = &xprt->rx_ia; int rc; - ia->ri_id = rpcrdma_create_id(xprt, ia, addr); + ia->ri_id = rpcrdma_create_id(xprt, ia); if (IS_ERR(ia->ri_id)) { rc = PTR_ERR(ia->ri_id); goto out_err; @@ -649,13 +649,12 @@ static int rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { - struct sockaddr *sap = (struct sockaddr *)&r_xprt->rx_data.addr; int rc, err; pr_info("%s: r_xprt = %p\n", __func__, r_xprt); rc = -EHOSTUNREACH; - if (rpcrdma_ia_open(r_xprt, sap)) + if (rpcrdma_ia_open(r_xprt)) goto out1; rc = -ENOMEM; @@ -687,7 +686,6 @@ static int rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { - struct sockaddr *sap = (struct sockaddr *)&r_xprt->rx_data.addr; struct rdma_cm_id *id, *old; int err, rc; @@ -696,7 +694,7 @@ rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ep *ep, rpcrdma_ep_disconnect(ep, ia); rc = -EHOSTUNREACH; - id = rpcrdma_create_id(r_xprt, ia, sap); + id = rpcrdma_create_id(r_xprt, ia); if (IS_ERR(id)) goto out; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 0b28026e1502..7c09e2ae2542 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -430,7 +430,6 @@ struct rpcrdma_buffer { * This data should be set with mount options */ struct rpcrdma_create_data_internal { - struct sockaddr_storage addr; /* RDMA server address */ unsigned int max_requests; /* max requests (slots) in flight */ unsigned int rsize; /* mount rsize - max read hdr+data */ unsigned int wsize; /* mount wsize - max write hdr+data */ @@ -543,7 +542,7 @@ extern unsigned int xprt_rdma_memreg_strategy; /* * Interface Adapter calls - xprtrdma/verbs.c */ -int rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr); +int rpcrdma_ia_open(struct rpcrdma_xprt *xprt); void rpcrdma_ia_remove(struct rpcrdma_ia *ia); void rpcrdma_ia_close(struct rpcrdma_ia *); bool frwr_is_supported(struct rpcrdma_ia *); From 20035edf3c349638a679cbc8e861d81fbc104b53 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 14 Dec 2017 20:57:06 -0500 Subject: [PATCH 34/69] xprtrdma: Support IPv6 in xprt_rdma_set_port Clean up a harmless oversight. xprtrdma's ->set_port method has never properly supported IPv6. This issue has never been a problem because NFS/RDMA mounts have always required "port=20049", thus so far, rpcbind is not invoked for these mounts. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/transport.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 8ba0aa8c566f..cebcd027d085 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -474,14 +474,34 @@ xprt_rdma_close(struct rpc_xprt *xprt) rpcrdma_ep_disconnect(ep, ia); } +/** + * xprt_rdma_set_port - update server port with rpcbind result + * @xprt: controlling RPC transport + * @port: new port value + * + * Transport connect status is unchanged. + */ static void xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port) { - struct sockaddr_in *sap; + struct sockaddr *sap = (struct sockaddr *)&xprt->addr; + char buf[8]; - sap = (struct sockaddr_in *)&xprt->addr; - sap->sin_port = htons(port); - dprintk("RPC: %s: %u\n", __func__, port); + dprintk("RPC: %s: setting port for xprt %p (%s:%s) to %u\n", + __func__, xprt, + xprt->address_strings[RPC_DISPLAY_ADDR], + xprt->address_strings[RPC_DISPLAY_PORT], + port); + + rpc_set_port(sap, port); + + kfree(xprt->address_strings[RPC_DISPLAY_PORT]); + snprintf(buf, sizeof(buf), "%u", port); + xprt->address_strings[RPC_DISPLAY_PORT] = kstrdup(buf, GFP_KERNEL); + + kfree(xprt->address_strings[RPC_DISPLAY_HEX_PORT]); + snprintf(buf, sizeof(buf), "%4hx", port); + xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL); } /** From a2b6470b1c51dee7be4faf4f6b64803a6fcf637f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 14 Dec 2017 20:57:14 -0500 Subject: [PATCH 35/69] xprtrdma: Move unmap-safe logic to rpcrdma_marshal_req Clean up. This logic is related to marshaling the request, and I'd like to keep everything that touches req->rl_registered close together, for CPU cache efficiency. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 11 +++++++++++ net/sunrpc/xprtrdma/transport.c | 5 ----- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index d7463bce3df3..dd7c0aab7535 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -821,6 +821,17 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) rtype = rpcrdma_areadch; } + /* If this is a retransmit, discard previously registered + * chunks. Very likely the connection has been replaced, + * so these registrations are invalid and unusable. + */ + while (unlikely(!list_empty(&req->rl_registered))) { + struct rpcrdma_mw *mw; + + mw = rpcrdma_pop_mw(&req->rl_registered); + rpcrdma_defer_mr_recovery(mw); + } + /* This implementation supports the following combinations * of chunk lists in one RPC-over-RDMA Call message: * diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index cebcd027d085..d77dee5a2748 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -731,11 +731,6 @@ xprt_rdma_send_request(struct rpc_task *task) if (!xprt_connected(xprt)) goto drop_connection; - /* On retransmit, remove any previously registered chunks */ - if (unlikely(!list_empty(&req->rl_registered))) - r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, - &req->rl_registered); - rc = rpcrdma_marshal_req(r_xprt, rqst); if (rc < 0) goto failed_marshal; From 6c537f2c7cc06da36f6701be4c9413d7b8b47bfb Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 14 Dec 2017 20:57:23 -0500 Subject: [PATCH 36/69] xprtrdma: buf_free not called for CB replies Since commit 5a6d1db45569 ("SUNRPC: Add a transport-specific private field in rpc_rqst"), the rpc_rqst's for RPC-over-RDMA backchannel operations leave rq_buffer set to NULL. xprt_release does not invoke ->op->buf_free when rq_buffer is NULL. The RPCRDMA_REQ_F_BACKCHANNEL check in xprt_rdma_free is therefore redundant because xprt_rdma_free is not invoked for backchannel requests. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/backchannel.c | 1 - net/sunrpc/xprtrdma/transport.c | 3 --- net/sunrpc/xprtrdma/xprt_rdma.h | 3 +-- 3 files changed, 1 insertion(+), 6 deletions(-) diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 256c67b433c1..11fb38f4d70d 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -43,7 +43,6 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt, req = rpcrdma_create_req(r_xprt); if (IS_ERR(req)) return PTR_ERR(req); - __set_bit(RPCRDMA_REQ_F_BACKCHANNEL, &req->rl_flags); rb = rpcrdma_alloc_regbuf(RPCRDMA_HDRBUF_SIZE, DMA_TO_DEVICE, GFP_KERNEL); diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index d77dee5a2748..d0cd6d411b64 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -686,9 +686,6 @@ xprt_rdma_free(struct rpc_task *task) struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); struct rpcrdma_req *req = rpcr_to_rdmar(rqst); - if (test_bit(RPCRDMA_REQ_F_BACKCHANNEL, &req->rl_flags)) - return; - dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply); if (test_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags)) diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 7c09e2ae2542..ed7e51304bc2 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -354,8 +354,7 @@ struct rpcrdma_req { /* rl_flags */ enum { - RPCRDMA_REQ_F_BACKCHANNEL = 0, - RPCRDMA_REQ_F_PENDING, + RPCRDMA_REQ_F_PENDING = 0, RPCRDMA_REQ_F_TX_RESOURCES, }; From cf73daf52750fca4b4af0ca812f542891c228066 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 14 Dec 2017 20:57:31 -0500 Subject: [PATCH 37/69] xprtrdma: Split xprt_rdma_send_request Clean up. @rqst is set up differently for backchannel Replies. For example, rqst->rq_task and task->tk_client are both NULL. So it is easier to understand and maintain this code path if it is separated. Also, we can get rid of the confusing rl_connect_cookie hack in rpcrdma_bc_receive_call. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/backchannel.c | 48 ++++++++++++++++++++++++------- net/sunrpc/xprtrdma/rpc_rdma.c | 5 ---- net/sunrpc/xprtrdma/transport.c | 27 +++++++---------- net/sunrpc/xprtrdma/xprt_rdma.h | 2 +- 4 files changed, 50 insertions(+), 32 deletions(-) diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 11fb38f4d70d..6c66a4f85507 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -187,13 +187,7 @@ size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt) return maxmsg - RPCRDMA_HDRLEN_MIN; } -/** - * rpcrdma_bc_marshal_reply - Send backwards direction reply - * @rqst: buffer containing RPC reply data - * - * Returns zero on success. - */ -int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) +static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); struct rpcrdma_req *req = rpcr_to_rdmar(rqst); @@ -220,6 +214,43 @@ int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) return 0; } +/** + * xprt_rdma_bc_send_reply - marshal and send a backchannel reply + * @rqst: RPC rqst with a backchannel RPC reply in rq_snd_buf + * + * Caller holds the transport's write lock. + * + * Returns: + * %0 if the RPC message has been sent + * %-ENOTCONN if the caller should reconnect and call again + * %-EIO if a permanent error occurred and the request was not + * sent. Do not try to send this message again. + */ +int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst) +{ + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + int rc; + + if (!xprt_connected(rqst->rq_xprt)) + goto drop_connection; + + rc = rpcrdma_bc_marshal_reply(rqst); + if (rc < 0) + goto failed_marshal; + + if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) + goto drop_connection; + return 0; + +failed_marshal: + if (rc != -ENOTCONN) + return rc; +drop_connection: + xprt_disconnect_done(rqst->rq_xprt); + return -ENOTCONN; +} + /** * xprt_rdma_bc_destroy - Release resources for handling backchannel requests * @xprt: transport associated with these backchannel resources @@ -330,9 +361,6 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, __func__, rep, req); req->rl_reply = rep; - /* Defeat the retransmit detection logic in send_request */ - req->rl_connect_cookie = 0; - /* Queue rqst for ULP's callback service */ bc_serv = xprt->bc_serv; spin_lock(&bc_serv->sv_cb_lock); diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index dd7c0aab7535..9207aeacd2c3 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -754,11 +754,6 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) __be32 *p; int ret; -#if defined(CONFIG_SUNRPC_BACKCHANNEL) - if (test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state)) - return rpcrdma_bc_marshal_reply(rqst); -#endif - rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0); xdr_init_encode(xdr, &req->rl_hdrbuf, req->rl_rdmabuf->rg_base); diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index d0cd6d411b64..be8c4e62d3f2 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -699,22 +699,12 @@ xprt_rdma_free(struct rpc_task *task) * * Caller holds the transport's write lock. * - * Return values: - * 0: The request has been sent - * ENOTCONN: Caller needs to invoke connect logic then call again - * ENOBUFS: Call again later to send the request - * EIO: A permanent error occurred. The request was not sent, - * and don't try it again - * - * send_request invokes the meat of RPC RDMA. It must do the following: - * - * 1. Marshal the RPC request into an RPC RDMA request, which means - * putting a header in front of data, and creating IOVs for RDMA - * from those in the request. - * 2. In marshaling, detect opportunities for RDMA, and use them. - * 3. Post a recv message to set up asynch completion, then send - * the request (rpcrdma_ep_post). - * 4. No partial sends are possible in the RPC-RDMA protocol (as in UDP). + * Returns: + * %0 if the RPC message has been sent + * %-ENOTCONN if the caller should reconnect and call again + * %-ENOBUFS if the caller should call again later + * %-EIO if a permanent error occurred and the request was not + * sent. Do not try to send this message again. */ static int xprt_rdma_send_request(struct rpc_task *task) @@ -725,6 +715,11 @@ xprt_rdma_send_request(struct rpc_task *task) struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); int rc = 0; +#if defined(CONFIG_SUNRPC_BACKCHANNEL) + if (unlikely(!rqst->rq_buffer)) + return xprt_rdma_bc_send_reply(rqst); +#endif /* CONFIG_SUNRPC_BACKCHANNEL */ + if (!xprt_connected(xprt)) goto drop_connection; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index ed7e51304bc2..e084130d3d84 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -666,7 +666,7 @@ int xprt_rdma_bc_up(struct svc_serv *, struct net *); size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *); int rpcrdma_bc_post_recv(struct rpcrdma_xprt *, unsigned int); void rpcrdma_bc_receive_call(struct rpcrdma_xprt *, struct rpcrdma_rep *); -int rpcrdma_bc_marshal_reply(struct rpc_rqst *); +int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst); void xprt_rdma_bc_free_rqst(struct rpc_rqst *); void xprt_rdma_bc_destroy(struct rpc_xprt *, unsigned int); #endif /* CONFIG_SUNRPC_BACKCHANNEL */ From 30b5416bf0fd3bfef55543343ad1e85d32e32de4 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 14 Dec 2017 20:57:39 -0500 Subject: [PATCH 38/69] xprtrdma: Don't clear RPC_BC_PA_IN_USE on pre-allocated rpc_rqst's No need for the overhead of atomically setting and clearing this bit flag for every use of a pre-allocated backchannel rpc_rqst. These are a distinct pool of rpc_rqsts that are used only for callback operations, so it is safe to simply leave the bit set. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/backchannel.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 6c66a4f85507..3c7998a72191 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -120,6 +120,7 @@ int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs) rqst->rq_xprt = &r_xprt->rx_xprt; INIT_LIST_HEAD(&rqst->rq_list); INIT_LIST_HEAD(&rqst->rq_bc_list); + __set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state); if (rpcrdma_bc_setup_rqst(r_xprt, rqst)) goto out_free; @@ -284,11 +285,6 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst) dprintk("RPC: %s: freeing rqst %p (req %p)\n", __func__, rqst, rpcr_to_rdmar(rqst)); - smp_mb__before_atomic(); - WARN_ON_ONCE(!test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state)); - clear_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state); - smp_mb__after_atomic(); - spin_lock_bh(&xprt->bc_pa_lock); list_add_tail(&rqst->rq_bc_pa_list, &xprt->bc_pa_list); spin_unlock_bh(&xprt->bc_pa_lock); @@ -343,7 +339,6 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, rqst->rq_xid = *p; rqst->rq_private_buf.len = size; - set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state); buf = &rqst->rq_rcv_buf; memset(buf, 0, sizeof(*buf)); From ce5b3717828356ce2c61e5a2a830df970fc90fb9 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 14 Dec 2017 20:57:47 -0500 Subject: [PATCH 39/69] xprtrdma: Replace all usage of "frmr" with "frwr" Clean up: Over time, the industry has adopted the term "frwr" instead of "frmr". The term "frwr" is now more widely recognized. For the past couple of years I've attempted to add new code using "frwr" , but there still remains plenty of older code that still uses "frmr". Replace all usage of "frmr" to avoid confusion. While we're churning code, rename variables unhelpfully called "f" to "frwr", to improve code clarity. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/linux/sunrpc/xprtrdma.h | 2 +- net/sunrpc/xprtrdma/frwr_ops.c | 172 ++++++++++++++++---------------- net/sunrpc/xprtrdma/transport.c | 2 +- net/sunrpc/xprtrdma/verbs.c | 2 +- net/sunrpc/xprtrdma/xprt_rdma.h | 18 ++-- 5 files changed, 98 insertions(+), 98 deletions(-) diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h index 221b7a2e5406..5859563e3c1f 100644 --- a/include/linux/sunrpc/xprtrdma.h +++ b/include/linux/sunrpc/xprtrdma.h @@ -64,7 +64,7 @@ enum rpcrdma_memreg { RPCRDMA_MEMWINDOWS, RPCRDMA_MEMWINDOWS_ASYNC, RPCRDMA_MTHCAFMR, - RPCRDMA_FRMR, + RPCRDMA_FRWR, RPCRDMA_ALLPHYSICAL, RPCRDMA_LAST }; diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index e1f73037b554..185eb69e5fb5 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -1,11 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright (c) 2015 Oracle. All rights reserved. + * Copyright (c) 2015, 2017 Oracle. All rights reserved. * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. */ /* Lightweight memory registration using Fast Registration Work - * Requests (FRWR). Also referred to sometimes as FRMR mode. + * Requests (FRWR). * * FRWR features ordered asynchronous registration and deregistration * of arbitrarily sized memory regions. This is the fastest and safest @@ -15,9 +15,9 @@ /* Normal operation * * A Memory Region is prepared for RDMA READ or WRITE using a FAST_REG - * Work Request (frmr_op_map). When the RDMA operation is finished, this + * Work Request (frwr_op_map). When the RDMA operation is finished, this * Memory Region is invalidated using a LOCAL_INV Work Request - * (frmr_op_unmap). + * (frwr_op_unmap). * * Typically these Work Requests are not signaled, and neither are RDMA * SEND Work Requests (with the exception of signaling occasionally to @@ -98,12 +98,12 @@ out_not_supported: static int frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r) { - unsigned int depth = ia->ri_max_frmr_depth; - struct rpcrdma_frmr *f = &r->frmr; + unsigned int depth = ia->ri_max_frwr_depth; + struct rpcrdma_frwr *frwr = &r->frwr; int rc; - f->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth); - if (IS_ERR(f->fr_mr)) + frwr->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth); + if (IS_ERR(frwr->fr_mr)) goto out_mr_err; r->mw_sg = kcalloc(depth, sizeof(*r->mw_sg), GFP_KERNEL); @@ -111,11 +111,11 @@ frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r) goto out_list_err; sg_init_table(r->mw_sg, depth); - init_completion(&f->fr_linv_done); + init_completion(&frwr->fr_linv_done); return 0; out_mr_err: - rc = PTR_ERR(f->fr_mr); + rc = PTR_ERR(frwr->fr_mr); dprintk("RPC: %s: ib_alloc_mr status %i\n", __func__, rc); return rc; @@ -124,7 +124,7 @@ out_list_err: rc = -ENOMEM; dprintk("RPC: %s: sg allocation failure\n", __func__); - ib_dereg_mr(f->fr_mr); + ib_dereg_mr(frwr->fr_mr); return rc; } @@ -137,7 +137,7 @@ frwr_op_release_mr(struct rpcrdma_mw *r) if (!list_empty(&r->mw_list)) list_del(&r->mw_list); - rc = ib_dereg_mr(r->frmr.fr_mr); + rc = ib_dereg_mr(r->frwr.fr_mr); if (rc) pr_err("rpcrdma: final ib_dereg_mr for %p returned %i\n", r, rc); @@ -148,41 +148,41 @@ frwr_op_release_mr(struct rpcrdma_mw *r) static int __frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r) { - struct rpcrdma_frmr *f = &r->frmr; + struct rpcrdma_frwr *frwr = &r->frwr; int rc; - rc = ib_dereg_mr(f->fr_mr); + rc = ib_dereg_mr(frwr->fr_mr); if (rc) { pr_warn("rpcrdma: ib_dereg_mr status %d, frwr %p orphaned\n", rc, r); return rc; } - f->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, - ia->ri_max_frmr_depth); - if (IS_ERR(f->fr_mr)) { + frwr->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, + ia->ri_max_frwr_depth); + if (IS_ERR(frwr->fr_mr)) { pr_warn("rpcrdma: ib_alloc_mr status %ld, frwr %p orphaned\n", - PTR_ERR(f->fr_mr), r); - return PTR_ERR(f->fr_mr); + PTR_ERR(frwr->fr_mr), r); + return PTR_ERR(frwr->fr_mr); } - dprintk("RPC: %s: recovered FRMR %p\n", __func__, f); - f->fr_state = FRMR_IS_INVALID; + dprintk("RPC: %s: recovered FRWR %p\n", __func__, frwr); + frwr->fr_state = FRWR_IS_INVALID; return 0; } -/* Reset of a single FRMR. Generate a fresh rkey by replacing the MR. +/* Reset of a single FRWR. Generate a fresh rkey by replacing the MR. */ static void frwr_op_recover_mr(struct rpcrdma_mw *mw) { - enum rpcrdma_frmr_state state = mw->frmr.fr_state; + enum rpcrdma_frwr_state state = mw->frwr.fr_state; struct rpcrdma_xprt *r_xprt = mw->mw_xprt; struct rpcrdma_ia *ia = &r_xprt->rx_ia; int rc; rc = __frwr_reset_mr(ia, mw); - if (state != FRMR_FLUSHED_LI) + if (state != FRWR_FLUSHED_LI) ib_dma_unmap_sg(ia->ri_device, mw->mw_sg, mw->mw_nents, mw->mw_dir); if (rc) @@ -193,7 +193,7 @@ frwr_op_recover_mr(struct rpcrdma_mw *mw) return; out_release: - pr_err("rpcrdma: FRMR reset failed %d, %p release\n", rc, mw); + pr_err("rpcrdma: FRWR reset failed %d, %p release\n", rc, mw); r_xprt->rx_stats.mrs_orphaned++; spin_lock(&r_xprt->rx_buf.rb_mwlock); @@ -214,31 +214,31 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, if (attrs->device_cap_flags & IB_DEVICE_SG_GAPS_REG) ia->ri_mrtype = IB_MR_TYPE_SG_GAPS; - ia->ri_max_frmr_depth = + ia->ri_max_frwr_depth = min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, attrs->max_fast_reg_page_list_len); dprintk("RPC: %s: device's max FR page list len = %u\n", - __func__, ia->ri_max_frmr_depth); + __func__, ia->ri_max_frwr_depth); - /* Add room for frmr register and invalidate WRs. - * 1. FRMR reg WR for head - * 2. FRMR invalidate WR for head - * 3. N FRMR reg WRs for pagelist - * 4. N FRMR invalidate WRs for pagelist - * 5. FRMR reg WR for tail - * 6. FRMR invalidate WR for tail + /* Add room for frwr register and invalidate WRs. + * 1. FRWR reg WR for head + * 2. FRWR invalidate WR for head + * 3. N FRWR reg WRs for pagelist + * 4. N FRWR invalidate WRs for pagelist + * 5. FRWR reg WR for tail + * 6. FRWR invalidate WR for tail * 7. The RDMA_SEND WR */ depth = 7; - /* Calculate N if the device max FRMR depth is smaller than + /* Calculate N if the device max FRWR depth is smaller than * RPCRDMA_MAX_DATA_SEGS. */ - if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) { - delta = RPCRDMA_MAX_DATA_SEGS - ia->ri_max_frmr_depth; + if (ia->ri_max_frwr_depth < RPCRDMA_MAX_DATA_SEGS) { + delta = RPCRDMA_MAX_DATA_SEGS - ia->ri_max_frwr_depth; do { - depth += 2; /* FRMR reg + invalidate */ - delta -= ia->ri_max_frmr_depth; + depth += 2; /* FRWR reg + invalidate */ + delta -= ia->ri_max_frwr_depth; } while (delta > 0); } @@ -252,7 +252,7 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, } ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS / - ia->ri_max_frmr_depth); + ia->ri_max_frwr_depth); return 0; } @@ -265,7 +265,7 @@ frwr_op_maxpages(struct rpcrdma_xprt *r_xprt) struct rpcrdma_ia *ia = &r_xprt->rx_ia; return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, - RPCRDMA_MAX_HDR_SEGS * ia->ri_max_frmr_depth); + RPCRDMA_MAX_HDR_SEGS * ia->ri_max_frwr_depth); } static void @@ -286,14 +286,14 @@ __frwr_sendcompletion_flush(struct ib_wc *wc, const char *wr) static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) { - struct rpcrdma_frmr *frmr; + struct rpcrdma_frwr *frwr; struct ib_cqe *cqe; /* WARNING: Only wr_cqe and status are reliable at this point */ if (wc->status != IB_WC_SUCCESS) { cqe = wc->wr_cqe; - frmr = container_of(cqe, struct rpcrdma_frmr, fr_cqe); - frmr->fr_state = FRMR_FLUSHED_FR; + frwr = container_of(cqe, struct rpcrdma_frwr, fr_cqe); + frwr->fr_state = FRWR_FLUSHED_FR; __frwr_sendcompletion_flush(wc, "fastreg"); } } @@ -307,14 +307,14 @@ frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc) { - struct rpcrdma_frmr *frmr; + struct rpcrdma_frwr *frwr; struct ib_cqe *cqe; /* WARNING: Only wr_cqe and status are reliable at this point */ if (wc->status != IB_WC_SUCCESS) { cqe = wc->wr_cqe; - frmr = container_of(cqe, struct rpcrdma_frmr, fr_cqe); - frmr->fr_state = FRMR_FLUSHED_LI; + frwr = container_of(cqe, struct rpcrdma_frwr, fr_cqe); + frwr->fr_state = FRWR_FLUSHED_LI; __frwr_sendcompletion_flush(wc, "localinv"); } } @@ -329,17 +329,17 @@ frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc) static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) { - struct rpcrdma_frmr *frmr; + struct rpcrdma_frwr *frwr; struct ib_cqe *cqe; /* WARNING: Only wr_cqe and status are reliable at this point */ cqe = wc->wr_cqe; - frmr = container_of(cqe, struct rpcrdma_frmr, fr_cqe); + frwr = container_of(cqe, struct rpcrdma_frwr, fr_cqe); if (wc->status != IB_WC_SUCCESS) { - frmr->fr_state = FRMR_FLUSHED_LI; + frwr->fr_state = FRWR_FLUSHED_LI; __frwr_sendcompletion_flush(wc, "localinv"); } - complete(&frmr->fr_linv_done); + complete(&frwr->fr_linv_done); } /* Post a REG_MR Work Request to register a memory region @@ -351,8 +351,8 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, { struct rpcrdma_ia *ia = &r_xprt->rx_ia; bool holes_ok = ia->ri_mrtype == IB_MR_TYPE_SG_GAPS; + struct rpcrdma_frwr *frwr; struct rpcrdma_mw *mw; - struct rpcrdma_frmr *frmr; struct ib_mr *mr; struct ib_reg_wr *reg_wr; struct ib_send_wr *bad_wr; @@ -366,14 +366,13 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, mw = rpcrdma_get_mw(r_xprt); if (!mw) return ERR_PTR(-ENOBUFS); - } while (mw->frmr.fr_state != FRMR_IS_INVALID); - frmr = &mw->frmr; - frmr->fr_state = FRMR_IS_VALID; - mr = frmr->fr_mr; - reg_wr = &frmr->fr_regwr; + } while (mw->frwr.fr_state != FRWR_IS_INVALID); + frwr = &mw->frwr; + frwr->fr_state = FRWR_IS_VALID; + mr = frwr->fr_mr; - if (nsegs > ia->ri_max_frmr_depth) - nsegs = ia->ri_max_frmr_depth; + if (nsegs > ia->ri_max_frwr_depth) + nsegs = ia->ri_max_frwr_depth; for (i = 0; i < nsegs;) { if (seg->mr_page) sg_set_page(&mw->mw_sg[i], @@ -402,16 +401,17 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, if (unlikely(n != mw->mw_nents)) goto out_mapmr_err; - dprintk("RPC: %s: Using frmr %p to map %u segments (%llu bytes)\n", - __func__, frmr, mw->mw_nents, mr->length); + dprintk("RPC: %s: Using frwr %p to map %u segments (%llu bytes)\n", + __func__, frwr, mw->mw_nents, mr->length); key = (u8)(mr->rkey & 0x000000FF); ib_update_fast_reg_key(mr, ++key); + reg_wr = &frwr->fr_regwr; reg_wr->wr.next = NULL; reg_wr->wr.opcode = IB_WR_REG_MR; - frmr->fr_cqe.done = frwr_wc_fastreg; - reg_wr->wr.wr_cqe = &frmr->fr_cqe; + frwr->fr_cqe.done = frwr_wc_fastreg; + reg_wr->wr.wr_cqe = &frwr->fr_cqe; reg_wr->wr.num_sge = 0; reg_wr->wr.send_flags = 0; reg_wr->mr = mr; @@ -434,18 +434,18 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, out_dmamap_err: pr_err("rpcrdma: failed to DMA map sg %p sg_nents %d\n", mw->mw_sg, i); - frmr->fr_state = FRMR_IS_INVALID; + frwr->fr_state = FRWR_IS_INVALID; rpcrdma_put_mw(r_xprt, mw); return ERR_PTR(-EIO); out_mapmr_err: pr_err("rpcrdma: failed to map mr %p (%d/%d)\n", - frmr->fr_mr, n, mw->mw_nents); + frwr->fr_mr, n, mw->mw_nents); rpcrdma_defer_mr_recovery(mw); return ERR_PTR(-EIO); out_senderr: - pr_err("rpcrdma: FRMR registration ib_post_send returned %i\n", rc); + pr_err("rpcrdma: FRWR registration ib_post_send returned %i\n", rc); rpcrdma_defer_mr_recovery(mw); return ERR_PTR(-ENOTCONN); } @@ -462,7 +462,7 @@ frwr_op_reminv(struct rpcrdma_rep *rep, struct list_head *mws) struct rpcrdma_xprt *r_xprt = mw->mw_xprt; list_del(&mw->mw_list); - mw->frmr.fr_state = FRMR_IS_INVALID; + mw->frwr.fr_state = FRWR_IS_INVALID; ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, mw->mw_sg, mw->mw_nents, mw->mw_dir); rpcrdma_put_mw(r_xprt, mw); @@ -483,7 +483,7 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws) { struct ib_send_wr *first, **prev, *last, *bad_wr; struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rpcrdma_frmr *f; + struct rpcrdma_frwr *frwr; struct rpcrdma_mw *mw; int count, rc; @@ -492,20 +492,20 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws) * Chain the LOCAL_INV Work Requests and post them with * a single ib_post_send() call. */ - f = NULL; + frwr = NULL; count = 0; prev = &first; list_for_each_entry(mw, mws, mw_list) { - mw->frmr.fr_state = FRMR_IS_INVALID; + mw->frwr.fr_state = FRWR_IS_INVALID; - f = &mw->frmr; - dprintk("RPC: %s: invalidating frmr %p\n", - __func__, f); + frwr = &mw->frwr; + dprintk("RPC: %s: invalidating frwr %p\n", + __func__, frwr); - f->fr_cqe.done = frwr_wc_localinv; - last = &f->fr_invwr; + frwr->fr_cqe.done = frwr_wc_localinv; + last = &frwr->fr_invwr; memset(last, 0, sizeof(*last)); - last->wr_cqe = &f->fr_cqe; + last->wr_cqe = &frwr->fr_cqe; last->opcode = IB_WR_LOCAL_INV; last->ex.invalidate_rkey = mw->mw_handle; count++; @@ -513,7 +513,7 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws) *prev = last; prev = &last->next; } - if (!f) + if (!frwr) goto unmap; /* Strong send queue ordering guarantees that when the @@ -521,8 +521,8 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws) * are complete. */ last->send_flags = IB_SEND_SIGNALED; - f->fr_cqe.done = frwr_wc_localinv_wake; - reinit_completion(&f->fr_linv_done); + frwr->fr_cqe.done = frwr_wc_localinv_wake; + reinit_completion(&frwr->fr_linv_done); /* Transport disconnect drains the receive CQ before it * replaces the QP. The RPC reply handler won't call us @@ -532,7 +532,7 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws) bad_wr = NULL; rc = ib_post_send(ia->ri_id->qp, first, &bad_wr); if (bad_wr != first) - wait_for_completion(&f->fr_linv_done); + wait_for_completion(&frwr->fr_linv_done); if (rc) goto reset_mrs; @@ -542,8 +542,8 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws) unmap: while (!list_empty(mws)) { mw = rpcrdma_pop_mw(mws); - dprintk("RPC: %s: DMA unmapping frmr %p\n", - __func__, &mw->frmr); + dprintk("RPC: %s: DMA unmapping frwr %p\n", + __func__, &mw->frwr); ib_dma_unmap_sg(ia->ri_device, mw->mw_sg, mw->mw_nents, mw->mw_dir); rpcrdma_put_mw(r_xprt, mw); @@ -551,15 +551,15 @@ unmap: return; reset_mrs: - pr_err("rpcrdma: FRMR invalidate ib_post_send returned %i\n", rc); + pr_err("rpcrdma: FRWR invalidate ib_post_send returned %i\n", rc); /* Find and reset the MRs in the LOCAL_INV WRs that did not * get posted. */ while (bad_wr) { - f = container_of(bad_wr, struct rpcrdma_frmr, - fr_invwr); - mw = container_of(f, struct rpcrdma_mw, frmr); + frwr = container_of(bad_wr, struct rpcrdma_frwr, + fr_invwr); + mw = container_of(frwr, struct rpcrdma_mw, frwr); __frwr_reset_mr(ia, mw); diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index be8c4e62d3f2..ddf0d87812ef 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -67,7 +67,7 @@ static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE; unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE; static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE; -unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR; +unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRWR; int xprt_rdma_pad_optimize; #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index d6c737d4c36b..840579919ad0 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -388,7 +388,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt) } switch (xprt_rdma_memreg_strategy) { - case RPCRDMA_FRMR: + case RPCRDMA_FRWR: if (frwr_is_supported(ia)) { ia->ri_ops = &rpcrdma_frwr_memreg_ops; break; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index e084130d3d84..f52269afaa09 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -73,7 +73,7 @@ struct rpcrdma_ia { struct completion ri_remove_done; int ri_async_rc; unsigned int ri_max_segs; - unsigned int ri_max_frmr_depth; + unsigned int ri_max_frwr_depth; unsigned int ri_max_inline_write; unsigned int ri_max_inline_read; unsigned int ri_max_send_sges; @@ -242,17 +242,17 @@ enum { * rpcrdma_deregister_external() uses this metadata to unmap and * release these resources when an RPC is complete. */ -enum rpcrdma_frmr_state { - FRMR_IS_INVALID, /* ready to be used */ - FRMR_IS_VALID, /* in use */ - FRMR_FLUSHED_FR, /* flushed FASTREG WR */ - FRMR_FLUSHED_LI, /* flushed LOCALINV WR */ +enum rpcrdma_frwr_state { + FRWR_IS_INVALID, /* ready to be used */ + FRWR_IS_VALID, /* in use */ + FRWR_FLUSHED_FR, /* flushed FASTREG WR */ + FRWR_FLUSHED_LI, /* flushed LOCALINV WR */ }; -struct rpcrdma_frmr { +struct rpcrdma_frwr { struct ib_mr *fr_mr; struct ib_cqe fr_cqe; - enum rpcrdma_frmr_state fr_state; + enum rpcrdma_frwr_state fr_state; struct completion fr_linv_done; union { struct ib_reg_wr fr_regwr; @@ -272,7 +272,7 @@ struct rpcrdma_mw { enum dma_data_direction mw_dir; union { struct rpcrdma_fmr fmr; - struct rpcrdma_frmr frmr; + struct rpcrdma_frwr frwr; }; struct rpcrdma_xprt *mw_xprt; u32 mw_handle; From 96ceddea3710f61bb5a5f2af25e684b7e1466171 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 14 Dec 2017 20:57:55 -0500 Subject: [PATCH 40/69] xprtrdma: Remove usage of "mw" Clean up: struct rpcrdma_mw was named after Memory Windows, but xprtrdma no longer supports a Memory Window registration mode. Rename rpcrdma_mw and its fields to reduce confusion and make the code more sensible to read. Renaming "mw" was suggested by Tom Talpey, the author of the original xprtrdma implementation. It's a good idea, but I haven't done this until now because it's a huge diffstat for no benefit other than code readability. However, I'm about to introduce static trace points that expose a few of xprtrdma's internal data structures. They should make sense in the trace report, and it's reasonable to treat trace points as a kernel API contract which might be difficult to change later. While I'm churning things up, two additional changes: - rename variables unhelpfully called "r" to "mr", to improve code clarity, and - rename the MR-related helper functions using the form "rpcrdma_mr_", to be consistent with other areas of the code. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/fmr_ops.c | 148 +++++++++++++------------- net/sunrpc/xprtrdma/frwr_ops.c | 177 ++++++++++++++++---------------- net/sunrpc/xprtrdma/rpc_rdma.c | 64 ++++++------ net/sunrpc/xprtrdma/verbs.c | 119 +++++++++++---------- net/sunrpc/xprtrdma/xprt_rdma.h | 62 +++++------ 5 files changed, 292 insertions(+), 278 deletions(-) diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index 29fc84c7ff98..8bd0399b3a1c 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright (c) 2015 Oracle. All rights reserved. + * Copyright (c) 2015, 2017 Oracle. All rights reserved. * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. */ @@ -47,7 +47,7 @@ fmr_is_supported(struct rpcrdma_ia *ia) } static int -fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *mw) +fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) { static struct ib_fmr_attr fmr_attr = { .max_pages = RPCRDMA_MAX_FMR_SGES, @@ -55,106 +55,106 @@ fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *mw) .page_shift = PAGE_SHIFT }; - mw->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES, + mr->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES, sizeof(u64), GFP_KERNEL); - if (!mw->fmr.fm_physaddrs) + if (!mr->fmr.fm_physaddrs) goto out_free; - mw->mw_sg = kcalloc(RPCRDMA_MAX_FMR_SGES, - sizeof(*mw->mw_sg), GFP_KERNEL); - if (!mw->mw_sg) + mr->mr_sg = kcalloc(RPCRDMA_MAX_FMR_SGES, + sizeof(*mr->mr_sg), GFP_KERNEL); + if (!mr->mr_sg) goto out_free; - sg_init_table(mw->mw_sg, RPCRDMA_MAX_FMR_SGES); + sg_init_table(mr->mr_sg, RPCRDMA_MAX_FMR_SGES); - mw->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS, + mr->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS, &fmr_attr); - if (IS_ERR(mw->fmr.fm_mr)) + if (IS_ERR(mr->fmr.fm_mr)) goto out_fmr_err; return 0; out_fmr_err: dprintk("RPC: %s: ib_alloc_fmr returned %ld\n", __func__, - PTR_ERR(mw->fmr.fm_mr)); + PTR_ERR(mr->fmr.fm_mr)); out_free: - kfree(mw->mw_sg); - kfree(mw->fmr.fm_physaddrs); + kfree(mr->mr_sg); + kfree(mr->fmr.fm_physaddrs); return -ENOMEM; } static int -__fmr_unmap(struct rpcrdma_mw *mw) +__fmr_unmap(struct rpcrdma_mr *mr) { LIST_HEAD(l); int rc; - list_add(&mw->fmr.fm_mr->list, &l); + list_add(&mr->fmr.fm_mr->list, &l); rc = ib_unmap_fmr(&l); - list_del(&mw->fmr.fm_mr->list); + list_del(&mr->fmr.fm_mr->list); return rc; } static void -fmr_op_release_mr(struct rpcrdma_mw *r) +fmr_op_release_mr(struct rpcrdma_mr *mr) { LIST_HEAD(unmap_list); int rc; /* Ensure MW is not on any rl_registered list */ - if (!list_empty(&r->mw_list)) - list_del(&r->mw_list); + if (!list_empty(&mr->mr_list)) + list_del(&mr->mr_list); - kfree(r->fmr.fm_physaddrs); - kfree(r->mw_sg); + kfree(mr->fmr.fm_physaddrs); + kfree(mr->mr_sg); /* In case this one was left mapped, try to unmap it * to prevent dealloc_fmr from failing with EBUSY */ - rc = __fmr_unmap(r); + rc = __fmr_unmap(mr); if (rc) pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n", - r, rc); + mr, rc); - rc = ib_dealloc_fmr(r->fmr.fm_mr); + rc = ib_dealloc_fmr(mr->fmr.fm_mr); if (rc) pr_err("rpcrdma: final ib_dealloc_fmr for %p returned %i\n", - r, rc); + mr, rc); - kfree(r); + kfree(mr); } /* Reset of a single FMR. */ static void -fmr_op_recover_mr(struct rpcrdma_mw *mw) +fmr_op_recover_mr(struct rpcrdma_mr *mr) { - struct rpcrdma_xprt *r_xprt = mw->mw_xprt; + struct rpcrdma_xprt *r_xprt = mr->mr_xprt; int rc; /* ORDER: invalidate first */ - rc = __fmr_unmap(mw); + rc = __fmr_unmap(mr); /* ORDER: then DMA unmap */ ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, - mw->mw_sg, mw->mw_nents, mw->mw_dir); + mr->mr_sg, mr->mr_nents, mr->mr_dir); if (rc) goto out_release; - rpcrdma_put_mw(r_xprt, mw); + rpcrdma_mr_put(mr); r_xprt->rx_stats.mrs_recovered++; return; out_release: - pr_err("rpcrdma: FMR reset failed (%d), %p released\n", rc, mw); + pr_err("rpcrdma: FMR reset failed (%d), %p released\n", rc, mr); r_xprt->rx_stats.mrs_orphaned++; - spin_lock(&r_xprt->rx_buf.rb_mwlock); - list_del(&mw->mw_all); - spin_unlock(&r_xprt->rx_buf.rb_mwlock); + spin_lock(&r_xprt->rx_buf.rb_mrlock); + list_del(&mr->mr_all); + spin_unlock(&r_xprt->rx_buf.rb_mrlock); - fmr_op_release_mr(mw); + fmr_op_release_mr(mr); } static int @@ -180,15 +180,15 @@ fmr_op_maxpages(struct rpcrdma_xprt *r_xprt) */ static struct rpcrdma_mr_seg * fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, - int nsegs, bool writing, struct rpcrdma_mw **out) + int nsegs, bool writing, struct rpcrdma_mr **out) { struct rpcrdma_mr_seg *seg1 = seg; int len, pageoff, i, rc; - struct rpcrdma_mw *mw; + struct rpcrdma_mr *mr; u64 *dma_pages; - mw = rpcrdma_get_mw(r_xprt); - if (!mw) + mr = rpcrdma_mr_get(r_xprt); + if (!mr) return ERR_PTR(-ENOBUFS); pageoff = offset_in_page(seg1->mr_offset); @@ -199,12 +199,12 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, nsegs = RPCRDMA_MAX_FMR_SGES; for (i = 0; i < nsegs;) { if (seg->mr_page) - sg_set_page(&mw->mw_sg[i], + sg_set_page(&mr->mr_sg[i], seg->mr_page, seg->mr_len, offset_in_page(seg->mr_offset)); else - sg_set_buf(&mw->mw_sg[i], seg->mr_offset, + sg_set_buf(&mr->mr_sg[i], seg->mr_offset, seg->mr_len); len += seg->mr_len; ++seg; @@ -214,40 +214,40 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) break; } - mw->mw_dir = rpcrdma_data_dir(writing); + mr->mr_dir = rpcrdma_data_dir(writing); - mw->mw_nents = ib_dma_map_sg(r_xprt->rx_ia.ri_device, - mw->mw_sg, i, mw->mw_dir); - if (!mw->mw_nents) + mr->mr_nents = ib_dma_map_sg(r_xprt->rx_ia.ri_device, + mr->mr_sg, i, mr->mr_dir); + if (!mr->mr_nents) goto out_dmamap_err; - for (i = 0, dma_pages = mw->fmr.fm_physaddrs; i < mw->mw_nents; i++) - dma_pages[i] = sg_dma_address(&mw->mw_sg[i]); - rc = ib_map_phys_fmr(mw->fmr.fm_mr, dma_pages, mw->mw_nents, + for (i = 0, dma_pages = mr->fmr.fm_physaddrs; i < mr->mr_nents; i++) + dma_pages[i] = sg_dma_address(&mr->mr_sg[i]); + rc = ib_map_phys_fmr(mr->fmr.fm_mr, dma_pages, mr->mr_nents, dma_pages[0]); if (rc) goto out_maperr; - mw->mw_handle = mw->fmr.fm_mr->rkey; - mw->mw_length = len; - mw->mw_offset = dma_pages[0] + pageoff; + mr->mr_handle = mr->fmr.fm_mr->rkey; + mr->mr_length = len; + mr->mr_offset = dma_pages[0] + pageoff; - *out = mw; + *out = mr; return seg; out_dmamap_err: pr_err("rpcrdma: failed to DMA map sg %p sg_nents %d\n", - mw->mw_sg, i); - rpcrdma_put_mw(r_xprt, mw); + mr->mr_sg, i); + rpcrdma_mr_put(mr); return ERR_PTR(-EIO); out_maperr: pr_err("rpcrdma: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n", len, (unsigned long long)dma_pages[0], - pageoff, mw->mw_nents, rc); + pageoff, mr->mr_nents, rc); ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, - mw->mw_sg, mw->mw_nents, mw->mw_dir); - rpcrdma_put_mw(r_xprt, mw); + mr->mr_sg, mr->mr_nents, mr->mr_dir); + rpcrdma_mr_put(mr); return ERR_PTR(-EIO); } @@ -256,13 +256,13 @@ out_maperr: * Sleeps until it is safe for the host CPU to access the * previously mapped memory regions. * - * Caller ensures that @mws is not empty before the call. This + * Caller ensures that @mrs is not empty before the call. This * function empties the list. */ static void -fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws) +fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs) { - struct rpcrdma_mw *mw; + struct rpcrdma_mr *mr; LIST_HEAD(unmap_list); int rc; @@ -271,10 +271,10 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws) * ib_unmap_fmr() is slow, so use a single call instead * of one call per mapped FMR. */ - list_for_each_entry(mw, mws, mw_list) { + list_for_each_entry(mr, mrs, mr_list) { dprintk("RPC: %s: unmapping fmr %p\n", - __func__, &mw->fmr); - list_add_tail(&mw->fmr.fm_mr->list, &unmap_list); + __func__, &mr->fmr); + list_add_tail(&mr->fmr.fm_mr->list, &unmap_list); } r_xprt->rx_stats.local_inv_needed++; rc = ib_unmap_fmr(&unmap_list); @@ -284,14 +284,14 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws) /* ORDER: Now DMA unmap all of the req's MRs, and return * them to the free MW list. */ - while (!list_empty(mws)) { - mw = rpcrdma_pop_mw(mws); + while (!list_empty(mrs)) { + mr = rpcrdma_mr_pop(mrs); dprintk("RPC: %s: DMA unmapping fmr %p\n", - __func__, &mw->fmr); - list_del(&mw->fmr.fm_mr->list); + __func__, &mr->fmr); + list_del(&mr->fmr.fm_mr->list); ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, - mw->mw_sg, mw->mw_nents, mw->mw_dir); - rpcrdma_put_mw(r_xprt, mw); + mr->mr_sg, mr->mr_nents, mr->mr_dir); + rpcrdma_mr_put(mr); } return; @@ -299,10 +299,10 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws) out_reset: pr_err("rpcrdma: ib_unmap_fmr failed (%i)\n", rc); - while (!list_empty(mws)) { - mw = rpcrdma_pop_mw(mws); - list_del(&mw->fmr.fm_mr->list); - fmr_op_recover_mr(mw); + while (!list_empty(mrs)) { + mr = rpcrdma_mr_pop(mrs); + list_del(&mr->fmr.fm_mr->list); + fmr_op_recover_mr(mr); } } diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 185eb69e5fb5..8ba4b3388a98 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -17,7 +17,7 @@ * A Memory Region is prepared for RDMA READ or WRITE using a FAST_REG * Work Request (frwr_op_map). When the RDMA operation is finished, this * Memory Region is invalidated using a LOCAL_INV Work Request - * (frwr_op_unmap). + * (frwr_op_unmap_sync). * * Typically these Work Requests are not signaled, and neither are RDMA * SEND Work Requests (with the exception of signaling occasionally to @@ -26,7 +26,7 @@ * * As an optimization, frwr_op_unmap marks MRs INVALID before the * LOCAL_INV WR is posted. If posting succeeds, the MR is placed on - * rb_mws immediately so that no work (like managing a linked list + * rb_mrs immediately so that no work (like managing a linked list * under a spinlock) is needed in the completion upcall. * * But this means that frwr_op_map() can occasionally encounter an MR @@ -60,7 +60,7 @@ * When frwr_op_map encounters FLUSHED and VALID MRs, they are recovered * with ib_dereg_mr and then are re-initialized. Because MR recovery * allocates fresh resources, it is deferred to a workqueue, and the - * recovered MRs are placed back on the rb_mws list when recovery is + * recovered MRs are placed back on the rb_mrs list when recovery is * complete. frwr_op_map allocates another MR for the current RPC while * the broken MR is reset. * @@ -96,21 +96,21 @@ out_not_supported: } static int -frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r) +frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) { unsigned int depth = ia->ri_max_frwr_depth; - struct rpcrdma_frwr *frwr = &r->frwr; + struct rpcrdma_frwr *frwr = &mr->frwr; int rc; frwr->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth); if (IS_ERR(frwr->fr_mr)) goto out_mr_err; - r->mw_sg = kcalloc(depth, sizeof(*r->mw_sg), GFP_KERNEL); - if (!r->mw_sg) + mr->mr_sg = kcalloc(depth, sizeof(*mr->mr_sg), GFP_KERNEL); + if (!mr->mr_sg) goto out_list_err; - sg_init_table(r->mw_sg, depth); + sg_init_table(mr->mr_sg, depth); init_completion(&frwr->fr_linv_done); return 0; @@ -129,32 +129,32 @@ out_list_err: } static void -frwr_op_release_mr(struct rpcrdma_mw *r) +frwr_op_release_mr(struct rpcrdma_mr *mr) { int rc; - /* Ensure MW is not on any rl_registered list */ - if (!list_empty(&r->mw_list)) - list_del(&r->mw_list); + /* Ensure MR is not on any rl_registered list */ + if (!list_empty(&mr->mr_list)) + list_del(&mr->mr_list); - rc = ib_dereg_mr(r->frwr.fr_mr); + rc = ib_dereg_mr(mr->frwr.fr_mr); if (rc) pr_err("rpcrdma: final ib_dereg_mr for %p returned %i\n", - r, rc); - kfree(r->mw_sg); - kfree(r); + mr, rc); + kfree(mr->mr_sg); + kfree(mr); } static int -__frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r) +__frwr_mr_reset(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) { - struct rpcrdma_frwr *frwr = &r->frwr; + struct rpcrdma_frwr *frwr = &mr->frwr; int rc; rc = ib_dereg_mr(frwr->fr_mr); if (rc) { pr_warn("rpcrdma: ib_dereg_mr status %d, frwr %p orphaned\n", - rc, r); + rc, mr); return rc; } @@ -162,7 +162,7 @@ __frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r) ia->ri_max_frwr_depth); if (IS_ERR(frwr->fr_mr)) { pr_warn("rpcrdma: ib_alloc_mr status %ld, frwr %p orphaned\n", - PTR_ERR(frwr->fr_mr), r); + PTR_ERR(frwr->fr_mr), mr); return PTR_ERR(frwr->fr_mr); } @@ -174,33 +174,33 @@ __frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r) /* Reset of a single FRWR. Generate a fresh rkey by replacing the MR. */ static void -frwr_op_recover_mr(struct rpcrdma_mw *mw) +frwr_op_recover_mr(struct rpcrdma_mr *mr) { - enum rpcrdma_frwr_state state = mw->frwr.fr_state; - struct rpcrdma_xprt *r_xprt = mw->mw_xprt; + enum rpcrdma_frwr_state state = mr->frwr.fr_state; + struct rpcrdma_xprt *r_xprt = mr->mr_xprt; struct rpcrdma_ia *ia = &r_xprt->rx_ia; int rc; - rc = __frwr_reset_mr(ia, mw); + rc = __frwr_mr_reset(ia, mr); if (state != FRWR_FLUSHED_LI) ib_dma_unmap_sg(ia->ri_device, - mw->mw_sg, mw->mw_nents, mw->mw_dir); + mr->mr_sg, mr->mr_nents, mr->mr_dir); if (rc) goto out_release; - rpcrdma_put_mw(r_xprt, mw); + rpcrdma_mr_put(mr); r_xprt->rx_stats.mrs_recovered++; return; out_release: - pr_err("rpcrdma: FRWR reset failed %d, %p release\n", rc, mw); + pr_err("rpcrdma: FRWR reset failed %d, %p release\n", rc, mr); r_xprt->rx_stats.mrs_orphaned++; - spin_lock(&r_xprt->rx_buf.rb_mwlock); - list_del(&mw->mw_all); - spin_unlock(&r_xprt->rx_buf.rb_mwlock); + spin_lock(&r_xprt->rx_buf.rb_mrlock); + list_del(&mr->mr_all); + spin_unlock(&r_xprt->rx_buf.rb_mrlock); - frwr_op_release_mr(mw); + frwr_op_release_mr(mr); } static int @@ -347,40 +347,39 @@ frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) */ static struct rpcrdma_mr_seg * frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, - int nsegs, bool writing, struct rpcrdma_mw **out) + int nsegs, bool writing, struct rpcrdma_mr **out) { struct rpcrdma_ia *ia = &r_xprt->rx_ia; bool holes_ok = ia->ri_mrtype == IB_MR_TYPE_SG_GAPS; struct rpcrdma_frwr *frwr; - struct rpcrdma_mw *mw; - struct ib_mr *mr; + struct rpcrdma_mr *mr; + struct ib_mr *ibmr; struct ib_reg_wr *reg_wr; struct ib_send_wr *bad_wr; int rc, i, n; u8 key; - mw = NULL; + mr = NULL; do { - if (mw) - rpcrdma_defer_mr_recovery(mw); - mw = rpcrdma_get_mw(r_xprt); - if (!mw) + if (mr) + rpcrdma_mr_defer_recovery(mr); + mr = rpcrdma_mr_get(r_xprt); + if (!mr) return ERR_PTR(-ENOBUFS); - } while (mw->frwr.fr_state != FRWR_IS_INVALID); - frwr = &mw->frwr; + } while (mr->frwr.fr_state != FRWR_IS_INVALID); + frwr = &mr->frwr; frwr->fr_state = FRWR_IS_VALID; - mr = frwr->fr_mr; if (nsegs > ia->ri_max_frwr_depth) nsegs = ia->ri_max_frwr_depth; for (i = 0; i < nsegs;) { if (seg->mr_page) - sg_set_page(&mw->mw_sg[i], + sg_set_page(&mr->mr_sg[i], seg->mr_page, seg->mr_len, offset_in_page(seg->mr_offset)); else - sg_set_buf(&mw->mw_sg[i], seg->mr_offset, + sg_set_buf(&mr->mr_sg[i], seg->mr_offset, seg->mr_len); ++seg; @@ -391,21 +390,22 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) break; } - mw->mw_dir = rpcrdma_data_dir(writing); + mr->mr_dir = rpcrdma_data_dir(writing); - mw->mw_nents = ib_dma_map_sg(ia->ri_device, mw->mw_sg, i, mw->mw_dir); - if (!mw->mw_nents) + mr->mr_nents = ib_dma_map_sg(ia->ri_device, mr->mr_sg, i, mr->mr_dir); + if (!mr->mr_nents) goto out_dmamap_err; - n = ib_map_mr_sg(mr, mw->mw_sg, mw->mw_nents, NULL, PAGE_SIZE); - if (unlikely(n != mw->mw_nents)) + ibmr = frwr->fr_mr; + n = ib_map_mr_sg(ibmr, mr->mr_sg, mr->mr_nents, NULL, PAGE_SIZE); + if (unlikely(n != mr->mr_nents)) goto out_mapmr_err; dprintk("RPC: %s: Using frwr %p to map %u segments (%llu bytes)\n", - __func__, frwr, mw->mw_nents, mr->length); + __func__, frwr, mr->mr_nents, ibmr->length); - key = (u8)(mr->rkey & 0x000000FF); - ib_update_fast_reg_key(mr, ++key); + key = (u8)(ibmr->rkey & 0x000000FF); + ib_update_fast_reg_key(ibmr, ++key); reg_wr = &frwr->fr_regwr; reg_wr->wr.next = NULL; @@ -414,8 +414,8 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, reg_wr->wr.wr_cqe = &frwr->fr_cqe; reg_wr->wr.num_sge = 0; reg_wr->wr.send_flags = 0; - reg_wr->mr = mr; - reg_wr->key = mr->rkey; + reg_wr->mr = ibmr; + reg_wr->key = ibmr->rkey; reg_wr->access = writing ? IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : IB_ACCESS_REMOTE_READ; @@ -424,48 +424,48 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, if (rc) goto out_senderr; - mw->mw_handle = mr->rkey; - mw->mw_length = mr->length; - mw->mw_offset = mr->iova; + mr->mr_handle = ibmr->rkey; + mr->mr_length = ibmr->length; + mr->mr_offset = ibmr->iova; - *out = mw; + *out = mr; return seg; out_dmamap_err: pr_err("rpcrdma: failed to DMA map sg %p sg_nents %d\n", - mw->mw_sg, i); + mr->mr_sg, i); frwr->fr_state = FRWR_IS_INVALID; - rpcrdma_put_mw(r_xprt, mw); + rpcrdma_mr_put(mr); return ERR_PTR(-EIO); out_mapmr_err: pr_err("rpcrdma: failed to map mr %p (%d/%d)\n", - frwr->fr_mr, n, mw->mw_nents); - rpcrdma_defer_mr_recovery(mw); + frwr->fr_mr, n, mr->mr_nents); + rpcrdma_mr_defer_recovery(mr); return ERR_PTR(-EIO); out_senderr: pr_err("rpcrdma: FRWR registration ib_post_send returned %i\n", rc); - rpcrdma_defer_mr_recovery(mw); + rpcrdma_mr_defer_recovery(mr); return ERR_PTR(-ENOTCONN); } -/* Handle a remotely invalidated mw on the @mws list +/* Handle a remotely invalidated mr on the @mrs list */ static void -frwr_op_reminv(struct rpcrdma_rep *rep, struct list_head *mws) +frwr_op_reminv(struct rpcrdma_rep *rep, struct list_head *mrs) { - struct rpcrdma_mw *mw; + struct rpcrdma_mr *mr; - list_for_each_entry(mw, mws, mw_list) - if (mw->mw_handle == rep->rr_inv_rkey) { - struct rpcrdma_xprt *r_xprt = mw->mw_xprt; + list_for_each_entry(mr, mrs, mr_list) + if (mr->mr_handle == rep->rr_inv_rkey) { + struct rpcrdma_xprt *r_xprt = mr->mr_xprt; - list_del(&mw->mw_list); - mw->frwr.fr_state = FRWR_IS_INVALID; + list_del(&mr->mr_list); + mr->frwr.fr_state = FRWR_IS_INVALID; ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, - mw->mw_sg, mw->mw_nents, mw->mw_dir); - rpcrdma_put_mw(r_xprt, mw); + mr->mr_sg, mr->mr_nents, mr->mr_dir); + rpcrdma_mr_put(mr); break; /* only one invalidated MR per RPC */ } } @@ -475,16 +475,16 @@ frwr_op_reminv(struct rpcrdma_rep *rep, struct list_head *mws) * Sleeps until it is safe for the host CPU to access the * previously mapped memory regions. * - * Caller ensures that @mws is not empty before the call. This + * Caller ensures that @mrs is not empty before the call. This * function empties the list. */ static void -frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws) +frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs) { struct ib_send_wr *first, **prev, *last, *bad_wr; struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rpcrdma_frwr *frwr; - struct rpcrdma_mw *mw; + struct rpcrdma_mr *mr; int count, rc; /* ORDER: Invalidate all of the MRs first @@ -495,10 +495,11 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws) frwr = NULL; count = 0; prev = &first; - list_for_each_entry(mw, mws, mw_list) { - mw->frwr.fr_state = FRWR_IS_INVALID; + list_for_each_entry(mr, mrs, mr_list) { + mr->frwr.fr_state = FRWR_IS_INVALID; + + frwr = &mr->frwr; - frwr = &mw->frwr; dprintk("RPC: %s: invalidating frwr %p\n", __func__, frwr); @@ -507,7 +508,7 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws) memset(last, 0, sizeof(*last)); last->wr_cqe = &frwr->fr_cqe; last->opcode = IB_WR_LOCAL_INV; - last->ex.invalidate_rkey = mw->mw_handle; + last->ex.invalidate_rkey = mr->mr_handle; count++; *prev = last; @@ -537,16 +538,16 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws) goto reset_mrs; /* ORDER: Now DMA unmap all of the MRs, and return - * them to the free MW list. + * them to the free MR list. */ unmap: - while (!list_empty(mws)) { - mw = rpcrdma_pop_mw(mws); + while (!list_empty(mrs)) { + mr = rpcrdma_mr_pop(mrs); dprintk("RPC: %s: DMA unmapping frwr %p\n", - __func__, &mw->frwr); + __func__, &mr->frwr); ib_dma_unmap_sg(ia->ri_device, - mw->mw_sg, mw->mw_nents, mw->mw_dir); - rpcrdma_put_mw(r_xprt, mw); + mr->mr_sg, mr->mr_nents, mr->mr_dir); + rpcrdma_mr_put(mr); } return; @@ -559,9 +560,9 @@ reset_mrs: while (bad_wr) { frwr = container_of(bad_wr, struct rpcrdma_frwr, fr_invwr); - mw = container_of(frwr, struct rpcrdma_mw, frwr); + mr = container_of(frwr, struct rpcrdma_mr, frwr); - __frwr_reset_mr(ia, mw); + __frwr_mr_reset(ia, mr); bad_wr = bad_wr->next; } diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 9207aeacd2c3..9601af01653f 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -292,15 +292,15 @@ encode_item_not_present(struct xdr_stream *xdr) } static void -xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mw *mw) +xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mr *mr) { - *iptr++ = cpu_to_be32(mw->mw_handle); - *iptr++ = cpu_to_be32(mw->mw_length); - xdr_encode_hyper(iptr, mw->mw_offset); + *iptr++ = cpu_to_be32(mr->mr_handle); + *iptr++ = cpu_to_be32(mr->mr_length); + xdr_encode_hyper(iptr, mr->mr_offset); } static int -encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mw *mw) +encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr) { __be32 *p; @@ -308,12 +308,12 @@ encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mw *mw) if (unlikely(!p)) return -EMSGSIZE; - xdr_encode_rdma_segment(p, mw); + xdr_encode_rdma_segment(p, mr); return 0; } static int -encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mw *mw, +encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr, u32 position) { __be32 *p; @@ -324,7 +324,7 @@ encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mw *mw, *p++ = xdr_one; /* Item present */ *p++ = cpu_to_be32(position); - xdr_encode_rdma_segment(p, mw); + xdr_encode_rdma_segment(p, mr); return 0; } @@ -348,7 +348,7 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, { struct xdr_stream *xdr = &req->rl_stream; struct rpcrdma_mr_seg *seg; - struct rpcrdma_mw *mw; + struct rpcrdma_mr *mr; unsigned int pos; int nsegs; @@ -363,21 +363,21 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, do { seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, - false, &mw); + false, &mr); if (IS_ERR(seg)) return PTR_ERR(seg); - rpcrdma_push_mw(mw, &req->rl_registered); + rpcrdma_mr_push(mr, &req->rl_registered); - if (encode_read_segment(xdr, mw, pos) < 0) + if (encode_read_segment(xdr, mr, pos) < 0) return -EMSGSIZE; dprintk("RPC: %5u %s: pos %u %u@0x%016llx:0x%08x (%s)\n", rqst->rq_task->tk_pid, __func__, pos, - mw->mw_length, (unsigned long long)mw->mw_offset, - mw->mw_handle, mw->mw_nents < nsegs ? "more" : "last"); + mr->mr_length, (unsigned long long)mr->mr_offset, + mr->mr_handle, mr->mr_nents < nsegs ? "more" : "last"); r_xprt->rx_stats.read_chunk_count++; - nsegs -= mw->mw_nents; + nsegs -= mr->mr_nents; } while (nsegs); return 0; @@ -404,7 +404,7 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, { struct xdr_stream *xdr = &req->rl_stream; struct rpcrdma_mr_seg *seg; - struct rpcrdma_mw *mw; + struct rpcrdma_mr *mr; int nsegs, nchunks; __be32 *segcount; @@ -425,23 +425,23 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, nchunks = 0; do { seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, - true, &mw); + true, &mr); if (IS_ERR(seg)) return PTR_ERR(seg); - rpcrdma_push_mw(mw, &req->rl_registered); + rpcrdma_mr_push(mr, &req->rl_registered); - if (encode_rdma_segment(xdr, mw) < 0) + if (encode_rdma_segment(xdr, mr) < 0) return -EMSGSIZE; dprintk("RPC: %5u %s: %u@0x016%llx:0x%08x (%s)\n", rqst->rq_task->tk_pid, __func__, - mw->mw_length, (unsigned long long)mw->mw_offset, - mw->mw_handle, mw->mw_nents < nsegs ? "more" : "last"); + mr->mr_length, (unsigned long long)mr->mr_offset, + mr->mr_handle, mr->mr_nents < nsegs ? "more" : "last"); r_xprt->rx_stats.write_chunk_count++; r_xprt->rx_stats.total_rdma_request += seg->mr_len; nchunks++; - nsegs -= mw->mw_nents; + nsegs -= mr->mr_nents; } while (nsegs); /* Update count of segments in this Write chunk */ @@ -468,7 +468,7 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, { struct xdr_stream *xdr = &req->rl_stream; struct rpcrdma_mr_seg *seg; - struct rpcrdma_mw *mw; + struct rpcrdma_mr *mr; int nsegs, nchunks; __be32 *segcount; @@ -487,23 +487,23 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, nchunks = 0; do { seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, - true, &mw); + true, &mr); if (IS_ERR(seg)) return PTR_ERR(seg); - rpcrdma_push_mw(mw, &req->rl_registered); + rpcrdma_mr_push(mr, &req->rl_registered); - if (encode_rdma_segment(xdr, mw) < 0) + if (encode_rdma_segment(xdr, mr) < 0) return -EMSGSIZE; dprintk("RPC: %5u %s: %u@0x%016llx:0x%08x (%s)\n", rqst->rq_task->tk_pid, __func__, - mw->mw_length, (unsigned long long)mw->mw_offset, - mw->mw_handle, mw->mw_nents < nsegs ? "more" : "last"); + mr->mr_length, (unsigned long long)mr->mr_offset, + mr->mr_handle, mr->mr_nents < nsegs ? "more" : "last"); r_xprt->rx_stats.reply_chunk_count++; r_xprt->rx_stats.total_rdma_request += seg->mr_len; nchunks++; - nsegs -= mw->mw_nents; + nsegs -= mr->mr_nents; } while (nsegs); /* Update count of segments in the Reply chunk */ @@ -821,10 +821,10 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) * so these registrations are invalid and unusable. */ while (unlikely(!list_empty(&req->rl_registered))) { - struct rpcrdma_mw *mw; + struct rpcrdma_mr *mr; - mw = rpcrdma_pop_mw(&req->rl_registered); - rpcrdma_defer_mr_recovery(mw); + mr = rpcrdma_mr_pop(&req->rl_registered); + rpcrdma_mr_defer_recovery(mr); } /* This implementation supports the following combinations diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 840579919ad0..2582729f8c64 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -71,8 +71,8 @@ /* * internal functions */ -static void rpcrdma_create_mrs(struct rpcrdma_xprt *r_xprt); -static void rpcrdma_destroy_mrs(struct rpcrdma_buffer *buf); +static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt); +static void rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf); static void rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb); struct workqueue_struct *rpcrdma_receive_wq __read_mostly; @@ -458,7 +458,7 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia) rpcrdma_dma_unmap_regbuf(req->rl_sendbuf); rpcrdma_dma_unmap_regbuf(req->rl_recvbuf); } - rpcrdma_destroy_mrs(buf); + rpcrdma_mrs_destroy(buf); /* Allow waiters to continue */ complete(&ia->ri_remove_done); @@ -671,7 +671,7 @@ rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt, goto out3; } - rpcrdma_create_mrs(r_xprt); + rpcrdma_mrs_create(r_xprt); return 0; out3: @@ -992,15 +992,15 @@ rpcrdma_mr_recovery_worker(struct work_struct *work) { struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer, rb_recovery_worker.work); - struct rpcrdma_mw *mw; + struct rpcrdma_mr *mr; spin_lock(&buf->rb_recovery_lock); while (!list_empty(&buf->rb_stale_mrs)) { - mw = rpcrdma_pop_mw(&buf->rb_stale_mrs); + mr = rpcrdma_mr_pop(&buf->rb_stale_mrs); spin_unlock(&buf->rb_recovery_lock); - dprintk("RPC: %s: recovering MR %p\n", __func__, mw); - mw->mw_xprt->rx_ia.ri_ops->ro_recover_mr(mw); + dprintk("RPC: %s: recovering MR %p\n", __func__, mr); + mr->mr_xprt->rx_ia.ri_ops->ro_recover_mr(mr); spin_lock(&buf->rb_recovery_lock); } @@ -1008,20 +1008,20 @@ rpcrdma_mr_recovery_worker(struct work_struct *work) } void -rpcrdma_defer_mr_recovery(struct rpcrdma_mw *mw) +rpcrdma_mr_defer_recovery(struct rpcrdma_mr *mr) { - struct rpcrdma_xprt *r_xprt = mw->mw_xprt; + struct rpcrdma_xprt *r_xprt = mr->mr_xprt; struct rpcrdma_buffer *buf = &r_xprt->rx_buf; spin_lock(&buf->rb_recovery_lock); - rpcrdma_push_mw(mw, &buf->rb_stale_mrs); + rpcrdma_mr_push(mr, &buf->rb_stale_mrs); spin_unlock(&buf->rb_recovery_lock); schedule_delayed_work(&buf->rb_recovery_worker, 0); } static void -rpcrdma_create_mrs(struct rpcrdma_xprt *r_xprt) +rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_ia *ia = &r_xprt->rx_ia; @@ -1030,30 +1030,30 @@ rpcrdma_create_mrs(struct rpcrdma_xprt *r_xprt) LIST_HEAD(all); for (count = 0; count < 32; count++) { - struct rpcrdma_mw *mw; + struct rpcrdma_mr *mr; int rc; - mw = kzalloc(sizeof(*mw), GFP_KERNEL); - if (!mw) + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) break; - rc = ia->ri_ops->ro_init_mr(ia, mw); + rc = ia->ri_ops->ro_init_mr(ia, mr); if (rc) { - kfree(mw); + kfree(mr); break; } - mw->mw_xprt = r_xprt; + mr->mr_xprt = r_xprt; - list_add(&mw->mw_list, &free); - list_add(&mw->mw_all, &all); + list_add(&mr->mr_list, &free); + list_add(&mr->mr_all, &all); } - spin_lock(&buf->rb_mwlock); - list_splice(&free, &buf->rb_mws); + spin_lock(&buf->rb_mrlock); + list_splice(&free, &buf->rb_mrs); list_splice(&all, &buf->rb_all); r_xprt->rx_stats.mrs_allocated += count; - spin_unlock(&buf->rb_mwlock); + spin_unlock(&buf->rb_mrlock); dprintk("RPC: %s: created %u MRs\n", __func__, count); } @@ -1066,7 +1066,7 @@ rpcrdma_mr_refresh_worker(struct work_struct *work) struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt, rx_buf); - rpcrdma_create_mrs(r_xprt); + rpcrdma_mrs_create(r_xprt); } struct rpcrdma_req * @@ -1144,10 +1144,10 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) buf->rb_max_requests = r_xprt->rx_data.max_requests; buf->rb_bc_srv_max_requests = 0; - spin_lock_init(&buf->rb_mwlock); + spin_lock_init(&buf->rb_mrlock); spin_lock_init(&buf->rb_lock); spin_lock_init(&buf->rb_recovery_lock); - INIT_LIST_HEAD(&buf->rb_mws); + INIT_LIST_HEAD(&buf->rb_mrs); INIT_LIST_HEAD(&buf->rb_all); INIT_LIST_HEAD(&buf->rb_stale_mrs); INIT_DELAYED_WORK(&buf->rb_refresh_worker, @@ -1155,7 +1155,7 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) INIT_DELAYED_WORK(&buf->rb_recovery_worker, rpcrdma_mr_recovery_worker); - rpcrdma_create_mrs(r_xprt); + rpcrdma_mrs_create(r_xprt); INIT_LIST_HEAD(&buf->rb_send_bufs); INIT_LIST_HEAD(&buf->rb_allreqs); @@ -1229,26 +1229,26 @@ rpcrdma_destroy_req(struct rpcrdma_req *req) } static void -rpcrdma_destroy_mrs(struct rpcrdma_buffer *buf) +rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf) { struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt, rx_buf); struct rpcrdma_ia *ia = rdmab_to_ia(buf); - struct rpcrdma_mw *mw; + struct rpcrdma_mr *mr; unsigned int count; count = 0; - spin_lock(&buf->rb_mwlock); + spin_lock(&buf->rb_mrlock); while (!list_empty(&buf->rb_all)) { - mw = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); - list_del(&mw->mw_all); + mr = list_entry(buf->rb_all.next, struct rpcrdma_mr, mr_all); + list_del(&mr->mr_all); - spin_unlock(&buf->rb_mwlock); - ia->ri_ops->ro_release_mr(mw); + spin_unlock(&buf->rb_mrlock); + ia->ri_ops->ro_release_mr(mr); count++; - spin_lock(&buf->rb_mwlock); + spin_lock(&buf->rb_mrlock); } - spin_unlock(&buf->rb_mwlock); + spin_unlock(&buf->rb_mrlock); r_xprt->rx_stats.mrs_allocated = 0; dprintk("RPC: %s: released %u MRs\n", __func__, count); @@ -1285,26 +1285,33 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) spin_unlock(&buf->rb_reqslock); buf->rb_recv_count = 0; - rpcrdma_destroy_mrs(buf); + rpcrdma_mrs_destroy(buf); } -struct rpcrdma_mw * -rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt) +/** + * rpcrdma_mr_get - Allocate an rpcrdma_mr object + * @r_xprt: controlling transport + * + * Returns an initialized rpcrdma_mr or NULL if no free + * rpcrdma_mr objects are available. + */ +struct rpcrdma_mr * +rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - struct rpcrdma_mw *mw = NULL; + struct rpcrdma_mr *mr = NULL; - spin_lock(&buf->rb_mwlock); - if (!list_empty(&buf->rb_mws)) - mw = rpcrdma_pop_mw(&buf->rb_mws); - spin_unlock(&buf->rb_mwlock); + spin_lock(&buf->rb_mrlock); + if (!list_empty(&buf->rb_mrs)) + mr = rpcrdma_mr_pop(&buf->rb_mrs); + spin_unlock(&buf->rb_mrlock); - if (!mw) - goto out_nomws; - return mw; + if (!mr) + goto out_nomrs; + return mr; -out_nomws: - dprintk("RPC: %s: no MWs available\n", __func__); +out_nomrs: + dprintk("RPC: %s: no MRs available\n", __func__); if (r_xprt->rx_ep.rep_connected != -ENODEV) schedule_delayed_work(&buf->rb_refresh_worker, 0); @@ -1314,14 +1321,20 @@ out_nomws: return NULL; } +/** + * rpcrdma_mr_put - Release an rpcrdma_mr object + * @mr: object to release + * + */ void -rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw) +rpcrdma_mr_put(struct rpcrdma_mr *mr) { + struct rpcrdma_xprt *r_xprt = mr->mr_xprt; struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - spin_lock(&buf->rb_mwlock); - rpcrdma_push_mw(mw, &buf->rb_mws); - spin_unlock(&buf->rb_mwlock); + spin_lock(&buf->rb_mrlock); + rpcrdma_mr_push(mr, &buf->rb_mrs); + spin_unlock(&buf->rb_mrlock); } static struct rpcrdma_rep * diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index f52269afaa09..530ace6ed125 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -230,12 +230,12 @@ enum { }; /* - * struct rpcrdma_mw - external memory region metadata + * struct rpcrdma_mr - external memory region metadata * * An external memory region is any buffer or page that is registered * on the fly (ie, not pre-registered). * - * Each rpcrdma_buffer has a list of free MWs anchored in rb_mws. During + * Each rpcrdma_buffer has a list of free MWs anchored in rb_mrs. During * call_allocate, rpcrdma_buffer_get() assigns one to each segment in * an rpcrdma_req. Then rpcrdma_register_external() grabs these to keep * track of registration metadata while each RPC is pending. @@ -265,20 +265,20 @@ struct rpcrdma_fmr { u64 *fm_physaddrs; }; -struct rpcrdma_mw { - struct list_head mw_list; - struct scatterlist *mw_sg; - int mw_nents; - enum dma_data_direction mw_dir; +struct rpcrdma_mr { + struct list_head mr_list; + struct scatterlist *mr_sg; + int mr_nents; + enum dma_data_direction mr_dir; union { struct rpcrdma_fmr fmr; struct rpcrdma_frwr frwr; }; - struct rpcrdma_xprt *mw_xprt; - u32 mw_handle; - u32 mw_length; - u64 mw_offset; - struct list_head mw_all; + struct rpcrdma_xprt *mr_xprt; + u32 mr_handle; + u32 mr_length; + u64 mr_offset; + struct list_head mr_all; }; /* @@ -371,19 +371,19 @@ rpcr_to_rdmar(struct rpc_rqst *rqst) } static inline void -rpcrdma_push_mw(struct rpcrdma_mw *mw, struct list_head *list) +rpcrdma_mr_push(struct rpcrdma_mr *mr, struct list_head *list) { - list_add_tail(&mw->mw_list, list); + list_add_tail(&mr->mr_list, list); } -static inline struct rpcrdma_mw * -rpcrdma_pop_mw(struct list_head *list) +static inline struct rpcrdma_mr * +rpcrdma_mr_pop(struct list_head *list) { - struct rpcrdma_mw *mw; + struct rpcrdma_mr *mr; - mw = list_first_entry(list, struct rpcrdma_mw, mw_list); - list_del(&mw->mw_list); - return mw; + mr = list_first_entry(list, struct rpcrdma_mr, mr_list); + list_del(&mr->mr_list); + return mr; } /* @@ -393,8 +393,8 @@ rpcrdma_pop_mw(struct list_head *list) * One of these is associated with a transport instance */ struct rpcrdma_buffer { - spinlock_t rb_mwlock; /* protect rb_mws list */ - struct list_head rb_mws; + spinlock_t rb_mrlock; /* protect rb_mrs list */ + struct list_head rb_mrs; struct list_head rb_all; unsigned long rb_sc_head; @@ -473,19 +473,19 @@ struct rpcrdma_memreg_ops { struct rpcrdma_mr_seg * (*ro_map)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *, int, bool, - struct rpcrdma_mw **); + struct rpcrdma_mr **); void (*ro_reminv)(struct rpcrdma_rep *rep, - struct list_head *mws); + struct list_head *mrs); void (*ro_unmap_sync)(struct rpcrdma_xprt *, struct list_head *); - void (*ro_recover_mr)(struct rpcrdma_mw *); + void (*ro_recover_mr)(struct rpcrdma_mr *mr); int (*ro_open)(struct rpcrdma_ia *, struct rpcrdma_ep *, struct rpcrdma_create_data_internal *); size_t (*ro_maxpages)(struct rpcrdma_xprt *); int (*ro_init_mr)(struct rpcrdma_ia *, - struct rpcrdma_mw *); - void (*ro_release_mr)(struct rpcrdma_mw *); + struct rpcrdma_mr *); + void (*ro_release_mr)(struct rpcrdma_mr *mr); const char *ro_displayname; const int ro_send_w_inv_ok; }; @@ -574,15 +574,15 @@ void rpcrdma_buffer_destroy(struct rpcrdma_buffer *); struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_buffer *buf); void rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc); -struct rpcrdma_mw *rpcrdma_get_mw(struct rpcrdma_xprt *); -void rpcrdma_put_mw(struct rpcrdma_xprt *, struct rpcrdma_mw *); +struct rpcrdma_mr *rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt); +void rpcrdma_mr_put(struct rpcrdma_mr *mr); +void rpcrdma_mr_defer_recovery(struct rpcrdma_mr *mr); + struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *); void rpcrdma_buffer_put(struct rpcrdma_req *); void rpcrdma_recv_buffer_get(struct rpcrdma_req *); void rpcrdma_recv_buffer_put(struct rpcrdma_rep *); -void rpcrdma_defer_mr_recovery(struct rpcrdma_mw *); - struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(size_t, enum dma_data_direction, gfp_t); bool __rpcrdma_dma_map_regbuf(struct rpcrdma_ia *, struct rpcrdma_regbuf *); From ec12e479e30653bf973ca1185bbb09158e9af0b7 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 14 Dec 2017 20:58:04 -0500 Subject: [PATCH 41/69] xprtrdma: Introduce rpcrdma_mw_unmap_and_put Clean up: Code review suggested that a common bit of code can be placed into a helper function, and this gives us fewer places to stick an "I DMA unmapped something" trace point. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/fmr_ops.c | 19 ++++++++----------- net/sunrpc/xprtrdma/frwr_ops.c | 10 ++-------- net/sunrpc/xprtrdma/verbs.c | 28 +++++++++++++++++++++++----- net/sunrpc/xprtrdma/xprt_rdma.h | 1 + 4 files changed, 34 insertions(+), 24 deletions(-) diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index 8bd0399b3a1c..7f2f2b774076 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -135,14 +135,12 @@ fmr_op_recover_mr(struct rpcrdma_mr *mr) /* ORDER: invalidate first */ rc = __fmr_unmap(mr); - - /* ORDER: then DMA unmap */ - ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, - mr->mr_sg, mr->mr_nents, mr->mr_dir); if (rc) goto out_release; - rpcrdma_mr_put(mr); + /* ORDER: then DMA unmap */ + rpcrdma_mr_unmap_and_put(mr); + r_xprt->rx_stats.mrs_recovered++; return; @@ -150,6 +148,9 @@ out_release: pr_err("rpcrdma: FMR reset failed (%d), %p released\n", rc, mr); r_xprt->rx_stats.mrs_orphaned++; + ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, + mr->mr_sg, mr->mr_nents, mr->mr_dir); + spin_lock(&r_xprt->rx_buf.rb_mrlock); list_del(&mr->mr_all); spin_unlock(&r_xprt->rx_buf.rb_mrlock); @@ -245,9 +246,7 @@ out_maperr: pr_err("rpcrdma: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n", len, (unsigned long long)dma_pages[0], pageoff, mr->mr_nents, rc); - ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, - mr->mr_sg, mr->mr_nents, mr->mr_dir); - rpcrdma_mr_put(mr); + rpcrdma_mr_unmap_and_put(mr); return ERR_PTR(-EIO); } @@ -289,9 +288,7 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs) dprintk("RPC: %s: DMA unmapping fmr %p\n", __func__, &mr->fmr); list_del(&mr->fmr.fm_mr->list); - ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, - mr->mr_sg, mr->mr_nents, mr->mr_dir); - rpcrdma_mr_put(mr); + rpcrdma_mr_unmap_and_put(mr); } return; diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 8ba4b3388a98..35e3a54344cc 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -459,13 +459,9 @@ frwr_op_reminv(struct rpcrdma_rep *rep, struct list_head *mrs) list_for_each_entry(mr, mrs, mr_list) if (mr->mr_handle == rep->rr_inv_rkey) { - struct rpcrdma_xprt *r_xprt = mr->mr_xprt; - list_del(&mr->mr_list); mr->frwr.fr_state = FRWR_IS_INVALID; - ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, - mr->mr_sg, mr->mr_nents, mr->mr_dir); - rpcrdma_mr_put(mr); + rpcrdma_mr_unmap_and_put(mr); break; /* only one invalidated MR per RPC */ } } @@ -545,9 +541,7 @@ unmap: mr = rpcrdma_mr_pop(mrs); dprintk("RPC: %s: DMA unmapping frwr %p\n", __func__, &mr->frwr); - ib_dma_unmap_sg(ia->ri_device, - mr->mr_sg, mr->mr_nents, mr->mr_dir); - rpcrdma_mr_put(mr); + rpcrdma_mr_unmap_and_put(mr); } return; diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 2582729f8c64..9cc8abc09e14 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1321,6 +1321,14 @@ out_nomrs: return NULL; } +static void +__rpcrdma_mr_put(struct rpcrdma_buffer *buf, struct rpcrdma_mr *mr) +{ + spin_lock(&buf->rb_mrlock); + rpcrdma_mr_push(mr, &buf->rb_mrs); + spin_unlock(&buf->rb_mrlock); +} + /** * rpcrdma_mr_put - Release an rpcrdma_mr object * @mr: object to release @@ -1329,12 +1337,22 @@ out_nomrs: void rpcrdma_mr_put(struct rpcrdma_mr *mr) { - struct rpcrdma_xprt *r_xprt = mr->mr_xprt; - struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + __rpcrdma_mr_put(&mr->mr_xprt->rx_buf, mr); +} - spin_lock(&buf->rb_mrlock); - rpcrdma_mr_push(mr, &buf->rb_mrs); - spin_unlock(&buf->rb_mrlock); +/** + * rpcrdma_mr_unmap_and_put - DMA unmap an MR and release it + * @mr: object to release + * + */ +void +rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr) +{ + struct rpcrdma_xprt *r_xprt = mr->mr_xprt; + + ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, + mr->mr_sg, mr->mr_nents, mr->mr_dir); + __rpcrdma_mr_put(&r_xprt->rx_buf, mr); } static struct rpcrdma_rep * diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 530ace6ed125..28ae1fb1e2f3 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -576,6 +576,7 @@ void rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc); struct rpcrdma_mr *rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt); void rpcrdma_mr_put(struct rpcrdma_mr *mr); +void rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr); void rpcrdma_mr_defer_recovery(struct rpcrdma_mr *mr); struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *); From 7ff4cff637aa0bd2abbd81f53b2a6206c50afd95 Mon Sep 17 00:00:00 2001 From: Tigran Mkrtchyan Date: Tue, 16 Jan 2018 22:38:50 +0100 Subject: [PATCH 42/69] nfs41: do not return ENOMEM on LAYOUTUNAVAILABLE A pNFS server may return LAYOUTUNAVAILABLE error on LAYOUTGET for files which don't have any layout. In this situation pnfs_update_layout currently returns NULL. As this NULL is converted into ENOMEM, IO requests fails instead of falling back to MDS. Do not return ENOMEM on LAYOUTUNAVAILABLE and let client retry through MDS. Fixes 8d40b0f14846f. I will suggest to backport this fix to affected stable branches. Signed-off-by: Tigran Mkrtchyan [trondmy: Use IS_ERR_OR_NULL()] Fixes: 8d40b0f14846 ("NFS filelayout:call GETDEVICEINFO after...") Cc: stable@vger.kernel.org # v4.11+ Signed-off-by: Trond Myklebust --- fs/nfs/filelayout/filelayout.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 4e54d8b5413a..d175724ff566 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -895,9 +895,7 @@ fl_pnfs_update_layout(struct inode *ino, lseg = pnfs_update_layout(ino, ctx, pos, count, iomode, strict_iomode, gfp_flags); - if (!lseg) - lseg = ERR_PTR(-ENOMEM); - if (IS_ERR(lseg)) + if (IS_ERR_OR_NULL(lseg)) goto out; lo = NFS_I(ino)->layout; From 06e1902456f3d0e45c3e64b75124339d67c83e5b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 18 Jan 2018 14:55:01 -0500 Subject: [PATCH 43/69] nfs: Use proper enum definitions for nfs_show_stable Commit 8224b2734ab1 ("NFS: Add static NFS I/O tracepoints") had a hack to work around some odd behavior observed with __print_symbolic. I couldn't ever get it to display NFS_FILE_SYNC when using TRACE_DEFINE_ENUM macros to set up the enum values. I tracked down the actual bug that forced me to add the workaround. That issue will be addressed soon, so replace the hack with a proper implementation. Fixes: 8224b2734ab1 ("NFS: Add static NFS I/O tracepoints") Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/nfstrace.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 093290c42d7c..c6dbd48896a1 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -796,15 +796,15 @@ TRACE_EVENT(nfs_readpage_done, ) ); -/* - * XXX: I tried using NFS_UNSTABLE and friends in this table, but they - * all evaluate to 0 for some reason, even if I include linux/nfs.h. - */ +TRACE_DEFINE_ENUM(NFS_UNSTABLE); +TRACE_DEFINE_ENUM(NFS_DATA_SYNC); +TRACE_DEFINE_ENUM(NFS_FILE_SYNC); + #define nfs_show_stable(stable) \ __print_symbolic(stable, \ - { 0, " (UNSTABLE)" }, \ - { 1, " (DATA_SYNC)" }, \ - { 2, " (FILE_SYNC)" }) + { NFS_UNSTABLE, "UNSTABLE" }, \ + { NFS_DATA_SYNC, "DATA_SYNC" }, \ + { NFS_FILE_SYNC, "FILE_SYNC" }) TRACE_EVENT(nfs_initiate_write, TP_PROTO( @@ -837,12 +837,12 @@ TRACE_EVENT(nfs_initiate_write, TP_printk( "fileid=%02x:%02x:%llu fhandle=0x%08x " - "offset=%lld count=%lu stable=%d%s", + "offset=%lld count=%lu stable=%s", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, __entry->offset, __entry->count, - __entry->stable, nfs_show_stable(__entry->stable) + nfs_show_stable(__entry->stable) ) ); @@ -881,13 +881,13 @@ TRACE_EVENT(nfs_writeback_done, TP_printk( "fileid=%02x:%02x:%llu fhandle=0x%08x " - "offset=%lld status=%d stable=%d%s " + "offset=%lld status=%d stable=%s " "verifier 0x%016llx", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, __entry->offset, __entry->status, - __entry->stable, nfs_show_stable(__entry->stable), + nfs_show_stable(__entry->stable), __entry->verifier ) ); From cbebc6ef4fc830f4040d4140bf53484812d5d5d9 Mon Sep 17 00:00:00 2001 From: Jan Chochol Date: Fri, 5 Jan 2018 08:39:12 +0100 Subject: [PATCH 44/69] nfs: Do not convert nfs_idmap_cache_timeout to jiffies Since commit 57e62324e469 ("NFS: Store the legacy idmapper result in the keyring") nfs_idmap_cache_timeout changed units from jiffies to seconds. Unfortunately sysctl interface was not updated accordingly. As a effect updating /proc/sys/fs/nfs/idmap_cache_timeout with some value will incorrectly multiply this value by HZ. Also reading /proc/sys/fs/nfs/idmap_cache_timeout will show real value divided by HZ. Fixes: 57e62324e469 ("NFS: Store the legacy idmapper result in the keyring") Signed-off-by: Jan Chochol Signed-off-by: Trond Myklebust --- fs/nfs/nfs4sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c index 0d91d84e5822..c394e4447100 100644 --- a/fs/nfs/nfs4sysctl.c +++ b/fs/nfs/nfs4sysctl.c @@ -32,7 +32,7 @@ static struct ctl_table nfs4_cb_sysctls[] = { .data = &nfs_idmap_cache_timeout, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec_jiffies, + .proc_handler = proc_dointvec, }, { } }; From 49686cbbb3ebafe42e63868222f269d8053ead00 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 19 Jan 2018 15:15:34 -0800 Subject: [PATCH 45/69] NFS: reject request for id_legacy key without auxdata nfs_idmap_legacy_upcall() is supposed to be called with 'aux' pointing to a 'struct idmap', via the call to request_key_with_auxdata() in nfs_idmap_request_key(). However it can also be reached via the request_key() system call in which case 'aux' will be NULL, causing a NULL pointer dereference in nfs_idmap_prepare_pipe_upcall(), assuming that the key description is valid enough to get that far. Fix this by making nfs_idmap_legacy_upcall() negate the key if no auxdata is provided. As usual, this bug was found by syzkaller. A simple reproducer using the command-line keyctl program is: keyctl request2 id_legacy uid:0 '' @s Fixes: 57e62324e469 ("NFS: Store the legacy idmapper result in the keyring") Reported-by: syzbot+5dfdbcf7b3eb5912abbb@syzkaller.appspotmail.com Cc: # v3.4+ Signed-off-by: Eric Biggers Signed-off-by: Trond Myklebust --- fs/nfs/nfs4idmap.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c index 30426c1a1bbd..22dc30a679a0 100644 --- a/fs/nfs/nfs4idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -568,9 +568,13 @@ static int nfs_idmap_legacy_upcall(struct key_construction *cons, struct idmap_msg *im; struct idmap *idmap = (struct idmap *)aux; struct key *key = cons->key; - int ret = -ENOMEM; + int ret = -ENOKEY; + + if (!aux) + goto out1; /* msg and im are freed in idmap_pipe_destroy_msg */ + ret = -ENOMEM; data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) goto out1; From 6b3a60ae7b531dc45c737579d91452b4ebda14a8 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 20 Jan 2018 11:16:25 -0500 Subject: [PATCH 46/69] rdma/ib: Add trace point macros to display human-readable values These can be shared with all kernel ULPs, and more can easily be added as needed. Note: checkpatch.pl has some heartburn with the TRACE_DEFINE_ENUM macros and the LIST macros. These follow the same style as other header files under include/tracing/events , thus should be considered acceptable exceptions. Signed-off-by: Chuck Lever Acked-by: Jason Gunthorpe Signed-off-by: Anna Schumaker --- include/trace/events/rdma.h | 129 ++++++++++++++++++++++++++++++++++++ 1 file changed, 129 insertions(+) create mode 100644 include/trace/events/rdma.h diff --git a/include/trace/events/rdma.h b/include/trace/events/rdma.h new file mode 100644 index 000000000000..aa19afc73a4e --- /dev/null +++ b/include/trace/events/rdma.h @@ -0,0 +1,129 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2017 Oracle. All rights reserved. + */ + +/* + * enum ib_event_type, from include/rdma/ib_verbs.h + */ + +#define IB_EVENT_LIST \ + ib_event(CQ_ERR) \ + ib_event(QP_FATAL) \ + ib_event(QP_REQ_ERR) \ + ib_event(QP_ACCESS_ERR) \ + ib_event(COMM_EST) \ + ib_event(SQ_DRAINED) \ + ib_event(PATH_MIG) \ + ib_event(PATH_MIG_ERR) \ + ib_event(DEVICE_FATAL) \ + ib_event(PORT_ACTIVE) \ + ib_event(PORT_ERR) \ + ib_event(LID_CHANGE) \ + ib_event(PKEY_CHANGE) \ + ib_event(SM_CHANGE) \ + ib_event(SRQ_ERR) \ + ib_event(SRQ_LIMIT_REACHED) \ + ib_event(QP_LAST_WQE_REACHED) \ + ib_event(CLIENT_REREGISTER) \ + ib_event(GID_CHANGE) \ + ib_event_end(WQ_FATAL) + +#undef ib_event +#undef ib_event_end + +#define ib_event(x) TRACE_DEFINE_ENUM(IB_EVENT_##x); +#define ib_event_end(x) TRACE_DEFINE_ENUM(IB_EVENT_##x); + +IB_EVENT_LIST + +#undef ib_event +#undef ib_event_end + +#define ib_event(x) { IB_EVENT_##x, #x }, +#define ib_event_end(x) { IB_EVENT_##x, #x } + +#define rdma_show_ib_event(x) \ + __print_symbolic(x, IB_EVENT_LIST) + +/* + * enum ib_wc_status type, from include/rdma/ib_verbs.h + */ +#define IB_WC_STATUS_LIST \ + ib_wc_status(SUCCESS) \ + ib_wc_status(LOC_LEN_ERR) \ + ib_wc_status(LOC_QP_OP_ERR) \ + ib_wc_status(LOC_EEC_OP_ERR) \ + ib_wc_status(LOC_PROT_ERR) \ + ib_wc_status(WR_FLUSH_ERR) \ + ib_wc_status(MW_BIND_ERR) \ + ib_wc_status(BAD_RESP_ERR) \ + ib_wc_status(LOC_ACCESS_ERR) \ + ib_wc_status(REM_INV_REQ_ERR) \ + ib_wc_status(REM_ACCESS_ERR) \ + ib_wc_status(REM_OP_ERR) \ + ib_wc_status(RETRY_EXC_ERR) \ + ib_wc_status(RNR_RETRY_EXC_ERR) \ + ib_wc_status(LOC_RDD_VIOL_ERR) \ + ib_wc_status(REM_INV_RD_REQ_ERR) \ + ib_wc_status(REM_ABORT_ERR) \ + ib_wc_status(INV_EECN_ERR) \ + ib_wc_status(INV_EEC_STATE_ERR) \ + ib_wc_status(FATAL_ERR) \ + ib_wc_status(RESP_TIMEOUT_ERR) \ + ib_wc_status_end(GENERAL_ERR) + +#undef ib_wc_status +#undef ib_wc_status_end + +#define ib_wc_status(x) TRACE_DEFINE_ENUM(IB_WC_##x); +#define ib_wc_status_end(x) TRACE_DEFINE_ENUM(IB_WC_##x); + +IB_WC_STATUS_LIST + +#undef ib_wc_status +#undef ib_wc_status_end + +#define ib_wc_status(x) { IB_WC_##x, #x }, +#define ib_wc_status_end(x) { IB_WC_##x, #x } + +#define rdma_show_wc_status(x) \ + __print_symbolic(x, IB_WC_STATUS_LIST) + +/* + * enum rdma_cm_event_type, from include/rdma/rdma_cm.h + */ +#define RDMA_CM_EVENT_LIST \ + rdma_cm_event(ADDR_RESOLVED) \ + rdma_cm_event(ADDR_ERROR) \ + rdma_cm_event(ROUTE_RESOLVED) \ + rdma_cm_event(ROUTE_ERROR) \ + rdma_cm_event(CONNECT_REQUEST) \ + rdma_cm_event(CONNECT_RESPONSE) \ + rdma_cm_event(CONNECT_ERROR) \ + rdma_cm_event(UNREACHABLE) \ + rdma_cm_event(REJECTED) \ + rdma_cm_event(ESTABLISHED) \ + rdma_cm_event(DISCONNECTED) \ + rdma_cm_event(DEVICE_REMOVAL) \ + rdma_cm_event(MULTICAST_JOIN) \ + rdma_cm_event(MULTICAST_ERROR) \ + rdma_cm_event(ADDR_CHANGE) \ + rdma_cm_event_end(TIMEWAIT_EXIT) + +#undef rdma_cm_event +#undef rdma_cm_event_end + +#define rdma_cm_event(x) TRACE_DEFINE_ENUM(RDMA_CM_EVENT_##x); +#define rdma_cm_event_end(x) TRACE_DEFINE_ENUM(RDMA_CM_EVENT_##x); + +RDMA_CM_EVENT_LIST + +#undef rdma_cm_event +#undef rdma_cm_event_end + +#define rdma_cm_event(x) { RDMA_CM_EVENT_##x, #x }, +#define rdma_cm_event_end(x) { RDMA_CM_EVENT_##x, #x } + +#define rdma_show_cm_event(x) \ + __print_symbolic(x, RDMA_CM_EVENT_LIST) From e48f083e19c9082352a07d0aef6a4ea0dc72dde0 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 20 Jan 2018 11:16:34 -0500 Subject: [PATCH 47/69] rpcrdma: infrastructure for static trace points in rpcrdma.ko Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 16 ++++++++++++++++ net/sunrpc/xprtrdma/module.c | 12 +++++++----- net/sunrpc/xprtrdma/xprt_rdma.h | 2 ++ 3 files changed, 25 insertions(+), 5 deletions(-) create mode 100644 include/trace/events/rpcrdma.h diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h new file mode 100644 index 000000000000..8536d90bd286 --- /dev/null +++ b/include/trace/events/rpcrdma.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2017 Oracle. All rights reserved. + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM rpcrdma + +#if !defined(_TRACE_RPCRDMA_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_RPCRDMA_H + +#include +#include + +#endif /* _TRACE_RPCRDMA_H */ + +#include diff --git a/net/sunrpc/xprtrdma/module.c b/net/sunrpc/xprtrdma/module.c index 560712bd9fa2..a762d192372b 100644 --- a/net/sunrpc/xprtrdma/module.c +++ b/net/sunrpc/xprtrdma/module.c @@ -1,18 +1,20 @@ /* - * Copyright (c) 2015 Oracle. All rights reserved. + * Copyright (c) 2015, 2017 Oracle. All rights reserved. */ /* rpcrdma.ko module initialization */ +#include +#include #include #include #include -#include "xprt_rdma.h" -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -# define RPCDBG_FACILITY RPCDBG_TRANS -#endif +#include + +#define CREATE_TRACE_POINTS +#include "xprt_rdma.h" MODULE_AUTHOR("Open Grid Computing and Network Appliance, Inc."); MODULE_DESCRIPTION("RPC/RDMA Transport"); diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 28ae1fb1e2f3..a1ca1b638123 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -675,3 +675,5 @@ void xprt_rdma_bc_destroy(struct rpc_xprt *, unsigned int); extern struct xprt_class xprt_rdma_bc; #endif /* _LINUX_SUNRPC_XPRT_RDMA_H */ + +#include From ab03eff58eb5b4914ec96e989e4c30e320d20ad8 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 20 Dec 2017 16:30:40 -0500 Subject: [PATCH 48/69] xprtrdma: Add trace points in RPC Call transmit paths Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 124 +++++++++++++++++++++++++++++++++ net/sunrpc/xprtrdma/rpc_rdma.c | 8 +-- net/sunrpc/xprtrdma/verbs.c | 16 ++--- 3 files changed, 131 insertions(+), 17 deletions(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 8536d90bd286..ffe660365193 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -11,6 +11,130 @@ #include #include +/** + ** Call events + **/ + +TRACE_DEFINE_ENUM(rpcrdma_noch); +TRACE_DEFINE_ENUM(rpcrdma_readch); +TRACE_DEFINE_ENUM(rpcrdma_areadch); +TRACE_DEFINE_ENUM(rpcrdma_writech); +TRACE_DEFINE_ENUM(rpcrdma_replych); + +#define xprtrdma_show_chunktype(x) \ + __print_symbolic(x, \ + { rpcrdma_noch, "inline" }, \ + { rpcrdma_readch, "read list" }, \ + { rpcrdma_areadch, "*read list" }, \ + { rpcrdma_writech, "write list" }, \ + { rpcrdma_replych, "reply chunk" }) + +TRACE_EVENT(xprtrdma_marshal, + TP_PROTO( + const struct rpc_rqst *rqst, + unsigned int hdrlen, + unsigned int rtype, + unsigned int wtype + ), + + TP_ARGS(rqst, hdrlen, rtype, wtype), + + TP_STRUCT__entry( + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(u32, xid) + __field(unsigned int, hdrlen) + __field(unsigned int, headlen) + __field(unsigned int, pagelen) + __field(unsigned int, taillen) + __field(unsigned int, rtype) + __field(unsigned int, wtype) + ), + + TP_fast_assign( + __entry->task_id = rqst->rq_task->tk_pid; + __entry->client_id = rqst->rq_task->tk_client->cl_clid; + __entry->xid = be32_to_cpu(rqst->rq_xid); + __entry->hdrlen = hdrlen; + __entry->headlen = rqst->rq_snd_buf.head[0].iov_len; + __entry->pagelen = rqst->rq_snd_buf.page_len; + __entry->taillen = rqst->rq_snd_buf.tail[0].iov_len; + __entry->rtype = rtype; + __entry->wtype = wtype; + ), + + TP_printk("task:%u@%u xid=0x%08x: hdr=%u xdr=%u/%u/%u %s/%s", + __entry->task_id, __entry->client_id, __entry->xid, + __entry->hdrlen, + __entry->headlen, __entry->pagelen, __entry->taillen, + xprtrdma_show_chunktype(__entry->rtype), + xprtrdma_show_chunktype(__entry->wtype) + ) +); + +TRACE_EVENT(xprtrdma_post_send, + TP_PROTO( + const struct rpcrdma_req *req, + int status + ), + + TP_ARGS(req, status), + + TP_STRUCT__entry( + __field(const void *, req) + __field(int, num_sge) + __field(bool, signaled) + __field(int, status) + ), + + TP_fast_assign( + __entry->req = req; + __entry->num_sge = req->rl_sendctx->sc_wr.num_sge; + __entry->signaled = req->rl_sendctx->sc_wr.send_flags & + IB_SEND_SIGNALED; + __entry->status = status; + ), + + TP_printk("req=%p, %d SGEs%s, status=%d", + __entry->req, __entry->num_sge, + (__entry->signaled ? ", signaled" : ""), + __entry->status + ) +); + +/** + ** Completion events + **/ + +TRACE_EVENT(xprtrdma_wc_send, + TP_PROTO( + const struct rpcrdma_sendctx *sc, + const struct ib_wc *wc + ), + + TP_ARGS(sc, wc), + + TP_STRUCT__entry( + __field(const void *, req) + __field(unsigned int, unmap_count) + __field(unsigned int, status) + __field(unsigned int, vendor_err) + ), + + TP_fast_assign( + __entry->req = sc->sc_req; + __entry->unmap_count = sc->sc_unmap_count; + __entry->status = wc->status; + __entry->vendor_err = __entry->status ? wc->vendor_err : 0; + ), + + TP_printk("req=%p, unmapped %u pages: %s (%u/0x%x)", + __entry->req, __entry->unmap_count, + rdma_show_wc_status(__entry->status), + __entry->status, __entry->vendor_err + ) +); + #endif /* _TRACE_RPCRDMA_H */ #include diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 9601af01653f..162c0fd82673 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -524,9 +524,6 @@ rpcrdma_unmap_sendctx(struct rpcrdma_sendctx *sc) struct ib_sge *sge; unsigned int count; - dprintk("RPC: %s: unmapping %u sges for sc=%p\n", - __func__, sc->sc_unmap_count, sc); - /* The first two SGEs contain the transport header and * the inline buffer. These are always left mapped so * they can be cheaply re-used. @@ -874,10 +871,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) if (ret) goto out_err; - dprintk("RPC: %5u %s: %s/%s: hdrlen %u rpclen\n", - rqst->rq_task->tk_pid, __func__, - transfertypes[rtype], transfertypes[wtype], - xdr_stream_pos(xdr)); + trace_xprtrdma_marshal(rqst, xdr_stream_pos(xdr), rtype, wtype); ret = rpcrdma_prepare_send_sges(r_xprt, req, xdr_stream_pos(xdr), &rqst->rq_snd_buf, rtype); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 9cc8abc09e14..06d86b8c7f71 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -133,6 +133,7 @@ rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) container_of(cqe, struct rpcrdma_sendctx, sc_cqe); /* WARNING: Only wr_cqe and status are reliable at this point */ + trace_xprtrdma_wc_send(sc, wc); if (wc->status != IB_WC_SUCCESS && wc->status != IB_WC_WR_FLUSH_ERR) pr_err("rpcrdma: Send: %s (%u/0x%x)\n", ib_wc_status_msg(wc->status), @@ -1549,9 +1550,6 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, req->rl_reply = NULL; } - dprintk("RPC: %s: posting %d s/g entries\n", - __func__, send_wr->num_sge); - if (!ep->rep_send_count || test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags)) { send_wr->send_flags |= IB_SEND_SIGNALED; @@ -1560,14 +1558,12 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, send_wr->send_flags &= ~IB_SEND_SIGNALED; --ep->rep_send_count; } - rc = ib_post_send(ia->ri_id->qp, send_wr, &send_wr_fail); - if (rc) - goto out_postsend_err; - return 0; -out_postsend_err: - pr_err("rpcrdma: RDMA Send ib_post_send returned %i\n", rc); - return -ENOTCONN; + rc = ib_post_send(ia->ri_id->qp, send_wr, &send_wr_fail); + trace_xprtrdma_post_send(req, rc); + if (rc) + return -ENOTCONN; + return 0; } int From b4a7f91c1d8e14596a1eb37075d9f20f213481a8 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 20 Dec 2017 16:30:48 -0500 Subject: [PATCH 49/69] xprtrdma: Add trace points in the RPC Reply handler paths Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 162 +++++++++++++++++++++++++++++++++ net/sunrpc/xprtrdma/rpc_rdma.c | 17 ++-- net/sunrpc/xprtrdma/verbs.c | 11 +-- 3 files changed, 171 insertions(+), 19 deletions(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index ffe660365193..fe8ba0b25df8 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -11,6 +11,46 @@ #include #include +/** + ** Event classes + **/ + +DECLARE_EVENT_CLASS(xprtrdma_reply_event, + TP_PROTO( + const struct rpcrdma_rep *rep + ), + + TP_ARGS(rep), + + TP_STRUCT__entry( + __field(const void *, rep) + __field(const void *, r_xprt) + __field(u32, xid) + __field(u32, version) + __field(u32, proc) + ), + + TP_fast_assign( + __entry->rep = rep; + __entry->r_xprt = rep->rr_rxprt; + __entry->xid = be32_to_cpu(rep->rr_xid); + __entry->version = be32_to_cpu(rep->rr_vers); + __entry->proc = be32_to_cpu(rep->rr_proc); + ), + + TP_printk("rxprt %p xid=0x%08x rep=%p: version %u proc %u", + __entry->r_xprt, __entry->xid, __entry->rep, + __entry->version, __entry->proc + ) +); + +#define DEFINE_REPLY_EVENT(name) \ + DEFINE_EVENT(xprtrdma_reply_event, name, \ + TP_PROTO( \ + const struct rpcrdma_rep *rep \ + ), \ + TP_ARGS(rep)) + /** ** Call events **/ @@ -102,6 +142,29 @@ TRACE_EVENT(xprtrdma_post_send, ) ); +TRACE_EVENT(xprtrdma_post_recv, + TP_PROTO( + const struct rpcrdma_rep *rep, + int status + ), + + TP_ARGS(rep, status), + + TP_STRUCT__entry( + __field(const void *, rep) + __field(int, status) + ), + + TP_fast_assign( + __entry->rep = rep; + __entry->status = status; + ), + + TP_printk("rep=%p status=%d", + __entry->rep, __entry->status + ) +); + /** ** Completion events **/ @@ -135,6 +198,105 @@ TRACE_EVENT(xprtrdma_wc_send, ) ); +TRACE_EVENT(xprtrdma_wc_receive, + TP_PROTO( + const struct rpcrdma_rep *rep, + const struct ib_wc *wc + ), + + TP_ARGS(rep, wc), + + TP_STRUCT__entry( + __field(const void *, rep) + __field(unsigned int, byte_len) + __field(unsigned int, status) + __field(unsigned int, vendor_err) + ), + + TP_fast_assign( + __entry->rep = rep; + __entry->byte_len = wc->byte_len; + __entry->status = wc->status; + __entry->vendor_err = __entry->status ? wc->vendor_err : 0; + ), + + TP_printk("rep=%p, %u bytes: %s (%u/0x%x)", + __entry->rep, __entry->byte_len, + rdma_show_wc_status(__entry->status), + __entry->status, __entry->vendor_err + ) +); + +/** + ** Reply events + **/ + +TRACE_EVENT(xprtrdma_reply, + TP_PROTO( + const struct rpc_task *task, + const struct rpcrdma_rep *rep, + const struct rpcrdma_req *req, + unsigned int credits + ), + + TP_ARGS(task, rep, req, credits), + + TP_STRUCT__entry( + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(const void *, rep) + __field(const void *, req) + __field(u32, xid) + __field(unsigned int, credits) + ), + + TP_fast_assign( + __entry->task_id = task->tk_pid; + __entry->client_id = task->tk_client->cl_clid; + __entry->rep = rep; + __entry->req = req; + __entry->xid = be32_to_cpu(rep->rr_xid); + __entry->credits = credits; + ), + + TP_printk("task:%u@%u xid=0x%08x, %u credits, rep=%p -> req=%p", + __entry->task_id, __entry->client_id, __entry->xid, + __entry->credits, __entry->rep, __entry->req + ) +); + +TRACE_EVENT(xprtrdma_defer_cmp, + TP_PROTO( + const struct rpcrdma_rep *rep + ), + + TP_ARGS(rep), + + TP_STRUCT__entry( + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(const void *, rep) + __field(u32, xid) + ), + + TP_fast_assign( + __entry->task_id = rep->rr_rqst->rq_task->tk_pid; + __entry->client_id = rep->rr_rqst->rq_task->tk_client->cl_clid; + __entry->rep = rep; + __entry->xid = be32_to_cpu(rep->rr_xid); + ), + + TP_printk("task:%u@%u xid=0x%08x rep=%p", + __entry->task_id, __entry->client_id, __entry->xid, + __entry->rep + ) +); + +DEFINE_REPLY_EVENT(xprtrdma_reply_vers); +DEFINE_REPLY_EVENT(xprtrdma_reply_rqst); +DEFINE_REPLY_EVENT(xprtrdma_reply_short); +DEFINE_REPLY_EVENT(xprtrdma_reply_hdr); + #endif /* _TRACE_RPCRDMA_H */ #include diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 162c0fd82673..6f774f1e4516 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -1278,8 +1278,7 @@ out: * being marshaled. */ out_badheader: - dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n", - rqst->rq_task->tk_pid, __func__, be32_to_cpu(rep->rr_proc)); + trace_xprtrdma_reply_hdr(rep); r_xprt->rx_stats.bad_reply_count++; status = -EIO; goto out; @@ -1323,6 +1322,7 @@ void rpcrdma_deferred_completion(struct work_struct *work) struct rpcrdma_req *req = rpcr_to_rdmar(rep->rr_rqst); struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; + trace_xprtrdma_defer_cmp(rep); if (rep->rr_wc_flags & IB_WC_WITH_INVALIDATE) r_xprt->rx_ia.ri_ops->ro_reminv(rep, &req->rl_registered); rpcrdma_release_rqst(r_xprt, req); @@ -1344,8 +1344,6 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) u32 credits; __be32 *p; - dprintk("RPC: %s: incoming rep %p\n", __func__, rep); - if (rep->rr_hdrbuf.head[0].iov_len == 0) goto out_badstatus; @@ -1389,8 +1387,7 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) rep->rr_rqst = rqst; clear_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags); - dprintk("RPC: %s: reply %p completes request %p (xid 0x%08x)\n", - __func__, rep, req, be32_to_cpu(rep->rr_xid)); + trace_xprtrdma_reply(rqst->rq_task, rep, req, credits); queue_work_on(req->rl_cpu, rpcrdma_receive_wq, &rep->rr_work); return; @@ -1404,8 +1401,7 @@ out_badstatus: return; out_badversion: - dprintk("RPC: %s: invalid version %d\n", - __func__, be32_to_cpu(rep->rr_vers)); + trace_xprtrdma_reply_vers(rep); goto repost; /* The RPC transaction has already been terminated, or the header @@ -1413,12 +1409,11 @@ out_badversion: */ out_norqst: spin_unlock(&xprt->recv_lock); - dprintk("RPC: %s: no match for incoming xid 0x%08x\n", - __func__, be32_to_cpu(rep->rr_xid)); + trace_xprtrdma_reply_rqst(rep); goto repost; out_shortreply: - dprintk("RPC: %s: short/invalid reply\n", __func__); + trace_xprtrdma_reply_short(rep); /* If no pending RPC transaction was matched, post a replacement * receive buffer before returning. diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 06d86b8c7f71..cfa3c0328878 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -156,13 +156,11 @@ rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) rr_cqe); /* WARNING: Only wr_id and status are reliable at this point */ + trace_xprtrdma_wc_receive(rep, wc); if (wc->status != IB_WC_SUCCESS) goto out_fail; /* status == SUCCESS means all fields in wc are trustworthy */ - dprintk("RPC: %s: rep %p opcode 'recv', length %u: success\n", - __func__, rep, wc->byte_len); - rpcrdma_set_xdrlen(&rep->rr_hdrbuf, wc->byte_len); rep->rr_wc_flags = wc->wc_flags; rep->rr_inv_rkey = wc->ex.invalidate_rkey; @@ -1576,17 +1574,14 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, if (!rpcrdma_dma_map_regbuf(ia, rep->rr_rdmabuf)) goto out_map; rc = ib_post_recv(ia->ri_id->qp, &rep->rr_recv_wr, &recv_wr_fail); + trace_xprtrdma_post_recv(rep, rc); if (rc) - goto out_postrecv; + return -ENOTCONN; return 0; out_map: pr_err("rpcrdma: failed to DMA map the Receive buffer\n"); return -EIO; - -out_postrecv: - pr_err("rpcrdma: ib_post_recv returned %i\n", rc); - return -ENOTCONN; } /** From 58f10ad40dd8456c0c6b1d90e8237c67db6e0801 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 20 Dec 2017 16:30:56 -0500 Subject: [PATCH 50/69] xprtrdma: Add trace points to instrument memory registration Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 156 +++++++++++++++++++++++++++++++++ net/sunrpc/xprtrdma/frwr_ops.c | 11 +-- net/sunrpc/xprtrdma/rpc_rdma.c | 18 +--- 3 files changed, 163 insertions(+), 22 deletions(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index fe8ba0b25df8..17eb3209cd6e 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -51,10 +51,164 @@ DECLARE_EVENT_CLASS(xprtrdma_reply_event, ), \ TP_ARGS(rep)) +DECLARE_EVENT_CLASS(xprtrdma_rdch_event, + TP_PROTO( + const struct rpc_task *task, + unsigned int pos, + struct rpcrdma_mr *mr, + int nsegs + ), + + TP_ARGS(task, pos, mr, nsegs), + + TP_STRUCT__entry( + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(const void *, mr) + __field(unsigned int, pos) + __field(int, nents) + __field(u32, handle) + __field(u32, length) + __field(u64, offset) + __field(int, nsegs) + ), + + TP_fast_assign( + __entry->task_id = task->tk_pid; + __entry->client_id = task->tk_client->cl_clid; + __entry->mr = mr; + __entry->pos = pos; + __entry->nents = mr->mr_nents; + __entry->handle = mr->mr_handle; + __entry->length = mr->mr_length; + __entry->offset = mr->mr_offset; + __entry->nsegs = nsegs; + ), + + TP_printk("task:%u@%u mr=%p pos=%u %u@0x%016llx:0x%08x (%s)", + __entry->task_id, __entry->client_id, __entry->mr, + __entry->pos, __entry->length, + (unsigned long long)__entry->offset, __entry->handle, + __entry->nents < __entry->nsegs ? "more" : "last" + ) +); + +#define DEFINE_RDCH_EVENT(name) \ + DEFINE_EVENT(xprtrdma_rdch_event, name, \ + TP_PROTO( \ + const struct rpc_task *task, \ + unsigned int pos, \ + struct rpcrdma_mr *mr, \ + int nsegs \ + ), \ + TP_ARGS(task, pos, mr, nsegs)) + +DECLARE_EVENT_CLASS(xprtrdma_wrch_event, + TP_PROTO( + const struct rpc_task *task, + struct rpcrdma_mr *mr, + int nsegs + ), + + TP_ARGS(task, mr, nsegs), + + TP_STRUCT__entry( + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(const void *, mr) + __field(int, nents) + __field(u32, handle) + __field(u32, length) + __field(u64, offset) + __field(int, nsegs) + ), + + TP_fast_assign( + __entry->task_id = task->tk_pid; + __entry->client_id = task->tk_client->cl_clid; + __entry->mr = mr; + __entry->nents = mr->mr_nents; + __entry->handle = mr->mr_handle; + __entry->length = mr->mr_length; + __entry->offset = mr->mr_offset; + __entry->nsegs = nsegs; + ), + + TP_printk("task:%u@%u mr=%p %u@0x%016llx:0x%08x (%s)", + __entry->task_id, __entry->client_id, __entry->mr, + __entry->length, (unsigned long long)__entry->offset, + __entry->handle, + __entry->nents < __entry->nsegs ? "more" : "last" + ) +); + +#define DEFINE_WRCH_EVENT(name) \ + DEFINE_EVENT(xprtrdma_wrch_event, name, \ + TP_PROTO( \ + const struct rpc_task *task, \ + struct rpcrdma_mr *mr, \ + int nsegs \ + ), \ + TP_ARGS(task, mr, nsegs)) + +TRACE_DEFINE_ENUM(FRWR_IS_INVALID); +TRACE_DEFINE_ENUM(FRWR_IS_VALID); +TRACE_DEFINE_ENUM(FRWR_FLUSHED_FR); +TRACE_DEFINE_ENUM(FRWR_FLUSHED_LI); + +#define xprtrdma_show_frwr_state(x) \ + __print_symbolic(x, \ + { FRWR_IS_INVALID, "INVALID" }, \ + { FRWR_IS_VALID, "VALID" }, \ + { FRWR_FLUSHED_FR, "FLUSHED_FR" }, \ + { FRWR_FLUSHED_LI, "FLUSHED_LI" }) + +DECLARE_EVENT_CLASS(xprtrdma_frwr_done, + TP_PROTO( + const struct ib_wc *wc, + const struct rpcrdma_frwr *frwr + ), + + TP_ARGS(wc, frwr), + + TP_STRUCT__entry( + __field(const void *, mr) + __field(unsigned int, state) + __field(unsigned int, status) + __field(unsigned int, vendor_err) + ), + + TP_fast_assign( + __entry->mr = container_of(frwr, struct rpcrdma_mr, frwr); + __entry->state = frwr->fr_state; + __entry->status = wc->status; + __entry->vendor_err = __entry->status ? wc->vendor_err : 0; + ), + + TP_printk( + "mr=%p state=%s: %s (%u/0x%x)", + __entry->mr, xprtrdma_show_frwr_state(__entry->state), + rdma_show_wc_status(__entry->status), + __entry->status, __entry->vendor_err + ) +); + +#define DEFINE_FRWR_DONE_EVENT(name) \ + DEFINE_EVENT(xprtrdma_frwr_done, name, \ + TP_PROTO( \ + const struct ib_wc *wc, \ + const struct rpcrdma_frwr *frwr \ + ), \ + TP_ARGS(wc, frwr)) + /** ** Call events **/ +DEFINE_RDCH_EVENT(xprtrdma_read_chunk); +DEFINE_WRCH_EVENT(xprtrdma_write_chunk); +DEFINE_WRCH_EVENT(xprtrdma_reply_chunk); + TRACE_DEFINE_ENUM(rpcrdma_noch); TRACE_DEFINE_ENUM(rpcrdma_readch); TRACE_DEFINE_ENUM(rpcrdma_areadch); @@ -227,6 +381,8 @@ TRACE_EVENT(xprtrdma_wc_receive, ) ); +DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_fastreg); + /** ** Reply events **/ diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 35e3a54344cc..afbeb9b442bf 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -286,16 +286,16 @@ __frwr_sendcompletion_flush(struct ib_wc *wc, const char *wr) static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) { - struct rpcrdma_frwr *frwr; - struct ib_cqe *cqe; + struct ib_cqe *cqe = wc->wr_cqe; + struct rpcrdma_frwr *frwr = + container_of(cqe, struct rpcrdma_frwr, fr_cqe); /* WARNING: Only wr_cqe and status are reliable at this point */ if (wc->status != IB_WC_SUCCESS) { - cqe = wc->wr_cqe; - frwr = container_of(cqe, struct rpcrdma_frwr, fr_cqe); frwr->fr_state = FRWR_FLUSHED_FR; __frwr_sendcompletion_flush(wc, "fastreg"); } + trace_xprtrdma_wc_fastreg(wc, frwr); } /** @@ -401,9 +401,6 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, if (unlikely(n != mr->mr_nents)) goto out_mapmr_err; - dprintk("RPC: %s: Using frwr %p to map %u segments (%llu bytes)\n", - __func__, frwr, mr->mr_nents, ibmr->length); - key = (u8)(ibmr->rkey & 0x000000FF); ib_update_fast_reg_key(ibmr, ++key); diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 6f774f1e4516..634496ca2e28 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -371,11 +371,7 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, if (encode_read_segment(xdr, mr, pos) < 0) return -EMSGSIZE; - dprintk("RPC: %5u %s: pos %u %u@0x%016llx:0x%08x (%s)\n", - rqst->rq_task->tk_pid, __func__, pos, - mr->mr_length, (unsigned long long)mr->mr_offset, - mr->mr_handle, mr->mr_nents < nsegs ? "more" : "last"); - + trace_xprtrdma_read_chunk(rqst->rq_task, pos, mr, nsegs); r_xprt->rx_stats.read_chunk_count++; nsegs -= mr->mr_nents; } while (nsegs); @@ -433,11 +429,7 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, if (encode_rdma_segment(xdr, mr) < 0) return -EMSGSIZE; - dprintk("RPC: %5u %s: %u@0x016%llx:0x%08x (%s)\n", - rqst->rq_task->tk_pid, __func__, - mr->mr_length, (unsigned long long)mr->mr_offset, - mr->mr_handle, mr->mr_nents < nsegs ? "more" : "last"); - + trace_xprtrdma_write_chunk(rqst->rq_task, mr, nsegs); r_xprt->rx_stats.write_chunk_count++; r_xprt->rx_stats.total_rdma_request += seg->mr_len; nchunks++; @@ -495,11 +487,7 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, if (encode_rdma_segment(xdr, mr) < 0) return -EMSGSIZE; - dprintk("RPC: %5u %s: %u@0x%016llx:0x%08x (%s)\n", - rqst->rq_task->tk_pid, __func__, - mr->mr_length, (unsigned long long)mr->mr_offset, - mr->mr_handle, mr->mr_nents < nsegs ? "more" : "last"); - + trace_xprtrdma_reply_chunk(rqst->rq_task, mr, nsegs); r_xprt->rx_stats.reply_chunk_count++; r_xprt->rx_stats.total_rdma_request += seg->mr_len; nchunks++; From e11b7c9655d13f26c227ca70c399aac2b596033d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 20 Dec 2017 16:31:04 -0500 Subject: [PATCH 51/69] xprtrdma: Add trace points in reply decoder path This includes decoding Write and Reply chunks, and fixing up inline payloads. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 93 ++++++++++++++++++++++++++++++++++ net/sunrpc/xprtrdma/rpc_rdma.c | 29 ++++------- 2 files changed, 102 insertions(+), 20 deletions(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 17eb3209cd6e..7336cdc173ed 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -453,6 +453,99 @@ DEFINE_REPLY_EVENT(xprtrdma_reply_rqst); DEFINE_REPLY_EVENT(xprtrdma_reply_short); DEFINE_REPLY_EVENT(xprtrdma_reply_hdr); +TRACE_EVENT(xprtrdma_fixup, + TP_PROTO( + const struct rpc_rqst *rqst, + int len, + int hdrlen + ), + + TP_ARGS(rqst, len, hdrlen), + + TP_STRUCT__entry( + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(const void *, base) + __field(int, len) + __field(int, hdrlen) + ), + + TP_fast_assign( + __entry->task_id = rqst->rq_task->tk_pid; + __entry->client_id = rqst->rq_task->tk_client->cl_clid; + __entry->base = rqst->rq_rcv_buf.head[0].iov_base; + __entry->len = len; + __entry->hdrlen = hdrlen; + ), + + TP_printk("task:%u@%u base=%p len=%d hdrlen=%d", + __entry->task_id, __entry->client_id, + __entry->base, __entry->len, __entry->hdrlen + ) +); + +TRACE_EVENT(xprtrdma_fixup_pg, + TP_PROTO( + const struct rpc_rqst *rqst, + int pageno, + const void *pos, + int len, + int curlen + ), + + TP_ARGS(rqst, pageno, pos, len, curlen), + + TP_STRUCT__entry( + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(const void *, pos) + __field(int, pageno) + __field(int, len) + __field(int, curlen) + ), + + TP_fast_assign( + __entry->task_id = rqst->rq_task->tk_pid; + __entry->client_id = rqst->rq_task->tk_client->cl_clid; + __entry->pos = pos; + __entry->pageno = pageno; + __entry->len = len; + __entry->curlen = curlen; + ), + + TP_printk("task:%u@%u pageno=%d pos=%p len=%d curlen=%d", + __entry->task_id, __entry->client_id, + __entry->pageno, __entry->pos, __entry->len, __entry->curlen + ) +); + +TRACE_EVENT(xprtrdma_decode_seg, + TP_PROTO( + u32 handle, + u32 length, + u64 offset + ), + + TP_ARGS(handle, length, offset), + + TP_STRUCT__entry( + __field(u32, handle) + __field(u32, length) + __field(u64, offset) + ), + + TP_fast_assign( + __entry->handle = handle; + __entry->length = length; + __entry->offset = offset; + ), + + TP_printk("%u@0x%016llx:0x%08x", + __entry->length, (unsigned long long)__entry->offset, + __entry->handle + ) +); + #endif /* _TRACE_RPCRDMA_H */ #include diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 634496ca2e28..1ae9b41b75a1 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -914,8 +914,7 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) curlen = rqst->rq_rcv_buf.head[0].iov_len; if (curlen > copy_len) curlen = copy_len; - dprintk("RPC: %s: srcp 0x%p len %d hdrlen %d\n", - __func__, srcp, copy_len, curlen); + trace_xprtrdma_fixup(rqst, copy_len, curlen); srcp += curlen; copy_len -= curlen; @@ -935,9 +934,8 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) if (curlen > pagelist_len) curlen = pagelist_len; - dprintk("RPC: %s: page %d" - " srcp 0x%p len %d curlen %d\n", - __func__, i, srcp, copy_len, curlen); + trace_xprtrdma_fixup_pg(rqst, i, srcp, + copy_len, curlen); destp = kmap_atomic(ppages[i]); memcpy(destp + page_base, srcp, curlen); flush_dcache_page(ppages[i]); @@ -1028,26 +1026,19 @@ out_short: static int decode_rdma_segment(struct xdr_stream *xdr, u32 *length) { + u32 handle; + u64 offset; __be32 *p; p = xdr_inline_decode(xdr, 4 * sizeof(*p)); if (unlikely(!p)) return -EIO; - ifdebug(FACILITY) { - u64 offset; - u32 handle; - - handle = be32_to_cpup(p++); - *length = be32_to_cpup(p++); - xdr_decode_hyper(p, &offset); - dprintk("RPC: %s: segment %u@0x%016llx:0x%08x\n", - __func__, *length, (unsigned long long)offset, - handle); - } else { - *length = be32_to_cpup(p + 1); - } + handle = be32_to_cpup(p++); + *length = be32_to_cpup(p++); + xdr_decode_hyper(p, &offset); + trace_xprtrdma_decode_seg(handle, *length, offset); return 0; } @@ -1068,8 +1059,6 @@ static int decode_write_chunk(struct xdr_stream *xdr, u32 *length) *length += seglength; } - dprintk("RPC: %s: segcount=%u, %u bytes\n", - __func__, be32_to_cpup(p), *length); return 0; } From 2937fede11b1081dbbbe6228637423639165eec7 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 20 Dec 2017 16:31:12 -0500 Subject: [PATCH 52/69] xprtrdma: Add trace points to instrument memory invalidation Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 41 ++++++++++++++++++++++++++++++++++ net/sunrpc/xprtrdma/fmr_ops.c | 4 ++-- net/sunrpc/xprtrdma/frwr_ops.c | 27 +++++++++++----------- net/sunrpc/xprtrdma/verbs.c | 1 + 4 files changed, 57 insertions(+), 16 deletions(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 7336cdc173ed..fadb4b687595 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -201,6 +201,41 @@ DECLARE_EVENT_CLASS(xprtrdma_frwr_done, ), \ TP_ARGS(wc, frwr)) +DECLARE_EVENT_CLASS(xprtrdma_mr, + TP_PROTO( + const struct rpcrdma_mr *mr + ), + + TP_ARGS(mr), + + TP_STRUCT__entry( + __field(const void *, mr) + __field(u32, handle) + __field(u32, length) + __field(u64, offset) + ), + + TP_fast_assign( + __entry->mr = mr; + __entry->handle = mr->mr_handle; + __entry->length = mr->mr_length; + __entry->offset = mr->mr_offset; + ), + + TP_printk("mr=%p %u@0x%016llx:0x%08x", + __entry->mr, __entry->length, + (unsigned long long)__entry->offset, + __entry->handle + ) +); + +#define DEFINE_MR_EVENT(name) \ + DEFINE_EVENT(xprtrdma_mr, name, \ + TP_PROTO( \ + const struct rpcrdma_mr *mr \ + ), \ + TP_ARGS(mr)) + /** ** Call events **/ @@ -382,6 +417,12 @@ TRACE_EVENT(xprtrdma_wc_receive, ); DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_fastreg); +DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_li); +DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_li_wake); + +DEFINE_MR_EVENT(xprtrdma_localinv); +DEFINE_MR_EVENT(xprtrdma_dma_unmap); +DEFINE_MR_EVENT(xprtrdma_remoteinv); /** ** Reply events diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index 7f2f2b774076..d5f95bb39300 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -148,6 +148,7 @@ out_release: pr_err("rpcrdma: FMR reset failed (%d), %p released\n", rc, mr); r_xprt->rx_stats.mrs_orphaned++; + trace_xprtrdma_dma_unmap(mr); ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, mr->mr_sg, mr->mr_nents, mr->mr_dir); @@ -273,6 +274,7 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs) list_for_each_entry(mr, mrs, mr_list) { dprintk("RPC: %s: unmapping fmr %p\n", __func__, &mr->fmr); + trace_xprtrdma_localinv(mr); list_add_tail(&mr->fmr.fm_mr->list, &unmap_list); } r_xprt->rx_stats.local_inv_needed++; @@ -285,8 +287,6 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs) */ while (!list_empty(mrs)) { mr = rpcrdma_mr_pop(mrs); - dprintk("RPC: %s: DMA unmapping fmr %p\n", - __func__, &mr->fmr); list_del(&mr->fmr.fm_mr->list); rpcrdma_mr_unmap_and_put(mr); } diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index afbeb9b442bf..90f688f19783 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -182,9 +182,11 @@ frwr_op_recover_mr(struct rpcrdma_mr *mr) int rc; rc = __frwr_mr_reset(ia, mr); - if (state != FRWR_FLUSHED_LI) + if (state != FRWR_FLUSHED_LI) { + trace_xprtrdma_dma_unmap(mr); ib_dma_unmap_sg(ia->ri_device, mr->mr_sg, mr->mr_nents, mr->mr_dir); + } if (rc) goto out_release; @@ -307,16 +309,16 @@ frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc) { - struct rpcrdma_frwr *frwr; - struct ib_cqe *cqe; + struct ib_cqe *cqe = wc->wr_cqe; + struct rpcrdma_frwr *frwr = container_of(cqe, struct rpcrdma_frwr, + fr_cqe); /* WARNING: Only wr_cqe and status are reliable at this point */ if (wc->status != IB_WC_SUCCESS) { - cqe = wc->wr_cqe; - frwr = container_of(cqe, struct rpcrdma_frwr, fr_cqe); frwr->fr_state = FRWR_FLUSHED_LI; __frwr_sendcompletion_flush(wc, "localinv"); } + trace_xprtrdma_wc_li(wc, frwr); } /** @@ -329,17 +331,17 @@ frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc) static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) { - struct rpcrdma_frwr *frwr; - struct ib_cqe *cqe; + struct ib_cqe *cqe = wc->wr_cqe; + struct rpcrdma_frwr *frwr = container_of(cqe, struct rpcrdma_frwr, + fr_cqe); /* WARNING: Only wr_cqe and status are reliable at this point */ - cqe = wc->wr_cqe; - frwr = container_of(cqe, struct rpcrdma_frwr, fr_cqe); if (wc->status != IB_WC_SUCCESS) { frwr->fr_state = FRWR_FLUSHED_LI; __frwr_sendcompletion_flush(wc, "localinv"); } complete(&frwr->fr_linv_done); + trace_xprtrdma_wc_li_wake(wc, frwr); } /* Post a REG_MR Work Request to register a memory region @@ -457,6 +459,7 @@ frwr_op_reminv(struct rpcrdma_rep *rep, struct list_head *mrs) list_for_each_entry(mr, mrs, mr_list) if (mr->mr_handle == rep->rr_inv_rkey) { list_del(&mr->mr_list); + trace_xprtrdma_remoteinv(mr); mr->frwr.fr_state = FRWR_IS_INVALID; rpcrdma_mr_unmap_and_put(mr); break; /* only one invalidated MR per RPC */ @@ -492,9 +495,7 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs) mr->frwr.fr_state = FRWR_IS_INVALID; frwr = &mr->frwr; - - dprintk("RPC: %s: invalidating frwr %p\n", - __func__, frwr); + trace_xprtrdma_localinv(mr); frwr->fr_cqe.done = frwr_wc_localinv; last = &frwr->fr_invwr; @@ -536,8 +537,6 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs) unmap: while (!list_empty(mrs)) { mr = rpcrdma_mr_pop(mrs); - dprintk("RPC: %s: DMA unmapping frwr %p\n", - __func__, &mr->frwr); rpcrdma_mr_unmap_and_put(mr); } return; diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index cfa3c0328878..0e89e5529d6c 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1349,6 +1349,7 @@ rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr) { struct rpcrdma_xprt *r_xprt = mr->mr_xprt; + trace_xprtrdma_dma_unmap(mr); ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, mr->mr_sg, mr->mr_nents, mr->mr_dir); __rpcrdma_mr_put(&r_xprt->rx_buf, mr); From 1c443effa319a02a5b8808bef7840500c29f24b2 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 20 Dec 2017 16:31:21 -0500 Subject: [PATCH 53/69] xprtrdma: Add trace points to instrument MR allocation and recovery Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 57 ++++++++++++++++++++++++++++++++++ net/sunrpc/xprtrdma/verbs.c | 6 ++-- 2 files changed, 60 insertions(+), 3 deletions(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index fadb4b687595..d296d5b084c2 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -51,6 +51,37 @@ DECLARE_EVENT_CLASS(xprtrdma_reply_event, ), \ TP_ARGS(rep)) +DECLARE_EVENT_CLASS(xprtrdma_rxprt, + TP_PROTO( + const struct rpcrdma_xprt *r_xprt + ), + + TP_ARGS(r_xprt), + + TP_STRUCT__entry( + __field(const void *, r_xprt) + __string(addr, rpcrdma_addrstr(r_xprt)) + __string(port, rpcrdma_portstr(r_xprt)) + ), + + TP_fast_assign( + __entry->r_xprt = r_xprt; + __assign_str(addr, rpcrdma_addrstr(r_xprt)); + __assign_str(port, rpcrdma_portstr(r_xprt)); + ), + + TP_printk("peer=[%s]:%s r_xprt=%p", + __get_str(addr), __get_str(port), __entry->r_xprt + ) +); + +#define DEFINE_RXPRT_EVENT(name) \ + DEFINE_EVENT(xprtrdma_rxprt, name, \ + TP_PROTO( \ + const struct rpcrdma_xprt *r_xprt \ + ), \ + TP_ARGS(r_xprt)) + DECLARE_EVENT_CLASS(xprtrdma_rdch_event, TP_PROTO( const struct rpc_task *task, @@ -240,6 +271,31 @@ DECLARE_EVENT_CLASS(xprtrdma_mr, ** Call events **/ +TRACE_EVENT(xprtrdma_createmrs, + TP_PROTO( + const struct rpcrdma_xprt *r_xprt, + unsigned int count + ), + + TP_ARGS(r_xprt, count), + + TP_STRUCT__entry( + __field(const void *, r_xprt) + __field(unsigned int, count) + ), + + TP_fast_assign( + __entry->r_xprt = r_xprt; + __entry->count = count; + ), + + TP_printk("r_xprt=%p: created %u MRs", + __entry->r_xprt, __entry->count + ) +); + +DEFINE_RXPRT_EVENT(xprtrdma_nomrs); + DEFINE_RDCH_EVENT(xprtrdma_read_chunk); DEFINE_WRCH_EVENT(xprtrdma_write_chunk); DEFINE_WRCH_EVENT(xprtrdma_reply_chunk); @@ -423,6 +479,7 @@ DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_li_wake); DEFINE_MR_EVENT(xprtrdma_localinv); DEFINE_MR_EVENT(xprtrdma_dma_unmap); DEFINE_MR_EVENT(xprtrdma_remoteinv); +DEFINE_MR_EVENT(xprtrdma_recover_mr); /** ** Reply events diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 0e89e5529d6c..1addcc9477ba 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -998,7 +998,7 @@ rpcrdma_mr_recovery_worker(struct work_struct *work) mr = rpcrdma_mr_pop(&buf->rb_stale_mrs); spin_unlock(&buf->rb_recovery_lock); - dprintk("RPC: %s: recovering MR %p\n", __func__, mr); + trace_xprtrdma_recover_mr(mr); mr->mr_xprt->rx_ia.ri_ops->ro_recover_mr(mr); spin_lock(&buf->rb_recovery_lock); @@ -1054,7 +1054,7 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) r_xprt->rx_stats.mrs_allocated += count; spin_unlock(&buf->rb_mrlock); - dprintk("RPC: %s: created %u MRs\n", __func__, count); + trace_xprtrdma_createmrs(r_xprt, count); } static void @@ -1310,7 +1310,7 @@ rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt) return mr; out_nomrs: - dprintk("RPC: %s: no MRs available\n", __func__); + trace_xprtrdma_nomrs(r_xprt); if (r_xprt->rx_ep.rep_connected != -ENODEV) schedule_delayed_work(&buf->rb_refresh_worker, 0); From b4744e00a39e6213d84a83a86e6d304886316f5f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 20 Dec 2017 16:31:29 -0500 Subject: [PATCH 54/69] xprtrdma: Add trace points for connect events Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 75 +++++++++++++++++++++++++++++++++ net/sunrpc/xprtrdma/transport.c | 12 ++---- net/sunrpc/xprtrdma/verbs.c | 33 ++++++--------- 3 files changed, 92 insertions(+), 28 deletions(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index d296d5b084c2..023443d5a0bf 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -267,6 +267,81 @@ DECLARE_EVENT_CLASS(xprtrdma_mr, ), \ TP_ARGS(mr)) +/** + ** Connection events + **/ + +TRACE_EVENT(xprtrdma_conn_upcall, + TP_PROTO( + const struct rpcrdma_xprt *r_xprt, + struct rdma_cm_event *event + ), + + TP_ARGS(r_xprt, event), + + TP_STRUCT__entry( + __field(const void *, r_xprt) + __field(unsigned int, event) + __field(int, status) + __string(addr, rpcrdma_addrstr(r_xprt)) + __string(port, rpcrdma_portstr(r_xprt)) + ), + + TP_fast_assign( + __entry->r_xprt = r_xprt; + __entry->event = event->event; + __entry->status = event->status; + __assign_str(addr, rpcrdma_addrstr(r_xprt)); + __assign_str(port, rpcrdma_portstr(r_xprt)); + ), + + TP_printk("peer=[%s]:%s r_xprt=%p: %s (%u/%d)", + __get_str(addr), __get_str(port), + __entry->r_xprt, rdma_show_cm_event(__entry->event), + __entry->event, __entry->status + ) +); + +TRACE_EVENT(xprtrdma_disconnect, + TP_PROTO( + const struct rpcrdma_xprt *r_xprt, + int status + ), + + TP_ARGS(r_xprt, status), + + TP_STRUCT__entry( + __field(const void *, r_xprt) + __field(int, status) + __field(int, connected) + __string(addr, rpcrdma_addrstr(r_xprt)) + __string(port, rpcrdma_portstr(r_xprt)) + ), + + TP_fast_assign( + __entry->r_xprt = r_xprt; + __entry->status = status; + __entry->connected = r_xprt->rx_ep.rep_connected; + __assign_str(addr, rpcrdma_addrstr(r_xprt)); + __assign_str(port, rpcrdma_portstr(r_xprt)); + ), + + TP_printk("peer=[%s]:%s r_xprt=%p: status=%d %sconnected", + __get_str(addr), __get_str(port), + __entry->r_xprt, __entry->status, + __entry->connected == 1 ? "still " : "dis" + ) +); + +DEFINE_RXPRT_EVENT(xprtrdma_conn_start); +DEFINE_RXPRT_EVENT(xprtrdma_conn_tout); +DEFINE_RXPRT_EVENT(xprtrdma_create); +DEFINE_RXPRT_EVENT(xprtrdma_destroy); +DEFINE_RXPRT_EVENT(xprtrdma_remove); +DEFINE_RXPRT_EVENT(xprtrdma_reinsert); +DEFINE_RXPRT_EVENT(xprtrdma_reconnect); +DEFINE_RXPRT_EVENT(xprtrdma_inject_dsc); + /** ** Call events **/ diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index ddf0d87812ef..25d1160dc085 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -259,13 +259,10 @@ xprt_rdma_connect_worker(struct work_struct *work) xprt_clear_connected(xprt); - dprintk("RPC: %s: %sconnect\n", __func__, - r_xprt->rx_ep.rep_connected != 0 ? "re" : ""); rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia); if (rc) xprt_wake_pending_tasks(xprt, rc); - dprintk("RPC: %s: exit\n", __func__); xprt_clear_connecting(xprt); } @@ -275,7 +272,7 @@ xprt_rdma_inject_disconnect(struct rpc_xprt *xprt) struct rpcrdma_xprt *r_xprt = container_of(xprt, struct rpcrdma_xprt, rx_xprt); - pr_info("rpcrdma: injecting transport disconnect on xprt=%p\n", xprt); + trace_xprtrdma_inject_dsc(r_xprt); rdma_disconnect(r_xprt->rx_ia.ri_id); } @@ -295,7 +292,7 @@ xprt_rdma_destroy(struct rpc_xprt *xprt) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - dprintk("RPC: %s: called\n", __func__); + trace_xprtrdma_destroy(r_xprt); cancel_delayed_work_sync(&r_xprt->rx_connect_worker); @@ -306,11 +303,8 @@ xprt_rdma_destroy(struct rpc_xprt *xprt) rpcrdma_ia_close(&r_xprt->rx_ia); xprt_rdma_free_addresses(xprt); - xprt_free(xprt); - dprintk("RPC: %s: returning\n", __func__); - module_put(THIS_MODULE); } @@ -430,6 +424,7 @@ xprt_setup_rdma(struct xprt_create *args) dprintk("RPC: %s: %s:%s\n", __func__, xprt->address_strings[RPC_DISPLAY_ADDR], xprt->address_strings[RPC_DISPLAY_PORT]); + trace_xprtrdma_create(new_xprt); return xprt; out4: @@ -440,6 +435,7 @@ out3: out2: rpcrdma_ia_close(&new_xprt->rx_ia); out1: + trace_xprtrdma_destroy(new_xprt); xprt_rdma_free_addresses(xprt); xprt_free(xprt); return ERR_PTR(rc); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 1addcc9477ba..7f9e9025c42f 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -220,6 +220,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) struct rpcrdma_ep *ep = &xprt->rx_ep; int connstate = 0; + trace_xprtrdma_conn_upcall(xprt, event); switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: case RDMA_CM_EVENT_ROUTE_RESOLVED: @@ -228,14 +229,10 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) break; case RDMA_CM_EVENT_ADDR_ERROR: ia->ri_async_rc = -EHOSTUNREACH; - dprintk("RPC: %s: CM address resolution error, ep 0x%p\n", - __func__, ep); complete(&ia->ri_done); break; case RDMA_CM_EVENT_ROUTE_ERROR: ia->ri_async_rc = -ENETUNREACH; - dprintk("RPC: %s: CM route resolution error, ep 0x%p\n", - __func__, ep); complete(&ia->ri_done); break; case RDMA_CM_EVENT_DEVICE_REMOVAL: @@ -299,6 +296,8 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia) struct rdma_cm_id *id; int rc; + trace_xprtrdma_conn_start(xprt); + init_completion(&ia->ri_done); init_completion(&ia->ri_remove_done); @@ -322,8 +321,7 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia) } rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout); if (rc < 0) { - dprintk("RPC: %s: wait() exited: %i\n", - __func__, rc); + trace_xprtrdma_conn_tout(xprt); goto out; } @@ -340,8 +338,7 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia) } rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout); if (rc < 0) { - dprintk("RPC: %s: wait() exited: %i\n", - __func__, rc); + trace_xprtrdma_conn_tout(xprt); goto out; } rc = ia->ri_async_rc; @@ -461,6 +458,8 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia) /* Allow waiters to continue */ complete(&ia->ri_remove_done); + + trace_xprtrdma_remove(r_xprt); } /** @@ -471,7 +470,6 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia) void rpcrdma_ia_close(struct rpcrdma_ia *ia) { - dprintk("RPC: %s: entering\n", __func__); if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) { if (ia->ri_id->qp) rdma_destroy_qp(ia->ri_id); @@ -625,9 +623,6 @@ out1: void rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { - dprintk("RPC: %s: entering, connected is %d\n", - __func__, ep->rep_connected); - cancel_delayed_work_sync(&ep->rep_connect_worker); if (ia->ri_id->qp) { @@ -650,7 +645,7 @@ rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt, { int rc, err; - pr_info("%s: r_xprt = %p\n", __func__, r_xprt); + trace_xprtrdma_reinsert(r_xprt); rc = -EHOSTUNREACH; if (rpcrdma_ia_open(r_xprt)) @@ -688,7 +683,7 @@ rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ep *ep, struct rdma_cm_id *id, *old; int err, rc; - dprintk("RPC: %s: reconnecting...\n", __func__); + trace_xprtrdma_reconnect(r_xprt); rpcrdma_ep_disconnect(ep, ia); @@ -810,16 +805,14 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) int rc; rc = rdma_disconnect(ia->ri_id); - if (!rc) { + if (!rc) /* returns without wait if not connected */ wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 1); - dprintk("RPC: %s: after wait, %sconnected\n", __func__, - (ep->rep_connected == 1) ? "still " : "dis"); - } else { - dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc); + else ep->rep_connected = rc; - } + trace_xprtrdma_disconnect(container_of(ep, struct rpcrdma_xprt, + rx_ep), rc); ib_drain_qp(ia->ri_id->qp); } From fc1eb8076fb0eb0641566b24007a40a7d4ae0116 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 20 Dec 2017 16:31:37 -0500 Subject: [PATCH 55/69] xprtrdma: Add trace points in the client-side backchannel code paths Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 68 +++++++++++++++++++++++++++++++ net/sunrpc/xprtrdma/backchannel.c | 8 ++-- net/sunrpc/xprtrdma/xprt_rdma.h | 2 +- 3 files changed, 73 insertions(+), 5 deletions(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 023443d5a0bf..95c2c0bffb65 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -267,6 +267,39 @@ DECLARE_EVENT_CLASS(xprtrdma_mr, ), \ TP_ARGS(mr)) +DECLARE_EVENT_CLASS(xprtrdma_cb_event, + TP_PROTO( + const struct rpc_rqst *rqst + ), + + TP_ARGS(rqst), + + TP_STRUCT__entry( + __field(const void *, rqst) + __field(const void *, rep) + __field(const void *, req) + __field(u32, xid) + ), + + TP_fast_assign( + __entry->rqst = rqst; + __entry->req = rpcr_to_rdmar(rqst); + __entry->rep = rpcr_to_rdmar(rqst)->rl_reply; + __entry->xid = be32_to_cpu(rqst->rq_xid); + ), + + TP_printk("xid=0x%08x, rqst=%p req=%p rep=%p", + __entry->xid, __entry->rqst, __entry->req, __entry->rep + ) +); + +#define DEFINE_CB_EVENT(name) \ + DEFINE_EVENT(xprtrdma_cb_event, name, \ + TP_PROTO( \ + const struct rpc_rqst *rqst \ + ), \ + TP_ARGS(rqst)) + /** ** Connection events **/ @@ -719,6 +752,41 @@ TRACE_EVENT(xprtrdma_decode_seg, ) ); +/** + ** Callback events + **/ + +TRACE_EVENT(xprtrdma_cb_setup, + TP_PROTO( + const struct rpcrdma_xprt *r_xprt, + unsigned int reqs + ), + + TP_ARGS(r_xprt, reqs), + + TP_STRUCT__entry( + __field(const void *, r_xprt) + __field(unsigned int, reqs) + __string(addr, rpcrdma_addrstr(r_xprt)) + __string(port, rpcrdma_portstr(r_xprt)) + ), + + TP_fast_assign( + __entry->r_xprt = r_xprt; + __entry->reqs = reqs; + __assign_str(addr, rpcrdma_addrstr(r_xprt)); + __assign_str(port, rpcrdma_portstr(r_xprt)); + ), + + TP_printk("peer=[%s]:%s r_xprt=%p: %u reqs", + __get_str(addr), __get_str(port), + __entry->r_xprt, __entry->reqs + ) +); + +DEFINE_CB_EVENT(xprtrdma_cb_call); +DEFINE_CB_EVENT(xprtrdma_cb_reply); + #endif /* _TRACE_RPCRDMA_H */ #include diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 3c7998a72191..ca0c5d3d493e 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -140,7 +140,7 @@ int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs) buffer->rb_bc_srv_max_requests = reqs; request_module("svcrdma"); - + trace_xprtrdma_cb_setup(r_xprt, reqs); return 0; out_free: @@ -212,6 +212,8 @@ static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) if (rpcrdma_prepare_send_sges(r_xprt, req, RPCRDMA_HDRLEN_MIN, &rqst->rq_snd_buf, rpcrdma_noch)) return -EIO; + + trace_xprtrdma_cb_reply(rqst); return 0; } @@ -331,7 +333,6 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, struct rpc_rqst, rq_bc_pa_list); list_del(&rqst->rq_bc_pa_list); spin_unlock(&xprt->bc_pa_lock); - dprintk("RPC: %s: using rqst %p\n", __func__, rqst); /* Prepare rqst */ rqst->rq_reply_bytes_recvd = 0; @@ -352,9 +353,8 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, * the Upper Layer is done decoding it. */ req = rpcr_to_rdmar(rqst); - dprintk("RPC: %s: attaching rep %p to req %p\n", - __func__, rep, req); req->rl_reply = rep; + trace_xprtrdma_cb_call(rqst); /* Queue rqst for ULP's callback service */ bc_serv = xprt->bc_serv; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index a1ca1b638123..69883a960a3f 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -365,7 +365,7 @@ rpcrdma_set_xprtdata(struct rpc_rqst *rqst, struct rpcrdma_req *req) } static inline struct rpcrdma_req * -rpcr_to_rdmar(struct rpc_rqst *rqst) +rpcr_to_rdmar(const struct rpc_rqst *rqst) { return rqst->rq_xprtdata; } From 643cf3237db83e1443fa61de896449858393cb72 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 20 Dec 2017 16:31:45 -0500 Subject: [PATCH 56/69] xprtrdma: Add trace points to instrument QP and CQ access upcalls Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 31 +++++++++++++++++++++++++++++++ net/sunrpc/xprtrdma/verbs.c | 3 +++ 2 files changed, 34 insertions(+) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 95c2c0bffb65..1e5ae57a4f0d 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -375,6 +375,37 @@ DEFINE_RXPRT_EVENT(xprtrdma_reinsert); DEFINE_RXPRT_EVENT(xprtrdma_reconnect); DEFINE_RXPRT_EVENT(xprtrdma_inject_dsc); +TRACE_EVENT(xprtrdma_qp_error, + TP_PROTO( + const struct rpcrdma_xprt *r_xprt, + const struct ib_event *event + ), + + TP_ARGS(r_xprt, event), + + TP_STRUCT__entry( + __field(const void *, r_xprt) + __field(unsigned int, event) + __string(name, event->device->name) + __string(addr, rpcrdma_addrstr(r_xprt)) + __string(port, rpcrdma_portstr(r_xprt)) + ), + + TP_fast_assign( + __entry->r_xprt = r_xprt; + __entry->event = event->event; + __assign_str(name, event->device->name); + __assign_str(addr, rpcrdma_addrstr(r_xprt)); + __assign_str(port, rpcrdma_portstr(r_xprt)); + ), + + TP_printk("peer=[%s]:%s r_xprt=%p: dev %s: %s (%u)", + __get_str(addr), __get_str(port), __entry->r_xprt, + __get_str(name), rdma_show_ib_event(__entry->event), + __entry->event + ) +); + /** ** Call events **/ diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 7f9e9025c42f..fb81d3a4b0b3 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -108,7 +108,10 @@ static void rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context) { struct rpcrdma_ep *ep = context; + struct rpcrdma_xprt *r_xprt = container_of(ep, struct rpcrdma_xprt, + rx_ep); + trace_xprtrdma_qp_error(r_xprt, event); pr_err("rpcrdma: %s on device %s ep %p\n", ib_event_msg(event->event), event->device->name, context); From ae7246762530af00109c3fb8a30031da054c0aa0 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 20 Dec 2017 16:31:53 -0500 Subject: [PATCH 57/69] xprtrdma: Instrument allocation/release of rpcrdma_req/rep objects Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 67 +++++++++++++++++++++++++++++++++ net/sunrpc/xprtrdma/transport.c | 12 +++--- net/sunrpc/xprtrdma/verbs.c | 4 +- 3 files changed, 74 insertions(+), 9 deletions(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 1e5ae57a4f0d..50ed3f8bf534 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -783,6 +783,73 @@ TRACE_EVENT(xprtrdma_decode_seg, ) ); +/** + ** Allocation/release of rpcrdma_reqs and rpcrdma_reps + **/ + +TRACE_EVENT(xprtrdma_allocate, + TP_PROTO( + const struct rpc_task *task, + const struct rpcrdma_req *req + ), + + TP_ARGS(task, req), + + TP_STRUCT__entry( + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(const void *, req) + __field(const void *, rep) + __field(size_t, callsize) + __field(size_t, rcvsize) + ), + + TP_fast_assign( + __entry->task_id = task->tk_pid; + __entry->client_id = task->tk_client->cl_clid; + __entry->req = req; + __entry->rep = req ? req->rl_reply : NULL; + __entry->callsize = task->tk_rqstp->rq_callsize; + __entry->rcvsize = task->tk_rqstp->rq_rcvsize; + ), + + TP_printk("task:%u@%u req=%p rep=%p (%zu, %zu)", + __entry->task_id, __entry->client_id, + __entry->req, __entry->rep, + __entry->callsize, __entry->rcvsize + ) +); + +TRACE_EVENT(xprtrdma_rpc_done, + TP_PROTO( + const struct rpc_task *task, + const struct rpcrdma_req *req + ), + + TP_ARGS(task, req), + + TP_STRUCT__entry( + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(const void *, req) + __field(const void *, rep) + ), + + TP_fast_assign( + __entry->task_id = task->tk_pid; + __entry->client_id = task->tk_client->cl_clid; + __entry->req = req; + __entry->rep = req->rl_reply; + ), + + TP_printk("task:%u@%u req=%p rep=%p", + __entry->task_id, __entry->client_id, + __entry->req, __entry->rep + ) +); + +DEFINE_RXPRT_EVENT(xprtrdma_noreps); + /** ** Callback events **/ diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 25d1160dc085..b90179af88bf 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -640,7 +640,7 @@ xprt_rdma_allocate(struct rpc_task *task) req = rpcrdma_buffer_get(&r_xprt->rx_buf); if (req == NULL) - return -ENOMEM; + goto out_get; flags = RPCRDMA_DEF_GFP; if (RPC_IS_SWAPPER(task)) @@ -653,19 +653,18 @@ xprt_rdma_allocate(struct rpc_task *task) if (!rpcrdma_get_recvbuf(r_xprt, req, rqst->rq_rcvsize, flags)) goto out_fail; - dprintk("RPC: %5u %s: send size = %zd, recv size = %zd, req = %p\n", - task->tk_pid, __func__, rqst->rq_callsize, - rqst->rq_rcvsize, req); - req->rl_cpu = smp_processor_id(); req->rl_connect_cookie = 0; /* our reserved value */ rpcrdma_set_xprtdata(rqst, req); rqst->rq_buffer = req->rl_sendbuf->rg_base; rqst->rq_rbuffer = req->rl_recvbuf->rg_base; + trace_xprtrdma_allocate(task, req); return 0; out_fail: rpcrdma_buffer_put(req); +out_get: + trace_xprtrdma_allocate(task, NULL); return -ENOMEM; } @@ -682,10 +681,9 @@ xprt_rdma_free(struct rpc_task *task) struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); struct rpcrdma_req *req = rpcr_to_rdmar(rqst); - dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply); - if (test_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags)) rpcrdma_release_rqst(r_xprt, req); + trace_xprtrdma_rpc_done(task, req); rpcrdma_buffer_put(req); } diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index fb81d3a4b0b3..57e1139d85bc 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1385,11 +1385,11 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) req = rpcrdma_buffer_get_req_locked(buffers); req->rl_reply = rpcrdma_buffer_get_rep(buffers); spin_unlock(&buffers->rb_lock); + return req; out_reqbuf: spin_unlock(&buffers->rb_lock); - pr_warn("RPC: %s: out of request buffers\n", __func__); return NULL; } @@ -1612,7 +1612,7 @@ rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count) out_reqbuf: spin_unlock(&buffers->rb_lock); - pr_warn("%s: no extra receive buffers\n", __func__); + trace_xprtrdma_noreps(r_xprt); return -ENOMEM; out_rc: From aae2349c49198e5dc7376519201cb4647a902e0f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 3 Jan 2018 15:38:09 -0500 Subject: [PATCH 58/69] xprtrdma: Fix "bytes registered" accounting The contents of seg->mr_len changed when ->ro_map stopped returning the full chunk length in the first segment. Count the full length of each Write chunk, not the length of the first segment (which now can only be as large as a page). Fixes: 9d6b04097882 ("xprtrdma: Place registered MWs on a ... ") Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 1ae9b41b75a1..162e5dd82466 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -431,7 +431,7 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, trace_xprtrdma_write_chunk(rqst->rq_task, mr, nsegs); r_xprt->rx_stats.write_chunk_count++; - r_xprt->rx_stats.total_rdma_request += seg->mr_len; + r_xprt->rx_stats.total_rdma_request += mr->mr_length; nchunks++; nsegs -= mr->mr_nents; } while (nsegs); @@ -489,7 +489,7 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, trace_xprtrdma_reply_chunk(rqst->rq_task, mr, nsegs); r_xprt->rx_stats.reply_chunk_count++; - r_xprt->rx_stats.total_rdma_request += seg->mr_len; + r_xprt->rx_stats.total_rdma_request += mr->mr_length; nchunks++; nsegs -= mr->mr_nents; } while (nsegs); From 9ab6d89e74785378cf69285375a2a6e0c63c51b2 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 3 Jan 2018 15:38:17 -0500 Subject: [PATCH 59/69] xprtrdma: Correct some documenting comments Fix kernel-doc warnings in net/sunrpc/xprtrdma/ . net/sunrpc/xprtrdma/verbs.c:1575: warning: No description found for parameter 'count' net/sunrpc/xprtrdma/verbs.c:1575: warning: Excess function parameter 'min_reqs' description in 'rpcrdma_ep_post_extra_recv' net/sunrpc/xprtrdma/backchannel.c:288: warning: No description found for parameter 'r_xprt' net/sunrpc/xprtrdma/backchannel.c:288: warning: Excess function parameter 'xprt' description in 'rpcrdma_bc_receive_call' Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/backchannel.c | 2 +- net/sunrpc/xprtrdma/verbs.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index ca0c5d3d493e..ed1a4a3065ee 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -294,7 +294,7 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst) /** * rpcrdma_bc_receive_call - Handle a backward direction call - * @xprt: transport receiving the call + * @r_xprt: transport receiving the call * @rep: receive buffer containing the call * * Operational assumptions: diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 57e1139d85bc..f4eb63e8e689 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1584,7 +1584,7 @@ out_map: /** * rpcrdma_ep_post_extra_recv - Post buffers for incoming backchannel requests * @r_xprt: transport associated with these backchannel resources - * @min_reqs: minimum number of incoming requests expected + * @count: minimum number of incoming requests expected * * Returns zero if all requested buffers were posted, or a negative errno. */ From 82476d9f955be0dfefadad22a87be6151cd70777 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 3 Jan 2018 15:38:25 -0500 Subject: [PATCH 60/69] SUNRPC: Trace xprt_timer events Track RPC timeouts: report the XID and the server address to match the content of network capture. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/sunrpc.h | 4 ++++ net/sunrpc/xprt.c | 2 +- net/sunrpc/xprtrdma/transport.c | 2 -- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 8c153f68509e..7804d857fd24 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -390,6 +390,10 @@ DECLARE_EVENT_CLASS(rpc_xprt_event, __entry->status) ); +DEFINE_EVENT(rpc_xprt_event, xprt_timer, + TP_PROTO(struct rpc_xprt *xprt, __be32 xid, int status), + TP_ARGS(xprt, xid, status)); + DEFINE_EVENT(rpc_xprt_event, xprt_lookup_rqst, TP_PROTO(struct rpc_xprt *xprt, __be32 xid, int status), TP_ARGS(xprt, xid, status)); diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 33b74fd84051..2436fd1125fc 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -940,8 +940,8 @@ static void xprt_timer(struct rpc_task *task) if (task->tk_status != -ETIMEDOUT) return; - dprintk("RPC: %5u xprt_timer\n", task->tk_pid); + trace_xprt_timer(xprt, req->rq_xid, task->tk_status); if (!req->rq_reply_bytes_recvd) { if (xprt->ops->timer) xprt->ops->timer(xprt, task); diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index b90179af88bf..4b1ecfe979cf 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -516,8 +516,6 @@ xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port) static void xprt_rdma_timer(struct rpc_xprt *xprt, struct rpc_task *task) { - dprintk("RPC: %5u %s: xprt = %p\n", task->tk_pid, __func__, xprt); - xprt_force_disconnect(xprt); } From 520694496aec9c4f1f69f8400753f04a6ffdbfc5 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 3 Jan 2018 15:38:33 -0500 Subject: [PATCH 61/69] sunrpc: Format RPC events consistently for display Clean up: Make it easier to use text search when browsing a trace report. Other events use "status=%d". Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/sunrpc.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 7804d857fd24..0594e668b135 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -32,7 +32,7 @@ DECLARE_EVENT_CLASS(rpc_task_status, __entry->status = task->tk_status; ), - TP_printk("task:%u@%u, status %d", + TP_printk("task:%u@%u status=%d", __entry->task_id, __entry->client_id, __entry->status) ); @@ -66,7 +66,7 @@ TRACE_EVENT(rpc_connect_status, __entry->status = status; ), - TP_printk("task:%u@%u, status %d", + TP_printk("task:%u@%u status=%d", __entry->task_id, __entry->client_id, __entry->status) ); From cf08d6f2e6e1b9f177cafbe57e7ad33a76d32c38 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 3 Jan 2018 15:38:41 -0500 Subject: [PATCH 62/69] SUNRPC: task_run_action should display tk_callback This shows up in every RPC: kworker/4:1-19772 [004] 3467.373443: rpc_task_run_action: task:4711@2 flags=0e81 state=0005 status=0 action=call_status kworker/4:1-19772 [004] 3467.373444: rpc_task_run_action: task:4711@2 flags=0e81 state=0005 status=0 action=call_status What's actually going on is that the first iteration of the RPC scheduler is invoking the function in tk_callback (in this case, xprt_timer), then invoking call_status on the next iteration. Feeding do_action, rather than tk_action, to the "task_run_action" trace point will now always display the correct FSM step. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index b1b49edd7c4d..c292a5e1a70c 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -770,7 +770,7 @@ static void __rpc_execute(struct rpc_task *task) if (do_action == NULL) break; } - trace_rpc_task_run_action(task->tk_client, task, task->tk_action); + trace_rpc_task_run_action(task->tk_client, task, do_action); do_action(task); /* From 21ead9ff3dc72604d89499a1da5a18cc193ec4ff Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 3 Jan 2018 15:38:49 -0500 Subject: [PATCH 63/69] SUNRPC: Micro-optimize __rpc_execute The common case: There are 13 to 14 actions per RPC, and tk_callback is non-NULL in only one of them. There's no need to store a NULL in the tk_callback field during each FSM step. This slightly improves throughput results in dbench and other multi- threaded benchmarks on my two-socket client on 56Gb InfiniBand, but will probably be inconsequential on slower systems. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/sched.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index c292a5e1a70c..896691afbb1a 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -755,21 +755,19 @@ static void __rpc_execute(struct rpc_task *task) void (*do_action)(struct rpc_task *); /* - * Execute any pending callback first. + * Perform the next FSM step or a pending callback. + * + * tk_action may be NULL if the task has been killed. + * In particular, note that rpc_killall_tasks may + * do this at any time, so beware when dereferencing. */ - do_action = task->tk_callback; - task->tk_callback = NULL; - if (do_action == NULL) { - /* - * Perform the next FSM step. - * tk_action may be NULL if the task has been killed. - * In particular, note that rpc_killall_tasks may - * do this at any time, so beware when dereferencing. - */ - do_action = task->tk_action; - if (do_action == NULL) - break; + do_action = task->tk_action; + if (task->tk_callback) { + do_action = task->tk_callback; + task->tk_callback = NULL; } + if (!do_action) + break; trace_rpc_task_run_action(task->tk_client, task, do_action); do_action(task); From 0be283f676a1e7b208db0c992283197ef8b52158 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Tue, 23 Jan 2018 09:32:35 -0500 Subject: [PATCH 64/69] SUNRPC: Fix null rpc_clnt dereference in rpc_task_queued tracepoint Backchannel tasks will not have a reference to the rpc_clnt. Return -1 for cl_clid in that case. Signed-off-by: Benjamin Coddington Signed-off-by: Trond Myklebust --- include/trace/events/sunrpc.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 8c153f68509e..1357ec8973d5 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -175,7 +175,7 @@ DECLARE_EVENT_CLASS(rpc_task_queued, ), TP_fast_assign( - __entry->client_id = clnt->cl_clid; + __entry->client_id = clnt ? clnt->cl_clid : -1; __entry->task_id = task->tk_pid; __entry->timeout = task->tk_timeout; __entry->runstate = task->tk_runstate; @@ -184,7 +184,7 @@ DECLARE_EVENT_CLASS(rpc_task_queued, __assign_str(q_name, rpc_qname(q)); ), - TP_printk("task:%u@%u flags=%4.4x state=%4.4lx status=%d timeout=%lu queue=%s", + TP_printk("task:%u@%d flags=%4.4x state=%4.4lx status=%d timeout=%lu queue=%s", __entry->task_id, __entry->client_id, __entry->flags, __entry->runstate, From 535cb8f3193f2976f39371810dcad64ae2650771 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 23 Jan 2018 19:39:04 -0500 Subject: [PATCH 65/69] lockd: Fix server refcounting The server shouldn't actually delete the struct nlm_host until it hits the garbage collector. In order to make that work correctly with the refcount API, we can bump the refcount by one, and then use refcount_dec_if_one() in the garbage collector. Signed-off-by: Trond Myklebust Acked-by: J. Bruce Fields --- fs/lockd/host.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 7d6ab72bbe65..d35cd6be0675 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -388,6 +388,8 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp, ln->nrhosts++; nrhosts++; + refcount_inc(&host->h_count); + dprintk("lockd: %s created host %s (%s)\n", __func__, host->h_name, host->h_addrbuf); @@ -662,8 +664,7 @@ nlm_gc_hosts(struct net *net) for_each_host_safe(host, next, chain, nlm_server_hosts) { if (net && host->net != net) continue; - if (refcount_read(&host->h_count) || host->h_inuse - || time_before(jiffies, host->h_expires)) { + if (host->h_inuse || time_before(jiffies, host->h_expires)) { dprintk("nlm_gc_hosts skipping %s " "(cnt %d use %d exp %ld net %x)\n", host->h_name, refcount_read(&host->h_count), @@ -671,7 +672,8 @@ nlm_gc_hosts(struct net *net) host->net->ns.inum); continue; } - nlm_destroy_host_locked(host); + if (refcount_dec_if_one(&host->h_count)) + nlm_destroy_host_locked(host); } if (net) { From b39604755ce06389357b0c22d138c954f2e96451 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Thu, 25 Jan 2018 09:36:25 -0500 Subject: [PATCH 66/69] pnfs/blocklayout: pnfs_block_dev_map uses bytes, not sectors Fixup the field types to match their use. Signed-off-by: Benjamin Coddington Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/blocklayout.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index efc007f00742..716bc75e9ed2 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h @@ -92,10 +92,9 @@ struct pnfs_block_volume { }; struct pnfs_block_dev_map { - sector_t start; - sector_t len; - - sector_t disk_offset; + u64 start; + u64 len; + u64 disk_offset; struct block_device *bdev; }; From f34462c3c8a2ec0f09003b526c4d7c08782d9350 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Thu, 25 Jan 2018 09:36:26 -0500 Subject: [PATCH 67/69] pnfs/blocklayout: Ensure disk address in block device map It's possible that the device map is smaller than the offset into the device for the I/O we're adding. Add a check for it and bail out, otherwise we risk botching the bio calculations that follow. Signed-off-by: Benjamin Coddington Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/blocklayout.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index ca6cf54b54df..7cb5c38c19e4 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -137,6 +137,11 @@ bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector, return bio; } +static bool offset_in_map(u64 offset, struct pnfs_block_dev_map *map) +{ + return offset >= map->start && offset < map->start + map->len; +} + static struct bio * do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect, struct page *page, struct pnfs_block_dev_map *map, @@ -156,8 +161,8 @@ do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect, /* translate to physical disk offset */ disk_addr = (u64)isect << SECTOR_SHIFT; - if (disk_addr < map->start || disk_addr >= map->start + map->len) { - if (!dev->map(dev, disk_addr, map)) + if (!offset_in_map(disk_addr, map)) { + if (!dev->map(dev, disk_addr, map) || !offset_in_map(disk_addr, map)) return ERR_PTR(-EIO); bio = bl_submit_bio(bio); } From 128159f2929ef398698c28a4177eca1c1b88aa74 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 28 Jan 2018 09:34:51 -0500 Subject: [PATCH 68/69] NFS: Remove a redundant call to unmap_mapping_range() We don't need to call unmap_mapping_range() prior to calling nfs_sync_mapping(). Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index deeb7d1097d0..49fba9ea5872 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1170,7 +1170,6 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map if (mapping->nrpages != 0) { if (S_ISREG(inode->i_mode)) { - unmap_mapping_range(mapping, 0, 0, 0); ret = nfs_sync_mapping(mapping); if (ret < 0) return ret; From e231c6879cfd44e4fffd384bb6dd7d313249a523 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 28 Jan 2018 09:29:41 -0500 Subject: [PATCH 69/69] NFS: Fix a race between mmap() and O_DIRECT When locking the file in order to do O_DIRECT on it, we must unmap any mmapped ranges on the pagecache so that we can flush out the dirty data. Fixes: a5864c999de67 ("NFS: Do not serialise O_DIRECT reads and writes") Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org # v4.8+ --- fs/nfs/io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/io.c b/fs/nfs/io.c index 20fef85d2bb1..9034b4926909 100644 --- a/fs/nfs/io.c +++ b/fs/nfs/io.c @@ -99,7 +99,7 @@ static void nfs_block_buffered(struct nfs_inode *nfsi, struct inode *inode) { if (!test_bit(NFS_INO_ODIRECT, &nfsi->flags)) { set_bit(NFS_INO_ODIRECT, &nfsi->flags); - nfs_wb_all(inode); + nfs_sync_mapping(inode->i_mapping); } }