NFS: Client side changes for RDMA
These patches are mostly cleanups and bugfixes for using RDMA as an over-the-wire transport. Highlights include: - Remove obsolete memory registration modes. - Removing BUG_ON()s to keep client's running. - Fix deadlocks, NULL-pointer dereferences, and memory leaks. Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com> -----BEGIN PGP SIGNATURE----- Version: GnuPG v2.0.22 (GNU/Linux) iQIcBAABAgAGBQJTjyQ/AAoJENfLVL+wpUDrlPYP/1X2yy8JVavZV4UT7m0OGHzF rR2IssjF/9dr008Abo9V9dOfvC2GSDlSEM/MY+baGkNE3r5Ck2+CO/0tQMpqyX7j Ie3Ar1CRIU8ClvKA4cEm8Kz0yGxa8AIEEDjW/JEPy/AxnJA+tVGPjmjyAGucMp1O Un8dXx7vzl24qyiG0LMPmEr43AQsaC9G5U9gRQHLY4f4td/EA52aWQ8OyTOdSihh Tlc+OHgWkKLvew23LZGE1S40jpl/H+TTIhVjnlaIcR0FiU64DGXI3cgfSrxwXT2S 0cU97ClKGALFRLFvqAxcrfkyBge4EAHG2YEciTH7dGxnTldZ7rFtwTjogh8yEDvO KEGwwO6Du3O7SMBPXlxE1Ig41MhTCJ+EEbyLk4ywTDQpREV6DJxMGxOu9LPQ24F0 nFgOCF57l5uw7y4k4E8D+IDIzHAznV9qr/4pQfMdAmoz4iHOZECYJ9IBVz6Hsu0B LHTPzL2uSiOnD4uEZxXwx1yFgaF1A4dfElH4CUIHcmb+o7kN5WJIXKysCx2G1r2X YAOBuO0OK4CG0BIdRVFP5n76B/4uIv89ILcYQJ0mXoaXOn+Aqqfc2eRL/f81yfyh PNZMN0yTM4jJmlHnMz4MGQJwUwEhaCkxX7EkufQ9gQbPJ4feV8xEAWX+V1lDNSo0 XKwTA8qdm9WgboWqRvYN =eo5g -----END PGP SIGNATURE----- Merge tag 'nfs-rdma-3.16' of git://git.linux-nfs.org/projects/anna/nfs-rdma into linux-next Pull NFS client side changes for RDMA from Anna Schumaker These patches are mostly cleanups and bugfixes for using RDMA as an over-the-wire transport. Highlights include: - Remove obsolete memory registration modes. - Removing BUG_ON()s to keep client's running. - Fix deadlocks, NULL-pointer dereferences, and memory leaks. * tag 'nfs-rdma-3.16' of git://git.linux-nfs.org/projects/anna/nfs-rdma: (24 commits) xprtrdma: Disconnect on registration failure xprtrdma: Remove BUG_ON() call sites xprtrdma: Avoid deadlock when credit window is reset SUNRPC: Move congestion window constants to header file xprtrdma: Reset connection timeout after successful reconnect xprtrdma: Use macros for reconnection timeout constants xprtrdma: Allocate missing pagelist xprtrdma: Remove Tavor MTU setting xprtrdma: Ensure ia->ri_id->qp is not NULL when reconnecting xprtrdma: Reduce the number of hardway buffer allocations xprtrdma: Limit work done by completion handler xprtrmda: Reduce calls to ib_poll_cq() in completion handlers xprtrmda: Reduce lock contention in completion handlers xprtrdma: Split the completion queue xprtrdma: Make rpcrdma_ep_destroy() return void xprtrdma: Simplify rpcrdma_deregister_external() synopsis xprtrdma: mount reports "Invalid mount option" if memreg mode not supported xprtrdma: Fall back to MTHCAFMR when FRMR is not supported xprtrdma: Remove REGISTER memory registration mode xprtrdma: Remove MEMWINDOWS registration modes ...
This commit is contained in:
Коммит
7b160cfd19
|
@ -24,6 +24,12 @@
|
|||
#define RPC_MAX_SLOT_TABLE_LIMIT (65536U)
|
||||
#define RPC_MAX_SLOT_TABLE RPC_MAX_SLOT_TABLE_LIMIT
|
||||
|
||||
#define RPC_CWNDSHIFT (8U)
|
||||
#define RPC_CWNDSCALE (1U << RPC_CWNDSHIFT)
|
||||
#define RPC_INITCWND RPC_CWNDSCALE
|
||||
#define RPC_MAXCWND(xprt) ((xprt)->max_reqs << RPC_CWNDSHIFT)
|
||||
#define RPCXPRT_CONGESTED(xprt) ((xprt)->cong >= (xprt)->cwnd)
|
||||
|
||||
/*
|
||||
* This describes a timeout strategy
|
||||
*/
|
||||
|
|
|
@ -71,24 +71,6 @@ static void xprt_destroy(struct rpc_xprt *xprt);
|
|||
static DEFINE_SPINLOCK(xprt_list_lock);
|
||||
static LIST_HEAD(xprt_list);
|
||||
|
||||
/*
|
||||
* The transport code maintains an estimate on the maximum number of out-
|
||||
* standing RPC requests, using a smoothed version of the congestion
|
||||
* avoidance implemented in 44BSD. This is basically the Van Jacobson
|
||||
* congestion algorithm: If a retransmit occurs, the congestion window is
|
||||
* halved; otherwise, it is incremented by 1/cwnd when
|
||||
*
|
||||
* - a reply is received and
|
||||
* - a full number of requests are outstanding and
|
||||
* - the congestion window hasn't been updated recently.
|
||||
*/
|
||||
#define RPC_CWNDSHIFT (8U)
|
||||
#define RPC_CWNDSCALE (1U << RPC_CWNDSHIFT)
|
||||
#define RPC_INITCWND RPC_CWNDSCALE
|
||||
#define RPC_MAXCWND(xprt) ((xprt)->max_reqs << RPC_CWNDSHIFT)
|
||||
|
||||
#define RPCXPRT_CONGESTED(xprt) ((xprt)->cong >= (xprt)->cwnd)
|
||||
|
||||
/**
|
||||
* xprt_register_transport - register a transport implementation
|
||||
* @transport: transport to register
|
||||
|
@ -446,7 +428,15 @@ EXPORT_SYMBOL_GPL(xprt_release_rqst_cong);
|
|||
* @task: recently completed RPC request used to adjust window
|
||||
* @result: result code of completed RPC request
|
||||
*
|
||||
* We use a time-smoothed congestion estimator to avoid heavy oscillation.
|
||||
* The transport code maintains an estimate on the maximum number of out-
|
||||
* standing RPC requests, using a smoothed version of the congestion
|
||||
* avoidance implemented in 44BSD. This is basically the Van Jacobson
|
||||
* congestion algorithm: If a retransmit occurs, the congestion window is
|
||||
* halved; otherwise, it is incremented by 1/cwnd when
|
||||
*
|
||||
* - a reply is received and
|
||||
* - a full number of requests are outstanding and
|
||||
* - the congestion window hasn't been updated recently.
|
||||
*/
|
||||
void xprt_adjust_cwnd(struct rpc_xprt *xprt, struct rpc_task *task, int result)
|
||||
{
|
||||
|
|
|
@ -78,8 +78,7 @@ static const char transfertypes[][12] = {
|
|||
* elements. Segments are then coalesced when registered, if possible
|
||||
* within the selected memreg mode.
|
||||
*
|
||||
* Note, this routine is never called if the connection's memory
|
||||
* registration strategy is 0 (bounce buffers).
|
||||
* Returns positive number of segments converted, or a negative errno.
|
||||
*/
|
||||
|
||||
static int
|
||||
|
@ -102,10 +101,17 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
|
|||
page_base = xdrbuf->page_base & ~PAGE_MASK;
|
||||
p = 0;
|
||||
while (len && n < nsegs) {
|
||||
if (!ppages[p]) {
|
||||
/* alloc the pagelist for receiving buffer */
|
||||
ppages[p] = alloc_page(GFP_ATOMIC);
|
||||
if (!ppages[p])
|
||||
return -ENOMEM;
|
||||
}
|
||||
seg[n].mr_page = ppages[p];
|
||||
seg[n].mr_offset = (void *)(unsigned long) page_base;
|
||||
seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len);
|
||||
BUG_ON(seg[n].mr_len > PAGE_SIZE);
|
||||
if (seg[n].mr_len > PAGE_SIZE)
|
||||
return -EIO;
|
||||
len -= seg[n].mr_len;
|
||||
++n;
|
||||
++p;
|
||||
|
@ -114,7 +120,7 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
|
|||
|
||||
/* Message overflows the seg array */
|
||||
if (len && n == nsegs)
|
||||
return 0;
|
||||
return -EIO;
|
||||
|
||||
if (xdrbuf->tail[0].iov_len) {
|
||||
/* the rpcrdma protocol allows us to omit any trailing
|
||||
|
@ -123,7 +129,7 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
|
|||
return n;
|
||||
if (n == nsegs)
|
||||
/* Tail remains, but we're out of segments */
|
||||
return 0;
|
||||
return -EIO;
|
||||
seg[n].mr_page = NULL;
|
||||
seg[n].mr_offset = xdrbuf->tail[0].iov_base;
|
||||
seg[n].mr_len = xdrbuf->tail[0].iov_len;
|
||||
|
@ -164,15 +170,17 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
|
|||
* Reply chunk (a counted array):
|
||||
* N elements:
|
||||
* 1 - N - HLOO - HLOO - ... - HLOO
|
||||
*
|
||||
* Returns positive RPC/RDMA header size, or negative errno.
|
||||
*/
|
||||
|
||||
static unsigned int
|
||||
static ssize_t
|
||||
rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
|
||||
struct rpcrdma_msg *headerp, enum rpcrdma_chunktype type)
|
||||
{
|
||||
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
|
||||
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
|
||||
int nsegs, nchunks = 0;
|
||||
int n, nsegs, nchunks = 0;
|
||||
unsigned int pos;
|
||||
struct rpcrdma_mr_seg *seg = req->rl_segments;
|
||||
struct rpcrdma_read_chunk *cur_rchunk = NULL;
|
||||
|
@ -198,12 +206,11 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
|
|||
pos = target->head[0].iov_len;
|
||||
|
||||
nsegs = rpcrdma_convert_iovs(target, pos, type, seg, RPCRDMA_MAX_SEGS);
|
||||
if (nsegs == 0)
|
||||
return 0;
|
||||
if (nsegs < 0)
|
||||
return nsegs;
|
||||
|
||||
do {
|
||||
/* bind/register the memory, then build chunk from result. */
|
||||
int n = rpcrdma_register_external(seg, nsegs,
|
||||
n = rpcrdma_register_external(seg, nsegs,
|
||||
cur_wchunk != NULL, r_xprt);
|
||||
if (n <= 0)
|
||||
goto out;
|
||||
|
@ -248,10 +255,6 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
|
|||
/* success. all failures return above */
|
||||
req->rl_nchunks = nchunks;
|
||||
|
||||
BUG_ON(nchunks == 0);
|
||||
BUG_ON((r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR)
|
||||
&& (nchunks > 3));
|
||||
|
||||
/*
|
||||
* finish off header. If write, marshal discrim and nchunks.
|
||||
*/
|
||||
|
@ -278,8 +281,8 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
|
|||
out:
|
||||
for (pos = 0; nchunks--;)
|
||||
pos += rpcrdma_deregister_external(
|
||||
&req->rl_segments[pos], r_xprt, NULL);
|
||||
return 0;
|
||||
&req->rl_segments[pos], r_xprt);
|
||||
return n;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -361,6 +364,8 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
|
|||
* [1] -- the RPC header/data, marshaled by RPC and the NFS protocol.
|
||||
* [2] -- optional padding.
|
||||
* [3] -- if padded, header only in [1] and data here.
|
||||
*
|
||||
* Returns zero on success, otherwise a negative errno.
|
||||
*/
|
||||
|
||||
int
|
||||
|
@ -370,7 +375,8 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
|
|||
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
|
||||
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
|
||||
char *base;
|
||||
size_t hdrlen, rpclen, padlen;
|
||||
size_t rpclen, padlen;
|
||||
ssize_t hdrlen;
|
||||
enum rpcrdma_chunktype rtype, wtype;
|
||||
struct rpcrdma_msg *headerp;
|
||||
|
||||
|
@ -441,14 +447,10 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
|
|||
/* The following simplification is not true forever */
|
||||
if (rtype != rpcrdma_noch && wtype == rpcrdma_replych)
|
||||
wtype = rpcrdma_noch;
|
||||
BUG_ON(rtype != rpcrdma_noch && wtype != rpcrdma_noch);
|
||||
|
||||
if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS &&
|
||||
(rtype != rpcrdma_noch || wtype != rpcrdma_noch)) {
|
||||
/* forced to "pure inline"? */
|
||||
dprintk("RPC: %s: too much data (%d/%d) for inline\n",
|
||||
__func__, rqst->rq_rcv_buf.len, rqst->rq_snd_buf.len);
|
||||
return -1;
|
||||
if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
|
||||
dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
|
||||
__func__);
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
hdrlen = 28; /*sizeof *headerp;*/
|
||||
|
@ -474,8 +476,11 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
|
|||
headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero;
|
||||
headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero;
|
||||
hdrlen += 2 * sizeof(u32); /* extra words in padhdr */
|
||||
BUG_ON(wtype != rpcrdma_noch);
|
||||
|
||||
if (wtype != rpcrdma_noch) {
|
||||
dprintk("RPC: %s: invalid chunk list\n",
|
||||
__func__);
|
||||
return -EIO;
|
||||
}
|
||||
} else {
|
||||
headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero;
|
||||
headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero;
|
||||
|
@ -492,8 +497,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
|
|||
* on receive. Therefore, we request a reply chunk
|
||||
* for non-writes wherever feasible and efficient.
|
||||
*/
|
||||
if (wtype == rpcrdma_noch &&
|
||||
r_xprt->rx_ia.ri_memreg_strategy > RPCRDMA_REGISTER)
|
||||
if (wtype == rpcrdma_noch)
|
||||
wtype = rpcrdma_replych;
|
||||
}
|
||||
}
|
||||
|
@ -511,9 +515,8 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
|
|||
hdrlen = rpcrdma_create_chunks(rqst,
|
||||
&rqst->rq_rcv_buf, headerp, wtype);
|
||||
}
|
||||
|
||||
if (hdrlen == 0)
|
||||
return -1;
|
||||
if (hdrlen < 0)
|
||||
return hdrlen;
|
||||
|
||||
dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd"
|
||||
" headerp 0x%p base 0x%p lkey 0x%x\n",
|
||||
|
@ -680,15 +683,11 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
|
|||
rqst->rq_private_buf = rqst->rq_rcv_buf;
|
||||
}
|
||||
|
||||
/*
|
||||
* This function is called when an async event is posted to
|
||||
* the connection which changes the connection state. All it
|
||||
* does at this point is mark the connection up/down, the rpc
|
||||
* timers do the rest.
|
||||
*/
|
||||
void
|
||||
rpcrdma_conn_func(struct rpcrdma_ep *ep)
|
||||
rpcrdma_connect_worker(struct work_struct *work)
|
||||
{
|
||||
struct rpcrdma_ep *ep =
|
||||
container_of(work, struct rpcrdma_ep, rep_connect_worker.work);
|
||||
struct rpc_xprt *xprt = ep->rep_xprt;
|
||||
|
||||
spin_lock_bh(&xprt->transport_lock);
|
||||
|
@ -705,13 +704,15 @@ rpcrdma_conn_func(struct rpcrdma_ep *ep)
|
|||
}
|
||||
|
||||
/*
|
||||
* This function is called when memory window unbind which we are waiting
|
||||
* for completes. Just use rr_func (zeroed by upcall) to signal completion.
|
||||
* This function is called when an async event is posted to
|
||||
* the connection which changes the connection state. All it
|
||||
* does at this point is mark the connection up/down, the rpc
|
||||
* timers do the rest.
|
||||
*/
|
||||
static void
|
||||
rpcrdma_unbind_func(struct rpcrdma_rep *rep)
|
||||
void
|
||||
rpcrdma_conn_func(struct rpcrdma_ep *ep)
|
||||
{
|
||||
wake_up(&rep->rr_unbind);
|
||||
schedule_delayed_work(&ep->rep_connect_worker, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -728,7 +729,8 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
|
|||
struct rpc_xprt *xprt = rep->rr_xprt;
|
||||
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
|
||||
__be32 *iptr;
|
||||
int i, rdmalen, status;
|
||||
int rdmalen, status;
|
||||
unsigned long cwnd;
|
||||
|
||||
/* Check status. If bad, signal disconnect and return rep to pool */
|
||||
if (rep->rr_len == ~0U) {
|
||||
|
@ -783,6 +785,7 @@ repost:
|
|||
|
||||
/* from here on, the reply is no longer an orphan */
|
||||
req->rl_reply = rep;
|
||||
xprt->reestablish_timeout = 0;
|
||||
|
||||
/* check for expected message types */
|
||||
/* The order of some of these tests is important. */
|
||||
|
@ -857,26 +860,10 @@ badheader:
|
|||
break;
|
||||
}
|
||||
|
||||
/* If using mw bind, start the deregister process now. */
|
||||
/* (Note: if mr_free(), cannot perform it here, in tasklet context) */
|
||||
if (req->rl_nchunks) switch (r_xprt->rx_ia.ri_memreg_strategy) {
|
||||
case RPCRDMA_MEMWINDOWS:
|
||||
for (i = 0; req->rl_nchunks-- > 1;)
|
||||
i += rpcrdma_deregister_external(
|
||||
&req->rl_segments[i], r_xprt, NULL);
|
||||
/* Optionally wait (not here) for unbinds to complete */
|
||||
rep->rr_func = rpcrdma_unbind_func;
|
||||
(void) rpcrdma_deregister_external(&req->rl_segments[i],
|
||||
r_xprt, rep);
|
||||
break;
|
||||
case RPCRDMA_MEMWINDOWS_ASYNC:
|
||||
for (i = 0; req->rl_nchunks--;)
|
||||
i += rpcrdma_deregister_external(&req->rl_segments[i],
|
||||
r_xprt, NULL);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
cwnd = xprt->cwnd;
|
||||
xprt->cwnd = atomic_read(&r_xprt->rx_buf.rb_credits) << RPC_CWNDSHIFT;
|
||||
if (xprt->cwnd > cwnd)
|
||||
xprt_release_rqst_cong(rqst->rq_task);
|
||||
|
||||
dprintk("RPC: %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n",
|
||||
__func__, xprt, rqst, status);
|
||||
|
|
|
@ -149,6 +149,11 @@ static struct ctl_table sunrpc_table[] = {
|
|||
|
||||
#endif
|
||||
|
||||
#define RPCRDMA_BIND_TO (60U * HZ)
|
||||
#define RPCRDMA_INIT_REEST_TO (5U * HZ)
|
||||
#define RPCRDMA_MAX_REEST_TO (30U * HZ)
|
||||
#define RPCRDMA_IDLE_DISC_TO (5U * 60 * HZ)
|
||||
|
||||
static struct rpc_xprt_ops xprt_rdma_procs; /* forward reference */
|
||||
|
||||
static void
|
||||
|
@ -229,7 +234,6 @@ static void
|
|||
xprt_rdma_destroy(struct rpc_xprt *xprt)
|
||||
{
|
||||
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
|
||||
int rc;
|
||||
|
||||
dprintk("RPC: %s: called\n", __func__);
|
||||
|
||||
|
@ -238,10 +242,7 @@ xprt_rdma_destroy(struct rpc_xprt *xprt)
|
|||
xprt_clear_connected(xprt);
|
||||
|
||||
rpcrdma_buffer_destroy(&r_xprt->rx_buf);
|
||||
rc = rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia);
|
||||
if (rc)
|
||||
dprintk("RPC: %s: rpcrdma_ep_destroy returned %i\n",
|
||||
__func__, rc);
|
||||
rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia);
|
||||
rpcrdma_ia_close(&r_xprt->rx_ia);
|
||||
|
||||
xprt_rdma_free_addresses(xprt);
|
||||
|
@ -289,9 +290,9 @@ xprt_setup_rdma(struct xprt_create *args)
|
|||
|
||||
/* 60 second timeout, no retries */
|
||||
xprt->timeout = &xprt_rdma_default_timeout;
|
||||
xprt->bind_timeout = (60U * HZ);
|
||||
xprt->reestablish_timeout = (5U * HZ);
|
||||
xprt->idle_timeout = (5U * 60 * HZ);
|
||||
xprt->bind_timeout = RPCRDMA_BIND_TO;
|
||||
xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
|
||||
xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO;
|
||||
|
||||
xprt->resvport = 0; /* privileged port not needed */
|
||||
xprt->tsh_size = 0; /* RPC-RDMA handles framing */
|
||||
|
@ -391,7 +392,7 @@ out4:
|
|||
xprt_rdma_free_addresses(xprt);
|
||||
rc = -EINVAL;
|
||||
out3:
|
||||
(void) rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia);
|
||||
rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia);
|
||||
out2:
|
||||
rpcrdma_ia_close(&new_xprt->rx_ia);
|
||||
out1:
|
||||
|
@ -436,10 +437,10 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
|
|||
schedule_delayed_work(&r_xprt->rdma_connect,
|
||||
xprt->reestablish_timeout);
|
||||
xprt->reestablish_timeout <<= 1;
|
||||
if (xprt->reestablish_timeout > (30 * HZ))
|
||||
xprt->reestablish_timeout = (30 * HZ);
|
||||
else if (xprt->reestablish_timeout < (5 * HZ))
|
||||
xprt->reestablish_timeout = (5 * HZ);
|
||||
if (xprt->reestablish_timeout > RPCRDMA_MAX_REEST_TO)
|
||||
xprt->reestablish_timeout = RPCRDMA_MAX_REEST_TO;
|
||||
else if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO)
|
||||
xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
|
||||
} else {
|
||||
schedule_delayed_work(&r_xprt->rdma_connect, 0);
|
||||
if (!RPC_IS_ASYNC(task))
|
||||
|
@ -447,23 +448,6 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
|
|||
}
|
||||
}
|
||||
|
||||
static int
|
||||
xprt_rdma_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task)
|
||||
{
|
||||
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
|
||||
int credits = atomic_read(&r_xprt->rx_buf.rb_credits);
|
||||
|
||||
/* == RPC_CWNDSCALE @ init, but *after* setup */
|
||||
if (r_xprt->rx_buf.rb_cwndscale == 0UL) {
|
||||
r_xprt->rx_buf.rb_cwndscale = xprt->cwnd;
|
||||
dprintk("RPC: %s: cwndscale %lu\n", __func__,
|
||||
r_xprt->rx_buf.rb_cwndscale);
|
||||
BUG_ON(r_xprt->rx_buf.rb_cwndscale <= 0);
|
||||
}
|
||||
xprt->cwnd = credits * r_xprt->rx_buf.rb_cwndscale;
|
||||
return xprt_reserve_xprt_cong(xprt, task);
|
||||
}
|
||||
|
||||
/*
|
||||
* The RDMA allocate/free functions need the task structure as a place
|
||||
* to hide the struct rpcrdma_req, which is necessary for the actual send/recv
|
||||
|
@ -479,7 +463,8 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size)
|
|||
struct rpcrdma_req *req, *nreq;
|
||||
|
||||
req = rpcrdma_buffer_get(&rpcx_to_rdmax(xprt)->rx_buf);
|
||||
BUG_ON(NULL == req);
|
||||
if (req == NULL)
|
||||
return NULL;
|
||||
|
||||
if (size > req->rl_size) {
|
||||
dprintk("RPC: %s: size %zd too large for buffer[%zd]: "
|
||||
|
@ -503,18 +488,6 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size)
|
|||
* If the allocation or registration fails, the RPC framework
|
||||
* will (doggedly) retry.
|
||||
*/
|
||||
if (rpcx_to_rdmax(xprt)->rx_ia.ri_memreg_strategy ==
|
||||
RPCRDMA_BOUNCEBUFFERS) {
|
||||
/* forced to "pure inline" */
|
||||
dprintk("RPC: %s: too much data (%zd) for inline "
|
||||
"(r/w max %d/%d)\n", __func__, size,
|
||||
rpcx_to_rdmad(xprt).inline_rsize,
|
||||
rpcx_to_rdmad(xprt).inline_wsize);
|
||||
size = req->rl_size;
|
||||
rpc_exit(task, -EIO); /* fail the operation */
|
||||
rpcx_to_rdmax(xprt)->rx_stats.failed_marshal_count++;
|
||||
goto out;
|
||||
}
|
||||
if (task->tk_flags & RPC_TASK_SWAPPER)
|
||||
nreq = kmalloc(sizeof *req + size, GFP_ATOMIC);
|
||||
else
|
||||
|
@ -543,7 +516,6 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size)
|
|||
req = nreq;
|
||||
}
|
||||
dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req);
|
||||
out:
|
||||
req->rl_connect_cookie = 0; /* our reserved value */
|
||||
return req->rl_xdr_buf;
|
||||
|
||||
|
@ -579,9 +551,7 @@ xprt_rdma_free(void *buffer)
|
|||
__func__, rep, (rep && rep->rr_func) ? " (with waiter)" : "");
|
||||
|
||||
/*
|
||||
* Finish the deregistration. When using mw bind, this was
|
||||
* begun in rpcrdma_reply_handler(). In all other modes, we
|
||||
* do it here, in thread context. The process is considered
|
||||
* Finish the deregistration. The process is considered
|
||||
* complete when the rr_func vector becomes NULL - this
|
||||
* was put in place during rpcrdma_reply_handler() - the wait
|
||||
* call below will not block if the dereg is "done". If
|
||||
|
@ -590,12 +560,7 @@ xprt_rdma_free(void *buffer)
|
|||
for (i = 0; req->rl_nchunks;) {
|
||||
--req->rl_nchunks;
|
||||
i += rpcrdma_deregister_external(
|
||||
&req->rl_segments[i], r_xprt, NULL);
|
||||
}
|
||||
|
||||
if (rep && wait_event_interruptible(rep->rr_unbind, !rep->rr_func)) {
|
||||
rep->rr_func = NULL; /* abandon the callback */
|
||||
req->rl_reply = NULL;
|
||||
&req->rl_segments[i], r_xprt);
|
||||
}
|
||||
|
||||
if (req->rl_iov.length == 0) { /* see allocate above */
|
||||
|
@ -630,13 +595,12 @@ xprt_rdma_send_request(struct rpc_task *task)
|
|||
struct rpc_xprt *xprt = rqst->rq_xprt;
|
||||
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
|
||||
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
|
||||
int rc;
|
||||
|
||||
/* marshal the send itself */
|
||||
if (req->rl_niovs == 0 && rpcrdma_marshal_req(rqst) != 0) {
|
||||
r_xprt->rx_stats.failed_marshal_count++;
|
||||
dprintk("RPC: %s: rpcrdma_marshal_req failed\n",
|
||||
__func__);
|
||||
return -EIO;
|
||||
if (req->rl_niovs == 0) {
|
||||
rc = rpcrdma_marshal_req(rqst);
|
||||
if (rc < 0)
|
||||
goto failed_marshal;
|
||||
}
|
||||
|
||||
if (req->rl_reply == NULL) /* e.g. reconnection */
|
||||
|
@ -660,6 +624,12 @@ xprt_rdma_send_request(struct rpc_task *task)
|
|||
rqst->rq_bytes_sent = 0;
|
||||
return 0;
|
||||
|
||||
failed_marshal:
|
||||
r_xprt->rx_stats.failed_marshal_count++;
|
||||
dprintk("RPC: %s: rpcrdma_marshal_req failed, status %i\n",
|
||||
__func__, rc);
|
||||
if (rc == -EIO)
|
||||
return -EIO;
|
||||
drop_connection:
|
||||
xprt_disconnect_done(xprt);
|
||||
return -ENOTCONN; /* implies disconnect */
|
||||
|
@ -705,7 +675,7 @@ static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
|
|||
*/
|
||||
|
||||
static struct rpc_xprt_ops xprt_rdma_procs = {
|
||||
.reserve_xprt = xprt_rdma_reserve_xprt,
|
||||
.reserve_xprt = xprt_reserve_xprt_cong,
|
||||
.release_xprt = xprt_release_xprt_cong, /* sunrpc/xprt.c */
|
||||
.alloc_slot = xprt_alloc_slot,
|
||||
.release_request = xprt_release_rqst_cong, /* ditto */
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -43,6 +43,7 @@
|
|||
#include <linux/wait.h> /* wait_queue_head_t, etc */
|
||||
#include <linux/spinlock.h> /* spinlock_t, etc */
|
||||
#include <linux/atomic.h> /* atomic_t, etc */
|
||||
#include <linux/workqueue.h> /* struct work_struct */
|
||||
|
||||
#include <rdma/rdma_cm.h> /* RDMA connection api */
|
||||
#include <rdma/ib_verbs.h> /* RDMA verbs api */
|
||||
|
@ -66,18 +67,21 @@ struct rpcrdma_ia {
|
|||
struct completion ri_done;
|
||||
int ri_async_rc;
|
||||
enum rpcrdma_memreg ri_memreg_strategy;
|
||||
unsigned int ri_max_frmr_depth;
|
||||
};
|
||||
|
||||
/*
|
||||
* RDMA Endpoint -- one per transport instance
|
||||
*/
|
||||
|
||||
#define RPCRDMA_WC_BUDGET (128)
|
||||
#define RPCRDMA_POLLSIZE (16)
|
||||
|
||||
struct rpcrdma_ep {
|
||||
atomic_t rep_cqcount;
|
||||
int rep_cqinit;
|
||||
int rep_connected;
|
||||
struct rpcrdma_ia *rep_ia;
|
||||
struct ib_cq *rep_cq;
|
||||
struct ib_qp_init_attr rep_attr;
|
||||
wait_queue_head_t rep_connect_wait;
|
||||
struct ib_sge rep_pad; /* holds zeroed pad */
|
||||
|
@ -86,6 +90,9 @@ struct rpcrdma_ep {
|
|||
struct rpc_xprt *rep_xprt; /* for rep_func */
|
||||
struct rdma_conn_param rep_remote_cma;
|
||||
struct sockaddr_storage rep_remote_addr;
|
||||
struct delayed_work rep_connect_worker;
|
||||
struct ib_wc rep_send_wcs[RPCRDMA_POLLSIZE];
|
||||
struct ib_wc rep_recv_wcs[RPCRDMA_POLLSIZE];
|
||||
};
|
||||
|
||||
#define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit)
|
||||
|
@ -124,7 +131,6 @@ struct rpcrdma_rep {
|
|||
struct rpc_xprt *rr_xprt; /* needed for request/reply matching */
|
||||
void (*rr_func)(struct rpcrdma_rep *);/* called by tasklet in softint */
|
||||
struct list_head rr_list; /* tasklet list */
|
||||
wait_queue_head_t rr_unbind; /* optional unbind wait */
|
||||
struct ib_sge rr_iov; /* for posting */
|
||||
struct ib_mr *rr_handle; /* handle for mem in rr_iov */
|
||||
char rr_base[MAX_RPCRDMAHDR]; /* minimal inline receive buffer */
|
||||
|
@ -159,7 +165,6 @@ struct rpcrdma_mr_seg { /* chunk descriptors */
|
|||
struct ib_mr *rl_mr; /* if registered directly */
|
||||
struct rpcrdma_mw { /* if registered from region */
|
||||
union {
|
||||
struct ib_mw *mw;
|
||||
struct ib_fmr *fmr;
|
||||
struct {
|
||||
struct ib_fast_reg_page_list *fr_pgl;
|
||||
|
@ -207,7 +212,6 @@ struct rpcrdma_req {
|
|||
struct rpcrdma_buffer {
|
||||
spinlock_t rb_lock; /* protects indexes */
|
||||
atomic_t rb_credits; /* most recent server credits */
|
||||
unsigned long rb_cwndscale; /* cached framework rpc_cwndscale */
|
||||
int rb_max_requests;/* client max requests */
|
||||
struct list_head rb_mws; /* optional memory windows/fmrs/frmrs */
|
||||
int rb_send_index;
|
||||
|
@ -300,7 +304,7 @@ void rpcrdma_ia_close(struct rpcrdma_ia *);
|
|||
*/
|
||||
int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *,
|
||||
struct rpcrdma_create_data_internal *);
|
||||
int rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *);
|
||||
void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *);
|
||||
int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *);
|
||||
int rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
|
||||
|
||||
|
@ -330,11 +334,12 @@ int rpcrdma_deregister_internal(struct rpcrdma_ia *,
|
|||
int rpcrdma_register_external(struct rpcrdma_mr_seg *,
|
||||
int, int, struct rpcrdma_xprt *);
|
||||
int rpcrdma_deregister_external(struct rpcrdma_mr_seg *,
|
||||
struct rpcrdma_xprt *, void *);
|
||||
struct rpcrdma_xprt *);
|
||||
|
||||
/*
|
||||
* RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c
|
||||
*/
|
||||
void rpcrdma_connect_worker(struct work_struct *);
|
||||
void rpcrdma_conn_func(struct rpcrdma_ep *);
|
||||
void rpcrdma_reply_handler(struct rpcrdma_rep *);
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче