From be819f7b66031c4a21fdc8edc47a3ecd4cac635d Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Wed, 29 Nov 2017 13:15:44 +0200 Subject: [PATCH 01/13] lockd: convert nsm_handle.sm_count from atomic_t to refcount_t atomic_t variables are currently used to implement reference counters with the following properties: - counter is initialized to 1 using atomic_set() - a resource is freed upon counter reaching zero - once counter reaches zero, its further increments aren't allowed - counter schema uses basic atomic operations (set, inc, inc_not_zero, dec_and_test, etc.) Such atomic variables should be converted to a newly provided refcount_t type and API that prevents accidental counter overflows and underflows. This is important since overflows and underflows can lead to use-after-free situation and be exploitable. The variable nsm_handle.sm_count is used as pure reference counter. Convert it to refcount_t and fix up the operations. **Important note for maintainers: Some functions from refcount_t API defined in lib/refcount.c have different memory ordering guarantees than their atomic counterparts. The full comparison can be seen in https://lkml.org/lkml/2017/11/15/57 and it is hopefully soon in state to be merged to the documentation tree. Normally the differences should not matter since refcount_t provides enough guarantees to satisfy the refcounting use cases, but in some rare cases it might matter. Please double check that you don't have some undocumented memory guarantees for this variable usage. For the nsm_handle.sm_count it might make a difference in following places: - nsm_release(): decrement in refcount_dec_and_lock() only provides RELEASE ordering, control dependency on success and holds a spin lock on success vs. fully ordered atomic counterpart. No change for the spin lock guarantees. Suggested-by: Kees Cook Reviewed-by: David Windsor Reviewed-by: Hans Liljestrand Signed-off-by: Elena Reshetova Signed-off-by: J. Bruce Fields --- fs/lockd/host.c | 2 +- fs/lockd/mon.c | 14 +++++++------- include/linux/lockd/lockd.h | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 826a89184f90..063095ee39ec 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -114,7 +114,7 @@ static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni, unsigned long now = jiffies; if (nsm != NULL) - atomic_inc(&nsm->sm_count); + refcount_inc(&nsm->sm_count); else { host = NULL; nsm = nsm_get_handle(ni->net, ni->sap, ni->salen, diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 96cfb2967ac7..654594ef4f94 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -191,7 +191,7 @@ void nsm_unmonitor(const struct nlm_host *host) struct nsm_res res; int status; - if (atomic_read(&nsm->sm_count) == 1 + if (refcount_read(&nsm->sm_count) == 1 && nsm->sm_monitored && !nsm->sm_sticky) { dprintk("lockd: nsm_unmonitor(%s)\n", nsm->sm_name); @@ -279,7 +279,7 @@ static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap, if (unlikely(new == NULL)) return NULL; - atomic_set(&new->sm_count, 1); + refcount_set(&new->sm_count, 1); new->sm_name = (char *)(new + 1); memcpy(nsm_addr(new), sap, salen); new->sm_addrlen = salen; @@ -337,13 +337,13 @@ retry: cached = nsm_lookup_addr(&ln->nsm_handles, sap); if (cached != NULL) { - atomic_inc(&cached->sm_count); + refcount_inc(&cached->sm_count); spin_unlock(&nsm_lock); kfree(new); dprintk("lockd: found nsm_handle for %s (%s), " "cnt %d\n", cached->sm_name, cached->sm_addrbuf, - atomic_read(&cached->sm_count)); + refcount_read(&cached->sm_count)); return cached; } @@ -388,12 +388,12 @@ struct nsm_handle *nsm_reboot_lookup(const struct net *net, return cached; } - atomic_inc(&cached->sm_count); + refcount_inc(&cached->sm_count); spin_unlock(&nsm_lock); dprintk("lockd: host %s (%s) rebooted, cnt %d\n", cached->sm_name, cached->sm_addrbuf, - atomic_read(&cached->sm_count)); + refcount_read(&cached->sm_count)); return cached; } @@ -404,7 +404,7 @@ struct nsm_handle *nsm_reboot_lookup(const struct net *net, */ void nsm_release(struct nsm_handle *nsm) { - if (atomic_dec_and_lock(&nsm->sm_count, &nsm_lock)) { + if (refcount_dec_and_lock(&nsm->sm_count, &nsm_lock)) { list_del(&nsm->sm_link); spin_unlock(&nsm_lock); dprintk("lockd: destroyed nsm_handle for %s (%s)\n", diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index d7d313fb9cd4..809619553e9b 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -83,7 +83,7 @@ struct nlm_host { struct nsm_handle { struct list_head sm_link; - atomic_t sm_count; + refcount_t sm_count; char *sm_mon_name; char *sm_name; struct sockaddr_storage sm_addr; From 8bb3ea77933e9796f8f15a5492481a96af8302d6 Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Wed, 29 Nov 2017 13:15:45 +0200 Subject: [PATCH 02/13] lockd: convert nlm_lockowner.count from atomic_t to refcount_t atomic_t variables are currently used to implement reference counters with the following properties: - counter is initialized to 1 using atomic_set() - a resource is freed upon counter reaching zero - once counter reaches zero, its further increments aren't allowed - counter schema uses basic atomic operations (set, inc, inc_not_zero, dec_and_test, etc.) Such atomic variables should be converted to a newly provided refcount_t type and API that prevents accidental counter overflows and underflows. This is important since overflows and underflows can lead to use-after-free situation and be exploitable. The variable nlm_lockowner.count is used as pure reference counter. Convert it to refcount_t and fix up the operations. **Important note for maintainers: Some functions from refcount_t API defined in lib/refcount.c have different memory ordering guarantees than their atomic counterparts. The full comparison can be seen in https://lkml.org/lkml/2017/11/15/57 and it is hopefully soon in state to be merged to the documentation tree. Normally the differences should not matter since refcount_t provides enough guarantees to satisfy the refcounting use cases, but in some rare cases it might matter. Please double check that you don't have some undocumented memory guarantees for this variable usage. For the nlm_lockowner.count it might make a difference in following places: - nlm_put_lockowner(): decrement in refcount_dec_and_lock() only provides RELEASE ordering, control dependency on success and holds a spin lock on success vs. fully ordered atomic counterpart. No changes in spin lock guarantees. Suggested-by: Kees Cook Reviewed-by: David Windsor Reviewed-by: Hans Liljestrand Signed-off-by: Elena Reshetova Signed-off-by: J. Bruce Fields --- fs/lockd/clntproc.c | 6 +++--- include/linux/lockd/lockd.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 066ac313ae5c..112173dbea76 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -48,13 +48,13 @@ void nlmclnt_next_cookie(struct nlm_cookie *c) static struct nlm_lockowner *nlm_get_lockowner(struct nlm_lockowner *lockowner) { - atomic_inc(&lockowner->count); + refcount_inc(&lockowner->count); return lockowner; } static void nlm_put_lockowner(struct nlm_lockowner *lockowner) { - if (!atomic_dec_and_lock(&lockowner->count, &lockowner->host->h_lock)) + if (!refcount_dec_and_lock(&lockowner->count, &lockowner->host->h_lock)) return; list_del(&lockowner->list); spin_unlock(&lockowner->host->h_lock); @@ -105,7 +105,7 @@ static struct nlm_lockowner *nlm_find_lockowner(struct nlm_host *host, fl_owner_ res = __nlm_find_lockowner(host, owner); if (res == NULL && new != NULL) { res = new; - atomic_set(&new->count, 1); + refcount_set(&new->count, 1); new->owner = owner; new->pid = __nlm_alloc_pid(host); new->host = nlm_get_host(host); diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 809619553e9b..345fced8339e 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -122,7 +122,7 @@ static inline struct sockaddr *nlm_srcaddr(const struct nlm_host *host) */ struct nlm_lockowner { struct list_head list; - atomic_t count; + refcount_t count; struct nlm_host *host; fl_owner_t owner; From d9226ec9ef01832b1edc1781241920614c6407db Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Wed, 29 Nov 2017 13:15:46 +0200 Subject: [PATCH 03/13] lockd: convert nlm_rqst.a_count from atomic_t to refcount_t atomic_t variables are currently used to implement reference counters with the following properties: - counter is initialized to 1 using atomic_set() - a resource is freed upon counter reaching zero - once counter reaches zero, its further increments aren't allowed - counter schema uses basic atomic operations (set, inc, inc_not_zero, dec_and_test, etc.) Such atomic variables should be converted to a newly provided refcount_t type and API that prevents accidental counter overflows and underflows. This is important since overflows and underflows can lead to use-after-free situation and be exploitable. The variable nlm_rqst.a_count is used as pure reference counter. Convert it to refcount_t and fix up the operations. **Important note for maintainers: Some functions from refcount_t API defined in lib/refcount.c have different memory ordering guarantees than their atomic counterparts. The full comparison can be seen in https://lkml.org/lkml/2017/11/15/57 and it is hopefully soon in state to be merged to the documentation tree. Normally the differences should not matter since refcount_t provides enough guarantees to satisfy the refcounting use cases, but in some rare cases it might matter. Please double check that you don't have some undocumented memory guarantees for this variable usage. For the nlm_rqst.a_count it might make a difference in following places: - nlmclnt_release_call() and nlmsvc_release_call(): decrement in refcount_dec_and_test() only provides RELEASE ordering and control dependency on success vs. fully ordered atomic counterpart Suggested-by: Kees Cook Reviewed-by: David Windsor Reviewed-by: Hans Liljestrand Signed-off-by: Elena Reshetova Signed-off-by: J. Bruce Fields --- fs/lockd/clntproc.c | 8 ++++---- fs/lockd/svcproc.c | 2 +- include/linux/lockd/lockd.h | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 112173dbea76..a2c0dfc6fdc0 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -204,7 +204,7 @@ struct nlm_rqst *nlm_alloc_call(struct nlm_host *host) for(;;) { call = kzalloc(sizeof(*call), GFP_KERNEL); if (call != NULL) { - atomic_set(&call->a_count, 1); + refcount_set(&call->a_count, 1); locks_init_lock(&call->a_args.lock.fl); locks_init_lock(&call->a_res.lock.fl); call->a_host = nlm_get_host(host); @@ -222,7 +222,7 @@ void nlmclnt_release_call(struct nlm_rqst *call) { const struct nlmclnt_operations *nlmclnt_ops = call->a_host->h_nlmclnt_ops; - if (!atomic_dec_and_test(&call->a_count)) + if (!refcount_dec_and_test(&call->a_count)) return; if (nlmclnt_ops && nlmclnt_ops->nlmclnt_release_call) nlmclnt_ops->nlmclnt_release_call(call->a_callback_data); @@ -678,7 +678,7 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl) goto out; } - atomic_inc(&req->a_count); + refcount_inc(&req->a_count); status = nlmclnt_async_call(nfs_file_cred(fl->fl_file), req, NLMPROC_UNLOCK, &nlmclnt_unlock_ops); if (status < 0) @@ -769,7 +769,7 @@ static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl nlmclnt_setlockargs(req, fl); req->a_args.block = block; - atomic_inc(&req->a_count); + refcount_inc(&req->a_count); status = nlmclnt_async_call(nfs_file_cred(fl->fl_file), req, NLMPROC_CANCEL, &nlmclnt_cancel_ops); if (status == 0 && req->a_res.status == nlm_lck_denied) diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 0d670c5c378f..ea77c66d3cc3 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -295,7 +295,7 @@ static void nlmsvc_callback_exit(struct rpc_task *task, void *data) void nlmsvc_release_call(struct nlm_rqst *call) { - if (!atomic_dec_and_test(&call->a_count)) + if (!refcount_dec_and_test(&call->a_count)) return; nlmsvc_release_host(call->a_host); kfree(call); diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 345fced8339e..94c9ff58fc91 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -136,7 +136,7 @@ struct nlm_wait; */ #define NLMCLNT_OHSIZE ((__NEW_UTS_LEN) + 10u) struct nlm_rqst { - atomic_t a_count; + refcount_t a_count; unsigned int a_flags; /* initial RPC task flags */ struct nlm_host * a_host; /* host handle */ struct nlm_args a_args; /* arguments */ From 66282ec1cf004c09083c29cb5e49019037937bbd Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Tue, 19 Dec 2017 09:35:25 -0500 Subject: [PATCH 04/13] nfsd4: permit layoutget of executable-only files Clients must be able to read a file in order to execute it, and for pNFS that means the client needs to be able to perform a LAYOUTGET on the file. This behavior for executable-only files was added for OPEN in commit a043226bc140 "nfsd4: permit read opens of executable-only files". This fixes up xfstests generic/126 on block/scsi layouts. Signed-off-by: Benjamin Coddington Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 008ea0b627d0..effeeb4f556f 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1363,14 +1363,14 @@ nfsd4_layoutget(struct svc_rqst *rqstp, const struct nfsd4_layout_ops *ops; struct nfs4_layout_stateid *ls; __be32 nfserr; - int accmode; + int accmode = NFSD_MAY_READ_IF_EXEC; switch (lgp->lg_seg.iomode) { case IOMODE_READ: - accmode = NFSD_MAY_READ; + accmode |= NFSD_MAY_READ; break; case IOMODE_RW: - accmode = NFSD_MAY_READ | NFSD_MAY_WRITE; + accmode |= NFSD_MAY_READ | NFSD_MAY_WRITE; break; default: dprintk("%s: invalid iomode %d\n", From 482725027ff32bc857f5527fb17feda5361265fe Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 3 Jan 2018 15:42:18 -0500 Subject: [PATCH 05/13] svcrdma: Post Receives in the Receive completion handler This change improves Receive efficiency by posting Receives only on the same CPU that handles Receive completion. Improved latency and throughput has been noted with this change. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 2 -- net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 5 ----- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 9 +------- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 6 ------ net/sunrpc/xprtrdma/svc_rdma_transport.c | 25 ++++++---------------- 5 files changed, 8 insertions(+), 39 deletions(-) diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 995c6fe9ee90..4b731b046bcd 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -185,8 +185,6 @@ extern void svc_rdma_wc_reg(struct ib_cq *, struct ib_wc *); extern void svc_rdma_wc_read(struct ib_cq *, struct ib_wc *); extern void svc_rdma_wc_inv(struct ib_cq *, struct ib_wc *); extern int svc_rdma_send(struct svcxprt_rdma *, struct ib_send_wr *); -extern int svc_rdma_post_recv(struct svcxprt_rdma *, gfp_t); -extern int svc_rdma_repost_recv(struct svcxprt_rdma *, gfp_t); extern int svc_rdma_create_listen(struct svc_serv *, int, struct sockaddr *); extern struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *); extern void svc_rdma_put_context(struct svc_rdma_op_ctxt *, int); diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index af7893501e40..a73632ca9048 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -95,7 +95,6 @@ out_shortreply: out_notfound: dprintk("svcrdma: unrecognized bc reply: xprt=%p, xid=%08x\n", xprt, be32_to_cpu(xid)); - goto out_unlock; } @@ -129,10 +128,6 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma, if (ret < 0) goto out_err; - ret = svc_rdma_repost_recv(rdma, GFP_NOIO); - if (ret) - goto out_err; - /* Bump page refcnt so Send completion doesn't release * the rq_buffer before all retransmits are complete. */ diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index ad4bd62eebf1..19e9c6b33042 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -400,10 +400,6 @@ static void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct page *page; int ret; - ret = svc_rdma_repost_recv(xprt, GFP_KERNEL); - if (ret) - return; - page = alloc_page(GFP_KERNEL); if (!page) return; @@ -554,8 +550,6 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt, p, &rqstp->rq_arg); svc_rdma_put_context(ctxt, 0); - if (ret) - goto repost; return ret; } @@ -590,6 +584,5 @@ out_postfail: out_drop: svc_rdma_put_context(ctxt, 1); -repost: - return svc_rdma_repost_recv(rdma_xprt, GFP_KERNEL); + return 0; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 7c3a211e0e9a..649441d5087d 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -674,9 +674,6 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) svc_rdma_xdr_encode_reply_chunk(rdma_resp, rp_ch, ret); } - ret = svc_rdma_post_recv(rdma, GFP_KERNEL); - if (ret) - goto err1; ret = svc_rdma_send_reply_msg(rdma, rdma_argp, rdma_resp, rqstp, wr_lst, rp_ch); if (ret < 0) @@ -687,9 +684,6 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) if (ret != -E2BIG && ret != -EINVAL) goto err1; - ret = svc_rdma_post_recv(rdma, GFP_KERNEL); - if (ret) - goto err1; ret = svc_rdma_send_error_msg(rdma, rdma_resp, rqstp); if (ret < 0) goto err0; diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 46ec069150d5..9ad12a215b51 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -58,6 +58,7 @@ #define RPCDBG_FACILITY RPCDBG_SVCXPRT +static int svc_rdma_post_recv(struct svcxprt_rdma *xprt); static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *, int); static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, struct net *net, @@ -320,6 +321,8 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) list_add_tail(&ctxt->list, &xprt->sc_rq_dto_q); spin_unlock(&xprt->sc_rq_dto_lock); + svc_rdma_post_recv(xprt); + set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); if (test_bit(RDMAXPRT_CONN_PENDING, &xprt->sc_flags)) goto out; @@ -404,7 +407,8 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, return cma_xprt; } -int svc_rdma_post_recv(struct svcxprt_rdma *xprt, gfp_t flags) +static int +svc_rdma_post_recv(struct svcxprt_rdma *xprt) { struct ib_recv_wr recv_wr, *bad_recv_wr; struct svc_rdma_op_ctxt *ctxt; @@ -423,7 +427,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt, gfp_t flags) pr_err("svcrdma: Too many sges (%d)\n", sge_no); goto err_put_ctxt; } - page = alloc_page(flags); + page = alloc_page(GFP_KERNEL); if (!page) goto err_put_ctxt; ctxt->pages[sge_no] = page; @@ -459,21 +463,6 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt, gfp_t flags) return -ENOMEM; } -int svc_rdma_repost_recv(struct svcxprt_rdma *xprt, gfp_t flags) -{ - int ret = 0; - - ret = svc_rdma_post_recv(xprt, flags); - if (ret) { - pr_err("svcrdma: could not post a receive buffer, err=%d.\n", - ret); - pr_err("svcrdma: closing transport %p.\n", xprt); - set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); - ret = -ENOTCONN; - } - return ret; -} - static void svc_rdma_parse_connect_private(struct svcxprt_rdma *newxprt, struct rdma_conn_param *param) @@ -833,7 +822,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) /* Post receive buffers */ for (i = 0; i < newxprt->sc_max_requests; i++) { - ret = svc_rdma_post_recv(newxprt, GFP_KERNEL); + ret = svc_rdma_post_recv(newxprt); if (ret) { dprintk("svcrdma: failure posting receive buffers\n"); goto errout; From d0945caafcccac9d8f4d78f99e0eac9e60386381 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 8 Jan 2018 12:21:04 +0100 Subject: [PATCH 06/13] sunrpc: remove dead code in svc_sock_setbufsize Setting values in struct sock directly is the usual method. Remove the long dead code using set_fs() and the related comment. Signed-off-by: Christoph Hellwig Signed-off-by: J. Bruce Fields --- net/sunrpc/svcsock.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index ff8e06cd067e..5884583f93a4 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -384,25 +384,11 @@ static int svc_partial_recvfrom(struct svc_rqst *rqstp, static void svc_sock_setbufsize(struct socket *sock, unsigned int snd, unsigned int rcv) { -#if 0 - mm_segment_t oldfs; - oldfs = get_fs(); set_fs(KERNEL_DS); - sock_setsockopt(sock, SOL_SOCKET, SO_SNDBUF, - (char*)&snd, sizeof(snd)); - sock_setsockopt(sock, SOL_SOCKET, SO_RCVBUF, - (char*)&rcv, sizeof(rcv)); -#else - /* sock_setsockopt limits use to sysctl_?mem_max, - * which isn't acceptable. Until that is made conditional - * on not having CAP_SYS_RESOURCE or similar, we go direct... - * DaveM said I could! - */ lock_sock(sock->sk); sock->sk->sk_sndbuf = snd * 2; sock->sk->sk_rcvbuf = rcv * 2; sock->sk->sk_write_space(sock->sk); release_sock(sock->sk); -#endif } static int svc_sock_secure_port(struct svc_rqst *rqstp) From 4f1764172a0aa7395d12b96cae640ca1438c5085 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 12 Jan 2018 17:42:30 -0500 Subject: [PATCH 07/13] nfsd: Detect unhashed stids in nfsd4_verify_open_stid() The state of the stid is guaranteed by 2 locks: - The nfs4_client 'cl_lock' spinlock - The nfs4_ol_stateid 'st_mutex' mutex so it is quite possible for the stid to be unhashed after lookup, but before calling nfsd4_lock_ol_stateid(). So we do need to check for a zero value for 'sc_type' in nfsd4_verify_open_stid(). Signed-off-by: Trond Myklebust Tested-by: Checuk Lever Cc: stable@vger.kernel.org Fixes: 659aefb68eca "nfsd: Ensure we don't recognise lock stateids..." Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index b29b5a185a2c..5a75135f5f53 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3590,6 +3590,7 @@ nfsd4_verify_open_stid(struct nfs4_stid *s) switch (s->sc_type) { default: break; + case 0: case NFS4_CLOSED_STID: case NFS4_CLOSED_DELEG_STID: ret = nfserr_bad_stateid; From 2502072058b35e2297f4ad7b211a45ad95a6a3d5 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 17 Jan 2018 16:25:59 -0500 Subject: [PATCH 08/13] nfsd4: don't set lock stateid's sc_type to CLOSED There's no point I can see to stp->st_stid.sc_type = NFS4_CLOSED_STID; given release_lock_stateid immediately sets sc_type to 0. That set of sc_type to 0 should be enough to prevent it being used where we don't want it to be; NFS4_CLOSED_STID should only be needed for actual open stateid's that are actually closed. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 5a75135f5f53..150521c9671b 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -5183,7 +5183,6 @@ nfsd4_free_lock_stateid(stateid_t *stateid, struct nfs4_stid *s) lockowner(stp->st_stateowner))) goto out; - stp->st_stid.sc_type = NFS4_CLOSED_STID; release_lock_stateid(stp); ret = nfs_ok; @@ -6079,10 +6078,8 @@ out: * If this is a new, never-before-used stateid, and we are * returning an error, then just go ahead and release it. */ - if (status && new) { - lock_stp->st_stid.sc_type = NFS4_CLOSED_STID; + if (status && new) release_lock_stateid(lock_stp); - } mutex_unlock(&lock_stp->st_mutex); From 0078117c6d9160031b866cfa1853514d4f6865d2 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 15 Nov 2017 12:30:27 -0500 Subject: [PATCH 09/13] nfsd: return RESOURCE not GARBAGE_ARGS on too many ops A client that sends more than a hundred ops in a single compound currently gets an rpc-level GARBAGE_ARGS error. It would be more helpful to return NFS4ERR_RESOURCE, since that gives the client a better idea how to recover (for example by splitting up the compound into smaller compounds). This is all a bit academic since we've never actually seen a reason for clients to send such long compounds, but we may as well fix it. While we're there, just use NFSD4_MAX_OPS_PER_COMPOUND == 16, the constant we already use in the 4.1 case, instead of hard-coding 100. Chances anyone actually uses even 16 ops per compound are small enough that I think there's a neglible risk or any regression. This fixes pynfs test COMP6. Reported-by: "Lu, Xinyu" Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 3 +++ fs/nfsd/nfs4xdr.c | 9 +++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index effeeb4f556f..a0bed2b2004d 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1703,6 +1703,9 @@ nfsd4_proc_compound(struct svc_rqst *rqstp) status = nfserr_minor_vers_mismatch; if (nfsd_minorversion(args->minorversion, NFSD_TEST) <= 0) goto out; + status = nfserr_resource; + if (args->opcnt > NFSD_MAX_OPS_PER_COMPOUND) + goto out; status = nfs41_check_op_ordering(args); if (status) { diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 2c61c6b8ae09..5dcd7cb45b2d 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1918,8 +1918,13 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) if (argp->taglen > NFSD4_MAX_TAGLEN) goto xdr_error; - if (argp->opcnt > 100) - goto xdr_error; + /* + * NFS4ERR_RESOURCE is a more helpful error than GARBAGE_ARGS + * here, so we return success at the xdr level so that + * nfsd4_proc can handle this is an NFS-level error. + */ + if (argp->opcnt > NFSD_MAX_OPS_PER_COMPOUND) + return 0; if (argp->opcnt > ARRAY_SIZE(argp->iops)) { argp->ops = kzalloc(argp->opcnt * sizeof(*argp->ops), GFP_KERNEL); From 76c479480b9afff4c585a17e19a1efe3457a2d9b Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 3 Jan 2018 17:14:34 +0200 Subject: [PATCH 10/13] nfsd: encode stat->mtime for getattr instead of inode->i_mtime The values of stat->mtime and inode->i_mtime may differ for overlayfs and stat->mtime is the correct value to use when encoding getattr. This is also consistent with the fact that other attr times are also encoded from stat values. Both callers of lease_get_mtime() already have the value of stat->mtime, so the only needed change is that lease_get_mtime() will not overwrite this value with inode->i_mtime in case the inode does not have an exclusive lease. Signed-off-by: Amir Goldstein Reviewed-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/locks.c | 6 ++---- fs/nfsd/nfsxdr.c | 1 + 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/locks.c b/fs/locks.c index 1bd71c4d663a..db374a025811 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1554,9 +1554,9 @@ out: EXPORT_SYMBOL(__break_lease); /** - * lease_get_mtime - get the last modified time of an inode + * lease_get_mtime - update modified time of an inode with exclusive lease * @inode: the inode - * @time: pointer to a timespec which will contain the last modified time + * @time: pointer to a timespec which contains the last modified time * * This is to force NFS clients to flush their caches for files with * exclusive leases. The justification is that if someone has an @@ -1580,8 +1580,6 @@ void lease_get_mtime(struct inode *inode, struct timespec *time) if (has_lease) *time = current_time(inode); - else - *time = inode->i_mtime; } EXPORT_SYMBOL(lease_get_mtime); diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 644a0342f0e0..79b6064f8977 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -188,6 +188,7 @@ encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, *p++ = htonl((u32) stat->ino); *p++ = htonl((u32) stat->atime.tv_sec); *p++ = htonl(stat->atime.tv_nsec ? stat->atime.tv_nsec / 1000 : 0); + time = stat->mtime; lease_get_mtime(d_inode(dentry), &time); *p++ = htonl((u32) time.tv_sec); *p++ = htonl(time.tv_nsec ? time.tv_nsec / 1000 : 0); From 39ca1bf624b6b82cc895b0217889eaaf572a7913 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 3 Jan 2018 17:14:35 +0200 Subject: [PATCH 11/13] nfsd: store stat times in fill_pre_wcc() instead of inode times The time values in stat and inode may differ for overlayfs and stat time values are the correct ones to use. This is also consistent with the fact that fill_post_wcc() also stores stat time values. This means introducing a stat call that could fail, where previously we were just copying values out of the inode. To be conservative about changing behavior, we fall back to copying values out of the inode in the error case. It might be better just to clear fh_pre_saved (though note the BUG_ON in set_change_info). Signed-off-by: Amir Goldstein Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs3xdr.c | 31 ++++++++++++++++++++++++++++++- fs/nfsd/nfs4xdr.c | 2 +- fs/nfsd/nfsfh.h | 28 ++++++---------------------- 3 files changed, 37 insertions(+), 24 deletions(-) diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 2758480555fa..1a70581e1cb2 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -250,6 +250,34 @@ encode_wcc_data(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp) return encode_post_op_attr(rqstp, p, fhp); } +/* + * Fill in the pre_op attr for the wcc data + */ +void fill_pre_wcc(struct svc_fh *fhp) +{ + struct inode *inode; + struct kstat stat; + __be32 err; + + if (fhp->fh_pre_saved) + return; + + inode = d_inode(fhp->fh_dentry); + err = fh_getattr(fhp, &stat); + if (err) { + /* Grab the times from inode anyway */ + stat.mtime = inode->i_mtime; + stat.ctime = inode->i_ctime; + stat.size = inode->i_size; + } + + fhp->fh_pre_mtime = stat.mtime; + fhp->fh_pre_ctime = stat.ctime; + fhp->fh_pre_size = stat.size; + fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode); + fhp->fh_pre_saved = true; +} + /* * Fill in the post_op attr for the wcc data */ @@ -261,7 +289,8 @@ void fill_post_wcc(struct svc_fh *fhp) printk("nfsd: inode locked twice during operation.\n"); err = fh_getattr(fhp, &fhp->fh_post_attr); - fhp->fh_post_change = nfsd4_change_attribute(d_inode(fhp->fh_dentry)); + fhp->fh_post_change = nfsd4_change_attribute(&fhp->fh_post_attr, + d_inode(fhp->fh_dentry)); if (err) { fhp->fh_post_saved = false; /* Grab the ctime anyway - set_change_info might use it */ diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 5dcd7cb45b2d..e4395abd0f2b 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1996,7 +1996,7 @@ static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode, *p++ = cpu_to_be32(convert_to_wallclock(exp->cd->flush_time)); *p++ = 0; } else if (IS_I_VERSION(inode)) { - p = xdr_encode_hyper(p, nfsd4_change_attribute(inode)); + p = xdr_encode_hyper(p, nfsd4_change_attribute(stat, inode)); } else { *p++ = cpu_to_be32(stat->ctime.tv_sec); *p++ = cpu_to_be32(stat->ctime.tv_nsec); diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index 43f31cf49bae..99be87b50ebe 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -252,36 +252,20 @@ fh_clear_wcc(struct svc_fh *fhp) * By using both ctime and the i_version counter we guarantee that as * long as time doesn't go backwards we never reuse an old value. */ -static inline u64 nfsd4_change_attribute(struct inode *inode) +static inline u64 nfsd4_change_attribute(struct kstat *stat, + struct inode *inode) { u64 chattr; - chattr = inode->i_ctime.tv_sec; + chattr = stat->ctime.tv_sec; chattr <<= 30; - chattr += inode->i_ctime.tv_nsec; + chattr += stat->ctime.tv_nsec; chattr += inode->i_version; return chattr; } -/* - * Fill in the pre_op attr for the wcc data - */ -static inline void -fill_pre_wcc(struct svc_fh *fhp) -{ - struct inode *inode; - - inode = d_inode(fhp->fh_dentry); - if (!fhp->fh_pre_saved) { - fhp->fh_pre_mtime = inode->i_mtime; - fhp->fh_pre_ctime = inode->i_ctime; - fhp->fh_pre_size = inode->i_size; - fhp->fh_pre_change = nfsd4_change_attribute(inode); - fhp->fh_pre_saved = true; - } -} - -extern void fill_post_wcc(struct svc_fh *); +extern void fill_pre_wcc(struct svc_fh *fhp); +extern void fill_post_wcc(struct svc_fh *fhp); #else #define fh_clear_wcc(ignored) #define fill_pre_wcc(ignored) From 2285ae760de2cbe4b91a55c0a8033ec332fb8049 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 22 Jan 2018 22:09:12 +0100 Subject: [PATCH 12/13] NFSD: hide unused svcxdr_dupstr() There is now only one caller left for svcxdr_dupstr() and this is inside of an #ifdef, so we can get a warning when the option is disabled: fs/nfsd/nfs4xdr.c:241:1: error: 'svcxdr_dupstr' defined but not used [-Werror=unused-function] This changes the remaining caller to use a nicer IS_ENABLED() check, which lets the compiler drop the unused code silently. Fixes: e40d99e6183e ("NFSD: Clean up symlink argument XDR decoders") Suggested-by: Rasmus Villemoes Signed-off-by: Arnd Bergmann Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index e4395abd0f2b..e502fd16246b 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -455,8 +455,8 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, } label->len = 0; -#ifdef CONFIG_NFSD_V4_SECURITY_LABEL - if (bmval[2] & FATTR4_WORD2_SECURITY_LABEL) { + if (IS_ENABLED(CONFIG_NFSD_V4_SECURITY_LABEL) && + bmval[2] & FATTR4_WORD2_SECURITY_LABEL) { READ_BUF(4); len += 4; dummy32 = be32_to_cpup(p++); /* lfs: we don't use it */ @@ -476,7 +476,6 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, if (!label->data) return nfserr_jukebox; } -#endif if (bmval[2] & FATTR4_WORD2_MODE_UMASK) { if (!umask) goto xdr_error; From 175e03101d36c3034f3c80038d4c28838351a7f2 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 2 Feb 2018 14:28:59 -0500 Subject: [PATCH 13/13] svcrdma: Fix Read chunk round-up A single NFSv4 WRITE compound can often have three operations: PUTFH, WRITE, then GETATTR. When the WRITE payload is sent in a Read chunk, the client places the GETATTR in the inline part of the RPC/RDMA message, just after the WRITE operation (sans payload). The position value in the Read chunk enables the receiver to insert the Read chunk at the correct place in the received XDR stream; that is between the WRITE and GETATTR. According to RFC 8166, an NFS/RDMA client does not have to add XDR round-up to the Read chunk that carries the WRITE payload. The receiver adds XDR round-up padding if it is absent and the receiver's XDR decoder requires it to be present. Commit 193bcb7b3719 ("svcrdma: Populate tail iovec when receiving") attempted to add support for receiving such a compound so that just the WRITE payload appears in rq_arg's page list, and the trailing GETATTR is placed in rq_arg's tail iovec. (TCP just strings the whole compound into the head iovec and page list, without regard to the alignment of the WRITE payload). The server transport logic also had to accommodate the optional XDR round-up of the Read chunk, which it did simply by lengthening the tail iovec when round-up was needed. This approach is adequate for the NFSv2 and NFSv3 WRITE decoders. Unfortunately it is not sufficient for nfsd4_decode_write. When the Read chunk length is a couple of bytes less than PAGE_SIZE, the computation at the end of nfsd4_decode_write allows argp->pagelen to go negative, which breaks the logic in read_buf that looks for the tail iovec. The result is that a WRITE operation whose payload length is just less than a multiple of a page succeeds, but the subsequent GETATTR in the same compound fails with NFS4ERR_OP_ILLEGAL because the XDR decoder can't find it. Clients ignore the error, but they must update their attribute cache via a separate round trip. As nfsd4_decode_write appears to expect the payload itself to always have appropriate XDR round-up, have svc_rdma_build_normal_read_chunk add the Read chunk XDR round-up to the page_len rather than lengthening the tail iovec. Reported-by: Olga Kornievskaia Fixes: 193bcb7b3719 ("svcrdma: Populate tail iovec when receiving") Signed-off-by: Chuck Lever Tested-by: Olga Kornievskaia Signed-off-by: J. Bruce Fields --- net/sunrpc/xprtrdma/svc_rdma_rw.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 9bd04549a1ad..12b9a7e0b6d2 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -727,12 +727,16 @@ static int svc_rdma_build_normal_read_chunk(struct svc_rqst *rqstp, head->arg.head[0].iov_len - info->ri_position; head->arg.head[0].iov_len = info->ri_position; - /* Read chunk may need XDR roundup (see RFC 5666, s. 3.7). + /* Read chunk may need XDR roundup (see RFC 8166, s. 3.4.5.2). * - * NFSv2/3 write decoders need the length of the tail to - * contain the size of the roundup padding. + * If the client already rounded up the chunk length, the + * length does not change. Otherwise, the length of the page + * list is increased to include XDR round-up. + * + * Currently these chunks always start at page offset 0, + * thus the rounded-up length never crosses a page boundary. */ - head->arg.tail[0].iov_len += 4 - (info->ri_chunklen & 3); + info->ri_chunklen = XDR_QUADLEN(info->ri_chunklen) << 2; head->arg.page_len = info->ri_chunklen; head->arg.len += info->ri_chunklen;