From 1c3da4452d185e4be663e76a1b9842184d8f9c4c Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 4 Jan 2019 09:47:03 +0300 Subject: [PATCH 01/12] nfsd: fix an IS_ERR() vs NULL check The get_backchannel_cred() used to return error pointers on error but now it returns NULL pointers. Fixes: 97f68c6b02e0 ("SUNRPC: add 'struct cred *' to auth_cred and rpc_cre") Signed-off-by: Dan Carpenter Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4callback.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index c74e4538d0ebe1..9b38dab1c21ba3 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -913,9 +913,9 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c return PTR_ERR(client); } cred = get_backchannel_cred(clp, client, ses); - if (IS_ERR(cred)) { + if (!cred) { rpc_shutdown_client(client); - return PTR_ERR(cred); + return -ENOMEM; } clp->cl_cb_client = client; clp->cl_cb_cred = cred; From 1602a7b7d33741d0490682e620bb29e96ad684d7 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 3 Jan 2019 09:17:12 -0500 Subject: [PATCH 02/12] SUNRPC: Don't allow compiler optimisation of svc_xprt_release_slot() Use READ_ONCE() to tell the compiler to not optimse away the read of xprt->xpt_flags in svc_xprt_release_slot(). Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- net/sunrpc/svc_xprt.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 4eb8fbf2508dc7..c3af1aaef551ad 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -363,9 +363,13 @@ static void svc_xprt_release_slot(struct svc_rqst *rqstp) static bool svc_xprt_has_something_to_do(struct svc_xprt *xprt) { - if (xprt->xpt_flags & ((1<xpt_flags); + + if (xpt_flags & (BIT(XPT_CONN) | BIT(XPT_CLOSE))) return true; - if (xprt->xpt_flags & ((1<xpt_ops->xpo_has_wspace(xprt) && svc_xprt_slots_in_range(xprt)) return true; From 66c898caefd346a88fbef242eb7892fd959308f6 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 11 Jan 2019 15:57:09 -0500 Subject: [PATCH 03/12] svcrpc: svc_xprt_has_something_to_do seems a little long The long name seemed cute till I wanted to refer to it somewhere else. Signed-off-by: J. Bruce Fields --- net/sunrpc/svc_xprt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index c3af1aaef551ad..a2435d3811a996 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -361,7 +361,7 @@ static void svc_xprt_release_slot(struct svc_rqst *rqstp) } } -static bool svc_xprt_has_something_to_do(struct svc_xprt *xprt) +static bool svc_xprt_ready(struct svc_xprt *xprt) { unsigned long xpt_flags; @@ -385,7 +385,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt) struct svc_rqst *rqstp = NULL; int cpu; - if (!svc_xprt_has_something_to_do(xprt)) + if (!svc_xprt_ready(xprt)) return; /* Mark transport as busy. It will remain in this state until From 95503d295ad6af20f09efff193e085481a962fd2 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 11 Jan 2019 15:36:40 -0500 Subject: [PATCH 04/12] svcrpc: fix unlikely races preventing queueing of sockets In the rpc server, When something happens that might be reason to wake up a thread to do something, what we do is - modify xpt_flags, sk_sock->flags, xpt_reserved, or xpt_nr_rqsts to indicate the new situation - call svc_xprt_enqueue() to decide whether to wake up a thread. svc_xprt_enqueue may require multiple conditions to be true before queueing up a thread to handle the xprt. In the SMP case, one of the other CPU's may have set another required condition, and in that case, although both CPUs run svc_xprt_enqueue(), it's possible that neither call sees the writes done by the other CPU in time, and neither one recognizes that all the required conditions have been set. A socket could therefore be ignored indefinitely. Add memory barries to ensure that any svc_xprt_enqueue() call will always see the conditions changed by other CPUs before deciding to ignore a socket. I've never seen this race reported. In the unlikely event it happens, another event will usually come along and the problem will fix itself. So I don't think this is worth backporting to stable. Chuck tried this patch and said "I don't see any performance regressions, but my server has only a single last-level CPU cache." Tested-by: Chuck Lever Signed-off-by: J. Bruce Fields --- net/sunrpc/svc_xprt.c | 12 +++++++++++- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 3 ++- net/sunrpc/xprtrdma/svc_rdma_rw.c | 3 ++- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index a2435d3811a996..61530b1b7754e5 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -357,6 +357,7 @@ static void svc_xprt_release_slot(struct svc_rqst *rqstp) struct svc_xprt *xprt = rqstp->rq_xprt; if (test_and_clear_bit(RQ_DATA, &rqstp->rq_flags)) { atomic_dec(&xprt->xpt_nr_rqsts); + smp_wmb(); /* See smp_rmb() in svc_xprt_ready() */ svc_xprt_enqueue(xprt); } } @@ -365,6 +366,15 @@ static bool svc_xprt_ready(struct svc_xprt *xprt) { unsigned long xpt_flags; + /* + * If another cpu has recently updated xpt_flags, + * sk_sock->flags, xpt_reserved, or xpt_nr_rqsts, we need to + * know about it; otherwise it's possible that both that cpu and + * this one could call svc_xprt_enqueue() without either + * svc_xprt_enqueue() recognizing that the conditions below + * are satisfied, and we could stall indefinitely: + */ + smp_rmb(); xpt_flags = READ_ONCE(xprt->xpt_flags); if (xpt_flags & (BIT(XPT_CONN) | BIT(XPT_CLOSE))) @@ -479,7 +489,7 @@ void svc_reserve(struct svc_rqst *rqstp, int space) if (xprt && space < rqstp->rq_reserved) { atomic_sub((rqstp->rq_reserved - space), &xprt->xpt_reserved); rqstp->rq_reserved = space; - + smp_wmb(); /* See smp_rmb() in svc_xprt_ready() */ svc_xprt_enqueue(xprt); } } diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 828b149eaaefe0..3ebb158c2279af 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -314,8 +314,9 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) spin_lock(&rdma->sc_rq_dto_lock); list_add_tail(&ctxt->rc_list, &rdma->sc_rq_dto_q); - spin_unlock(&rdma->sc_rq_dto_lock); + /* Note the unlock pairs with the smp_rmb in svc_xprt_ready: */ set_bit(XPT_DATA, &rdma->sc_xprt.xpt_flags); + spin_unlock(&rdma->sc_rq_dto_lock); if (!test_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags)) svc_xprt_enqueue(&rdma->sc_xprt); goto out; diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index dc1951759a8eff..c35753691960c0 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -287,9 +287,10 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc) spin_lock(&rdma->sc_rq_dto_lock); list_add_tail(&info->ri_readctxt->rc_list, &rdma->sc_read_complete_q); + /* Note the unlock pairs with the smp_rmb in svc_xprt_ready: */ + set_bit(XPT_DATA, &rdma->sc_xprt.xpt_flags); spin_unlock(&rdma->sc_rq_dto_lock); - set_bit(XPT_DATA, &rdma->sc_xprt.xpt_flags); svc_xprt_enqueue(&rdma->sc_xprt); } From 14cfbd94998a3ad6aaa67da46d997eea9e31897e Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 15 Jan 2019 15:11:40 -0600 Subject: [PATCH 05/12] svcrdma: Use struct_size() in kmalloc() One of the more common cases of allocation size calculations is finding the size of a structure that has a zero-sized array at the end, along with memory for some number of elements for that array. For example: struct foo { int stuff; struct boo entry[]; }; instance = kmalloc(sizeof(struct foo) + count * sizeof(struct boo), GFP_KERNEL); Instead of leaving these open-coded and prone to type mistakes, we can now use the new struct_size() helper: instance = kmalloc(struct_size(instance, entry, count), GFP_KERNEL); This code was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Reviewed-by: Chuck Lever Signed-off-by: J. Bruce Fields --- net/sunrpc/xprtrdma/svc_rdma_rw.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index c35753691960c0..65ee6fd6016270 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -64,8 +64,7 @@ svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges) spin_unlock(&rdma->sc_rw_ctxt_lock); } else { spin_unlock(&rdma->sc_rw_ctxt_lock); - ctxt = kmalloc(sizeof(*ctxt) + - SG_CHUNK_SIZE * sizeof(struct scatterlist), + ctxt = kmalloc(struct_size(ctxt, rw_first_sgl, SG_CHUNK_SIZE), GFP_KERNEL); if (!ctxt) goto out; From c7920f06ae755e2b2f3f14895a5fa148e81db7e2 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 6 Feb 2019 12:00:52 -0500 Subject: [PATCH 06/12] svcrdma: Squelch compiler warning when SUNRPC_DEBUG is disabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CC [M] net/sunrpc/xprtrdma/svc_rdma_transport.o linux/net/sunrpc/xprtrdma/svc_rdma_transport.c: In function ‘svc_rdma_accept’: linux/net/sunrpc/xprtrdma/svc_rdma_transport.c:452:19: warning: variable ‘sap’ set but not used [-Wunused-but-set-variable] struct sockaddr *sap; ^ Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 57f86c63a46369..ef6afcf5e9c431 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -390,8 +390,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) struct ib_qp_init_attr qp_attr; unsigned int ctxts, rq_depth; struct ib_device *dev; - struct sockaddr *sap; int ret = 0; + RPC_IFDEBUG(struct sockaddr *sap); listen_rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt); clear_bit(XPT_CONN, &xprt->xpt_flags); @@ -525,6 +525,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) if (ret) goto errout; +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) dprintk("svcrdma: new connection %p accepted:\n", newxprt); sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr; dprintk(" local address : %pIS:%u\n", sap, rpc_get_port(sap)); @@ -535,6 +536,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) dprintk(" rdma_rw_ctxs : %d\n", ctxts); dprintk(" max_requests : %d\n", newxprt->sc_max_requests); dprintk(" ord : %d\n", conn_param.initiator_depth); +#endif trace_svcrdma_xprt_accept(&newxprt->sc_xprt); return &newxprt->sc_xprt; From 8820bcaa5bd73db2e28caae98f080a04cb6e2abb Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 6 Feb 2019 12:00:57 -0500 Subject: [PATCH 07/12] svcrdma: Remove syslog warnings in work completion handlers These can result in a lot of log noise, and are able to be triggered by client misbehavior. Since there are trace points in these handlers now, there's no need to spam the log. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 9 +-------- net/sunrpc/xprtrdma/svc_rdma_rw.c | 11 +---------- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 4 ---- net/sunrpc/xprtrdma/svc_rdma_transport.c | 5 ----- 4 files changed, 2 insertions(+), 27 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 3ebb158c2279af..65e2fb9aac656f 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -272,11 +272,8 @@ bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma) return false; ctxt->rc_temp = true; ret = __svc_rdma_post_recv(rdma, ctxt); - if (ret) { - pr_err("svcrdma: failure posting recv buffers: %d\n", - ret); + if (ret) return false; - } } return true; } @@ -322,10 +319,6 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) goto out; flushed: - if (wc->status != IB_WC_WR_FLUSH_ERR) - pr_err("svcrdma: Recv: %s (%u/0x%x)\n", - ib_wc_status_msg(wc->status), - wc->status, wc->vendor_err); post_err: svc_rdma_recv_ctxt_put(rdma, ctxt); set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 65ee6fd6016270..2121c9b4d27567 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -212,13 +212,8 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); wake_up(&rdma->sc_send_wait); - if (unlikely(wc->status != IB_WC_SUCCESS)) { + if (unlikely(wc->status != IB_WC_SUCCESS)) set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); - if (wc->status != IB_WC_WR_FLUSH_ERR) - pr_err("svcrdma: write ctx: %s (%u/0x%x)\n", - ib_wc_status_msg(wc->status), - wc->status, wc->vendor_err); - } svc_rdma_write_info_free(info); } @@ -277,10 +272,6 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc) if (unlikely(wc->status != IB_WC_SUCCESS)) { set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); - if (wc->status != IB_WC_WR_FLUSH_ERR) - pr_err("svcrdma: read ctx: %s (%u/0x%x)\n", - ib_wc_status_msg(wc->status), - wc->status, wc->vendor_err); svc_rdma_recv_ctxt_put(rdma, info->ri_readctxt); } else { spin_lock(&rdma->sc_rq_dto_lock); diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 1f200119268ccd..6fdba72f89f478 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -272,10 +272,6 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) if (unlikely(wc->status != IB_WC_SUCCESS)) { set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); svc_xprt_enqueue(&rdma->sc_xprt); - if (wc->status != IB_WC_WR_FLUSH_ERR) - pr_err("svcrdma: Send: %s (%u/0x%x)\n", - ib_wc_status_msg(wc->status), - wc->status, wc->vendor_err); } svc_xprt_put(&rdma->sc_xprt); diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index ef6afcf5e9c431..027a3b07d32990 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -590,11 +590,6 @@ static void __svc_rdma_free(struct work_struct *work) if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) ib_drain_qp(rdma->sc_qp); - /* We should only be called from kref_put */ - if (kref_read(&xprt->xpt_ref) != 0) - pr_err("svcrdma: sc_xprt still in use? (%d)\n", - kref_read(&xprt->xpt_ref)); - svc_rdma_flush_recv_queues(rdma); /* Final put of backchannel client transport */ From b7e5034cbecf5a65b7bfdc2b20a8378039577706 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 20 Feb 2019 12:54:50 -0500 Subject: [PATCH 08/12] svcrpc: fix UDP on servers with lots of threads James Pearson found that an NFS server stopped responding to UDP requests if started with more than 1017 threads. sv_max_mesg is about 2^20, so that is probably where the calculation performed by svc_sock_setbufsize(svsk->sk_sock, (serv->sv_nrthreads+3) * serv->sv_max_mesg, (serv->sv_nrthreads+3) * serv->sv_max_mesg); starts to overflow an int. Reported-by: James Pearson Tested-by: James Pearson Cc: stable@vger.kernel.org Signed-off-by: J. Bruce Fields --- net/sunrpc/svcsock.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index a6a060925e5d18..43590a968b734b 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -349,12 +349,16 @@ static ssize_t svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, /* * Set socket snd and rcv buffer lengths */ -static void svc_sock_setbufsize(struct socket *sock, unsigned int snd, - unsigned int rcv) +static void svc_sock_setbufsize(struct svc_sock *svsk, unsigned int nreqs) { + unsigned int max_mesg = svsk->sk_xprt.xpt_server->sv_max_mesg; + struct socket *sock = svsk->sk_sock; + + nreqs = min(nreqs, INT_MAX / 2 / max_mesg); + lock_sock(sock->sk); - sock->sk->sk_sndbuf = snd * 2; - sock->sk->sk_rcvbuf = rcv * 2; + sock->sk->sk_sndbuf = nreqs * max_mesg * 2; + sock->sk->sk_rcvbuf = nreqs * max_mesg * 2; sock->sk->sk_write_space(sock->sk); release_sock(sock->sk); } @@ -516,9 +520,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp) * provides an upper bound on the number of threads * which will access the socket. */ - svc_sock_setbufsize(svsk->sk_sock, - (serv->sv_nrthreads+3) * serv->sv_max_mesg, - (serv->sv_nrthreads+3) * serv->sv_max_mesg); + svc_sock_setbufsize(svsk, serv->sv_nrthreads + 3); clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); skb = NULL; @@ -681,9 +683,7 @@ static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv) * receive and respond to one request. * svc_udp_recvfrom will re-adjust if necessary */ - svc_sock_setbufsize(svsk->sk_sock, - 3 * svsk->sk_xprt.xpt_server->sv_max_mesg, - 3 * svsk->sk_xprt.xpt_server->sv_max_mesg); + svc_sock_setbufsize(svsk, 3); /* data might have come in before data_ready set up */ set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); From c54f24e338ed2a35218f117a4a1afb5f9e2b4e64 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 21 Feb 2019 10:47:00 -0500 Subject: [PATCH 09/12] nfsd: fix performance-limiting session calculation We're unintentionally limiting the number of slots per nfsv4.1 session to 10. Often more than 10 simultaneous RPCs are needed for the best performance. This calculation was meant to prevent any one client from using up more than a third of the limit we set for total memory use across all clients and sessions. Instead, it's limiting the client to a third of the maximum for a single session. Fix this. Reported-by: Chris Tracy Cc: stable@vger.kernel.org Fixes: de766e570413 "nfsd: give out fewer session slots as limit approaches" Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index fb3c9844c82ae2..6a45fb00c5fcdc 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1544,16 +1544,16 @@ static u32 nfsd4_get_drc_mem(struct nfsd4_channel_attrs *ca) { u32 slotsize = slot_bytes(ca); u32 num = ca->maxreqs; - int avail; + unsigned long avail, total_avail; spin_lock(&nfsd_drc_lock); - avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION, - nfsd_drc_max_mem - nfsd_drc_mem_used); + total_avail = nfsd_drc_max_mem - nfsd_drc_mem_used; + avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION, total_avail); /* * Never use more than a third of the remaining memory, * unless it's the only way to give this client a slot: */ - avail = clamp_t(int, avail, slotsize, avail/3); + avail = clamp_t(int, avail, slotsize, total_avail/3); num = min_t(int, num, avail / slotsize); nfsd_drc_mem_used += num * slotsize; spin_unlock(&nfsd_drc_lock); From b602345da6cbb135ba68cf042df8ec9a73da7981 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 4 Mar 2019 14:08:22 +1100 Subject: [PATCH 10/12] nfsd: fix memory corruption caused by readdir If the result of an NFSv3 readdir{,plus} request results in the "offset" on one entry having to be split across 2 pages, and is sized so that the next directory entry doesn't fit in the requested size, then memory corruption can happen. When encode_entry() is called after encoding the last entry that fits, it notices that ->offset and ->offset1 are set, and so stores the offset value in the two pages as required. It clears ->offset1 but *does not* clear ->offset. Normally this omission doesn't matter as encode_entry_baggage() will be called, and will set ->offset to a suitable value (not on a page boundary). But in the case where cd->buflen < elen and nfserr_toosmall is returned, ->offset is not reset. This means that nfsd3proc_readdirplus will see ->offset with a value 4 bytes before the end of a page, and ->offset1 set to NULL. It will try to write 8bytes to ->offset. If we are lucky, the next page will be read-only, and the system will BUG: unable to handle kernel paging request at... If we are unlucky, some innocent page will have the first 4 bytes corrupted. nfsd3proc_readdir() doesn't even check for ->offset1, it just blindly writes 8 bytes to the offset wherever it is. Fix this by clearing ->offset after it is used, and copying the ->offset handling code from nfsd3_proc_readdirplus into nfsd3_proc_readdir. (Note that the commit hash in the Fixes tag is from the 'history' tree - this bug predates git). Fixes: 0b1d57cf7654 ("[PATCH] kNFSd: Fix nfs3 dentry encoding") Fixes-URL: https://git.kernel.org/pub/scm/linux/kernel/git/history/history.git/commit/?id=0b1d57cf7654 Cc: stable@vger.kernel.org (v2.6.12+) Signed-off-by: NeilBrown Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs3proc.c | 16 ++++++++++++++-- fs/nfsd/nfs3xdr.c | 1 + 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 9eb8086ea841e6..c9cf46e0c0405d 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -463,8 +463,19 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp) &resp->common, nfs3svc_encode_entry); memcpy(resp->verf, argp->verf, 8); resp->count = resp->buffer - argp->buffer; - if (resp->offset) - xdr_encode_hyper(resp->offset, argp->cookie); + if (resp->offset) { + loff_t offset = argp->cookie; + + if (unlikely(resp->offset1)) { + /* we ended up with offset on a page boundary */ + *resp->offset = htonl(offset >> 32); + *resp->offset1 = htonl(offset & 0xffffffff); + resp->offset1 = NULL; + } else { + xdr_encode_hyper(resp->offset, offset); + } + resp->offset = NULL; + } RETURN_STATUS(nfserr); } @@ -533,6 +544,7 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp) } else { xdr_encode_hyper(resp->offset, offset); } + resp->offset = NULL; } RETURN_STATUS(nfserr); diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 9b973f4f7d01c5..83919116d5cbe4 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -921,6 +921,7 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen, } else { xdr_encode_hyper(cd->offset, offset64); } + cd->offset = NULL; } /* From dd838821f0a29781b185cd8fb8e48d5c177bd838 Mon Sep 17 00:00:00 2001 From: Yihao Wu Date: Wed, 6 Mar 2019 21:03:50 +0800 Subject: [PATCH 11/12] nfsd: fix wrong check in write_v4_end_grace() Commit 62a063b8e7d1 "nfsd4: fix crash on writing v4_end_grace before nfsd startup" is trying to fix a NULL dereference issue, but it mistakenly checks if the nfsd server is started. So fix it. Fixes: 62a063b8e7d1 "nfsd4: fix crash on writing v4_end_grace before nfsd startup" Cc: stable@vger.kernel.org Reviewed-by: Joseph Qi Signed-off-by: Yihao Wu Signed-off-by: J. Bruce Fields --- fs/nfsd/nfsctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index b33f9785b756ed..ff14c29d01e9d9 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1126,7 +1126,7 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size) case 'Y': case 'y': case '1': - if (nn->nfsd_serv) + if (!nn->nfsd_serv) return -EBUSY; nfsd4_end_grace(nn); break; From f875a792abe933d0b4553ab6e29c624b58932e41 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 7 Mar 2019 09:49:46 +1100 Subject: [PATCH 12/12] nfsd: allow nfsv3 readdir request to be larger. nfsd currently reports the NFSv3 dtpref FSINFO parameter to be PAGE_SIZE, so NFS clients will typically ask for one page of directory entries at a time. This is needlessly restrictive as nfsd can handle larger replies easily. Also, a READDIR request (but not a READDIRPLUS request) has the count size clipped to PAGE_SIE, again unnecessary. This patch lifts these limits so that larger readdir requests can be used. Signed-off-by: NeilBrown Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs3proc.c | 2 +- fs/nfsd/nfs3xdr.c | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index c9cf46e0c0405d..8f933e84cec182 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -588,7 +588,7 @@ nfsd3_proc_fsinfo(struct svc_rqst *rqstp) resp->f_wtmax = max_blocksize; resp->f_wtpref = max_blocksize; resp->f_wtmult = PAGE_SIZE; - resp->f_dtpref = PAGE_SIZE; + resp->f_dtpref = max_blocksize; resp->f_maxfilesize = ~(u32) 0; resp->f_properties = NFS3_FSF_DEFAULT; diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 83919116d5cbe4..93fea246f676eb 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -573,6 +573,8 @@ int nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_readdirargs *args = rqstp->rq_argp; + u32 max_blocksize = svc_max_payload(rqstp); + p = decode_fh(p, &args->fh); if (!p) return 0; @@ -580,7 +582,7 @@ nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p) args->verf = p; p += 2; args->dircount = ~0; args->count = ntohl(*p++); - args->count = min_t(u32, args->count, PAGE_SIZE); + args->count = min_t(u32, args->count, max_blocksize); args->buffer = page_address(*(rqstp->rq_next_page++)); return xdr_argsize_check(rqstp, p);