Merge tag 'nfsd-6.5' of git://git.kernel.org/pub/scm/linux/kernel/git/cel/linux
authorLinus Torvalds <torvalds@linux-foundation.org>
Mon, 26 Jun 2023 17:48:57 +0000 (10:48 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Mon, 26 Jun 2023 17:48:57 +0000 (10:48 -0700)
Pull nfsd updates from Chuck Lever:

 - Clean-ups in the READ path in anticipation of MSG_SPLICE_PAGES

 - Better NUMA awareness when allocating pages and other objects

 - A number of minor clean-ups to XDR encoding

 - Elimination of a race when accepting a TCP socket

 - Numerous observability enhancements

* tag 'nfsd-6.5' of git://git.kernel.org/pub/scm/linux/kernel/git/cel/linux: (46 commits)
  nfsd: remove redundant assignments to variable len
  svcrdma: Fix stale comment
  NFSD: Distinguish per-net namespace initialization
  nfsd: move init of percpu reply_cache_stats counters back to nfsd_init_net
  SUNRPC: Address RCU warning in net/sunrpc/svc.c
  SUNRPC: Use sysfs_emit in place of strlcpy/sprintf
  SUNRPC: Remove transport class dprintk call sites
  SUNRPC: Fix comments for transport class registration
  svcrdma: Remove an unused argument from __svc_rdma_put_rw_ctxt()
  svcrdma: trace cc_release calls
  svcrdma: Convert "might sleep" comment into a code annotation
  NFSD: Add an nfsd4_encode_nfstime4() helper
  SUNRPC: Move initialization of rq_stime
  SUNRPC: Optimize page release in svc_rdma_sendto()
  svcrdma: Prevent page release when nothing was received
  svcrdma: Revert 2a1e4f21d841 ("svcrdma: Normalize Send page handling")
  SUNRPC: Revert 579900670ac7 ("svcrdma: Remove unused sc_pages field")
  SUNRPC: Revert cc93ce9529a6 ("svcrdma: Retain the page backing rq_res.head[0].iov_base")
  NFSD: add encoding of op_recall flag for write delegation
  NFSD: Add "official" reviewers for this subsystem
  ...

31 files changed:
.mailmap
MAINTAINERS
fs/lockd/svc.c
fs/nfsd/cache.h
fs/nfsd/export.c
fs/nfsd/nfs3proc.c
fs/nfsd/nfs3xdr.c
fs/nfsd/nfs4xdr.c
fs/nfsd/nfscache.c
fs/nfsd/nfsctl.c
fs/nfsd/nfsfh.c
fs/nfsd/nfsproc.c
fs/nfsd/nfssvc.c
fs/nfsd/nfsxdr.c
fs/nfsd/trace.h
fs/nfsd/vfs.c
fs/nfsd/vfs.h
include/linux/sunrpc/svc.h
include/linux/sunrpc/svc_rdma.h
include/linux/sunrpc/xdr.h
include/trace/events/rpcrdma.h
include/trace/events/sunrpc.h
net/sunrpc/svc.c
net/sunrpc/svc_xprt.c
net/sunrpc/svcsock.c
net/sunrpc/xdr.c
net/sunrpc/xprtrdma/svc_rdma_backchannel.c
net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
net/sunrpc/xprtrdma/svc_rdma_rw.c
net/sunrpc/xprtrdma/svc_rdma_sendto.c
net/sunrpc/xprtrdma/svc_rdma_transport.c

index c94da2a..4d71480 100644 (file)
--- a/.mailmap
+++ b/.mailmap
@@ -183,6 +183,8 @@ Henrik Rydberg <rydberg@bitmath.org>
 Herbert Xu <herbert@gondor.apana.org.au>
 Huacai Chen <chenhuacai@kernel.org> <chenhc@lemote.com>
 Huacai Chen <chenhuacai@kernel.org> <chenhuacai@loongson.cn>
+J. Bruce Fields <bfields@fieldses.org> <bfields@redhat.com>
+J. Bruce Fields <bfields@fieldses.org> <bfields@citi.umich.edu>
 Jacob Shin <Jacob.Shin@amd.com>
 Jaegeuk Kim <jaegeuk@kernel.org> <jaegeuk@google.com>
 Jaegeuk Kim <jaegeuk@kernel.org> <jaegeuk.kim@samsung.com>
index 35e1959..233b9a3 100644 (file)
@@ -11275,6 +11275,10 @@ W:     http://kernelnewbies.org/KernelJanitors
 KERNEL NFSD, SUNRPC, AND LOCKD SERVERS
 M:     Chuck Lever <chuck.lever@oracle.com>
 M:     Jeff Layton <jlayton@kernel.org>
+R:     Neil Brown <neilb@suse.de>
+R:     Olga Kornievskaia <kolga@netapp.com>
+R:     Dai Ngo <Dai.Ngo@oracle.com>
+R:     Tom Talpey <tom@talpey.com>
 L:     linux-nfs@vger.kernel.org
 S:     Supported
 W:     http://nfs.sourceforge.net/
index 04ba95b..22d3ff3 100644 (file)
@@ -355,7 +355,6 @@ static int lockd_get(void)
        int error;
 
        if (nlmsvc_serv) {
-               svc_get(nlmsvc_serv);
                nlmsvc_users++;
                return 0;
        }
index f21259e..4c9b878 100644 (file)
@@ -80,6 +80,8 @@ enum {
 
 int    nfsd_drc_slab_create(void);
 void   nfsd_drc_slab_free(void);
+int    nfsd_net_reply_cache_init(struct nfsd_net *nn);
+void   nfsd_net_reply_cache_destroy(struct nfsd_net *nn);
 int    nfsd_reply_cache_init(struct nfsd_net *);
 void   nfsd_reply_cache_shutdown(struct nfsd_net *);
 int    nfsd_cache_lookup(struct svc_rqst *);
index ae85257..11a0eaa 100644 (file)
@@ -97,7 +97,7 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen)
                goto out;
 
        err = -EINVAL;
-       if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0)
+       if (qword_get(&mesg, buf, PAGE_SIZE) <= 0)
                goto out;
 
        err = -ENOENT;
@@ -107,7 +107,7 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen)
        dprintk("found domain %s\n", buf);
 
        err = -EINVAL;
-       if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0)
+       if (qword_get(&mesg, buf, PAGE_SIZE) <= 0)
                goto out;
        fsidtype = simple_strtoul(buf, &ep, 10);
        if (*ep)
@@ -593,7 +593,6 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 {
        /* client path expiry [flags anonuid anongid fsid] */
        char *buf;
-       int len;
        int err;
        struct auth_domain *dom = NULL;
        struct svc_export exp = {}, *expp;
@@ -609,8 +608,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 
        /* client */
        err = -EINVAL;
-       len = qword_get(&mesg, buf, PAGE_SIZE);
-       if (len <= 0)
+       if (qword_get(&mesg, buf, PAGE_SIZE) <= 0)
                goto out;
 
        err = -ENOENT;
@@ -620,7 +618,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 
        /* path */
        err = -EINVAL;
-       if ((len = qword_get(&mesg, buf, PAGE_SIZE)) <= 0)
+       if (qword_get(&mesg, buf, PAGE_SIZE) <= 0)
                goto out1;
 
        err = kern_path(buf, 0, &exp.ex_path);
@@ -665,7 +663,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
                        goto out3;
                exp.ex_fsid = an_int;
 
-               while ((len = qword_get(&mesg, buf, PAGE_SIZE)) > 0) {
+               while (qword_get(&mesg, buf, PAGE_SIZE) > 0) {
                        if (strcmp(buf, "fsloc") == 0)
                                err = fsloc_parse(&mesg, buf, &exp.ex_fslocs);
                        else if (strcmp(buf, "uuid") == 0)
index e6bb8ee..fc8d5b7 100644 (file)
@@ -151,8 +151,6 @@ nfsd3_proc_read(struct svc_rqst *rqstp)
 {
        struct nfsd3_readargs *argp = rqstp->rq_argp;
        struct nfsd3_readres *resp = rqstp->rq_resp;
-       unsigned int len;
-       int v;
 
        dprintk("nfsd: READ(3) %s %lu bytes at %Lu\n",
                                SVCFH_fmt(&argp->fh),
@@ -166,17 +164,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp)
        if (argp->offset + argp->count > (u64)OFFSET_MAX)
                argp->count = (u64)OFFSET_MAX - argp->offset;
 
-       v = 0;
-       len = argp->count;
        resp->pages = rqstp->rq_next_page;
-       while (len > 0) {
-               struct page *page = *(rqstp->rq_next_page++);
-
-               rqstp->rq_vec[v].iov_base = page_address(page);
-               rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE);
-               len -= rqstp->rq_vec[v].iov_len;
-               v++;
-       }
 
        /* Obtain buffer pointer for payload.
         * 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof)
@@ -187,7 +175,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp)
 
        fh_copy(&resp->fh, &argp->fh);
        resp->status = nfsd_read(rqstp, &resp->fh, argp->offset,
-                                rqstp->rq_vec, v, &resp->count, &resp->eof);
+                                &resp->count, &resp->eof);
        return rpc_success;
 }
 
index 3308dd6..f321289 100644 (file)
@@ -828,7 +828,8 @@ nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
                        return false;
                if (xdr_stream_encode_u32(xdr, resp->len) < 0)
                        return false;
-               xdr_write_pages(xdr, resp->pages, 0, resp->len);
+               svcxdr_encode_opaque_pages(rqstp, xdr, resp->pages, 0,
+                                          resp->len);
                if (svc_encode_result_payload(rqstp, head->iov_len, resp->len) < 0)
                        return false;
                break;
@@ -859,8 +860,9 @@ nfs3svc_encode_readres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
                        return false;
                if (xdr_stream_encode_u32(xdr, resp->count) < 0)
                        return false;
-               xdr_write_pages(xdr, resp->pages, rqstp->rq_res.page_base,
-                               resp->count);
+               svcxdr_encode_opaque_pages(rqstp, xdr, resp->pages,
+                                          rqstp->rq_res.page_base,
+                                          resp->count);
                if (svc_encode_result_payload(rqstp, head->iov_len, resp->count) < 0)
                        return false;
                break;
@@ -961,7 +963,8 @@ nfs3svc_encode_readdirres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
                        return false;
                if (!svcxdr_encode_cookieverf3(xdr, resp->verf))
                        return false;
-               xdr_write_pages(xdr, dirlist->pages, 0, dirlist->len);
+               svcxdr_encode_opaque_pages(rqstp, xdr, dirlist->pages, 0,
+                                          dirlist->len);
                /* no more entries */
                if (xdr_stream_encode_item_absent(xdr) < 0)
                        return false;
index 76db2fe..26b1343 100644 (file)
@@ -2541,6 +2541,20 @@ static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode,
        return p;
 }
 
+static __be32 nfsd4_encode_nfstime4(struct xdr_stream *xdr,
+                                   struct timespec64 *tv)
+{
+       __be32 *p;
+
+       p = xdr_reserve_space(xdr, XDR_UNIT * 3);
+       if (!p)
+               return nfserr_resource;
+
+       p = xdr_encode_hyper(p, (s64)tv->tv_sec);
+       *p = cpu_to_be32(tv->tv_nsec);
+       return nfs_ok;
+}
+
 /*
  * ctime (in NFSv4, time_metadata) is not writeable, and the client
  * doesn't really care what resolution could theoretically be stored by
@@ -2566,12 +2580,16 @@ static __be32 *encode_time_delta(__be32 *p, struct inode *inode)
        return p;
 }
 
-static __be32 *encode_cinfo(__be32 *p, struct nfsd4_change_info *c)
+static __be32
+nfsd4_encode_change_info4(struct xdr_stream *xdr, struct nfsd4_change_info *c)
 {
-       *p++ = cpu_to_be32(c->atomic);
-       p = xdr_encode_hyper(p, c->before_change);
-       p = xdr_encode_hyper(p, c->after_change);
-       return p;
+       if (xdr_stream_encode_bool(xdr, c->atomic) < 0)
+               return nfserr_resource;
+       if (xdr_stream_encode_u64(xdr, c->before_change) < 0)
+               return nfserr_resource;
+       if (xdr_stream_encode_u64(xdr, c->after_change) < 0)
+               return nfserr_resource;
+       return nfs_ok;
 }
 
 /* Encode as an array of strings the string given with components
@@ -3348,11 +3366,9 @@ out_acl:
                p = xdr_encode_hyper(p, dummy64);
        }
        if (bmval1 & FATTR4_WORD1_TIME_ACCESS) {
-               p = xdr_reserve_space(xdr, 12);
-               if (!p)
-                       goto out_resource;
-               p = xdr_encode_hyper(p, (s64)stat.atime.tv_sec);
-               *p++ = cpu_to_be32(stat.atime.tv_nsec);
+               status = nfsd4_encode_nfstime4(xdr, &stat.atime);
+               if (status)
+                       goto out;
        }
        if (bmval1 & FATTR4_WORD1_TIME_DELTA) {
                p = xdr_reserve_space(xdr, 12);
@@ -3361,25 +3377,19 @@ out_acl:
                p = encode_time_delta(p, d_inode(dentry));
        }
        if (bmval1 & FATTR4_WORD1_TIME_METADATA) {
-               p = xdr_reserve_space(xdr, 12);
-               if (!p)
-                       goto out_resource;
-               p = xdr_encode_hyper(p, (s64)stat.ctime.tv_sec);
-               *p++ = cpu_to_be32(stat.ctime.tv_nsec);
+               status = nfsd4_encode_nfstime4(xdr, &stat.ctime);
+               if (status)
+                       goto out;
        }
        if (bmval1 & FATTR4_WORD1_TIME_MODIFY) {
-               p = xdr_reserve_space(xdr, 12);
-               if (!p)
-                       goto out_resource;
-               p = xdr_encode_hyper(p, (s64)stat.mtime.tv_sec);
-               *p++ = cpu_to_be32(stat.mtime.tv_nsec);
+               status = nfsd4_encode_nfstime4(xdr, &stat.mtime);
+               if (status)
+                       goto out;
        }
        if (bmval1 & FATTR4_WORD1_TIME_CREATE) {
-               p = xdr_reserve_space(xdr, 12);
-               if (!p)
-                       goto out_resource;
-               p = xdr_encode_hyper(p, (s64)stat.btime.tv_sec);
-               *p++ = cpu_to_be32(stat.btime.tv_nsec);
+               status = nfsd4_encode_nfstime4(xdr, &stat.btime);
+               if (status)
+                       goto out;
        }
        if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) {
                u64 ino = stat.ino;
@@ -3689,6 +3699,30 @@ fail:
 }
 
 static __be32
+nfsd4_encode_verifier4(struct xdr_stream *xdr, const nfs4_verifier *verf)
+{
+       __be32 *p;
+
+       p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE);
+       if (!p)
+               return nfserr_resource;
+       memcpy(p, verf->data, sizeof(verf->data));
+       return nfs_ok;
+}
+
+static __be32
+nfsd4_encode_clientid4(struct xdr_stream *xdr, const clientid_t *clientid)
+{
+       __be32 *p;
+
+       p = xdr_reserve_space(xdr, sizeof(__be64));
+       if (!p)
+               return nfserr_resource;
+       memcpy(p, clientid, sizeof(*clientid));
+       return nfs_ok;
+}
+
+static __be32
 nfsd4_encode_stateid(struct xdr_stream *xdr, stateid_t *sid)
 {
        __be32 *p;
@@ -3752,15 +3786,8 @@ nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr,
                    union nfsd4_op_u *u)
 {
        struct nfsd4_commit *commit = &u->commit;
-       struct xdr_stream *xdr = resp->xdr;
-       __be32 *p;
 
-       p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE);
-       if (!p)
-               return nfserr_resource;
-       p = xdr_encode_opaque_fixed(p, commit->co_verf.data,
-                                               NFS4_VERIFIER_SIZE);
-       return 0;
+       return nfsd4_encode_verifier4(resp->xdr, &commit->co_verf);
 }
 
 static __be32
@@ -3769,12 +3796,10 @@ nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr,
 {
        struct nfsd4_create *create = &u->create;
        struct xdr_stream *xdr = resp->xdr;
-       __be32 *p;
 
-       p = xdr_reserve_space(xdr, 20);
-       if (!p)
-               return nfserr_resource;
-       encode_cinfo(p, &create->cr_cinfo);
+       nfserr = nfsd4_encode_change_info4(xdr, &create->cr_cinfo);
+       if (nfserr)
+               return nfserr;
        return nfsd4_encode_bitmap(xdr, create->cr_bmval[0],
                        create->cr_bmval[1], create->cr_bmval[2]);
 }
@@ -3892,13 +3917,8 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr,
 {
        struct nfsd4_link *link = &u->link;
        struct xdr_stream *xdr = resp->xdr;
-       __be32 *p;
 
-       p = xdr_reserve_space(xdr, 20);
-       if (!p)
-               return nfserr_resource;
-       p = encode_cinfo(p, &link->li_cinfo);
-       return 0;
+       return nfsd4_encode_change_info4(xdr, &link->li_cinfo);
 }
 
 
@@ -3913,11 +3933,11 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr,
        nfserr = nfsd4_encode_stateid(xdr, &open->op_stateid);
        if (nfserr)
                return nfserr;
-       p = xdr_reserve_space(xdr, 24);
-       if (!p)
+       nfserr = nfsd4_encode_change_info4(xdr, &open->op_cinfo);
+       if (nfserr)
+               return nfserr;
+       if (xdr_stream_encode_u32(xdr, open->op_rflags) < 0)
                return nfserr_resource;
-       p = encode_cinfo(p, &open->op_cinfo);
-       *p++ = cpu_to_be32(open->op_rflags);
 
        nfserr = nfsd4_encode_bitmap(xdr, open->op_bmval[0], open->op_bmval[1],
                                        open->op_bmval[2]);
@@ -3956,7 +3976,7 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr,
                p = xdr_reserve_space(xdr, 32);
                if (!p)
                        return nfserr_resource;
-               *p++ = cpu_to_be32(0);
+               *p++ = cpu_to_be32(open->op_recall);
 
                /*
                 * TODO: space_limit's in delegations
@@ -4018,6 +4038,11 @@ nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr,
        return nfsd4_encode_stateid(xdr, &od->od_stateid);
 }
 
+/*
+ * The operation of this function assumes that this is the only
+ * READ operation in the COMPOUND. If there are multiple READs,
+ * we use nfsd4_encode_readv().
+ */
 static __be32 nfsd4_encode_splice_read(
                                struct nfsd4_compoundres *resp,
                                struct nfsd4_read *read,
@@ -4028,8 +4053,12 @@ static __be32 nfsd4_encode_splice_read(
        int status, space_left;
        __be32 nfserr;
 
-       /* Make sure there will be room for padding if needed */
-       if (xdr->end - xdr->p < 1)
+       /*
+        * Make sure there is room at the end of buf->head for
+        * svcxdr_encode_opaque_pages() to create a tail buffer
+        * to XDR-pad the payload.
+        */
+       if (xdr->iov != xdr->buf->head || xdr->end - xdr->p < 1)
                return nfserr_resource;
 
        nfserr = nfsd_splice_read(read->rd_rqstp, read->rd_fhp,
@@ -4038,6 +4067,8 @@ static __be32 nfsd4_encode_splice_read(
        read->rd_length = maxcount;
        if (nfserr)
                goto out_err;
+       svcxdr_encode_opaque_pages(read->rd_rqstp, xdr, buf->pages,
+                                  buf->page_base, maxcount);
        status = svc_encode_result_payload(read->rd_rqstp,
                                           buf->head[0].iov_len, maxcount);
        if (status) {
@@ -4045,31 +4076,19 @@ static __be32 nfsd4_encode_splice_read(
                goto out_err;
        }
 
-       buf->page_len = maxcount;
-       buf->len += maxcount;
-       xdr->page_ptr += (buf->page_base + maxcount + PAGE_SIZE - 1)
-                                                       / PAGE_SIZE;
-
-       /* Use rest of head for padding and remaining ops: */
-       buf->tail[0].iov_base = xdr->p;
-       buf->tail[0].iov_len = 0;
-       xdr->iov = buf->tail;
-       if (maxcount&3) {
-               int pad = 4 - (maxcount&3);
-
-               *(xdr->p++) = 0;
-
-               buf->tail[0].iov_base += maxcount&3;
-               buf->tail[0].iov_len = pad;
-               buf->len += pad;
-       }
-
+       /*
+        * Prepare to encode subsequent operations.
+        *
+        * xdr_truncate_encode() is not safe to use after a successful
+        * splice read has been done, so the following stream
+        * manipulations are open-coded.
+        */
        space_left = min_t(int, (void *)xdr->end - (void *)xdr->p,
                                buf->buflen - buf->len);
        buf->buflen = buf->len + space_left;
        xdr->end = (__be32 *)((void *)xdr->end + space_left);
 
-       return 0;
+       return nfs_ok;
 
 out_err:
        /*
@@ -4090,13 +4109,13 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
        __be32 zero = xdr_zero;
        __be32 nfserr;
 
-       read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp->rq_vec, maxcount);
-       if (read->rd_vlen < 0)
+       if (xdr_reserve_space_vec(xdr, maxcount) < 0)
                return nfserr_resource;
 
-       nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset,
-                           resp->rqstp->rq_vec, read->rd_vlen, &maxcount,
-                           &read->rd_eof);
+       nfserr = nfsd_iter_read(resp->rqstp, read->rd_fhp, file,
+                               read->rd_offset, &maxcount,
+                               xdr->buf->page_len & ~PAGE_MASK,
+                               &read->rd_eof);
        read->rd_length = maxcount;
        if (nfserr)
                return nfserr;
@@ -4213,15 +4232,9 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr,
        int starting_len = xdr->buf->len;
        __be32 *p;
 
-       p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE);
-       if (!p)
-               return nfserr_resource;
-
-       /* XXX: Following NFSv3, we ignore the READDIR verifier for now. */
-       *p++ = cpu_to_be32(0);
-       *p++ = cpu_to_be32(0);
-       xdr->buf->head[0].iov_len = (char *)xdr->p -
-                                   (char *)xdr->buf->head[0].iov_base;
+       nfserr = nfsd4_encode_verifier4(xdr, &readdir->rd_verf);
+       if (nfserr != nfs_ok)
+               return nfserr;
 
        /*
         * Number of bytes left for directory entries allowing for the
@@ -4299,13 +4312,8 @@ nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr,
 {
        struct nfsd4_remove *remove = &u->remove;
        struct xdr_stream *xdr = resp->xdr;
-       __be32 *p;
 
-       p = xdr_reserve_space(xdr, 20);
-       if (!p)
-               return nfserr_resource;
-       p = encode_cinfo(p, &remove->rm_cinfo);
-       return 0;
+       return nfsd4_encode_change_info4(xdr, &remove->rm_cinfo);
 }
 
 static __be32
@@ -4314,14 +4322,11 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr,
 {
        struct nfsd4_rename *rename = &u->rename;
        struct xdr_stream *xdr = resp->xdr;
-       __be32 *p;
 
-       p = xdr_reserve_space(xdr, 40);
-       if (!p)
-               return nfserr_resource;
-       p = encode_cinfo(p, &rename->rn_sinfo);
-       p = encode_cinfo(p, &rename->rn_tinfo);
-       return 0;
+       nfserr = nfsd4_encode_change_info4(xdr, &rename->rn_sinfo);
+       if (nfserr)
+               return nfserr;
+       return nfsd4_encode_change_info4(xdr, &rename->rn_tinfo);
 }
 
 static __be32
@@ -4448,23 +4453,25 @@ nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr,
 {
        struct nfsd4_setclientid *scd = &u->setclientid;
        struct xdr_stream *xdr = resp->xdr;
-       __be32 *p;
 
        if (!nfserr) {
-               p = xdr_reserve_space(xdr, 8 + NFS4_VERIFIER_SIZE);
-               if (!p)
-                       return nfserr_resource;
-               p = xdr_encode_opaque_fixed(p, &scd->se_clientid, 8);
-               p = xdr_encode_opaque_fixed(p, &scd->se_confirm,
-                                               NFS4_VERIFIER_SIZE);
-       }
-       else if (nfserr == nfserr_clid_inuse) {
-               p = xdr_reserve_space(xdr, 8);
-               if (!p)
-                       return nfserr_resource;
-               *p++ = cpu_to_be32(0);
-               *p++ = cpu_to_be32(0);
+               nfserr = nfsd4_encode_clientid4(xdr, &scd->se_clientid);
+               if (nfserr != nfs_ok)
+                       goto out;
+               nfserr = nfsd4_encode_verifier4(xdr, &scd->se_confirm);
+       } else if (nfserr == nfserr_clid_inuse) {
+               /* empty network id */
+               if (xdr_stream_encode_u32(xdr, 0) < 0) {
+                       nfserr = nfserr_resource;
+                       goto out;
+               }
+               /* empty universal address */
+               if (xdr_stream_encode_u32(xdr, 0) < 0) {
+                       nfserr = nfserr_resource;
+                       goto out;
+               }
        }
+out:
        return nfserr;
 }
 
@@ -4473,17 +4480,12 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr,
                   union nfsd4_op_u *u)
 {
        struct nfsd4_write *write = &u->write;
-       struct xdr_stream *xdr = resp->xdr;
-       __be32 *p;
 
-       p = xdr_reserve_space(xdr, 16);
-       if (!p)
+       if (xdr_stream_encode_u32(resp->xdr, write->wr_bytes_written) < 0)
                return nfserr_resource;
-       *p++ = cpu_to_be32(write->wr_bytes_written);
-       *p++ = cpu_to_be32(write->wr_how_written);
-       p = xdr_encode_opaque_fixed(p, write->wr_verifier.data,
-                                               NFS4_VERIFIER_SIZE);
-       return 0;
+       if (xdr_stream_encode_u32(resp->xdr, write->wr_how_written) < 0)
+               return nfserr_resource;
+       return nfsd4_encode_verifier4(resp->xdr, &write->wr_verifier);
 }
 
 static __be32
@@ -4505,20 +4507,15 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
        server_scope = nn->nfsd_name;
        server_scope_sz = strlen(nn->nfsd_name);
 
-       p = xdr_reserve_space(xdr,
-               8 /* eir_clientid */ +
-               4 /* eir_sequenceid */ +
-               4 /* eir_flags */ +
-               4 /* spr_how */);
-       if (!p)
+       if (nfsd4_encode_clientid4(xdr, &exid->clientid) != nfs_ok)
+               return nfserr_resource;
+       if (xdr_stream_encode_u32(xdr, exid->seqid) < 0)
+               return nfserr_resource;
+       if (xdr_stream_encode_u32(xdr, exid->flags) < 0)
                return nfserr_resource;
 
-       p = xdr_encode_opaque_fixed(p, &exid->clientid, 8);
-       *p++ = cpu_to_be32(exid->seqid);
-       *p++ = cpu_to_be32(exid->flags);
-
-       *p++ = cpu_to_be32(exid->spa_how);
-
+       if (xdr_stream_encode_u32(xdr, exid->spa_how) < 0)
+               return nfserr_resource;
        switch (exid->spa_how) {
        case SP4_NONE:
                break;
@@ -5099,15 +5096,8 @@ nfsd4_encode_setxattr(struct nfsd4_compoundres *resp, __be32 nfserr,
 {
        struct nfsd4_setxattr *setxattr = &u->setxattr;
        struct xdr_stream *xdr = resp->xdr;
-       __be32 *p;
 
-       p = xdr_reserve_space(xdr, 20);
-       if (!p)
-               return nfserr_resource;
-
-       encode_cinfo(p, &setxattr->setxa_cinfo);
-
-       return 0;
+       return nfsd4_encode_change_info4(xdr, &setxattr->setxa_cinfo);
 }
 
 /*
@@ -5253,14 +5243,8 @@ nfsd4_encode_removexattr(struct nfsd4_compoundres *resp, __be32 nfserr,
 {
        struct nfsd4_removexattr *removexattr = &u->removexattr;
        struct xdr_stream *xdr = resp->xdr;
-       __be32 *p;
 
-       p = xdr_reserve_space(xdr, 20);
-       if (!p)
-               return nfserr_resource;
-
-       p = encode_cinfo(p, &removexattr->rmxa_cinfo);
-       return 0;
+       return nfsd4_encode_change_info4(xdr, &removexattr->rmxa_cinfo);
 }
 
 typedef __be32(*nfsd4_enc)(struct nfsd4_compoundres *, __be32, union nfsd4_op_u *u);
@@ -5460,6 +5444,12 @@ status:
 release:
        if (opdesc && opdesc->op_release)
                opdesc->op_release(&op->u);
+
+       /*
+        * Account for pages consumed while encoding this operation.
+        * The xdr_stream primitives don't manage rq_next_page.
+        */
+       rqstp->rq_next_page = xdr->page_ptr + 1;
 }
 
 /* 
@@ -5528,9 +5518,6 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
        p = resp->statusp;
 
        *p++ = resp->cstate.status;
-
-       rqstp->rq_next_page = xdr->page_ptr + 1;
-
        *p++ = htonl(resp->taglen);
        memcpy(p, resp->tag, resp->taglen);
        p += XDR_QUADLEN(resp->taglen);
index 041faa1..a8eda1c 100644 (file)
@@ -148,12 +148,23 @@ void nfsd_drc_slab_free(void)
        kmem_cache_destroy(drc_slab);
 }
 
-static int nfsd_reply_cache_stats_init(struct nfsd_net *nn)
+/**
+ * nfsd_net_reply_cache_init - per net namespace reply cache set-up
+ * @nn: nfsd_net being initialized
+ *
+ * Returns zero on succes; otherwise a negative errno is returned.
+ */
+int nfsd_net_reply_cache_init(struct nfsd_net *nn)
 {
        return nfsd_percpu_counters_init(nn->counter, NFSD_NET_COUNTERS_NUM);
 }
 
-static void nfsd_reply_cache_stats_destroy(struct nfsd_net *nn)
+/**
+ * nfsd_net_reply_cache_destroy - per net namespace reply cache tear-down
+ * @nn: nfsd_net being freed
+ *
+ */
+void nfsd_net_reply_cache_destroy(struct nfsd_net *nn)
 {
        nfsd_percpu_counters_destroy(nn->counter, NFSD_NET_COUNTERS_NUM);
 }
@@ -169,17 +180,13 @@ int nfsd_reply_cache_init(struct nfsd_net *nn)
        hashsize = nfsd_hashsize(nn->max_drc_entries);
        nn->maskbits = ilog2(hashsize);
 
-       status = nfsd_reply_cache_stats_init(nn);
-       if (status)
-               goto out_nomem;
-
        nn->nfsd_reply_cache_shrinker.scan_objects = nfsd_reply_cache_scan;
        nn->nfsd_reply_cache_shrinker.count_objects = nfsd_reply_cache_count;
        nn->nfsd_reply_cache_shrinker.seeks = 1;
        status = register_shrinker(&nn->nfsd_reply_cache_shrinker,
                                   "nfsd-reply:%s", nn->nfsd_name);
        if (status)
-               goto out_stats_destroy;
+               return status;
 
        nn->drc_hashtbl = kvzalloc(array_size(hashsize,
                                sizeof(*nn->drc_hashtbl)), GFP_KERNEL);
@@ -195,9 +202,6 @@ int nfsd_reply_cache_init(struct nfsd_net *nn)
        return 0;
 out_shrinker:
        unregister_shrinker(&nn->nfsd_reply_cache_shrinker);
-out_stats_destroy:
-       nfsd_reply_cache_stats_destroy(nn);
-out_nomem:
        printk(KERN_ERR "nfsd: failed to allocate reply cache\n");
        return -ENOMEM;
 }
@@ -217,7 +221,6 @@ void nfsd_reply_cache_shutdown(struct nfsd_net *nn)
                                                                        rp, nn);
                }
        }
-       nfsd_reply_cache_stats_destroy(nn);
 
        kvfree(nn->drc_hashtbl);
        nn->drc_hashtbl = NULL;
index b4fd7a7..1b8b1aa 100644 (file)
@@ -25,6 +25,7 @@
 #include "netns.h"
 #include "pnfs.h"
 #include "filecache.h"
+#include "trace.h"
 
 /*
  *     We have a single directory with several nodes in it.
@@ -109,12 +110,12 @@ static ssize_t nfsctl_transaction_write(struct file *file, const char __user *bu
        if (IS_ERR(data))
                return PTR_ERR(data);
 
-       rv =  write_op[ino](file, data, size);
-       if (rv >= 0) {
-               simple_transaction_set(file, rv);
-               rv = size;
-       }
-       return rv;
+       rv = write_op[ino](file, data, size);
+       if (rv < 0)
+               return rv;
+
+       simple_transaction_set(file, rv);
+       return size;
 }
 
 static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos)
@@ -230,6 +231,7 @@ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
        if (rpc_pton(net, fo_path, size, sap, salen) == 0)
                return -EINVAL;
 
+       trace_nfsd_ctl_unlock_ip(net, buf);
        return nlmsvc_unlock_all_by_ip(sap);
 }
 
@@ -263,7 +265,7 @@ static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size)
        fo_path = buf;
        if (qword_get(&buf, fo_path, size) < 0)
                return -EINVAL;
-
+       trace_nfsd_ctl_unlock_fs(netns(file), fo_path);
        error = kern_path(fo_path, 0, &path);
        if (error)
                return error;
@@ -324,7 +326,7 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
        len = qword_get(&mesg, dname, size);
        if (len <= 0)
                return -EINVAL;
-       
+
        path = dname+len+1;
        len = qword_get(&mesg, path, size);
        if (len <= 0)
@@ -338,15 +340,17 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
                return -EINVAL;
        maxsize = min(maxsize, NFS3_FHSIZE);
 
-       if (qword_get(&mesg, mesg, size)>0)
+       if (qword_get(&mesg, mesg, size) > 0)
                return -EINVAL;
 
+       trace_nfsd_ctl_filehandle(netns(file), dname, path, maxsize);
+
        /* we have all the words, they are in buf.. */
        dom = unix_domain_find(dname);
        if (!dom)
                return -ENOMEM;
 
-       len = exp_rootfh(netns(file), dom, path, &fh,  maxsize);
+       len = exp_rootfh(netns(file), dom, path, &fh, maxsize);
        auth_domain_put(dom);
        if (len)
                return len;
@@ -399,6 +403,7 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)
                        return rv;
                if (newthreads < 0)
                        return -EINVAL;
+               trace_nfsd_ctl_threads(net, newthreads);
                rv = nfsd_svc(newthreads, net, file->f_cred);
                if (rv < 0)
                        return rv;
@@ -418,8 +423,8 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)
  * OR
  *
  * Input:
- *                     buf:            C string containing whitespace-
- *                                     separated unsigned integer values
+ *                     buf:            C string containing whitespace-
+ *                                     separated unsigned integer values
  *                                     representing the number of NFSD
  *                                     threads to start in each pool
  *                     size:           non-zero length of C string in @buf
@@ -471,6 +476,7 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size)
                        rv = -EINVAL;
                        if (nthreads[i] < 0)
                                goto out_free;
+                       trace_nfsd_ctl_pool_threads(net, i, nthreads[i]);
                }
                rv = nfsd_set_nrthreads(i, nthreads, net);
                if (rv)
@@ -526,7 +532,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
        char *sep;
        struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id);
 
-       if (size>0) {
+       if (size > 0) {
                if (nn->nfsd_serv)
                        /* Cannot change versions without updating
                         * nn->nfsd_serv->sv_xdrsize, and reallocing
@@ -536,6 +542,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
                if (buf[size-1] != '\n')
                        return -EINVAL;
                buf[size-1] = 0;
+               trace_nfsd_ctl_version(netns(file), buf);
 
                vers = mesg;
                len = qword_get(&mesg, vers, size);
@@ -637,11 +644,11 @@ out:
  * OR
  *
  * Input:
- *                     buf:            C string containing whitespace-
- *                                     separated positive or negative
- *                                     integer values representing NFS
- *                                     protocol versions to enable ("+n")
- *                                     or disable ("-n")
+ *                     buf:            C string containing whitespace-
+ *                                     separated positive or negative
+ *                                     integer values representing NFS
+ *                                     protocol versions to enable ("+n")
+ *                                     or disable ("-n")
  *                     size:           non-zero length of C string in @buf
  * Output:
  *     On success:     status of zero or more protocol versions has
@@ -689,6 +696,7 @@ static ssize_t __write_ports_addfd(char *buf, struct net *net, const struct cred
        err = get_int(&mesg, &fd);
        if (err != 0 || fd < 0)
                return -EINVAL;
+       trace_nfsd_ctl_ports_addfd(net, fd);
 
        err = nfsd_create_serv(net);
        if (err != 0)
@@ -705,7 +713,7 @@ static ssize_t __write_ports_addfd(char *buf, struct net *net, const struct cred
 }
 
 /*
- * A transport listener is added by writing it's transport name and
+ * A transport listener is added by writing its transport name and
  * a port number.
  */
 static ssize_t __write_ports_addxprt(char *buf, struct net *net, const struct cred *cred)
@@ -720,6 +728,7 @@ static ssize_t __write_ports_addxprt(char *buf, struct net *net, const struct cr
 
        if (port < 1 || port > USHRT_MAX)
                return -EINVAL;
+       trace_nfsd_ctl_ports_addxprt(net, transport, port);
 
        err = nfsd_create_serv(net);
        if (err != 0)
@@ -832,9 +841,9 @@ int nfsd_max_blksize;
  * OR
  *
  * Input:
- *                     buf:            C string containing an unsigned
- *                                     integer value representing the new
- *                                     NFS blksize
+ *                     buf:            C string containing an unsigned
+ *                                     integer value representing the new
+ *                                     NFS blksize
  *                     size:           non-zero length of C string in @buf
  * Output:
  *     On success:     passed-in buffer filled with '\n'-terminated C string
@@ -853,6 +862,8 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
                int rv = get_int(&mesg, &bsize);
                if (rv)
                        return rv;
+               trace_nfsd_ctl_maxblksize(netns(file), bsize);
+
                /* force bsize into allowed range and
                 * required alignment.
                 */
@@ -881,9 +892,9 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
  * OR
  *
  * Input:
- *                     buf:            C string containing an unsigned
- *                                     integer value representing the new
- *                                     number of max connections
+ *                     buf:            C string containing an unsigned
+ *                                     integer value representing the new
+ *                                     number of max connections
  *                     size:           non-zero length of C string in @buf
  * Output:
  *     On success:     passed-in buffer filled with '\n'-terminated C string
@@ -903,6 +914,7 @@ static ssize_t write_maxconn(struct file *file, char *buf, size_t size)
 
                if (rv)
                        return rv;
+               trace_nfsd_ctl_maxconn(netns(file), maxconn);
                nn->max_connections = maxconn;
        }
 
@@ -913,6 +925,7 @@ static ssize_t write_maxconn(struct file *file, char *buf, size_t size)
 static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size,
                                  time64_t *time, struct nfsd_net *nn)
 {
+       struct dentry *dentry = file_dentry(file);
        char *mesg = buf;
        int rv, i;
 
@@ -922,6 +935,9 @@ static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size,
                rv = get_int(&mesg, &i);
                if (rv)
                        return rv;
+               trace_nfsd_ctl_time(netns(file), dentry->d_name.name,
+                                   dentry->d_name.len, i);
+
                /*
                 * Some sanity checking.  We don't have a reason for
                 * these particular numbers, but problems with the
@@ -1014,6 +1030,7 @@ static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size,
                len = qword_get(&mesg, recdir, size);
                if (len <= 0)
                        return -EINVAL;
+               trace_nfsd_ctl_recoverydir(netns(file), recdir);
 
                status = nfs4_reset_recoverydir(recdir);
                if (status)
@@ -1065,7 +1082,7 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
  * OR
  *
  * Input:
- *                     buf:            any value
+ *                     buf:            any value
  *                     size:           non-zero length of C string in @buf
  * Output:
  *                     passed-in buffer filled with "Y" or "N" with a newline
@@ -1087,7 +1104,7 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size)
                case '1':
                        if (!nn->nfsd_serv)
                                return -EBUSY;
-                       nfsd4_end_grace(nn);
+                       trace_nfsd_end_grace(netns(file));
                        break;
                default:
                        return -EINVAL;
@@ -1192,8 +1209,8 @@ static int __nfsd_symlink(struct inode *dir, struct dentry *dentry,
  * @content is assumed to be a NUL-terminated string that lives
  * longer than the symlink itself.
  */
-static void nfsd_symlink(struct dentry *parent, const char *name,
-                        const char *content)
+static void _nfsd_symlink(struct dentry *parent, const char *name,
+                         const char *content)
 {
        struct inode *dir = parent->d_inode;
        struct dentry *dentry;
@@ -1210,8 +1227,8 @@ out:
        inode_unlock(dir);
 }
 #else
-static inline void nfsd_symlink(struct dentry *parent, const char *name,
-                               const char *content)
+static inline void _nfsd_symlink(struct dentry *parent, const char *name,
+                                const char *content)
 {
 }
 
@@ -1389,8 +1406,8 @@ static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc)
        ret = simple_fill_super(sb, 0x6e667364, nfsd_files);
        if (ret)
                return ret;
-       nfsd_symlink(sb->s_root, "supported_krb5_enctypes",
-                    "/proc/net/rpc/gss_krb5_enctypes");
+       _nfsd_symlink(sb->s_root, "supported_krb5_enctypes",
+                     "/proc/net/rpc/gss_krb5_enctypes");
        dentry = nfsd_mkdir(sb->s_root, NULL, "clients");
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);
@@ -1477,7 +1494,17 @@ static int create_proc_exports_entry(void)
 
 unsigned int nfsd_net_id;
 
-static __net_init int nfsd_init_net(struct net *net)
+/**
+ * nfsd_net_init - Prepare the nfsd_net portion of a new net namespace
+ * @net: a freshly-created network namespace
+ *
+ * This information stays around as long as the network namespace is
+ * alive whether or not there is an NFSD instance running in the
+ * namespace.
+ *
+ * Returns zero on success, or a negative errno otherwise.
+ */
+static __net_init int nfsd_net_init(struct net *net)
 {
        int retval;
        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
@@ -1488,6 +1515,9 @@ static __net_init int nfsd_init_net(struct net *net)
        retval = nfsd_idmap_init(net);
        if (retval)
                goto out_idmap_error;
+       retval = nfsd_net_reply_cache_init(nn);
+       if (retval)
+               goto out_repcache_error;
        nn->nfsd_versions = NULL;
        nn->nfsd4_minorversions = NULL;
        nfsd4_init_leases_net(nn);
@@ -1496,22 +1526,32 @@ static __net_init int nfsd_init_net(struct net *net)
 
        return 0;
 
+out_repcache_error:
+       nfsd_idmap_shutdown(net);
 out_idmap_error:
        nfsd_export_shutdown(net);
 out_export_error:
        return retval;
 }
 
-static __net_exit void nfsd_exit_net(struct net *net)
+/**
+ * nfsd_net_exit - Release the nfsd_net portion of a net namespace
+ * @net: a network namespace that is about to be destroyed
+ *
+ */
+static __net_exit void nfsd_net_exit(struct net *net)
 {
+       struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+       nfsd_net_reply_cache_destroy(nn);
        nfsd_idmap_shutdown(net);
        nfsd_export_shutdown(net);
-       nfsd_netns_free_versions(net_generic(net, nfsd_net_id));
+       nfsd_netns_free_versions(nn);
 }
 
 static struct pernet_operations nfsd_net_ops = {
-       .init = nfsd_init_net,
-       .exit = nfsd_exit_net,
+       .init = nfsd_net_init,
+       .exit = nfsd_net_exit,
        .id   = &nfsd_net_id,
        .size = sizeof(struct nfsd_net),
 };
index ccd8485..e8e13ae 100644 (file)
@@ -623,16 +623,9 @@ void fh_fill_pre_attrs(struct svc_fh *fhp)
 
        inode = d_inode(fhp->fh_dentry);
        err = fh_getattr(fhp, &stat);
-       if (err) {
-               /* Grab the times from inode anyway */
-               stat.mtime = inode->i_mtime;
-               stat.ctime = inode->i_ctime;
-               stat.size  = inode->i_size;
-               if (v4 && IS_I_VERSION(inode)) {
-                       stat.change_cookie = inode_query_iversion(inode);
-                       stat.result_mask |= STATX_CHANGE_COOKIE;
-               }
-       }
+       if (err)
+               return;
+
        if (v4)
                fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode);
 
@@ -660,15 +653,10 @@ void fh_fill_post_attrs(struct svc_fh *fhp)
                printk("nfsd: inode locked twice during operation.\n");
 
        err = fh_getattr(fhp, &fhp->fh_post_attr);
-       if (err) {
-               fhp->fh_post_saved = false;
-               fhp->fh_post_attr.ctime = inode->i_ctime;
-               if (v4 && IS_I_VERSION(inode)) {
-                       fhp->fh_post_attr.change_cookie = inode_query_iversion(inode);
-                       fhp->fh_post_attr.result_mask |= STATX_CHANGE_COOKIE;
-               }
-       } else
-               fhp->fh_post_saved = true;
+       if (err)
+               return;
+
+       fhp->fh_post_saved = true;
        if (v4)
                fhp->fh_post_change =
                        nfsd4_change_attribute(&fhp->fh_post_attr, inode);
index c371955..a731592 100644 (file)
@@ -176,9 +176,7 @@ nfsd_proc_read(struct svc_rqst *rqstp)
 {
        struct nfsd_readargs *argp = rqstp->rq_argp;
        struct nfsd_readres *resp = rqstp->rq_resp;
-       unsigned int len;
        u32 eof;
-       int v;
 
        dprintk("nfsd: READ    %s %d bytes at %d\n",
                SVCFH_fmt(&argp->fh),
@@ -187,17 +185,7 @@ nfsd_proc_read(struct svc_rqst *rqstp)
        argp->count = min_t(u32, argp->count, NFSSVC_MAXBLKSIZE_V2);
        argp->count = min_t(u32, argp->count, rqstp->rq_res.buflen);
 
-       v = 0;
-       len = argp->count;
        resp->pages = rqstp->rq_next_page;
-       while (len > 0) {
-               struct page *page = *(rqstp->rq_next_page++);
-
-               rqstp->rq_vec[v].iov_base = page_address(page);
-               rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE);
-               len -= rqstp->rq_vec[v].iov_len;
-               v++;
-       }
 
        /* Obtain buffer pointer for payload. 19 is 1 word for
         * status, 17 words for fattr, and 1 word for the byte count.
@@ -207,7 +195,7 @@ nfsd_proc_read(struct svc_rqst *rqstp)
        resp->count = argp->count;
        fh_copy(&resp->fh, &argp->fh);
        resp->status = nfsd_read(rqstp, &resp->fh, argp->offset,
-                                rqstp->rq_vec, v, &resp->count, &eof);
+                                &resp->count, &eof);
        if (resp->status == nfs_ok)
                resp->status = fh_getattr(&resp->fh, &resp->stat);
        else if (resp->status == nfserr_jukebox)
index 9c7b1ef..2154fa6 100644 (file)
@@ -402,6 +402,11 @@ void nfsd_reset_write_verifier(struct nfsd_net *nn)
        write_sequnlock(&nn->writeverf_lock);
 }
 
+/*
+ * Crank up a set of per-namespace resources for a new NFSD instance,
+ * including lockd, a duplicate reply cache, an open file cache
+ * instance, and a cache of NFSv4 state objects.
+ */
 static int nfsd_startup_net(struct net *net, const struct cred *cred)
 {
        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
index caf6355..5777f40 100644 (file)
@@ -468,7 +468,8 @@ nfssvc_encode_readlinkres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
        case nfs_ok:
                if (xdr_stream_encode_u32(xdr, resp->len) < 0)
                        return false;
-               xdr_write_pages(xdr, &resp->page, 0, resp->len);
+               svcxdr_encode_opaque_pages(rqstp, xdr, &resp->page, 0,
+                                          resp->len);
                if (svc_encode_result_payload(rqstp, head->iov_len, resp->len) < 0)
                        return false;
                break;
@@ -491,8 +492,9 @@ nfssvc_encode_readres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
                        return false;
                if (xdr_stream_encode_u32(xdr, resp->count) < 0)
                        return false;
-               xdr_write_pages(xdr, resp->pages, rqstp->rq_res.page_base,
-                               resp->count);
+               svcxdr_encode_opaque_pages(rqstp, xdr, resp->pages,
+                                          rqstp->rq_res.page_base,
+                                          resp->count);
                if (svc_encode_result_payload(rqstp, head->iov_len, resp->count) < 0)
                        return false;
                break;
@@ -511,7 +513,8 @@ nfssvc_encode_readdirres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
                return false;
        switch (resp->status) {
        case nfs_ok:
-               xdr_write_pages(xdr, dirlist->pages, 0, dirlist->len);
+               svcxdr_encode_opaque_pages(rqstp, xdr, dirlist->pages, 0,
+                                          dirlist->len);
                /* no more entries */
                if (xdr_stream_encode_item_absent(xdr) < 0)
                        return false;
index 72a906a..2af7498 100644 (file)
@@ -1581,6 +1581,265 @@ TRACE_EVENT(nfsd_cb_recall_any_done,
        )
 );
 
+TRACE_EVENT(nfsd_ctl_unlock_ip,
+       TP_PROTO(
+               const struct net *net,
+               const char *address
+       ),
+       TP_ARGS(net, address),
+       TP_STRUCT__entry(
+               __field(unsigned int, netns_ino)
+               __string(address, address)
+       ),
+       TP_fast_assign(
+               __entry->netns_ino = net->ns.inum;
+               __assign_str(address, address);
+       ),
+       TP_printk("address=%s",
+               __get_str(address)
+       )
+);
+
+TRACE_EVENT(nfsd_ctl_unlock_fs,
+       TP_PROTO(
+               const struct net *net,
+               const char *path
+       ),
+       TP_ARGS(net, path),
+       TP_STRUCT__entry(
+               __field(unsigned int, netns_ino)
+               __string(path, path)
+       ),
+       TP_fast_assign(
+               __entry->netns_ino = net->ns.inum;
+               __assign_str(path, path);
+       ),
+       TP_printk("path=%s",
+               __get_str(path)
+       )
+);
+
+TRACE_EVENT(nfsd_ctl_filehandle,
+       TP_PROTO(
+               const struct net *net,
+               const char *domain,
+               const char *path,
+               int maxsize
+       ),
+       TP_ARGS(net, domain, path, maxsize),
+       TP_STRUCT__entry(
+               __field(unsigned int, netns_ino)
+               __field(int, maxsize)
+               __string(domain, domain)
+               __string(path, path)
+       ),
+       TP_fast_assign(
+               __entry->netns_ino = net->ns.inum;
+               __entry->maxsize = maxsize;
+               __assign_str(domain, domain);
+               __assign_str(path, path);
+       ),
+       TP_printk("domain=%s path=%s maxsize=%d",
+               __get_str(domain), __get_str(path), __entry->maxsize
+       )
+);
+
+TRACE_EVENT(nfsd_ctl_threads,
+       TP_PROTO(
+               const struct net *net,
+               int newthreads
+       ),
+       TP_ARGS(net, newthreads),
+       TP_STRUCT__entry(
+               __field(unsigned int, netns_ino)
+               __field(int, newthreads)
+       ),
+       TP_fast_assign(
+               __entry->netns_ino = net->ns.inum;
+               __entry->newthreads = newthreads;
+       ),
+       TP_printk("newthreads=%d",
+               __entry->newthreads
+       )
+);
+
+TRACE_EVENT(nfsd_ctl_pool_threads,
+       TP_PROTO(
+               const struct net *net,
+               int pool,
+               int nrthreads
+       ),
+       TP_ARGS(net, pool, nrthreads),
+       TP_STRUCT__entry(
+               __field(unsigned int, netns_ino)
+               __field(int, pool)
+               __field(int, nrthreads)
+       ),
+       TP_fast_assign(
+               __entry->netns_ino = net->ns.inum;
+               __entry->pool = pool;
+               __entry->nrthreads = nrthreads;
+       ),
+       TP_printk("pool=%d nrthreads=%d",
+               __entry->pool, __entry->nrthreads
+       )
+);
+
+TRACE_EVENT(nfsd_ctl_version,
+       TP_PROTO(
+               const struct net *net,
+               const char *mesg
+       ),
+       TP_ARGS(net, mesg),
+       TP_STRUCT__entry(
+               __field(unsigned int, netns_ino)
+               __string(mesg, mesg)
+       ),
+       TP_fast_assign(
+               __entry->netns_ino = net->ns.inum;
+               __assign_str(mesg, mesg);
+       ),
+       TP_printk("%s",
+               __get_str(mesg)
+       )
+);
+
+TRACE_EVENT(nfsd_ctl_ports_addfd,
+       TP_PROTO(
+               const struct net *net,
+               int fd
+       ),
+       TP_ARGS(net, fd),
+       TP_STRUCT__entry(
+               __field(unsigned int, netns_ino)
+               __field(int, fd)
+       ),
+       TP_fast_assign(
+               __entry->netns_ino = net->ns.inum;
+               __entry->fd = fd;
+       ),
+       TP_printk("fd=%d",
+               __entry->fd
+       )
+);
+
+TRACE_EVENT(nfsd_ctl_ports_addxprt,
+       TP_PROTO(
+               const struct net *net,
+               const char *transport,
+               int port
+       ),
+       TP_ARGS(net, transport, port),
+       TP_STRUCT__entry(
+               __field(unsigned int, netns_ino)
+               __field(int, port)
+               __string(transport, transport)
+       ),
+       TP_fast_assign(
+               __entry->netns_ino = net->ns.inum;
+               __entry->port = port;
+               __assign_str(transport, transport);
+       ),
+       TP_printk("transport=%s port=%d",
+               __get_str(transport), __entry->port
+       )
+);
+
+TRACE_EVENT(nfsd_ctl_maxblksize,
+       TP_PROTO(
+               const struct net *net,
+               int bsize
+       ),
+       TP_ARGS(net, bsize),
+       TP_STRUCT__entry(
+               __field(unsigned int, netns_ino)
+               __field(int, bsize)
+       ),
+       TP_fast_assign(
+               __entry->netns_ino = net->ns.inum;
+               __entry->bsize = bsize;
+       ),
+       TP_printk("bsize=%d",
+               __entry->bsize
+       )
+);
+
+TRACE_EVENT(nfsd_ctl_maxconn,
+       TP_PROTO(
+               const struct net *net,
+               int maxconn
+       ),
+       TP_ARGS(net, maxconn),
+       TP_STRUCT__entry(
+               __field(unsigned int, netns_ino)
+               __field(int, maxconn)
+       ),
+       TP_fast_assign(
+               __entry->netns_ino = net->ns.inum;
+               __entry->maxconn = maxconn;
+       ),
+       TP_printk("maxconn=%d",
+               __entry->maxconn
+       )
+);
+
+TRACE_EVENT(nfsd_ctl_time,
+       TP_PROTO(
+               const struct net *net,
+               const char *name,
+               size_t namelen,
+               int time
+       ),
+       TP_ARGS(net, name, namelen, time),
+       TP_STRUCT__entry(
+               __field(unsigned int, netns_ino)
+               __field(int, time)
+               __string_len(name, name, namelen)
+       ),
+       TP_fast_assign(
+               __entry->netns_ino = net->ns.inum;
+               __entry->time = time;
+               __assign_str_len(name, name, namelen);
+       ),
+       TP_printk("file=%s time=%d\n",
+               __get_str(name), __entry->time
+       )
+);
+
+TRACE_EVENT(nfsd_ctl_recoverydir,
+       TP_PROTO(
+               const struct net *net,
+               const char *recdir
+       ),
+       TP_ARGS(net, recdir),
+       TP_STRUCT__entry(
+               __field(unsigned int, netns_ino)
+               __string(recdir, recdir)
+       ),
+       TP_fast_assign(
+               __entry->netns_ino = net->ns.inum;
+               __assign_str(recdir, recdir);
+       ),
+       TP_printk("recdir=%s",
+               __get_str(recdir)
+       )
+);
+
+TRACE_EVENT(nfsd_end_grace,
+       TP_PROTO(
+               const struct net *net
+       ),
+       TP_ARGS(net),
+       TP_STRUCT__entry(
+               __field(unsigned int, netns_ino)
+       ),
+       TP_fast_assign(
+               __entry->netns_ino = net->ns.inum;
+       ),
+       TP_printk("nn=%d", __entry->netns_ino
+       )
+);
+
 #endif /* _NFSD_TRACE_H */
 
 #undef TRACE_INCLUDE_PATH
index db67f8e..59b7d60 100644 (file)
@@ -388,7 +388,9 @@ nfsd_sanitize_attrs(struct inode *inode, struct iattr *iap)
                                iap->ia_mode &= ~S_ISGID;
                } else {
                        /* set ATTR_KILL_* bits and let VFS handle it */
-                       iap->ia_valid |= (ATTR_KILL_SUID | ATTR_KILL_SGID);
+                       iap->ia_valid |= ATTR_KILL_SUID;
+                       iap->ia_valid |=
+                               setattr_should_drop_sgid(&nop_mnt_idmap, inode);
                }
        }
 }
@@ -1001,6 +1003,18 @@ static __be32 nfsd_finish_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
        }
 }
 
+/**
+ * nfsd_splice_read - Perform a VFS read using a splice pipe
+ * @rqstp: RPC transaction context
+ * @fhp: file handle of file to be read
+ * @file: opened struct file of file to be read
+ * @offset: starting byte offset
+ * @count: IN: requested number of bytes; OUT: number of bytes read
+ * @eof: OUT: set non-zero if operation reached the end of the file
+ *
+ * Returns nfs_ok on success, otherwise an nfserr stat value is
+ * returned.
+ */
 __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
                        struct file *file, loff_t offset, unsigned long *count,
                        u32 *eof)
@@ -1014,22 +1028,50 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
        ssize_t host_err;
 
        trace_nfsd_read_splice(rqstp, fhp, offset, *count);
-       rqstp->rq_next_page = rqstp->rq_respages + 1;
        host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor);
        return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err);
 }
 
-__be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp,
-                 struct file *file, loff_t offset,
-                 struct kvec *vec, int vlen, unsigned long *count,
-                 u32 *eof)
+/**
+ * nfsd_iter_read - Perform a VFS read using an iterator
+ * @rqstp: RPC transaction context
+ * @fhp: file handle of file to be read
+ * @file: opened struct file of file to be read
+ * @offset: starting byte offset
+ * @count: IN: requested number of bytes; OUT: number of bytes read
+ * @base: offset in first page of read buffer
+ * @eof: OUT: set non-zero if operation reached the end of the file
+ *
+ * Some filesystems or situations cannot use nfsd_splice_read. This
+ * function is the slightly less-performant fallback for those cases.
+ *
+ * Returns nfs_ok on success, otherwise an nfserr stat value is
+ * returned.
+ */
+__be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
+                     struct file *file, loff_t offset, unsigned long *count,
+                     unsigned int base, u32 *eof)
 {
+       unsigned long v, total;
        struct iov_iter iter;
        loff_t ppos = offset;
+       struct page *page;
        ssize_t host_err;
 
+       v = 0;
+       total = *count;
+       while (total) {
+               page = *(rqstp->rq_next_page++);
+               rqstp->rq_vec[v].iov_base = page_address(page) + base;
+               rqstp->rq_vec[v].iov_len = min_t(size_t, total, PAGE_SIZE - base);
+               total -= rqstp->rq_vec[v].iov_len;
+               ++v;
+               base = 0;
+       }
+       WARN_ON_ONCE(v > ARRAY_SIZE(rqstp->rq_vec));
+
        trace_nfsd_read_vector(rqstp, fhp, offset, *count);
-       iov_iter_kvec(&iter, ITER_DEST, vec, vlen, *count);
+       iov_iter_kvec(&iter, ITER_DEST, rqstp->rq_vec, v, *count);
        host_err = vfs_iter_read(file, &iter, &ppos, 0);
        return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err);
 }
@@ -1159,14 +1201,24 @@ out_nfserr:
        return nfserr;
 }
 
-/*
- * Read data from a file. count must contain the requested read count
- * on entry. On return, *count contains the number of bytes actually read.
+/**
+ * nfsd_read - Read data from a file
+ * @rqstp: RPC transaction context
+ * @fhp: file handle of file to be read
+ * @offset: starting byte offset
+ * @count: IN: requested number of bytes; OUT: number of bytes read
+ * @eof: OUT: set non-zero if operation reached the end of the file
+ *
+ * The caller must verify that there is enough space in @rqstp.rq_res
+ * to perform this operation.
+ *
  * N.B. After this call fhp needs an fh_put
+ *
+ * Returns nfs_ok on success, otherwise an nfserr stat value is
+ * returned.
  */
 __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
-       loff_t offset, struct kvec *vec, int vlen, unsigned long *count,
-       u32 *eof)
+                loff_t offset, unsigned long *count, u32 *eof)
 {
        struct nfsd_file        *nf;
        struct file *file;
@@ -1181,12 +1233,10 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
        if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &rqstp->rq_flags))
                err = nfsd_splice_read(rqstp, fhp, file, offset, count, eof);
        else
-               err = nfsd_readv(rqstp, fhp, file, offset, vec, vlen, count, eof);
+               err = nfsd_iter_read(rqstp, fhp, file, offset, count, 0, eof);
 
        nfsd_file_put(nf);
-
        trace_nfsd_read_done(rqstp, fhp, offset, *count);
-
        return err;
 }
 
index 43fb57a..a6890ea 100644 (file)
@@ -110,13 +110,12 @@ __be32            nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
                                struct file *file, loff_t offset,
                                unsigned long *count,
                                u32 *eof);
-__be32         nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp,
+__be32         nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
                                struct file *file, loff_t offset,
-                               struct kvec *vec, int vlen,
-                               unsigned long *count,
+                               unsigned long *count, unsigned int base,
                                u32 *eof);
-__be32                 nfsd_read(struct svc_rqst *, struct svc_fh *,
-                               loff_t, struct kvec *, int, unsigned long *,
+__be32         nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
+                               loff_t offset, unsigned long *count,
                                u32 *eof);
 __be32                 nfsd_write(struct svc_rqst *, struct svc_fh *, loff_t,
                                struct kvec *, int, unsigned long *,
index 762d723..3b10636 100644 (file)
@@ -509,6 +509,27 @@ static inline void svcxdr_init_encode(struct svc_rqst *rqstp)
 }
 
 /**
+ * svcxdr_encode_opaque_pages - Insert pages into an xdr_stream
+ * @xdr: xdr_stream to be updated
+ * @pages: array of pages to insert
+ * @base: starting offset of first data byte in @pages
+ * @len: number of data bytes in @pages to insert
+ *
+ * After the @pages are added, the tail iovec is instantiated pointing
+ * to end of the head buffer, and the stream is set up to encode
+ * subsequent items into the tail.
+ */
+static inline void svcxdr_encode_opaque_pages(struct svc_rqst *rqstp,
+                                             struct xdr_stream *xdr,
+                                             struct page **pages,
+                                             unsigned int base,
+                                             unsigned int len)
+{
+       xdr_write_pages(xdr, pages, base, len);
+       xdr->page_ptr = rqstp->rq_next_page - 1;
+}
+
+/**
  * svcxdr_set_auth_slack -
  * @rqstp: RPC transaction
  * @slack: buffer space to reserve for the transaction's security flavor
index fbc4bd4..a5ee0af 100644 (file)
@@ -135,7 +135,6 @@ struct svc_rdma_recv_ctxt {
        struct ib_sge           rc_recv_sge;
        void                    *rc_recv_buf;
        struct xdr_stream       rc_stream;
-       bool                    rc_temp;
        u32                     rc_byte_len;
        unsigned int            rc_page_count;
        u32                     rc_inv_rkey;
@@ -155,12 +154,12 @@ struct svc_rdma_send_ctxt {
 
        struct ib_send_wr       sc_send_wr;
        struct ib_cqe           sc_cqe;
-       struct completion       sc_done;
        struct xdr_buf          sc_hdrbuf;
        struct xdr_stream       sc_stream;
        void                    *sc_xprt_buf;
+       int                     sc_page_count;
        int                     sc_cur_sge_no;
-
+       struct page             *sc_pages[RPCSVC_MAXPAGES];
        struct ib_sge           sc_sges[];
 };
 
index 72014c9..f89ec4b 100644 (file)
@@ -242,8 +242,7 @@ extern void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf,
 extern void xdr_init_encode_pages(struct xdr_stream *xdr, struct xdr_buf *buf,
                           struct page **pages, struct rpc_rqst *rqst);
 extern __be32 *xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes);
-extern int xdr_reserve_space_vec(struct xdr_stream *xdr, struct kvec *vec,
-               size_t nbytes);
+extern int xdr_reserve_space_vec(struct xdr_stream *xdr, size_t nbytes);
 extern void __xdr_commit_encode(struct xdr_stream *xdr);
 extern void xdr_truncate_encode(struct xdr_stream *xdr, size_t len);
 extern void xdr_truncate_decode(struct xdr_stream *xdr, size_t len);
index 8f461e0..f8069ef 100644 (file)
@@ -2112,6 +2112,14 @@ DEFINE_POST_CHUNK_EVENT(read);
 DEFINE_POST_CHUNK_EVENT(write);
 DEFINE_POST_CHUNK_EVENT(reply);
 
+DEFINE_EVENT(svcrdma_post_chunk_class, svcrdma_cc_release,
+       TP_PROTO(
+               const struct rpc_rdma_cid *cid,
+               int sqecount
+       ),
+       TP_ARGS(cid, sqecount)
+);
+
 TRACE_EVENT(svcrdma_wc_read,
        TP_PROTO(
                const struct ib_wc *wc,
index 31bc702..69e42ef 100644 (file)
@@ -2104,31 +2104,46 @@ DEFINE_SVC_DEFERRED_EVENT(drop);
 DEFINE_SVC_DEFERRED_EVENT(queue);
 DEFINE_SVC_DEFERRED_EVENT(recv);
 
-TRACE_EVENT(svcsock_new_socket,
+DECLARE_EVENT_CLASS(svcsock_lifetime_class,
        TP_PROTO(
+               const void *svsk,
                const struct socket *socket
        ),
-
-       TP_ARGS(socket),
-
+       TP_ARGS(svsk, socket),
        TP_STRUCT__entry(
+               __field(unsigned int, netns_ino)
+               __field(const void *, svsk)
+               __field(const void *, sk)
                __field(unsigned long, type)
                __field(unsigned long, family)
-               __field(bool, listener)
+               __field(unsigned long, state)
        ),
-
        TP_fast_assign(
+               struct sock *sk = socket->sk;
+
+               __entry->netns_ino = sock_net(sk)->ns.inum;
+               __entry->svsk = svsk;
+               __entry->sk = sk;
                __entry->type = socket->type;
-               __entry->family = socket->sk->sk_family;
-               __entry->listener = (socket->sk->sk_state == TCP_LISTEN);
+               __entry->family = sk->sk_family;
+               __entry->state = sk->sk_state;
        ),
-
-       TP_printk("type=%s family=%s%s",
-               show_socket_type(__entry->type),
+       TP_printk("svsk=%p type=%s family=%s%s",
+               __entry->svsk, show_socket_type(__entry->type),
                rpc_show_address_family(__entry->family),
-               __entry->listener ? " (listener)" : ""
+               __entry->state == TCP_LISTEN ? " (listener)" : ""
        )
 );
+#define DEFINE_SVCSOCK_LIFETIME_EVENT(name) \
+       DEFINE_EVENT(svcsock_lifetime_class, name, \
+               TP_PROTO( \
+                       const void *svsk, \
+                       const struct socket *socket \
+               ), \
+               TP_ARGS(svsk, socket))
+
+DEFINE_SVCSOCK_LIFETIME_EVENT(svcsock_new);
+DEFINE_SVCSOCK_LIFETIME_EVENT(svcsock_free);
 
 TRACE_EVENT(svcsock_marker,
        TP_PROTO(
index 79967b6..e7c1012 100644 (file)
@@ -109,15 +109,15 @@ param_get_pool_mode(char *buf, const struct kernel_param *kp)
        switch (*ip)
        {
        case SVC_POOL_AUTO:
-               return strlcpy(buf, "auto\n", 20);
+               return sysfs_emit(buf, "auto\n");
        case SVC_POOL_GLOBAL:
-               return strlcpy(buf, "global\n", 20);
+               return sysfs_emit(buf, "global\n");
        case SVC_POOL_PERCPU:
-               return strlcpy(buf, "percpu\n", 20);
+               return sysfs_emit(buf, "percpu\n");
        case SVC_POOL_PERNODE:
-               return strlcpy(buf, "pernode\n", 20);
+               return sysfs_emit(buf, "pernode\n");
        default:
-               return sprintf(buf, "%d\n", *ip);
+               return sysfs_emit(buf, "%d\n", *ip);
        }
 }
 
@@ -597,34 +597,25 @@ svc_destroy(struct kref *ref)
 }
 EXPORT_SYMBOL_GPL(svc_destroy);
 
-/*
- * Allocate an RPC server's buffer space.
- * We allocate pages and place them in rq_pages.
- */
-static int
+static bool
 svc_init_buffer(struct svc_rqst *rqstp, unsigned int size, int node)
 {
-       unsigned int pages, arghi;
+       unsigned long pages, ret;
 
        /* bc_xprt uses fore channel allocated buffers */
        if (svc_is_backchannel(rqstp))
-               return 1;
+               return true;
 
        pages = size / PAGE_SIZE + 1; /* extra page as we hold both request and reply.
                                       * We assume one is at most one page
                                       */
-       arghi = 0;
        WARN_ON_ONCE(pages > RPCSVC_MAXPAGES);
        if (pages > RPCSVC_MAXPAGES)
                pages = RPCSVC_MAXPAGES;
-       while (pages) {
-               struct page *p = alloc_pages_node(node, GFP_KERNEL, 0);
-               if (!p)
-                       break;
-               rqstp->rq_pages[arghi++] = p;
-               pages--;
-       }
-       return pages == 0;
+
+       ret = alloc_pages_bulk_array_node(GFP_KERNEL, node, pages,
+                                         rqstp->rq_pages);
+       return ret == pages;
 }
 
 /*
@@ -1173,6 +1164,7 @@ static void __svc_unregister(struct net *net, const u32 program, const u32 versi
  */
 static void svc_unregister(const struct svc_serv *serv, struct net *net)
 {
+       struct sighand_struct *sighand;
        struct svc_program *progp;
        unsigned long flags;
        unsigned int i;
@@ -1189,9 +1181,12 @@ static void svc_unregister(const struct svc_serv *serv, struct net *net)
                }
        }
 
-       spin_lock_irqsave(&current->sighand->siglock, flags);
+       rcu_read_lock();
+       sighand = rcu_dereference(current->sighand);
+       spin_lock_irqsave(&sighand->siglock, flags);
        recalc_sigpending();
-       spin_unlock_irqrestore(&current->sighand->siglock, flags);
+       spin_unlock_irqrestore(&sighand->siglock, flags);
+       rcu_read_unlock();
 }
 
 /*
index 13a1489..62c7919 100644 (file)
@@ -74,13 +74,18 @@ static LIST_HEAD(svc_xprt_class_list);
  *               that no other thread will be using the transport or will
  *               try to set XPT_DEAD.
  */
+
+/**
+ * svc_reg_xprt_class - Register a server-side RPC transport class
+ * @xcl: New transport class to be registered
+ *
+ * Returns zero on success; otherwise a negative errno is returned.
+ */
 int svc_reg_xprt_class(struct svc_xprt_class *xcl)
 {
        struct svc_xprt_class *cl;
        int res = -EEXIST;
 
-       dprintk("svc: Adding svc transport class '%s'\n", xcl->xcl_name);
-
        INIT_LIST_HEAD(&xcl->xcl_list);
        spin_lock(&svc_xprt_class_lock);
        /* Make sure there isn't already a class with the same name */
@@ -96,9 +101,13 @@ out:
 }
 EXPORT_SYMBOL_GPL(svc_reg_xprt_class);
 
+/**
+ * svc_unreg_xprt_class - Unregister a server-side RPC transport class
+ * @xcl: Transport class to be unregistered
+ *
+ */
 void svc_unreg_xprt_class(struct svc_xprt_class *xcl)
 {
-       dprintk("svc: Removing svc transport class '%s'\n", xcl->xcl_name);
        spin_lock(&svc_xprt_class_lock);
        list_del_init(&xcl->xcl_list);
        spin_unlock(&svc_xprt_class_lock);
@@ -685,8 +694,9 @@ static int svc_alloc_arg(struct svc_rqst *rqstp)
        }
 
        for (filled = 0; filled < pages; filled = ret) {
-               ret = alloc_pages_bulk_array(GFP_KERNEL, pages,
-                                            rqstp->rq_pages);
+               ret = alloc_pages_bulk_array_node(GFP_KERNEL,
+                                                 rqstp->rq_pool->sp_id,
+                                                 pages, rqstp->rq_pages);
                if (ret > filled)
                        /* Made progress, don't sleep yet */
                        continue;
@@ -843,15 +853,11 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
                svc_xprt_received(xprt);
        } else if (svc_xprt_reserve_slot(rqstp, xprt)) {
                /* XPT_DATA|XPT_DEFERRED case: */
-               dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n",
-                       rqstp, rqstp->rq_pool->sp_id, xprt,
-                       kref_read(&xprt->xpt_ref));
                rqstp->rq_deferred = svc_deferred_dequeue(xprt);
                if (rqstp->rq_deferred)
                        len = svc_deferred_recv(rqstp);
                else
                        len = xprt->xpt_ops->xpo_recvfrom(rqstp);
-               rqstp->rq_stime = ktime_get();
                rqstp->rq_reserved = serv->sv_max_mesg;
                atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved);
        } else
@@ -894,6 +900,7 @@ int svc_recv(struct svc_rqst *rqstp, long timeout)
        err = -EAGAIN;
        if (len <= 0)
                goto out_release;
+
        trace_svc_xdr_recvfrom(&rqstp->rq_arg);
 
        clear_bit(XPT_OLD, &xprt->xpt_flags);
@@ -902,6 +909,7 @@ int svc_recv(struct svc_rqst *rqstp, long timeout)
 
        if (serv->sv_stats)
                serv->sv_stats->netcnt++;
+       rqstp->rq_stime = ktime_get();
        return len;
 out_release:
        rqstp->rq_res.len = 0;
index f77cebe..5f519fc 100644 (file)
@@ -826,12 +826,6 @@ static void svc_tcp_listen_data_ready(struct sock *sk)
 
        trace_sk_data_ready(sk);
 
-       if (svsk) {
-               /* Refer to svc_setup_socket() for details. */
-               rmb();
-               svsk->sk_odata(sk);
-       }
-
        /*
         * This callback may called twice when a new connection
         * is established as a child socket inherits everything
@@ -840,13 +834,18 @@ static void svc_tcp_listen_data_ready(struct sock *sk)
         *    when one of child sockets become ESTABLISHED.
         * 2) data_ready method of the child socket may be called
         *    when it receives data before the socket is accepted.
-        * In case of 2, we should ignore it silently.
+        * In case of 2, we should ignore it silently and DO NOT
+        * dereference svsk.
         */
-       if (sk->sk_state == TCP_LISTEN) {
-               if (svsk) {
-                       set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
-                       svc_xprt_enqueue(&svsk->sk_xprt);
-               }
+       if (sk->sk_state != TCP_LISTEN)
+               return;
+
+       if (svsk) {
+               /* Refer to svc_setup_socket() for details. */
+               rmb();
+               svsk->sk_odata(sk);
+               set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
+               svc_xprt_enqueue(&svsk->sk_xprt);
        }
 }
 
@@ -887,13 +886,8 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt)
        clear_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
        err = kernel_accept(sock, &newsock, O_NONBLOCK);
        if (err < 0) {
-               if (err == -ENOMEM)
-                       printk(KERN_WARNING "%s: no more sockets!\n",
-                              serv->sv_name);
-               else if (err != -EAGAIN)
-                       net_warn_ratelimited("%s: accept failed (err %d)!\n",
-                                            serv->sv_name, -err);
-               trace_svcsock_accept_err(xprt, serv->sv_name, err);
+               if (err != -EAGAIN)
+                       trace_svcsock_accept_err(xprt, serv->sv_name, err);
                return NULL;
        }
        if (IS_ERR(sock_alloc_file(newsock, O_NONBLOCK, NULL)))
@@ -1464,7 +1458,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
        svsk->sk_owspace = inet->sk_write_space;
        /*
         * This barrier is necessary in order to prevent race condition
-        * with svc_data_ready(), svc_listen_data_ready() and others
+        * with svc_data_ready(), svc_tcp_listen_data_ready(), and others
         * when calling callbacks above.
         */
        wmb();
@@ -1476,7 +1470,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
        else
                svc_tcp_init(svsk, serv);
 
-       trace_svcsock_new_socket(sock);
+       trace_svcsock_new(svsk, sock);
        return svsk;
 }
 
@@ -1657,6 +1651,8 @@ static void svc_sock_free(struct svc_xprt *xprt)
        struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
        struct socket *sock = svsk->sk_sock;
 
+       trace_svcsock_free(svsk, sock);
+
        tls_handshake_cancel(sock->sk);
        if (sock->file)
                sockfd_put(sock);
index 36835b2..2a22e78 100644 (file)
@@ -1070,22 +1070,22 @@ __be32 * xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes)
 }
 EXPORT_SYMBOL_GPL(xdr_reserve_space);
 
-
 /**
  * xdr_reserve_space_vec - Reserves a large amount of buffer space for sending
  * @xdr: pointer to xdr_stream
- * @vec: pointer to a kvec array
  * @nbytes: number of bytes to reserve
  *
- * Reserves enough buffer space to encode 'nbytes' of data and stores the
- * pointers in 'vec'. The size argument passed to xdr_reserve_space() is
- * determined based on the number of bytes remaining in the current page to
- * avoid invalidating iov_base pointers when xdr_commit_encode() is called.
+ * The size argument passed to xdr_reserve_space() is determined based
+ * on the number of bytes remaining in the current page to avoid
+ * invalidating iov_base pointers when xdr_commit_encode() is called.
+ *
+ * Return values:
+ *   %0: success
+ *   %-EMSGSIZE: not enough space is available in @xdr
  */
-int xdr_reserve_space_vec(struct xdr_stream *xdr, struct kvec *vec, size_t nbytes)
+int xdr_reserve_space_vec(struct xdr_stream *xdr, size_t nbytes)
 {
-       int thislen;
-       int v = 0;
+       size_t thislen;
        __be32 *p;
 
        /*
@@ -1097,21 +1097,19 @@ int xdr_reserve_space_vec(struct xdr_stream *xdr, struct kvec *vec, size_t nbyte
                xdr->end = xdr->p;
        }
 
+       /* XXX: Let's find a way to make this more efficient */
        while (nbytes) {
                thislen = xdr->buf->page_len % PAGE_SIZE;
                thislen = min_t(size_t, nbytes, PAGE_SIZE - thislen);
 
                p = xdr_reserve_space(xdr, thislen);
                if (!p)
-                       return -EIO;
+                       return -EMSGSIZE;
 
-               vec[v].iov_base = p;
-               vec[v].iov_len = thislen;
-               v++;
                nbytes -= thislen;
        }
 
-       return v;
+       return 0;
 }
 EXPORT_SYMBOL_GPL(xdr_reserve_space_vec);
 
index aa2227a..7420a2c 100644 (file)
@@ -93,13 +93,7 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
         */
        get_page(virt_to_page(rqst->rq_buffer));
        sctxt->sc_send_wr.opcode = IB_WR_SEND;
-       ret = svc_rdma_send(rdma, sctxt);
-       if (ret < 0)
-               return ret;
-
-       ret = wait_for_completion_killable(&sctxt->sc_done);
-       svc_rdma_send_ctxt_put(rdma, sctxt);
-       return ret;
+       return svc_rdma_send(rdma, sctxt);
 }
 
 /* Server-side transport endpoint wants a whole page for its send
index a22fe75..85c8bca 100644 (file)
@@ -125,14 +125,15 @@ static void svc_rdma_recv_cid_init(struct svcxprt_rdma *rdma,
 static struct svc_rdma_recv_ctxt *
 svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
 {
+       int node = ibdev_to_node(rdma->sc_cm_id->device);
        struct svc_rdma_recv_ctxt *ctxt;
        dma_addr_t addr;
        void *buffer;
 
-       ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL);
+       ctxt = kmalloc_node(sizeof(*ctxt), GFP_KERNEL, node);
        if (!ctxt)
                goto fail0;
-       buffer = kmalloc(rdma->sc_max_req_size, GFP_KERNEL);
+       buffer = kmalloc_node(rdma->sc_max_req_size, GFP_KERNEL, node);
        if (!buffer)
                goto fail1;
        addr = ib_dma_map_single(rdma->sc_pd->device, buffer,
@@ -155,7 +156,6 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
        ctxt->rc_recv_sge.length = rdma->sc_max_req_size;
        ctxt->rc_recv_sge.lkey = rdma->sc_pd->local_dma_lkey;
        ctxt->rc_recv_buf = buffer;
-       ctxt->rc_temp = false;
        return ctxt;
 
 fail2:
@@ -232,10 +232,7 @@ void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma,
        pcl_free(&ctxt->rc_write_pcl);
        pcl_free(&ctxt->rc_reply_pcl);
 
-       if (!ctxt->rc_temp)
-               llist_add(&ctxt->rc_node, &rdma->sc_recv_ctxts);
-       else
-               svc_rdma_recv_ctxt_destroy(rdma, ctxt);
+       llist_add(&ctxt->rc_node, &rdma->sc_recv_ctxts);
 }
 
 /**
@@ -258,7 +255,7 @@ void svc_rdma_release_ctxt(struct svc_xprt *xprt, void *vctxt)
 }
 
 static bool svc_rdma_refresh_recvs(struct svcxprt_rdma *rdma,
-                                  unsigned int wanted, bool temp)
+                                  unsigned int wanted)
 {
        const struct ib_recv_wr *bad_wr = NULL;
        struct svc_rdma_recv_ctxt *ctxt;
@@ -275,7 +272,6 @@ static bool svc_rdma_refresh_recvs(struct svcxprt_rdma *rdma,
                        break;
 
                trace_svcrdma_post_recv(ctxt);
-               ctxt->rc_temp = temp;
                ctxt->rc_recv_wr.next = recv_chain;
                recv_chain = &ctxt->rc_recv_wr;
                rdma->sc_pending_recvs++;
@@ -309,7 +305,7 @@ err_free:
  */
 bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma)
 {
-       return svc_rdma_refresh_recvs(rdma, rdma->sc_max_requests, true);
+       return svc_rdma_refresh_recvs(rdma, rdma->sc_max_requests);
 }
 
 /**
@@ -343,7 +339,7 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
         * client reconnects.
         */
        if (rdma->sc_pending_recvs < rdma->sc_max_requests)
-               if (!svc_rdma_refresh_recvs(rdma, rdma->sc_recv_batch, false))
+               if (!svc_rdma_refresh_recvs(rdma, rdma->sc_recv_batch))
                        goto dropped;
 
        /* All wc fields are now known to be valid */
@@ -775,9 +771,6 @@ static bool svc_rdma_is_reverse_direction_reply(struct svc_xprt *xprt,
  *
  * The next ctxt is removed from the "receive" lists.
  *
- * - If the ctxt completes a Read, then finish assembling the Call
- *   message and return the number of bytes in the message.
- *
  * - If the ctxt completes a Receive, then construct the Call
  *   message from the contents of the Receive buffer.
  *
@@ -786,7 +779,8 @@ static bool svc_rdma_is_reverse_direction_reply(struct svc_xprt *xprt,
  *     in the message.
  *
  *   - If there are Read chunks in this message, post Read WRs to
- *     pull that payload and return 0.
+ *     pull that payload. When the Read WRs complete, build the
+ *     full message and return the number of bytes in it.
  */
 int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 {
@@ -796,6 +790,12 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
        struct svc_rdma_recv_ctxt *ctxt;
        int ret;
 
+       /* Prevent svc_xprt_release() from releasing pages in rq_pages
+        * when returning 0 or an error.
+        */
+       rqstp->rq_respages = rqstp->rq_pages;
+       rqstp->rq_next_page = rqstp->rq_respages;
+
        rqstp->rq_xprt_ctxt = NULL;
 
        ctxt = NULL;
@@ -819,12 +819,6 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
                                   DMA_FROM_DEVICE);
        svc_rdma_build_arg_xdr(rqstp, ctxt);
 
-       /* Prevent svc_xprt_release from releasing pages in rq_pages
-        * if we return 0 or an error.
-        */
-       rqstp->rq_respages = rqstp->rq_pages;
-       rqstp->rq_next_page = rqstp->rq_respages;
-
        ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg, ctxt);
        if (ret < 0)
                goto out_err;
index 11cf7c6..e460e25 100644 (file)
@@ -62,8 +62,8 @@ svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges)
        if (node) {
                ctxt = llist_entry(node, struct svc_rdma_rw_ctxt, rw_node);
        } else {
-               ctxt = kmalloc(struct_size(ctxt, rw_first_sgl, SG_CHUNK_SIZE),
-                              GFP_KERNEL);
+               ctxt = kmalloc_node(struct_size(ctxt, rw_first_sgl, SG_CHUNK_SIZE),
+                                   GFP_KERNEL, ibdev_to_node(rdma->sc_cm_id->device));
                if (!ctxt)
                        goto out_noctx;
 
@@ -84,8 +84,7 @@ out_noctx:
        return NULL;
 }
 
-static void __svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma,
-                                  struct svc_rdma_rw_ctxt *ctxt,
+static void __svc_rdma_put_rw_ctxt(struct svc_rdma_rw_ctxt *ctxt,
                                   struct llist_head *list)
 {
        sg_free_table_chained(&ctxt->rw_sg_table, SG_CHUNK_SIZE);
@@ -95,7 +94,7 @@ static void __svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma,
 static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma,
                                 struct svc_rdma_rw_ctxt *ctxt)
 {
-       __svc_rdma_put_rw_ctxt(rdma, ctxt, &rdma->sc_rw_ctxts);
+       __svc_rdma_put_rw_ctxt(ctxt, &rdma->sc_rw_ctxts);
 }
 
 /**
@@ -191,6 +190,8 @@ static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc,
        struct svc_rdma_rw_ctxt *ctxt;
        LLIST_HEAD(free);
 
+       trace_svcrdma_cc_release(&cc->cc_cid, cc->cc_sqecount);
+
        first = last = NULL;
        while ((ctxt = svc_rdma_next_ctxt(&cc->cc_rwctxts)) != NULL) {
                list_del(&ctxt->rw_list);
@@ -198,7 +199,7 @@ static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc,
                rdma_rw_ctx_destroy(&ctxt->rw_ctx, rdma->sc_qp,
                                    rdma->sc_port_num, ctxt->rw_sg_table.sgl,
                                    ctxt->rw_nents, dir);
-               __svc_rdma_put_rw_ctxt(rdma, ctxt, &free);
+               __svc_rdma_put_rw_ctxt(ctxt, &free);
 
                ctxt->rw_node.next = first;
                first = &ctxt->rw_node;
@@ -234,7 +235,8 @@ svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma,
 {
        struct svc_rdma_write_info *info;
 
-       info = kmalloc(sizeof(*info), GFP_KERNEL);
+       info = kmalloc_node(sizeof(*info), GFP_KERNEL,
+                           ibdev_to_node(rdma->sc_cm_id->device));
        if (!info)
                return info;
 
@@ -304,7 +306,8 @@ svc_rdma_read_info_alloc(struct svcxprt_rdma *rdma)
 {
        struct svc_rdma_read_info *info;
 
-       info = kmalloc(sizeof(*info), GFP_KERNEL);
+       info = kmalloc_node(sizeof(*info), GFP_KERNEL,
+                           ibdev_to_node(rdma->sc_cm_id->device));
        if (!info)
                return info;
 
@@ -351,8 +354,7 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc)
        return;
 }
 
-/* This function sleeps when the transport's Send Queue is congested.
- *
+/*
  * Assumptions:
  * - If ib_post_send() succeeds, only one completion is expected,
  *   even if one or more WRs are flushed. This is true when posting
@@ -367,6 +369,8 @@ static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc)
        struct ib_cqe *cqe;
        int ret;
 
+       might_sleep();
+
        if (cc->cc_sqecount > rdma->sc_sq_depth)
                return -EINVAL;
 
index 22a871e..c6644cc 100644 (file)
@@ -123,18 +123,17 @@ static void svc_rdma_send_cid_init(struct svcxprt_rdma *rdma,
 static struct svc_rdma_send_ctxt *
 svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma)
 {
+       int node = ibdev_to_node(rdma->sc_cm_id->device);
        struct svc_rdma_send_ctxt *ctxt;
        dma_addr_t addr;
        void *buffer;
-       size_t size;
        int i;
 
-       size = sizeof(*ctxt);
-       size += rdma->sc_max_send_sges * sizeof(struct ib_sge);
-       ctxt = kmalloc(size, GFP_KERNEL);
+       ctxt = kmalloc_node(struct_size(ctxt, sc_sges, rdma->sc_max_send_sges),
+                           GFP_KERNEL, node);
        if (!ctxt)
                goto fail0;
-       buffer = kmalloc(rdma->sc_max_req_size, GFP_KERNEL);
+       buffer = kmalloc_node(rdma->sc_max_req_size, GFP_KERNEL, node);
        if (!buffer)
                goto fail1;
        addr = ib_dma_map_single(rdma->sc_pd->device, buffer,
@@ -148,7 +147,6 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma)
        ctxt->sc_send_wr.wr_cqe = &ctxt->sc_cqe;
        ctxt->sc_send_wr.sg_list = ctxt->sc_sges;
        ctxt->sc_send_wr.send_flags = IB_SEND_SIGNALED;
-       init_completion(&ctxt->sc_done);
        ctxt->sc_cqe.done = svc_rdma_wc_send;
        ctxt->sc_xprt_buf = buffer;
        xdr_buf_init(&ctxt->sc_hdrbuf, ctxt->sc_xprt_buf,
@@ -214,6 +212,7 @@ out:
 
        ctxt->sc_send_wr.num_sge = 0;
        ctxt->sc_cur_sge_no = 0;
+       ctxt->sc_page_count = 0;
        return ctxt;
 
 out_empty:
@@ -228,6 +227,8 @@ out_empty:
  * svc_rdma_send_ctxt_put - Return send_ctxt to free list
  * @rdma: controlling svcxprt_rdma
  * @ctxt: object to return to the free list
+ *
+ * Pages left in sc_pages are DMA unmapped and released.
  */
 void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma,
                            struct svc_rdma_send_ctxt *ctxt)
@@ -235,6 +236,9 @@ void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma,
        struct ib_device *device = rdma->sc_cm_id->device;
        unsigned int i;
 
+       if (ctxt->sc_page_count)
+               release_pages(ctxt->sc_pages, ctxt->sc_page_count);
+
        /* The first SGE contains the transport header, which
         * remains mapped until @ctxt is destroyed.
         */
@@ -281,12 +285,12 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
                container_of(cqe, struct svc_rdma_send_ctxt, sc_cqe);
 
        svc_rdma_wake_send_waiters(rdma, 1);
-       complete(&ctxt->sc_done);
 
        if (unlikely(wc->status != IB_WC_SUCCESS))
                goto flushed;
 
        trace_svcrdma_wc_send(wc, &ctxt->sc_cid);
+       svc_rdma_send_ctxt_put(rdma, ctxt);
        return;
 
 flushed:
@@ -294,6 +298,7 @@ flushed:
                trace_svcrdma_wc_send_err(wc, &ctxt->sc_cid);
        else
                trace_svcrdma_wc_send_flush(wc, &ctxt->sc_cid);
+       svc_rdma_send_ctxt_put(rdma, ctxt);
        svc_xprt_deferred_close(&rdma->sc_xprt);
 }
 
@@ -310,7 +315,7 @@ int svc_rdma_send(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt)
        struct ib_send_wr *wr = &ctxt->sc_send_wr;
        int ret;
 
-       reinit_completion(&ctxt->sc_done);
+       might_sleep();
 
        /* Sync the transport header buffer */
        ib_dma_sync_single_for_device(rdma->sc_pd->device,
@@ -799,6 +804,25 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
                                       svc_rdma_xb_dma_map, &args);
 }
 
+/* The svc_rqst and all resources it owns are released as soon as
+ * svc_rdma_sendto returns. Transfer pages under I/O to the ctxt
+ * so they are released by the Send completion handler.
+ */
+static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
+                                  struct svc_rdma_send_ctxt *ctxt)
+{
+       int i, pages = rqstp->rq_next_page - rqstp->rq_respages;
+
+       ctxt->sc_page_count += pages;
+       for (i = 0; i < pages; i++) {
+               ctxt->sc_pages[i] = rqstp->rq_respages[i];
+               rqstp->rq_respages[i] = NULL;
+       }
+
+       /* Prevent svc_xprt_release from releasing pages in rq_pages */
+       rqstp->rq_next_page = rqstp->rq_respages;
+}
+
 /* Prepare the portion of the RPC Reply that will be transmitted
  * via RDMA Send. The RPC-over-RDMA transport header is prepared
  * in sc_sges[0], and the RPC xdr_buf is prepared in following sges.
@@ -828,6 +852,8 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
        if (ret < 0)
                return ret;
 
+       svc_rdma_save_io_pages(rqstp, sctxt);
+
        if (rctxt->rc_inv_rkey) {
                sctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV;
                sctxt->sc_send_wr.ex.invalidate_rkey = rctxt->rc_inv_rkey;
@@ -835,13 +861,7 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
                sctxt->sc_send_wr.opcode = IB_WR_SEND;
        }
 
-       ret = svc_rdma_send(rdma, sctxt);
-       if (ret < 0)
-               return ret;
-
-       ret = wait_for_completion_killable(&sctxt->sc_done);
-       svc_rdma_send_ctxt_put(rdma, sctxt);
-       return ret;
+       return svc_rdma_send(rdma, sctxt);
 }
 
 /**
@@ -907,8 +927,7 @@ void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma,
        sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len;
        if (svc_rdma_send(rdma, sctxt))
                goto put_ctxt;
-
-       wait_for_completion_killable(&sctxt->sc_done);
+       return;
 
 put_ctxt:
        svc_rdma_send_ctxt_put(rdma, sctxt);
@@ -976,17 +995,16 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
        ret = svc_rdma_send_reply_msg(rdma, sctxt, rctxt, rqstp);
        if (ret < 0)
                goto put_ctxt;
-
-       /* Prevent svc_xprt_release() from releasing the page backing
-        * rq_res.head[0].iov_base. It's no longer being accessed by
-        * the I/O device. */
-       rqstp->rq_respages++;
        return 0;
 
 reply_chunk:
        if (ret != -E2BIG && ret != -EINVAL)
                goto put_ctxt;
 
+       /* Send completion releases payload pages that were part
+        * of previously posted RDMA Writes.
+        */
+       svc_rdma_save_io_pages(rqstp, sctxt);
        svc_rdma_send_error_msg(rdma, sctxt, rctxt, ret);
        return 0;
 
index ca04f7a..2abd895 100644 (file)
@@ -64,7 +64,7 @@
 #define RPCDBG_FACILITY        RPCDBG_SVCXPRT
 
 static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv,
-                                                struct net *net);
+                                                struct net *net, int node);
 static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
                                        struct net *net,
                                        struct sockaddr *sa, int salen,
@@ -123,14 +123,14 @@ static void qp_event_handler(struct ib_event *event, void *context)
 }
 
 static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv,
-                                                struct net *net)
+                                                struct net *net, int node)
 {
-       struct svcxprt_rdma *cma_xprt = kzalloc(sizeof *cma_xprt, GFP_KERNEL);
+       struct svcxprt_rdma *cma_xprt;
 
-       if (!cma_xprt) {
-               dprintk("svcrdma: failed to create new transport\n");
+       cma_xprt = kzalloc_node(sizeof(*cma_xprt), GFP_KERNEL, node);
+       if (!cma_xprt)
                return NULL;
-       }
+
        svc_xprt_init(net, &svc_rdma_class, &cma_xprt->sc_xprt, serv);
        INIT_LIST_HEAD(&cma_xprt->sc_accept_q);
        INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
@@ -193,9 +193,9 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id,
        struct svcxprt_rdma *newxprt;
        struct sockaddr *sa;
 
-       /* Create a new transport */
        newxprt = svc_rdma_create_xprt(listen_xprt->sc_xprt.xpt_server,
-                                      listen_xprt->sc_xprt.xpt_net);
+                                      listen_xprt->sc_xprt.xpt_net,
+                                      ibdev_to_node(new_cma_id->device));
        if (!newxprt)
                return;
        newxprt->sc_cm_id = new_cma_id;
@@ -304,7 +304,7 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
 
        if (sa->sa_family != AF_INET && sa->sa_family != AF_INET6)
                return ERR_PTR(-EAFNOSUPPORT);
-       cma_xprt = svc_rdma_create_xprt(serv, net);
+       cma_xprt = svc_rdma_create_xprt(serv, net, NUMA_NO_NODE);
        if (!cma_xprt)
                return ERR_PTR(-ENOMEM);
        set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);