Merge tag 'nfsd-6.5' of git://git.kernel.org/pub/scm/linux/kernel/git/cel/linux

author Linus Torvalds <torvalds@linux-foundation.org>

Mon, 26 Jun 2023 17:48:57 +0000 (10:48 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Mon, 26 Jun 2023 17:48:57 +0000 (10:48 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Mon, 26 Jun 2023 17:48:57 +0000 (10:48 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Mon, 26 Jun 2023 17:48:57 +0000 (10:48 -0700)
diff --git a/.mailmap b/.mailmap

index c94da2a..4d71480 100644 (file)
--- a/.mailmap
+++ b/.mailmap
@@ -183,6 +183,8 @@ Henrik Rydberg <rydberg@bitmath.org>
  Herbert Xu <herbert@gondor.apana.org.au>
  Huacai Chen <chenhuacai@kernel.org> <chenhc@lemote.com>
  Huacai Chen <chenhuacai@kernel.org> <chenhuacai@loongson.cn>
+J. Bruce Fields <bfields@fieldses.org> <bfields@redhat.com>
+J. Bruce Fields <bfields@fieldses.org> <bfields@citi.umich.edu>
  Jacob Shin <Jacob.Shin@amd.com>
  Jaegeuk Kim <jaegeuk@kernel.org> <jaegeuk@google.com>
  Jaegeuk Kim <jaegeuk@kernel.org> <jaegeuk.kim@samsung.com>
diff --git a/MAINTAINERS b/MAINTAINERS

index 35e1959..233b9a3 100644 (file)
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11275,6 +11275,10 @@ W:     http://kernelnewbies.org/KernelJanitors
  KERNEL NFSD, SUNRPC, AND LOCKD SERVERS
  M:     Chuck Lever <chuck.lever@oracle.com>
  M:     Jeff Layton <jlayton@kernel.org>
+R:     Neil Brown <neilb@suse.de>
+R:     Olga Kornievskaia <kolga@netapp.com>
+R:     Dai Ngo <Dai.Ngo@oracle.com>
+R:     Tom Talpey <tom@talpey.com>
  L:     linux-nfs@vger.kernel.org
  S:     Supported
  W:     http://nfs.sourceforge.net/
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c

index 04ba95b..22d3ff3 100644 (file)
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -355,7 +355,6 @@ static int lockd_get(void)
         int error;
  
         if (nlmsvc_serv) {
-               svc_get(nlmsvc_serv);
                 nlmsvc_users++;
                 return 0;
         }
diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h

index f21259e..4c9b878 100644 (file)
--- a/fs/nfsd/cache.h
+++ b/fs/nfsd/cache.h
@@ -80,6 +80,8 @@ enum {
  
  int    nfsd_drc_slab_create(void);
  void   nfsd_drc_slab_free(void);
+int    nfsd_net_reply_cache_init(struct nfsd_net *nn);
+void   nfsd_net_reply_cache_destroy(struct nfsd_net *nn);
  int    nfsd_reply_cache_init(struct nfsd_net *);
  void   nfsd_reply_cache_shutdown(struct nfsd_net *);
  int    nfsd_cache_lookup(struct svc_rqst *);
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c

index ae85257..11a0eaa 100644 (file)
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -97,7 +97,7 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen)
                 goto out;
  
         err = -EINVAL;
-       if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0)
+       if (qword_get(&mesg, buf, PAGE_SIZE) <= 0)
                 goto out;
  
         err = -ENOENT;
@@ -107,7 +107,7 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen)
         dprintk("found domain %s\n", buf);
  
         err = -EINVAL;
-       if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0)
+       if (qword_get(&mesg, buf, PAGE_SIZE) <= 0)
                 goto out;
         fsidtype = simple_strtoul(buf, &ep, 10);
         if (*ep)
@@ -593,7 +593,6 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
  {
         /* client path expiry [flags anonuid anongid fsid] */
         char *buf;
-       int len;
         int err;
         struct auth_domain *dom = NULL;
         struct svc_export exp = {}, *expp;
@@ -609,8 +608,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
  
         /* client */
         err = -EINVAL;
-       len = qword_get(&mesg, buf, PAGE_SIZE);
-       if (len <= 0)
+       if (qword_get(&mesg, buf, PAGE_SIZE) <= 0)
                 goto out;
  
         err = -ENOENT;
@@ -620,7 +618,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
  
         /* path */
         err = -EINVAL;
-       if ((len = qword_get(&mesg, buf, PAGE_SIZE)) <= 0)
+       if (qword_get(&mesg, buf, PAGE_SIZE) <= 0)
                 goto out1;
  
         err = kern_path(buf, 0, &exp.ex_path);
@@ -665,7 +663,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
                         goto out3;
                 exp.ex_fsid = an_int;
  
-               while ((len = qword_get(&mesg, buf, PAGE_SIZE)) > 0) {
+               while (qword_get(&mesg, buf, PAGE_SIZE) > 0) {
                         if (strcmp(buf, "fsloc") == 0)
                                 err = fsloc_parse(&mesg, buf, &exp.ex_fslocs);
                         else if (strcmp(buf, "uuid") == 0)
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c

index e6bb8ee..fc8d5b7 100644 (file)
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -151,8 +151,6 @@ nfsd3_proc_read(struct svc_rqst *rqstp)
  {
         struct nfsd3_readargs *argp = rqstp->rq_argp;
         struct nfsd3_readres *resp = rqstp->rq_resp;
-       unsigned int len;
-       int v;
  
         dprintk("nfsd: READ(3) %s %lu bytes at %Lu\n",
                                 SVCFH_fmt(&argp->fh),
@@ -166,17 +164,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp)
         if (argp->offset + argp->count > (u64)OFFSET_MAX)
                 argp->count = (u64)OFFSET_MAX - argp->offset;
  
-       v = 0;
-       len = argp->count;
         resp->pages = rqstp->rq_next_page;
-       while (len > 0) {
-               struct page *page = *(rqstp->rq_next_page++);
-
-               rqstp->rq_vec[v].iov_base = page_address(page);
-               rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE);
-               len -= rqstp->rq_vec[v].iov_len;
-               v++;
-       }
  
         /* Obtain buffer pointer for payload.
          * 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof)
@@ -187,7 +175,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp)
  
         fh_copy(&resp->fh, &argp->fh);
         resp->status = nfsd_read(rqstp, &resp->fh, argp->offset,
-                                rqstp->rq_vec, v, &resp->count, &resp->eof);
+                                &resp->count, &resp->eof);
         return rpc_success;
  }
  
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c

index 3308dd6..f321289 100644 (file)
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -828,7 +828,8 @@ nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
                         return false;
                 if (xdr_stream_encode_u32(xdr, resp->len) < 0)
                         return false;
-               xdr_write_pages(xdr, resp->pages, 0, resp->len);
+               svcxdr_encode_opaque_pages(rqstp, xdr, resp->pages, 0,
+                                          resp->len);
                 if (svc_encode_result_payload(rqstp, head->iov_len, resp->len) < 0)
                         return false;
                 break;
@@ -859,8 +860,9 @@ nfs3svc_encode_readres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
                         return false;
                 if (xdr_stream_encode_u32(xdr, resp->count) < 0)
                         return false;
-               xdr_write_pages(xdr, resp->pages, rqstp->rq_res.page_base,
-                               resp->count);
+               svcxdr_encode_opaque_pages(rqstp, xdr, resp->pages,
+                                          rqstp->rq_res.page_base,
+                                          resp->count);
                 if (svc_encode_result_payload(rqstp, head->iov_len, resp->count) < 0)
                         return false;
                 break;
@@ -961,7 +963,8 @@ nfs3svc_encode_readdirres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
                         return false;
                 if (!svcxdr_encode_cookieverf3(xdr, resp->verf))
                         return false;
-               xdr_write_pages(xdr, dirlist->pages, 0, dirlist->len);
+               svcxdr_encode_opaque_pages(rqstp, xdr, dirlist->pages, 0,
+                                          dirlist->len);
                 /* no more entries */
                 if (xdr_stream_encode_item_absent(xdr) < 0)
                         return false;
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c

index 76db2fe..26b1343 100644 (file)
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2541,6 +2541,20 @@ static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode,
         return p;
  }
  
+static __be32 nfsd4_encode_nfstime4(struct xdr_stream *xdr,
+                                   struct timespec64 *tv)
+{
+       __be32 *p;
+
+       p = xdr_reserve_space(xdr, XDR_UNIT * 3);
+       if (!p)
+               return nfserr_resource;
+
+       p = xdr_encode_hyper(p, (s64)tv->tv_sec);
+       *p = cpu_to_be32(tv->tv_nsec);
+       return nfs_ok;
+}
+
  /*
   * ctime (in NFSv4, time_metadata) is not writeable, and the client
   * doesn't really care what resolution could theoretically be stored by
@@ -2566,12 +2580,16 @@ static __be32 *encode_time_delta(__be32 *p, struct inode *inode)
         return p;
  }
  
-static __be32 *encode_cinfo(__be32 *p, struct nfsd4_change_info *c)
+static __be32
+nfsd4_encode_change_info4(struct xdr_stream *xdr, struct nfsd4_change_info *c)
  {
-       *p++ = cpu_to_be32(c->atomic);
-       p = xdr_encode_hyper(p, c->before_change);
-       p = xdr_encode_hyper(p, c->after_change);
-       return p;
+       if (xdr_stream_encode_bool(xdr, c->atomic) < 0)
+               return nfserr_resource;
+       if (xdr_stream_encode_u64(xdr, c->before_change) < 0)
+               return nfserr_resource;
+       if (xdr_stream_encode_u64(xdr, c->after_change) < 0)
+               return nfserr_resource;
+       return nfs_ok;
  }
  
  /* Encode as an array of strings the string given with components
@@ -3348,11 +3366,9 @@ out_acl:
                 p = xdr_encode_hyper(p, dummy64);
         }
         if (bmval1 & FATTR4_WORD1_TIME_ACCESS) {
-               p = xdr_reserve_space(xdr, 12);
-               if (!p)
-                       goto out_resource;
-               p = xdr_encode_hyper(p, (s64)stat.atime.tv_sec);
-               *p++ = cpu_to_be32(stat.atime.tv_nsec);
+               status = nfsd4_encode_nfstime4(xdr, &stat.atime);
+               if (status)
+                       goto out;
         }
         if (bmval1 & FATTR4_WORD1_TIME_DELTA) {
                 p = xdr_reserve_space(xdr, 12);
@@ -3361,25 +3377,19 @@ out_acl:
                 p = encode_time_delta(p, d_inode(dentry));
         }
         if (bmval1 & FATTR4_WORD1_TIME_METADATA) {
-               p = xdr_reserve_space(xdr, 12);
-               if (!p)
-                       goto out_resource;
-               p = xdr_encode_hyper(p, (s64)stat.ctime.tv_sec);
-               *p++ = cpu_to_be32(stat.ctime.tv_nsec);
+               status = nfsd4_encode_nfstime4(xdr, &stat.ctime);
+               if (status)
+                       goto out;
         }
         if (bmval1 & FATTR4_WORD1_TIME_MODIFY) {
-               p = xdr_reserve_space(xdr, 12);
-               if (!p)
-                       goto out_resource;
-               p = xdr_encode_hyper(p, (s64)stat.mtime.tv_sec);
-               *p++ = cpu_to_be32(stat.mtime.tv_nsec);
+               status = nfsd4_encode_nfstime4(xdr, &stat.mtime);
+               if (status)
+                       goto out;
         }
         if (bmval1 & FATTR4_WORD1_TIME_CREATE) {
-               p = xdr_reserve_space(xdr, 12);
-               if (!p)
-                       goto out_resource;
-               p = xdr_encode_hyper(p, (s64)stat.btime.tv_sec);
-               *p++ = cpu_to_be32(stat.btime.tv_nsec);
+               status = nfsd4_encode_nfstime4(xdr, &stat.btime);
+               if (status)
+                       goto out;
         }
         if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) {
                 u64 ino = stat.ino;
@@ -3689,6 +3699,30 @@ fail:
  }
  
  static __be32
+nfsd4_encode_verifier4(struct xdr_stream *xdr, const nfs4_verifier *verf)
+{
+       __be32 *p;
+
+       p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE);
+       if (!p)
+               return nfserr_resource;
+       memcpy(p, verf->data, sizeof(verf->data));
+       return nfs_ok;
+}
+
+static __be32
+nfsd4_encode_clientid4(struct xdr_stream *xdr, const clientid_t *clientid)
+{
+       __be32 *p;
+
+       p = xdr_reserve_space(xdr, sizeof(__be64));
+       if (!p)
+               return nfserr_resource;
+       memcpy(p, clientid, sizeof(*clientid));
+       return nfs_ok;
+}
+
+static __be32
  nfsd4_encode_stateid(struct xdr_stream *xdr, stateid_t *sid)
  {
         __be32 *p;
@@ -3752,15 +3786,8 @@ nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr,
                     union nfsd4_op_u *u)
  {
         struct nfsd4_commit *commit = &u->commit;
-       struct xdr_stream *xdr = resp->xdr;
-       __be32 *p;
  
-       p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE);
-       if (!p)
-               return nfserr_resource;
-       p = xdr_encode_opaque_fixed(p, commit->co_verf.data,
-                                               NFS4_VERIFIER_SIZE);
-       return 0;
+       return nfsd4_encode_verifier4(resp->xdr, &commit->co_verf);
  }
  
  static __be32
@@ -3769,12 +3796,10 @@ nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr,
  {
         struct nfsd4_create *create = &u->create;
         struct xdr_stream *xdr = resp->xdr;
-       __be32 *p;
  
-       p = xdr_reserve_space(xdr, 20);
-       if (!p)
-               return nfserr_resource;
-       encode_cinfo(p, &create->cr_cinfo);
+       nfserr = nfsd4_encode_change_info4(xdr, &create->cr_cinfo);
+       if (nfserr)
+               return nfserr;
         return nfsd4_encode_bitmap(xdr, create->cr_bmval[0],
                         create->cr_bmval[1], create->cr_bmval[2]);
  }
@@ -3892,13 +3917,8 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr,
  {
         struct nfsd4_link *link = &u->link;
         struct xdr_stream *xdr = resp->xdr;
-       __be32 *p;
  
-       p = xdr_reserve_space(xdr, 20);
-       if (!p)
-               return nfserr_resource;
-       p = encode_cinfo(p, &link->li_cinfo);
-       return 0;
+       return nfsd4_encode_change_info4(xdr, &link->li_cinfo);
  }
  
  
@@ -3913,11 +3933,11 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr,
         nfserr = nfsd4_encode_stateid(xdr, &open->op_stateid);
         if (nfserr)
                 return nfserr;
-       p = xdr_reserve_space(xdr, 24);
-       if (!p)
+       nfserr = nfsd4_encode_change_info4(xdr, &open->op_cinfo);
+       if (nfserr)
+               return nfserr;
+       if (xdr_stream_encode_u32(xdr, open->op_rflags) < 0)
                 return nfserr_resource;
-       p = encode_cinfo(p, &open->op_cinfo);
-       *p++ = cpu_to_be32(open->op_rflags);
  
         nfserr = nfsd4_encode_bitmap(xdr, open->op_bmval[0], open->op_bmval[1],
                                         open->op_bmval[2]);
@@ -3956,7 +3976,7 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr,
                 p = xdr_reserve_space(xdr, 32);
                 if (!p)
                         return nfserr_resource;
-               *p++ = cpu_to_be32(0);
+               *p++ = cpu_to_be32(open->op_recall);
  
                 /*
                  * TODO: space_limit's in delegations
@@ -4018,6 +4038,11 @@ nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr,
         return nfsd4_encode_stateid(xdr, &od->od_stateid);
  }
  
+/*
+ * The operation of this function assumes that this is the only
+ * READ operation in the COMPOUND. If there are multiple READs,
+ * we use nfsd4_encode_readv().
+ */
  static __be32 nfsd4_encode_splice_read(
                                 struct nfsd4_compoundres *resp,
                                 struct nfsd4_read *read,
@@ -4028,8 +4053,12 @@ static __be32 nfsd4_encode_splice_read(
         int status, space_left;
         __be32 nfserr;
  
-       /* Make sure there will be room for padding if needed */
-       if (xdr->end - xdr->p < 1)
+       /*
+        * Make sure there is room at the end of buf->head for
+        * svcxdr_encode_opaque_pages() to create a tail buffer
+        * to XDR-pad the payload.
+        */
+       if (xdr->iov != xdr->buf->head || xdr->end - xdr->p < 1)
                 return nfserr_resource;
  
         nfserr = nfsd_splice_read(read->rd_rqstp, read->rd_fhp,
@@ -4038,6 +4067,8 @@ static __be32 nfsd4_encode_splice_read(
         read->rd_length = maxcount;
         if (nfserr)
                 goto out_err;
+       svcxdr_encode_opaque_pages(read->rd_rqstp, xdr, buf->pages,
+                                  buf->page_base, maxcount);
         status = svc_encode_result_payload(read->rd_rqstp,
                                            buf->head[0].iov_len, maxcount);
         if (status) {
@@ -4045,31 +4076,19 @@ static __be32 nfsd4_encode_splice_read(
                 goto out_err;
         }
  
-       buf->page_len = maxcount;
-       buf->len += maxcount;
-       xdr->page_ptr += (buf->page_base + maxcount + PAGE_SIZE - 1)
-                                                       / PAGE_SIZE;
-
-       /* Use rest of head for padding and remaining ops: */
-       buf->tail[0].iov_base = xdr->p;
-       buf->tail[0].iov_len = 0;
-       xdr->iov = buf->tail;
-       if (maxcount&3) {
-               int pad = 4 - (maxcount&3);
-
-               *(xdr->p++) = 0;
-
-               buf->tail[0].iov_base += maxcount&3;
-               buf->tail[0].iov_len = pad;
-               buf->len += pad;
-       }
-
+       /*
+        * Prepare to encode subsequent operations.
+        *
+        * xdr_truncate_encode() is not safe to use after a successful
+        * splice read has been done, so the following stream
+        * manipulations are open-coded.
+        */
         space_left = min_t(int, (void *)xdr->end - (void *)xdr->p,
                                 buf->buflen - buf->len);
         buf->buflen = buf->len + space_left;
         xdr->end = (__be32 *)((void *)xdr->end + space_left);
  
-       return 0;
+       return nfs_ok;
  
  out_err:
         /*
@@ -4090,13 +4109,13 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
         __be32 zero = xdr_zero;
         __be32 nfserr;
  
-       read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp->rq_vec, maxcount);
-       if (read->rd_vlen < 0)
+       if (xdr_reserve_space_vec(xdr, maxcount) < 0)
                 return nfserr_resource;
  
-       nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset,
-                           resp->rqstp->rq_vec, read->rd_vlen, &maxcount,
-                           &read->rd_eof);
+       nfserr = nfsd_iter_read(resp->rqstp, read->rd_fhp, file,
+                               read->rd_offset, &maxcount,
+                               xdr->buf->page_len & ~PAGE_MASK,
+                               &read->rd_eof);
         read->rd_length = maxcount;
         if (nfserr)
                 return nfserr;
@@ -4213,15 +4232,9 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr,
         int starting_len = xdr->buf->len;
         __be32 *p;
  
-       p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE);
-       if (!p)
-               return nfserr_resource;
-
-       /* XXX: Following NFSv3, we ignore the READDIR verifier for now. */
-       *p++ = cpu_to_be32(0);
-       *p++ = cpu_to_be32(0);
-       xdr->buf->head[0].iov_len = (char *)xdr->p -
-                                   (char *)xdr->buf->head[0].iov_base;
+       nfserr = nfsd4_encode_verifier4(xdr, &readdir->rd_verf);
+       if (nfserr != nfs_ok)
+               return nfserr;
  
         /*
          * Number of bytes left for directory entries allowing for the
@@ -4299,13 +4312,8 @@ nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr,
  {
         struct nfsd4_remove *remove = &u->remove;
         struct xdr_stream *xdr = resp->xdr;
-       __be32 *p;
  
-       p = xdr_reserve_space(xdr, 20);
-       if (!p)
-               return nfserr_resource;
-       p = encode_cinfo(p, &remove->rm_cinfo);
-       return 0;
+       return nfsd4_encode_change_info4(xdr, &remove->rm_cinfo);
  }
  
  static __be32
@@ -4314,14 +4322,11 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr,
  {
         struct nfsd4_rename *rename = &u->rename;
         struct xdr_stream *xdr = resp->xdr;
-       __be32 *p;
  
-       p = xdr_reserve_space(xdr, 40);
-       if (!p)
-               return nfserr_resource;
-       p = encode_cinfo(p, &rename->rn_sinfo);
-       p = encode_cinfo(p, &rename->rn_tinfo);
-       return 0;
+       nfserr = nfsd4_encode_change_info4(xdr, &rename->rn_sinfo);
+       if (nfserr)
+               return nfserr;
+       return nfsd4_encode_change_info4(xdr, &rename->rn_tinfo);
  }
  
  static __be32
@@ -4448,23 +4453,25 @@ nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr,
  {
         struct nfsd4_setclientid *scd = &u->setclientid;
         struct xdr_stream *xdr = resp->xdr;
-       __be32 *p;
  
         if (!nfserr) {
-               p = xdr_reserve_space(xdr, 8 + NFS4_VERIFIER_SIZE);
-               if (!p)
-                       return nfserr_resource;
-               p = xdr_encode_opaque_fixed(p, &scd->se_clientid, 8);
-               p = xdr_encode_opaque_fixed(p, &scd->se_confirm,
-                                               NFS4_VERIFIER_SIZE);
-       }
-       else if (nfserr == nfserr_clid_inuse) {
-               p = xdr_reserve_space(xdr, 8);
-               if (!p)
-                       return nfserr_resource;
-               *p++ = cpu_to_be32(0);
-               *p++ = cpu_to_be32(0);
+               nfserr = nfsd4_encode_clientid4(xdr, &scd->se_clientid);
+               if (nfserr != nfs_ok)
+                       goto out;
+               nfserr = nfsd4_encode_verifier4(xdr, &scd->se_confirm);
+       } else if (nfserr == nfserr_clid_inuse) {
+               /* empty network id */
+               if (xdr_stream_encode_u32(xdr, 0) < 0) {
+                       nfserr = nfserr_resource;
+                       goto out;
+               }
+               /* empty universal address */
+               if (xdr_stream_encode_u32(xdr, 0) < 0) {
+                       nfserr = nfserr_resource;
+                       goto out;
+               }
         }
+out:
         return nfserr;
  }
  
@@ -4473,17 +4480,12 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr,
                    union nfsd4_op_u *u)
  {
         struct nfsd4_write *write = &u->write;
-       struct xdr_stream *xdr = resp->xdr;
-       __be32 *p;
  
-       p = xdr_reserve_space(xdr, 16);
-       if (!p)
+       if (xdr_stream_encode_u32(resp->xdr, write->wr_bytes_written) < 0)
                 return nfserr_resource;
-       *p++ = cpu_to_be32(write->wr_bytes_written);
-       *p++ = cpu_to_be32(write->wr_how_written);
-       p = xdr_encode_opaque_fixed(p, write->wr_verifier.data,
-                                               NFS4_VERIFIER_SIZE);
-       return 0;
+       if (xdr_stream_encode_u32(resp->xdr, write->wr_how_written) < 0)
+               return nfserr_resource;
+       return nfsd4_encode_verifier4(resp->xdr, &write->wr_verifier);
  }
  
  static __be32
@@ -4505,20 +4507,15 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
         server_scope = nn->nfsd_name;
         server_scope_sz = strlen(nn->nfsd_name);
  
-       p = xdr_reserve_space(xdr,
-               8 /* eir_clientid */ +
-               4 /* eir_sequenceid */ +
-               4 /* eir_flags */ +
-               4 /* spr_how */);
-       if (!p)
+       if (nfsd4_encode_clientid4(xdr, &exid->clientid) != nfs_ok)
+               return nfserr_resource;
+       if (xdr_stream_encode_u32(xdr, exid->seqid) < 0)
+               return nfserr_resource;
+       if (xdr_stream_encode_u32(xdr, exid->flags) < 0)
                 return nfserr_resource;
  
-       p = xdr_encode_opaque_fixed(p, &exid->clientid, 8);
-       *p++ = cpu_to_be32(exid->seqid);
-       *p++ = cpu_to_be32(exid->flags);
-
-       *p++ = cpu_to_be32(exid->spa_how);
-
+       if (xdr_stream_encode_u32(xdr, exid->spa_how) < 0)
+               return nfserr_resource;
         switch (exid->spa_how) {
         case SP4_NONE:
                 break;
@@ -5099,15 +5096,8 @@ nfsd4_encode_setxattr(struct nfsd4_compoundres *resp, __be32 nfserr,
  {
         struct nfsd4_setxattr *setxattr = &u->setxattr;
         struct xdr_stream *xdr = resp->xdr;
-       __be32 *p;
  
-       p = xdr_reserve_space(xdr, 20);
-       if (!p)
-               return nfserr_resource;
-
-       encode_cinfo(p, &setxattr->setxa_cinfo);
-
-       return 0;
+       return nfsd4_encode_change_info4(xdr, &setxattr->setxa_cinfo);
  }
  
  /*
@@ -5253,14 +5243,8 @@ nfsd4_encode_removexattr(struct nfsd4_compoundres *resp, __be32 nfserr,
  {
         struct nfsd4_removexattr *removexattr = &u->removexattr;
         struct xdr_stream *xdr = resp->xdr;
-       __be32 *p;
  
-       p = xdr_reserve_space(xdr, 20);
-       if (!p)
-               return nfserr_resource;
-
-       p = encode_cinfo(p, &removexattr->rmxa_cinfo);
-       return 0;
+       return nfsd4_encode_change_info4(xdr, &removexattr->rmxa_cinfo);
  }
  
  typedef __be32(*nfsd4_enc)(struct nfsd4_compoundres *, __be32, union nfsd4_op_u *u);
@@ -5460,6 +5444,12 @@ status:
  release:
         if (opdesc && opdesc->op_release)
                 opdesc->op_release(&op->u);
+
+       /*
+        * Account for pages consumed while encoding this operation.
+        * The xdr_stream primitives don't manage rq_next_page.
+        */
+       rqstp->rq_next_page = xdr->page_ptr + 1;
  }
  
  /* 
@@ -5528,9 +5518,6 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
         p = resp->statusp;
  
         *p++ = resp->cstate.status;
-
-       rqstp->rq_next_page = xdr->page_ptr + 1;
-
         *p++ = htonl(resp->taglen);
         memcpy(p, resp->tag, resp->taglen);
         p += XDR_QUADLEN(resp->taglen);
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c

index 041faa1..a8eda1c 100644 (file)
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -148,12 +148,23 @@ void nfsd_drc_slab_free(void)
         kmem_cache_destroy(drc_slab);
  }
  
-static int nfsd_reply_cache_stats_init(struct nfsd_net *nn)
+/**
+ * nfsd_net_reply_cache_init - per net namespace reply cache set-up
+ * @nn: nfsd_net being initialized
+ *
+ * Returns zero on succes; otherwise a negative errno is returned.
+ */
+int nfsd_net_reply_cache_init(struct nfsd_net *nn)
  {
         return nfsd_percpu_counters_init(nn->counter, NFSD_NET_COUNTERS_NUM);
  }
  
-static void nfsd_reply_cache_stats_destroy(struct nfsd_net *nn)
+/**
+ * nfsd_net_reply_cache_destroy - per net namespace reply cache tear-down
+ * @nn: nfsd_net being freed
+ *
+ */
+void nfsd_net_reply_cache_destroy(struct nfsd_net *nn)
  {
         nfsd_percpu_counters_destroy(nn->counter, NFSD_NET_COUNTERS_NUM);
  }
@@ -169,17 +180,13 @@ int nfsd_reply_cache_init(struct nfsd_net *nn)
         hashsize = nfsd_hashsize(nn->max_drc_entries);
         nn->maskbits = ilog2(hashsize);
  
-       status = nfsd_reply_cache_stats_init(nn);
-       if (status)
-               goto out_nomem;
-
         nn->nfsd_reply_cache_shrinker.scan_objects = nfsd_reply_cache_scan;
         nn->nfsd_reply_cache_shrinker.count_objects = nfsd_reply_cache_count;
         nn->nfsd_reply_cache_shrinker.seeks = 1;
         status = register_shrinker(&nn->nfsd_reply_cache_shrinker,
                                    "nfsd-reply:%s", nn->nfsd_name);
         if (status)
-               goto out_stats_destroy;
+               return status;
  
         nn->drc_hashtbl = kvzalloc(array_size(hashsize,
                                 sizeof(*nn->drc_hashtbl)), GFP_KERNEL);
@@ -195,9 +202,6 @@ int nfsd_reply_cache_init(struct nfsd_net *nn)
         return 0;
  out_shrinker:
         unregister_shrinker(&nn->nfsd_reply_cache_shrinker);
-out_stats_destroy:
-       nfsd_reply_cache_stats_destroy(nn);
-out_nomem:
         printk(KERN_ERR "nfsd: failed to allocate reply cache\n");
         return -ENOMEM;
  }
@@ -217,7 +221,6 @@ void nfsd_reply_cache_shutdown(struct nfsd_net *nn)
                                                                         rp, nn);
                 }
         }
-       nfsd_reply_cache_stats_destroy(nn);
  
         kvfree(nn->drc_hashtbl);
         nn->drc_hashtbl = NULL;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c

index b4fd7a7..1b8b1aa 100644 (file)
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -25,6 +25,7 @@
  #include "netns.h"
  #include "pnfs.h"
  #include "filecache.h"
+#include "trace.h"
  
  /*
   *     We have a single directory with several nodes in it.
@@ -109,12 +110,12 @@ static ssize_t nfsctl_transaction_write(struct file *file, const char __user *bu
         if (IS_ERR(data))
                 return PTR_ERR(data);
  
-       rv =  write_op[ino](file, data, size);
-       if (rv >= 0) {
-               simple_transaction_set(file, rv);
-               rv = size;
-       }
-       return rv;
+       rv = write_op[ino](file, data, size);
+       if (rv < 0)
+               return rv;
+
+       simple_transaction_set(file, rv);
+       return size;
  }
  
  static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos)
@@ -230,6 +231,7 @@ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
         if (rpc_pton(net, fo_path, size, sap, salen) == 0)
                 return -EINVAL;
  
+       trace_nfsd_ctl_unlock_ip(net, buf);
         return nlmsvc_unlock_all_by_ip(sap);
  }
  
@@ -263,7 +265,7 @@ static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size)
         fo_path = buf;
         if (qword_get(&buf, fo_path, size) < 0)
                 return -EINVAL;
-
+       trace_nfsd_ctl_unlock_fs(netns(file), fo_path);
         error = kern_path(fo_path, 0, &path);
         if (error)
                 return error;
@@ -324,7 +326,7 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
         len = qword_get(&mesg, dname, size);
         if (len <= 0)
                 return -EINVAL;
-       
+
         path = dname+len+1;
         len = qword_get(&mesg, path, size);
         if (len <= 0)
@@ -338,15 +340,17 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
                 return -EINVAL;
         maxsize = min(maxsize, NFS3_FHSIZE);
  
-       if (qword_get(&mesg, mesg, size)>0)
+       if (qword_get(&mesg, mesg, size) > 0)
                 return -EINVAL;
  
+       trace_nfsd_ctl_filehandle(netns(file), dname, path, maxsize);
+
         /* we have all the words, they are in buf.. */
         dom = unix_domain_find(dname);
         if (!dom)
                 return -ENOMEM;
  
-       len = exp_rootfh(netns(file), dom, path, &fh,  maxsize);
+       len = exp_rootfh(netns(file), dom, path, &fh, maxsize);
         auth_domain_put(dom);
         if (len)
                 return len;
@@ -399,6 +403,7 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)
                         return rv;
                 if (newthreads < 0)
                         return -EINVAL;
+               trace_nfsd_ctl_threads(net, newthreads);
                 rv = nfsd_svc(newthreads, net, file->f_cred);
                 if (rv < 0)
                         return rv;
@@ -418,8 +423,8 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)
   * OR
   *
   * Input:
- *                     buf:            C string containing whitespace-
- *                                     separated unsigned integer values
+ *                     buf:            C string containing whitespace-
+ *                                     separated unsigned integer values
   *                                     representing the number of NFSD
   *                                     threads to start in each pool
   *                     size:           non-zero length of C string in @buf
@@ -471,6 +476,7 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size)
                         rv = -EINVAL;
                         if (nthreads[i] < 0)
                                 goto out_free;
+                       trace_nfsd_ctl_pool_threads(net, i, nthreads[i]);
                 }
                 rv = nfsd_set_nrthreads(i, nthreads, net);
                 if (rv)
@@ -526,7 +532,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
         char *sep;
         struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id);
  
-       if (size>0) {
+       if (size > 0) {
                 if (nn->nfsd_serv)
                         /* Cannot change versions without updating
                          * nn->nfsd_serv->sv_xdrsize, and reallocing
@@ -536,6 +542,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
                 if (buf[size-1] != '\n')
                         return -EINVAL;
                 buf[size-1] = 0;
+               trace_nfsd_ctl_version(netns(file), buf);
  
                 vers = mesg;
                 len = qword_get(&mesg, vers, size);
@@ -637,11 +644,11 @@ out:
   * OR
   *
   * Input:
- *                     buf:            C string containing whitespace-
- *                                     separated positive or negative
- *                                     integer values representing NFS
- *                                     protocol versions to enable ("+n")
- *                                     or disable ("-n")
+ *                     buf:            C string containing whitespace-
+ *                                     separated positive or negative
+ *                                     integer values representing NFS
+ *                                     protocol versions to enable ("+n")
+ *                                     or disable ("-n")
   *                     size:           non-zero length of C string in @buf
   * Output:
   *     On success:     status of zero or more protocol versions has
@@ -689,6 +696,7 @@ static ssize_t __write_ports_addfd(char *buf, struct net *net, const struct cred
         err = get_int(&mesg, &fd);
         if (err != 0 || fd < 0)
                 return -EINVAL;
+       trace_nfsd_ctl_ports_addfd(net, fd);
  
         err = nfsd_create_serv(net);
         if (err != 0)
@@ -705,7 +713,7 @@ static ssize_t __write_ports_addfd(char *buf, struct net *net, const struct cred
  }
  
  /*
- * A transport listener is added by writing it's transport name and
+ * A transport listener is added by writing its transport name and
   * a port number.
   */
  static ssize_t __write_ports_addxprt(char *buf, struct net *net, const struct cred *cred)
@@ -720,6 +728,7 @@ static ssize_t __write_ports_addxprt(char *buf, struct net *net, const struct cr
  
         if (port < 1 || port > USHRT_MAX)
                 return -EINVAL;
+       trace_nfsd_ctl_ports_addxprt(net, transport, port);
  
         err = nfsd_create_serv(net);
         if (err != 0)
@@ -832,9 +841,9 @@ int nfsd_max_blksize;
   * OR
   *
   * Input:
- *                     buf:            C string containing an unsigned
- *                                     integer value representing the new
- *                                     NFS blksize
+ *                     buf:            C string containing an unsigned
+ *                                     integer value representing the new
+ *                                     NFS blksize
   *                     size:           non-zero length of C string in @buf
   * Output:
   *     On success:     passed-in buffer filled with '\n'-terminated C string
@@ -853,6 +862,8 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
                 int rv = get_int(&mesg, &bsize);
                 if (rv)
                         return rv;
+               trace_nfsd_ctl_maxblksize(netns(file), bsize);
+
                 /* force bsize into allowed range and
                  * required alignment.
                  */
@@ -881,9 +892,9 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
   * OR
   *
   * Input:
- *                     buf:            C string containing an unsigned
- *                                     integer value representing the new
- *                                     number of max connections
+ *                     buf:            C string containing an unsigned
+ *                                     integer value representing the new
+ *                                     number of max connections
   *                     size:           non-zero length of C string in @buf
   * Output:
   *     On success:     passed-in buffer filled with '\n'-terminated C string
@@ -903,6 +914,7 @@ static ssize_t write_maxconn(struct file *file, char *buf, size_t size)
  
                 if (rv)
                         return rv;
+               trace_nfsd_ctl_maxconn(netns(file), maxconn);
                 nn->max_connections = maxconn;
         }
  
@@ -913,6 +925,7 @@ static ssize_t write_maxconn(struct file *file, char *buf, size_t size)
  static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size,
                                   time64_t *time, struct nfsd_net *nn)
  {
+       struct dentry *dentry = file_dentry(file);
         char *mesg = buf;
         int rv, i;
  
@@ -922,6 +935,9 @@ static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size,
                 rv = get_int(&mesg, &i);
                 if (rv)
                         return rv;
+               trace_nfsd_ctl_time(netns(file), dentry->d_name.name,
+                                   dentry->d_name.len, i);
+
                 /*
                  * Some sanity checking.  We don't have a reason for
                  * these particular numbers, but problems with the
@@ -1014,6 +1030,7 @@ static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size,
                 len = qword_get(&mesg, recdir, size);
                 if (len <= 0)
                         return -EINVAL;
+               trace_nfsd_ctl_recoverydir(netns(file), recdir);
  
                 status = nfs4_reset_recoverydir(recdir);
                 if (status)
@@ -1065,7 +1082,7 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
   * OR
   *
   * Input:
- *                     buf:            any value
+ *                     buf:            any value
   *                     size:           non-zero length of C string in @buf
   * Output:
   *                     passed-in buffer filled with "Y" or "N" with a newline
@@ -1087,7 +1104,7 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size)
                 case '1':
                         if (!nn->nfsd_serv)
                                 return -EBUSY;
-                       nfsd4_end_grace(nn);
+                       trace_nfsd_end_grace(netns(file));
                         break;
                 default:
                         return -EINVAL;
@@ -1192,8 +1209,8 @@ static int __nfsd_symlink(struct inode *dir, struct dentry *dentry,
   * @content is assumed to be a NUL-terminated string that lives
   * longer than the symlink itself.
   */
-static void nfsd_symlink(struct dentry *parent, const char *name,
-                        const char *content)
+static void _nfsd_symlink(struct dentry *parent, const char *name,
+                         const char *content)
  {
         struct inode *dir = parent->d_inode;
         struct dentry *dentry;
@@ -1210,8 +1227,8 @@ out:
         inode_unlock(dir);
  }
  #else
-static inline void nfsd_symlink(struct dentry *parent, const char *name,
-                               const char *content)
+static inline void _nfsd_symlink(struct dentry *parent, const char *name,
+                                const char *content)
  {
  }
  
@@ -1389,8 +1406,8 @@ static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc)
         ret = simple_fill_super(sb, 0x6e667364, nfsd_files);
         if (ret)
                 return ret;
-       nfsd_symlink(sb->s_root, "supported_krb5_enctypes",
-                    "/proc/net/rpc/gss_krb5_enctypes");
+       _nfsd_symlink(sb->s_root, "supported_krb5_enctypes",
+                     "/proc/net/rpc/gss_krb5_enctypes");
         dentry = nfsd_mkdir(sb->s_root, NULL, "clients");
         if (IS_ERR(dentry))
                 return PTR_ERR(dentry);
@@ -1477,7 +1494,17 @@ static int create_proc_exports_entry(void)
  
  unsigned int nfsd_net_id;
  
-static __net_init int nfsd_init_net(struct net *net)
+/**
+ * nfsd_net_init - Prepare the nfsd_net portion of a new net namespace
+ * @net: a freshly-created network namespace
+ *
+ * This information stays around as long as the network namespace is
+ * alive whether or not there is an NFSD instance running in the
+ * namespace.
+ *
+ * Returns zero on success, or a negative errno otherwise.
+ */
+static __net_init int nfsd_net_init(struct net *net)
  {
         int retval;
         struct nfsd_net *nn = net_generic(net, nfsd_net_id);
@@ -1488,6 +1515,9 @@ static __net_init int nfsd_init_net(struct net *net)
         retval = nfsd_idmap_init(net);
         if (retval)
                 goto out_idmap_error;
+       retval = nfsd_net_reply_cache_init(nn);
+       if (retval)
+               goto out_repcache_error;
         nn->nfsd_versions = NULL;
         nn->nfsd4_minorversions = NULL;
         nfsd4_init_leases_net(nn);
@@ -1496,22 +1526,32 @@ static __net_init int nfsd_init_net(struct net *net)
  
         return 0;
  
+out_repcache_error:
+       nfsd_idmap_shutdown(net);
  out_idmap_error:
         nfsd_export_shutdown(net);
  out_export_error:
         return retval;
  }
  
-static __net_exit void nfsd_exit_net(struct net *net)
+/**
+ * nfsd_net_exit - Release the nfsd_net portion of a net namespace
+ * @net: a network namespace that is about to be destroyed
+ *
+ */
+static __net_exit void nfsd_net_exit(struct net *net)
  {
+       struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+       nfsd_net_reply_cache_destroy(nn);
         nfsd_idmap_shutdown(net);
         nfsd_export_shutdown(net);
-       nfsd_netns_free_versions(net_generic(net, nfsd_net_id));
+       nfsd_netns_free_versions(nn);
  }
  
  static struct pernet_operations nfsd_net_ops = {
-       .init = nfsd_init_net,
-       .exit = nfsd_exit_net,
+       .init = nfsd_net_init,
+       .exit = nfsd_net_exit,
         .id   = &nfsd_net_id,
         .size = sizeof(struct nfsd_net),
  };
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c

index ccd8485..e8e13ae 100644 (file)
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -623,16 +623,9 @@ void fh_fill_pre_attrs(struct svc_fh *fhp)
  
         inode = d_inode(fhp->fh_dentry);
         err = fh_getattr(fhp, &stat);
-       if (err) {
-               /* Grab the times from inode anyway */
-               stat.mtime = inode->i_mtime;
-               stat.ctime = inode->i_ctime;
-               stat.size  = inode->i_size;
-               if (v4 && IS_I_VERSION(inode)) {
-                       stat.change_cookie = inode_query_iversion(inode);
-                       stat.result_mask |= STATX_CHANGE_COOKIE;
-               }
-       }
+       if (err)
+               return;
+
         if (v4)
                 fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode);
  
@@ -660,15 +653,10 @@ void fh_fill_post_attrs(struct svc_fh *fhp)
                 printk("nfsd: inode locked twice during operation.\n");
  
         err = fh_getattr(fhp, &fhp->fh_post_attr);
-       if (err) {
-               fhp->fh_post_saved = false;
-               fhp->fh_post_attr.ctime = inode->i_ctime;
-               if (v4 && IS_I_VERSION(inode)) {
-                       fhp->fh_post_attr.change_cookie = inode_query_iversion(inode);
-                       fhp->fh_post_attr.result_mask |= STATX_CHANGE_COOKIE;
-               }
-       } else
-               fhp->fh_post_saved = true;
+       if (err)
+               return;
+
+       fhp->fh_post_saved = true;
         if (v4)
                 fhp->fh_post_change =
                         nfsd4_change_attribute(&fhp->fh_post_attr, inode);
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c

index c371955..a731592 100644 (file)
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -176,9 +176,7 @@ nfsd_proc_read(struct svc_rqst *rqstp)
  {
         struct nfsd_readargs *argp = rqstp->rq_argp;
         struct nfsd_readres *resp = rqstp->rq_resp;
-       unsigned int len;
         u32 eof;
-       int v;
  
         dprintk("nfsd: READ    %s %d bytes at %d\n",
                 SVCFH_fmt(&argp->fh),
@@ -187,17 +185,7 @@ nfsd_proc_read(struct svc_rqst *rqstp)
         argp->count = min_t(u32, argp->count, NFSSVC_MAXBLKSIZE_V2);
         argp->count = min_t(u32, argp->count, rqstp->rq_res.buflen);
  
-       v = 0;
-       len = argp->count;
         resp->pages = rqstp->rq_next_page;
-       while (len > 0) {
-               struct page *page = *(rqstp->rq_next_page++);
-
-               rqstp->rq_vec[v].iov_base = page_address(page);
-               rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE);
-               len -= rqstp->rq_vec[v].iov_len;
-               v++;
-       }
  
         /* Obtain buffer pointer for payload. 19 is 1 word for
          * status, 17 words for fattr, and 1 word for the byte count.
@@ -207,7 +195,7 @@ nfsd_proc_read(struct svc_rqst *rqstp)
         resp->count = argp->count;
         fh_copy(&resp->fh, &argp->fh);
         resp->status = nfsd_read(rqstp, &resp->fh, argp->offset,
-                                rqstp->rq_vec, v, &resp->count, &eof);
+                                &resp->count, &eof);
         if (resp->status == nfs_ok)
                 resp->status = fh_getattr(&resp->fh, &resp->stat);
         else if (resp->status == nfserr_jukebox)
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c

index 9c7b1ef..2154fa6 100644 (file)
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -402,6 +402,11 @@ void nfsd_reset_write_verifier(struct nfsd_net *nn)
         write_sequnlock(&nn->writeverf_lock);
  }
  
+/*
+ * Crank up a set of per-namespace resources for a new NFSD instance,
+ * including lockd, a duplicate reply cache, an open file cache
+ * instance, and a cache of NFSv4 state objects.
+ */
  static int nfsd_startup_net(struct net *net, const struct cred *cred)
  {
         struct nfsd_net *nn = net_generic(net, nfsd_net_id);
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c

index caf6355..5777f40 100644 (file)
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -468,7 +468,8 @@ nfssvc_encode_readlinkres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
         case nfs_ok:
                 if (xdr_stream_encode_u32(xdr, resp->len) < 0)
                         return false;
-               xdr_write_pages(xdr, &resp->page, 0, resp->len);
+               svcxdr_encode_opaque_pages(rqstp, xdr, &resp->page, 0,
+                                          resp->len);
                 if (svc_encode_result_payload(rqstp, head->iov_len, resp->len) < 0)
                         return false;
                 break;
@@ -491,8 +492,9 @@ nfssvc_encode_readres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
                         return false;
                 if (xdr_stream_encode_u32(xdr, resp->count) < 0)
                         return false;
-               xdr_write_pages(xdr, resp->pages, rqstp->rq_res.page_base,
-                               resp->count);
+               svcxdr_encode_opaque_pages(rqstp, xdr, resp->pages,
+                                          rqstp->rq_res.page_base,
+                                          resp->count);
                 if (svc_encode_result_payload(rqstp, head->iov_len, resp->count) < 0)
                         return false;
                 break;
@@ -511,7 +513,8 @@ nfssvc_encode_readdirres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
                 return false;
         switch (resp->status) {
         case nfs_ok:
-               xdr_write_pages(xdr, dirlist->pages, 0, dirlist->len);
+               svcxdr_encode_opaque_pages(rqstp, xdr, dirlist->pages, 0,
+                                          dirlist->len);
                 /* no more entries */
                 if (xdr_stream_encode_item_absent(xdr) < 0)
                         return false;
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h

index 72a906a..2af7498 100644 (file)
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -1581,6 +1581,265 @@ TRACE_EVENT(nfsd_cb_recall_any_done,
         )
  );
  
+TRACE_EVENT(nfsd_ctl_unlock_ip,
+       TP_PROTO(
+               const struct net *net,
+               const char *address
+       ),
+       TP_ARGS(net, address),
+       TP_STRUCT__entry(
+               __field(unsigned int, netns_ino)
+               __string(address, address)
+       ),
+       TP_fast_assign(
+               __entry->netns_ino = net->ns.inum;
+               __assign_str(address, address);
+       ),
+       TP_printk("address=%s",
+               __get_str(address)
+       )
+);
+
+TRACE_EVENT(nfsd_ctl_unlock_fs,
+       TP_PROTO(
+               const struct net *net,
+               const char *path
+       ),
+       TP_ARGS(net, path),
+       TP_STRUCT__entry(
+               __field(unsigned int, netns_ino)
+               __string(path, path)
+       ),
+       TP_fast_assign(
+               __entry->netns_ino = net->ns.inum;
+               __assign_str(path, path);
+       ),
+       TP_printk("path=%s",
+               __get_str(path)
+       )
+);
+
+TRACE_EVENT(nfsd_ctl_filehandle,
+       TP_PROTO(
+               const struct net *net,
+               const char *domain,
+               const char *path,
+               int maxsize
+       ),
+       TP_ARGS(net, domain, path, maxsize),
+       TP_STRUCT__entry(
+               __field(unsigned int, netns_ino)
+               __field(int, maxsize)
+               __string(domain, domain)
+               __string(path, path)
+       ),
+       TP_fast_assign(
+               __entry->netns_ino = net->ns.inum;
+               __entry->maxsize = maxsize;
+               __assign_str(domain, domain);
+               __assign_str(path, path);
+       ),
+       TP_printk("domain=%s path=%s maxsize=%d",
+               __get_str(domain), __get_str(path), __entry->maxsize
+       )
+);
+
+TRACE_EVENT(nfsd_ctl_threads,
+       TP_PROTO(
+               const struct net *net,
+               int newthreads
+       ),
+       TP_ARGS(net, newthreads),
+       TP_STRUCT__entry(
+               __field(unsigned int, netns_ino)
+               __field(int, newthreads)
+       ),
+       TP_fast_assign(
+               __entry->netns_ino = net->ns.inum;
+               __entry->newthreads = newthreads;
+       ),
+       TP_printk("newthreads=%d",
+               __entry->newthreads
+       )
+);
+
+TRACE_EVENT(nfsd_ctl_pool_threads,
+       TP_PROTO(
+               const struct net *net,
+               int pool,
+               int nrthreads
+       ),
+       TP_ARGS(net, pool, nrthreads),
+       TP_STRUCT__entry(
+               __field(unsigned int, netns_ino)
+               __field(int, pool)
+               __field(int, nrthreads)
+       ),
+       TP_fast_assign(
+               __entry->netns_ino = net->ns.inum;
+               __entry->pool = pool;
+               __entry->nrthreads = nrthreads;
+       ),
+       TP_printk("pool=%d nrthreads=%d",
+               __entry->pool, __entry->nrthreads
+       )
+);
+
+TRACE_EVENT(nfsd_ctl_version,
+       TP_PROTO(
+               const struct net *net,
+               const char *mesg
+       ),
+       TP_ARGS(net, mesg),
+       TP_STRUCT__entry(
+               __field(unsigned int, netns_ino)
+               __string(mesg, mesg)
+       ),
+       TP_fast_assign(
+               __entry->netns_ino = net->ns.inum;
+               __assign_str(mesg, mesg);
+       ),
+       TP_printk("%s",
+               __get_str(mesg)
+       )
+);
+
+TRACE_EVENT(nfsd_ctl_ports_addfd,
+       TP_PROTO(
+               const struct net *net,
+               int fd
+       ),
+       TP_ARGS(net, fd),
+       TP_STRUCT__entry(
+               __field(unsigned int, netns_ino)
+               __field(int, fd)
+       ),
+       TP_fast_assign(
+               __entry->netns_ino = net->ns.inum;
+               __entry->fd = fd;
+       ),
+       TP_printk("fd=%d",
+               __entry->fd
+       )
+);
+
+TRACE_EVENT(nfsd_ctl_ports_addxprt,
+       TP_PROTO(
+               const struct net *net,
+               const char *transport,
+               int port
+       ),
+       TP_ARGS(net, transport, port),
+       TP_STRUCT__entry(
+               __field(unsigned int, netns_ino)
+               __field(int, port)
+               __string(transport, transport)
+       ),
+       TP_fast_assign(
+               __entry->netns_ino = net->ns.inum;
+               __entry->port = port;
+               __assign_str(transport, transport);
+       ),
+       TP_printk("transport=%s port=%d",
+               __get_str(transport), __entry->port
+       )
+);
+
+TRACE_EVENT(nfsd_ctl_maxblksize,
+       TP_PROTO(
+               const struct net *net,
+               int bsize
+       ),
+       TP_ARGS(net, bsize),
+       TP_STRUCT__entry(
+               __field(unsigned int, netns_ino)
+               __field(int, bsize)
+       ),
+       TP_fast_assign(
+               __entry->netns_ino = net->ns.inum;
+               __entry->bsize = bsize;
+       ),
+       TP_printk("bsize=%d",
+               __entry->bsize
+       )
+);
+
+TRACE_EVENT(nfsd_ctl_maxconn,
+       TP_PROTO(
+               const struct net *net,
+               int maxconn
+       ),
+       TP_ARGS(net, maxconn),
+       TP_STRUCT__entry(
+               __field(unsigned int, netns_ino)
+               __field(int, maxconn)
+       ),
+       TP_fast_assign(
+               __entry->netns_ino = net->ns.inum;
+               __entry->maxconn = maxconn;
+       ),
+       TP_printk("maxconn=%d",
+               __entry->maxconn
+       )
+);
+
+TRACE_EVENT(nfsd_ctl_time,
+       TP_PROTO(
+               const struct net *net,
+               const char *name,
+               size_t namelen,
+               int time
+       ),
+       TP_ARGS(net, name, namelen, time),
+       TP_STRUCT__entry(
+               __field(unsigned int, netns_ino)
+               __field(int, time)
+               __string_len(name, name, namelen)
+       ),
+       TP_fast_assign(
+               __entry->netns_ino = net->ns.inum;
+               __entry->time = time;
+               __assign_str_len(name, name, namelen);
+       ),
+       TP_printk("file=%s time=%d\n",
+               __get_str(name), __entry->time
+       )
+);
+
+TRACE_EVENT(nfsd_ctl_recoverydir,
+       TP_PROTO(
+               const struct net *net,
+               const char *recdir
+       ),
+       TP_ARGS(net, recdir),
+       TP_STRUCT__entry(
+               __field(unsigned int, netns_ino)
+               __string(recdir, recdir)
+       ),
+       TP_fast_assign(
+               __entry->netns_ino = net->ns.inum;
+               __assign_str(recdir, recdir);
+       ),
+       TP_printk("recdir=%s",
+               __get_str(recdir)
+       )
+);
+
+TRACE_EVENT(nfsd_end_grace,
+       TP_PROTO(
+               const struct net *net
+       ),
+       TP_ARGS(net),
+       TP_STRUCT__entry(
+               __field(unsigned int, netns_ino)
+       ),
+       TP_fast_assign(
+               __entry->netns_ino = net->ns.inum;
+       ),
+       TP_printk("nn=%d", __entry->netns_ino
+       )
+);
+
  #endif /* _NFSD_TRACE_H */
  
  #undef TRACE_INCLUDE_PATH
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c

index db67f8e..59b7d60 100644 (file)
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -388,7 +388,9 @@ nfsd_sanitize_attrs(struct inode *inode, struct iattr *iap)
                                 iap->ia_mode &= ~S_ISGID;
                 } else {
                         /* set ATTR_KILL_* bits and let VFS handle it */
-                       iap->ia_valid |= (ATTR_KILL_SUID | ATTR_KILL_SGID);
+                       iap->ia_valid |= ATTR_KILL_SUID;
+                       iap->ia_valid |=
+                               setattr_should_drop_sgid(&nop_mnt_idmap, inode);
                 }
         }
  }
@@ -1001,6 +1003,18 @@ static __be32 nfsd_finish_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
         }
  }
  
+/**
+ * nfsd_splice_read - Perform a VFS read using a splice pipe
+ * @rqstp: RPC transaction context
+ * @fhp: file handle of file to be read
+ * @file: opened struct file of file to be read
+ * @offset: starting byte offset
+ * @count: IN: requested number of bytes; OUT: number of bytes read
+ * @eof: OUT: set non-zero if operation reached the end of the file
+ *
+ * Returns nfs_ok on success, otherwise an nfserr stat value is
+ * returned.
+ */
  __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
                         struct file *file, loff_t offset, unsigned long *count,
                         u32 *eof)
@@ -1014,22 +1028,50 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
         ssize_t host_err;
  
         trace_nfsd_read_splice(rqstp, fhp, offset, *count);
-       rqstp->rq_next_page = rqstp->rq_respages + 1;
         host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor);
         return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err);
  }
  
-__be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp,
-                 struct file *file, loff_t offset,
-                 struct kvec *vec, int vlen, unsigned long *count,
-                 u32 *eof)
+/**
+ * nfsd_iter_read - Perform a VFS read using an iterator
+ * @rqstp: RPC transaction context
+ * @fhp: file handle of file to be read
+ * @file: opened struct file of file to be read
+ * @offset: starting byte offset
+ * @count: IN: requested number of bytes; OUT: number of bytes read
+ * @base: offset in first page of read buffer
+ * @eof: OUT: set non-zero if operation reached the end of the file
+ *
+ * Some filesystems or situations cannot use nfsd_splice_read. This
+ * function is the slightly less-performant fallback for those cases.
+ *
+ * Returns nfs_ok on success, otherwise an nfserr stat value is
+ * returned.
+ */
+__be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
+                     struct file *file, loff_t offset, unsigned long *count,
+                     unsigned int base, u32 *eof)
  {
+       unsigned long v, total;
         struct iov_iter iter;
         loff_t ppos = offset;
+       struct page *page;
         ssize_t host_err;
  
+       v = 0;
+       total = *count;
+       while (total) {
+               page = *(rqstp->rq_next_page++);
+               rqstp->rq_vec[v].iov_base = page_address(page) + base;
+               rqstp->rq_vec[v].iov_len = min_t(size_t, total, PAGE_SIZE - base);
+               total -= rqstp->rq_vec[v].iov_len;
+               ++v;
+               base = 0;
+       }
+       WARN_ON_ONCE(v > ARRAY_SIZE(rqstp->rq_vec));
+
         trace_nfsd_read_vector(rqstp, fhp, offset, *count);
-       iov_iter_kvec(&iter, ITER_DEST, vec, vlen, *count);
+       iov_iter_kvec(&iter, ITER_DEST, rqstp->rq_vec, v, *count);
         host_err = vfs_iter_read(file, &iter, &ppos, 0);
         return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err);
  }
@@ -1159,14 +1201,24 @@ out_nfserr:
         return nfserr;
  }
  
-/*
- * Read data from a file. count must contain the requested read count
- * on entry. On return, *count contains the number of bytes actually read.
+/**
+ * nfsd_read - Read data from a file
+ * @rqstp: RPC transaction context
+ * @fhp: file handle of file to be read
+ * @offset: starting byte offset
+ * @count: IN: requested number of bytes; OUT: number of bytes read
+ * @eof: OUT: set non-zero if operation reached the end of the file
+ *
+ * The caller must verify that there is enough space in @rqstp.rq_res
+ * to perform this operation.
+ *
   * N.B. After this call fhp needs an fh_put
+ *
+ * Returns nfs_ok on success, otherwise an nfserr stat value is
+ * returned.
   */
  __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
-       loff_t offset, struct kvec *vec, int vlen, unsigned long *count,
-       u32 *eof)
+                loff_t offset, unsigned long *count, u32 *eof)
  {
         struct nfsd_file        *nf;
         struct file *file;
@@ -1181,12 +1233,10 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
         if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &rqstp->rq_flags))
                 err = nfsd_splice_read(rqstp, fhp, file, offset, count, eof);
         else
-               err = nfsd_readv(rqstp, fhp, file, offset, vec, vlen, count, eof);
+               err = nfsd_iter_read(rqstp, fhp, file, offset, count, 0, eof);
  
         nfsd_file_put(nf);
-
         trace_nfsd_read_done(rqstp, fhp, offset, *count);
-
         return err;
  }
  
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h

index 43fb57a..a6890ea 100644 (file)
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -110,13 +110,12 @@ __be32            nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
                                 struct file *file, loff_t offset,
                                 unsigned long *count,
                                 u32 *eof);
-__be32         nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp,
+__be32         nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
                                 struct file *file, loff_t offset,
-                               struct kvec *vec, int vlen,
-                               unsigned long *count,
+                               unsigned long *count, unsigned int base,
                                 u32 *eof);
-__be32                 nfsd_read(struct svc_rqst *, struct svc_fh *,
-                               loff_t, struct kvec *, int, unsigned long *,
+__be32         nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
+                               loff_t offset, unsigned long *count,
                                 u32 *eof);
  __be32                 nfsd_write(struct svc_rqst *, struct svc_fh *, loff_t,
                                 struct kvec *, int, unsigned long *,
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h

index 762d723..3b10636 100644 (file)
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -509,6 +509,27 @@ static inline void svcxdr_init_encode(struct svc_rqst *rqstp)
  }
  
  /**
+ * svcxdr_encode_opaque_pages - Insert pages into an xdr_stream
+ * @xdr: xdr_stream to be updated
+ * @pages: array of pages to insert
+ * @base: starting offset of first data byte in @pages
+ * @len: number of data bytes in @pages to insert
+ *
+ * After the @pages are added, the tail iovec is instantiated pointing
+ * to end of the head buffer, and the stream is set up to encode
+ * subsequent items into the tail.
+ */
+static inline void svcxdr_encode_opaque_pages(struct svc_rqst *rqstp,
+                                             struct xdr_stream *xdr,
+                                             struct page **pages,
+                                             unsigned int base,
+                                             unsigned int len)
+{
+       xdr_write_pages(xdr, pages, base, len);
+       xdr->page_ptr = rqstp->rq_next_page - 1;
+}
+
+/**
   * svcxdr_set_auth_slack -
   * @rqstp: RPC transaction
   * @slack: buffer space to reserve for the transaction's security flavor
diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h

index fbc4bd4..a5ee0af 100644 (file)
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -135,7 +135,6 @@ struct svc_rdma_recv_ctxt {
         struct ib_sge           rc_recv_sge;
         void                    *rc_recv_buf;
         struct xdr_stream       rc_stream;
-       bool                    rc_temp;
         u32                     rc_byte_len;
         unsigned int            rc_page_count;
         u32                     rc_inv_rkey;
@@ -155,12 +154,12 @@ struct svc_rdma_send_ctxt {
  
         struct ib_send_wr       sc_send_wr;
         struct ib_cqe           sc_cqe;
-       struct completion       sc_done;
         struct xdr_buf          sc_hdrbuf;
         struct xdr_stream       sc_stream;
         void                    *sc_xprt_buf;
+       int                     sc_page_count;
         int                     sc_cur_sge_no;
-
+       struct page             *sc_pages[RPCSVC_MAXPAGES];
         struct ib_sge           sc_sges[];
  };
  
diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h

index 72014c9..f89ec4b 100644 (file)
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -242,8 +242,7 @@ extern void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf,
  extern void xdr_init_encode_pages(struct xdr_stream *xdr, struct xdr_buf *buf,
                            struct page **pages, struct rpc_rqst *rqst);
  extern __be32 *xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes);
-extern int xdr_reserve_space_vec(struct xdr_stream *xdr, struct kvec *vec,
-               size_t nbytes);
+extern int xdr_reserve_space_vec(struct xdr_stream *xdr, size_t nbytes);
  extern void __xdr_commit_encode(struct xdr_stream *xdr);
  extern void xdr_truncate_encode(struct xdr_stream *xdr, size_t len);
  extern void xdr_truncate_decode(struct xdr_stream *xdr, size_t len);
diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h

index 8f461e0..f8069ef 100644 (file)
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -2112,6 +2112,14 @@ DEFINE_POST_CHUNK_EVENT(read);
  DEFINE_POST_CHUNK_EVENT(write);
  DEFINE_POST_CHUNK_EVENT(reply);
  
+DEFINE_EVENT(svcrdma_post_chunk_class, svcrdma_cc_release,
+       TP_PROTO(
+               const struct rpc_rdma_cid *cid,
+               int sqecount
+       ),
+       TP_ARGS(cid, sqecount)
+);
+
  TRACE_EVENT(svcrdma_wc_read,
         TP_PROTO(
                 const struct ib_wc *wc,
diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h

index 31bc702..69e42ef 100644 (file)
--- a/include/trace/events/sunrpc.h
+++ b/include/trace/events/sunrpc.h
@@ -2104,31 +2104,46 @@ DEFINE_SVC_DEFERRED_EVENT(drop);
  DEFINE_SVC_DEFERRED_EVENT(queue);
  DEFINE_SVC_DEFERRED_EVENT(recv);
  
-TRACE_EVENT(svcsock_new_socket,
+DECLARE_EVENT_CLASS(svcsock_lifetime_class,
         TP_PROTO(
+               const void *svsk,
                 const struct socket *socket
         ),
-
-       TP_ARGS(socket),
-
+       TP_ARGS(svsk, socket),
         TP_STRUCT__entry(
+               __field(unsigned int, netns_ino)
+               __field(const void *, svsk)
+               __field(const void *, sk)
                 __field(unsigned long, type)
                 __field(unsigned long, family)
-               __field(bool, listener)
+               __field(unsigned long, state)
         ),
-
         TP_fast_assign(
+               struct sock *sk = socket->sk;
+
+               __entry->netns_ino = sock_net(sk)->ns.inum;
+               __entry->svsk = svsk;
+               __entry->sk = sk;
                 __entry->type = socket->type;
-               __entry->family = socket->sk->sk_family;
-               __entry->listener = (socket->sk->sk_state == TCP_LISTEN);
+               __entry->family = sk->sk_family;
+               __entry->state = sk->sk_state;
         ),
-
-       TP_printk("type=%s family=%s%s",
-               show_socket_type(__entry->type),
+       TP_printk("svsk=%p type=%s family=%s%s",
+               __entry->svsk, show_socket_type(__entry->type),
                 rpc_show_address_family(__entry->family),
-               __entry->listener ? " (listener)" : ""
+               __entry->state == TCP_LISTEN ? " (listener)" : ""
         )
  );
+#define DEFINE_SVCSOCK_LIFETIME_EVENT(name) \
+       DEFINE_EVENT(svcsock_lifetime_class, name, \
+               TP_PROTO( \
+                       const void *svsk, \
+                       const struct socket *socket \
+               ), \
+               TP_ARGS(svsk, socket))
+
+DEFINE_SVCSOCK_LIFETIME_EVENT(svcsock_new);
+DEFINE_SVCSOCK_LIFETIME_EVENT(svcsock_free);
  
  TRACE_EVENT(svcsock_marker,
         TP_PROTO(
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c

index 79967b6..e7c1012 100644 (file)
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -109,15 +109,15 @@ param_get_pool_mode(char *buf, const struct kernel_param *kp)
         switch (*ip)
         {
         case SVC_POOL_AUTO:
-               return strlcpy(buf, "auto\n", 20);
+               return sysfs_emit(buf, "auto\n");
         case SVC_POOL_GLOBAL:
-               return strlcpy(buf, "global\n", 20);
+               return sysfs_emit(buf, "global\n");
         case SVC_POOL_PERCPU:
-               return strlcpy(buf, "percpu\n", 20);
+               return sysfs_emit(buf, "percpu\n");
         case SVC_POOL_PERNODE:
-               return strlcpy(buf, "pernode\n", 20);
+               return sysfs_emit(buf, "pernode\n");
         default:
-               return sprintf(buf, "%d\n", *ip);
+               return sysfs_emit(buf, "%d\n", *ip);
         }
  }
  
@@ -597,34 +597,25 @@ svc_destroy(struct kref *ref)
  }
  EXPORT_SYMBOL_GPL(svc_destroy);
  
-/*
- * Allocate an RPC server's buffer space.
- * We allocate pages and place them in rq_pages.
- */
-static int
+static bool
  svc_init_buffer(struct svc_rqst *rqstp, unsigned int size, int node)
  {
-       unsigned int pages, arghi;
+       unsigned long pages, ret;
  
         /* bc_xprt uses fore channel allocated buffers */
         if (svc_is_backchannel(rqstp))
-               return 1;
+               return true;
  
         pages = size / PAGE_SIZE + 1; /* extra page as we hold both request and reply.
                                        * We assume one is at most one page
                                        */
-       arghi = 0;
         WARN_ON_ONCE(pages > RPCSVC_MAXPAGES);
         if (pages > RPCSVC_MAXPAGES)
                 pages = RPCSVC_MAXPAGES;
-       while (pages) {
-               struct page *p = alloc_pages_node(node, GFP_KERNEL, 0);
-               if (!p)
-                       break;
-               rqstp->rq_pages[arghi++] = p;
-               pages--;
-       }
-       return pages == 0;
+
+       ret = alloc_pages_bulk_array_node(GFP_KERNEL, node, pages,
+                                         rqstp->rq_pages);
+       return ret == pages;
  }
  
  /*
@@ -1173,6 +1164,7 @@ static void __svc_unregister(struct net *net, const u32 program, const u32 versi
   */
  static void svc_unregister(const struct svc_serv *serv, struct net *net)
  {
+       struct sighand_struct *sighand;
         struct svc_program *progp;
         unsigned long flags;
         unsigned int i;
@@ -1189,9 +1181,12 @@ static void svc_unregister(const struct svc_serv *serv, struct net *net)
                 }
         }
  
-       spin_lock_irqsave(&current->sighand->siglock, flags);
+       rcu_read_lock();
+       sighand = rcu_dereference(current->sighand);
+       spin_lock_irqsave(&sighand->siglock, flags);
         recalc_sigpending();
-       spin_unlock_irqrestore(&current->sighand->siglock, flags);
+       spin_unlock_irqrestore(&sighand->siglock, flags);
+       rcu_read_unlock();
  }
  
  /*
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c

index 13a1489..62c7919 100644 (file)
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -74,13 +74,18 @@ static LIST_HEAD(svc_xprt_class_list);
   *               that no other thread will be using the transport or will
   *               try to set XPT_DEAD.
   */
+
+/**
+ * svc_reg_xprt_class - Register a server-side RPC transport class
+ * @xcl: New transport class to be registered
+ *
+ * Returns zero on success; otherwise a negative errno is returned.
+ */
  int svc_reg_xprt_class(struct svc_xprt_class *xcl)
  {
         struct svc_xprt_class *cl;
         int res = -EEXIST;
  
-       dprintk("svc: Adding svc transport class '%s'\n", xcl->xcl_name);
-
         INIT_LIST_HEAD(&xcl->xcl_list);
         spin_lock(&svc_xprt_class_lock);
         /* Make sure there isn't already a class with the same name */
@@ -96,9 +101,13 @@ out:
  }
  EXPORT_SYMBOL_GPL(svc_reg_xprt_class);
  
+/**
+ * svc_unreg_xprt_class - Unregister a server-side RPC transport class
+ * @xcl: Transport class to be unregistered
+ *
+ */
  void svc_unreg_xprt_class(struct svc_xprt_class *xcl)
  {
-       dprintk("svc: Removing svc transport class '%s'\n", xcl->xcl_name);
         spin_lock(&svc_xprt_class_lock);
         list_del_init(&xcl->xcl_list);
         spin_unlock(&svc_xprt_class_lock);
@@ -685,8 +694,9 @@ static int svc_alloc_arg(struct svc_rqst *rqstp)
         }
  
         for (filled = 0; filled < pages; filled = ret) {
-               ret = alloc_pages_bulk_array(GFP_KERNEL, pages,
-                                            rqstp->rq_pages);
+               ret = alloc_pages_bulk_array_node(GFP_KERNEL,
+                                                 rqstp->rq_pool->sp_id,
+                                                 pages, rqstp->rq_pages);
                 if (ret > filled)
                         /* Made progress, don't sleep yet */
                         continue;
@@ -843,15 +853,11 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
                 svc_xprt_received(xprt);
         } else if (svc_xprt_reserve_slot(rqstp, xprt)) {
                 /* XPT_DATA|XPT_DEFERRED case: */
-               dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n",
-                       rqstp, rqstp->rq_pool->sp_id, xprt,
-                       kref_read(&xprt->xpt_ref));
                 rqstp->rq_deferred = svc_deferred_dequeue(xprt);
                 if (rqstp->rq_deferred)
                         len = svc_deferred_recv(rqstp);
                 else
                         len = xprt->xpt_ops->xpo_recvfrom(rqstp);
-               rqstp->rq_stime = ktime_get();
                 rqstp->rq_reserved = serv->sv_max_mesg;
                 atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved);
         } else
@@ -894,6 +900,7 @@ int svc_recv(struct svc_rqst *rqstp, long timeout)
         err = -EAGAIN;
         if (len <= 0)
                 goto out_release;
+
         trace_svc_xdr_recvfrom(&rqstp->rq_arg);
  
         clear_bit(XPT_OLD, &xprt->xpt_flags);
@@ -902,6 +909,7 @@ int svc_recv(struct svc_rqst *rqstp, long timeout)
  
         if (serv->sv_stats)
                 serv->sv_stats->netcnt++;
+       rqstp->rq_stime = ktime_get();
         return len;
  out_release:
         rqstp->rq_res.len = 0;
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c

index f77cebe..5f519fc 100644 (file)
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -826,12 +826,6 @@ static void svc_tcp_listen_data_ready(struct sock *sk)
  
         trace_sk_data_ready(sk);
  
-       if (svsk) {
-               /* Refer to svc_setup_socket() for details. */
-               rmb();
-               svsk->sk_odata(sk);
-       }
-
         /*
          * This callback may called twice when a new connection
          * is established as a child socket inherits everything
@@ -840,13 +834,18 @@ static void svc_tcp_listen_data_ready(struct sock *sk)
          *    when one of child sockets become ESTABLISHED.
          * 2) data_ready method of the child socket may be called
          *    when it receives data before the socket is accepted.
-        * In case of 2, we should ignore it silently.
+        * In case of 2, we should ignore it silently and DO NOT
+        * dereference svsk.
          */
-       if (sk->sk_state == TCP_LISTEN) {
-               if (svsk) {
-                       set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
-                       svc_xprt_enqueue(&svsk->sk_xprt);
-               }
+       if (sk->sk_state != TCP_LISTEN)
+               return;
+
+       if (svsk) {
+               /* Refer to svc_setup_socket() for details. */
+               rmb();
+               svsk->sk_odata(sk);
+               set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
+               svc_xprt_enqueue(&svsk->sk_xprt);
         }
  }
  
@@ -887,13 +886,8 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt)
         clear_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
         err = kernel_accept(sock, &newsock, O_NONBLOCK);
         if (err < 0) {
-               if (err == -ENOMEM)
-                       printk(KERN_WARNING "%s: no more sockets!\n",
-                              serv->sv_name);
-               else if (err != -EAGAIN)
-                       net_warn_ratelimited("%s: accept failed (err %d)!\n",
-                                            serv->sv_name, -err);
-               trace_svcsock_accept_err(xprt, serv->sv_name, err);
+               if (err != -EAGAIN)
+                       trace_svcsock_accept_err(xprt, serv->sv_name, err);
                 return NULL;
         }
         if (IS_ERR(sock_alloc_file(newsock, O_NONBLOCK, NULL)))
@@ -1464,7 +1458,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
         svsk->sk_owspace = inet->sk_write_space;
         /*
          * This barrier is necessary in order to prevent race condition
-        * with svc_data_ready(), svc_listen_data_ready() and others
+        * with svc_data_ready(), svc_tcp_listen_data_ready(), and others
          * when calling callbacks above.
          */
         wmb();
@@ -1476,7 +1470,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
         else
                 svc_tcp_init(svsk, serv);
  
-       trace_svcsock_new_socket(sock);
+       trace_svcsock_new(svsk, sock);
         return svsk;
  }
  
@@ -1657,6 +1651,8 @@ static void svc_sock_free(struct svc_xprt *xprt)
         struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
         struct socket *sock = svsk->sk_sock;
  
+       trace_svcsock_free(svsk, sock);
+
         tls_handshake_cancel(sock->sk);
         if (sock->file)
                 sockfd_put(sock);
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c

index 36835b2..2a22e78 100644 (file)
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -1070,22 +1070,22 @@ __be32 * xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes)
  }
  EXPORT_SYMBOL_GPL(xdr_reserve_space);
  
-
  /**
   * xdr_reserve_space_vec - Reserves a large amount of buffer space for sending
   * @xdr: pointer to xdr_stream
- * @vec: pointer to a kvec array
   * @nbytes: number of bytes to reserve
   *
- * Reserves enough buffer space to encode 'nbytes' of data and stores the
- * pointers in 'vec'. The size argument passed to xdr_reserve_space() is
- * determined based on the number of bytes remaining in the current page to
- * avoid invalidating iov_base pointers when xdr_commit_encode() is called.
+ * The size argument passed to xdr_reserve_space() is determined based
+ * on the number of bytes remaining in the current page to avoid
+ * invalidating iov_base pointers when xdr_commit_encode() is called.
+ *
+ * Return values:
+ *   %0: success
+ *   %-EMSGSIZE: not enough space is available in @xdr
   */
-int xdr_reserve_space_vec(struct xdr_stream *xdr, struct kvec *vec, size_t nbytes)
+int xdr_reserve_space_vec(struct xdr_stream *xdr, size_t nbytes)
  {
-       int thislen;
-       int v = 0;
+       size_t thislen;
         __be32 *p;
  
         /*
@@ -1097,21 +1097,19 @@ int xdr_reserve_space_vec(struct xdr_stream *xdr, struct kvec *vec, size_t nbyte
                 xdr->end = xdr->p;
         }
  
+       /* XXX: Let's find a way to make this more efficient */
         while (nbytes) {
                 thislen = xdr->buf->page_len % PAGE_SIZE;
                 thislen = min_t(size_t, nbytes, PAGE_SIZE - thislen);
  
                 p = xdr_reserve_space(xdr, thislen);
                 if (!p)
-                       return -EIO;
+                       return -EMSGSIZE;
  
-               vec[v].iov_base = p;
-               vec[v].iov_len = thislen;
-               v++;
                 nbytes -= thislen;
         }
  
-       return v;
+       return 0;
  }
  EXPORT_SYMBOL_GPL(xdr_reserve_space_vec);
  
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c

index aa2227a..7420a2c 100644 (file)
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -93,13 +93,7 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
          */
         get_page(virt_to_page(rqst->rq_buffer));
         sctxt->sc_send_wr.opcode = IB_WR_SEND;
-       ret = svc_rdma_send(rdma, sctxt);
-       if (ret < 0)
-               return ret;
-
-       ret = wait_for_completion_killable(&sctxt->sc_done);
-       svc_rdma_send_ctxt_put(rdma, sctxt);
-       return ret;
+       return svc_rdma_send(rdma, sctxt);
  }
  
  /* Server-side transport endpoint wants a whole page for its send
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c

index a22fe75..85c8bca 100644 (file)
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -125,14 +125,15 @@ static void svc_rdma_recv_cid_init(struct svcxprt_rdma *rdma,
  static struct svc_rdma_recv_ctxt *
  svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
  {
+       int node = ibdev_to_node(rdma->sc_cm_id->device);
         struct svc_rdma_recv_ctxt *ctxt;
         dma_addr_t addr;
         void *buffer;
  
-       ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL);
+       ctxt = kmalloc_node(sizeof(*ctxt), GFP_KERNEL, node);
         if (!ctxt)
                 goto fail0;
-       buffer = kmalloc(rdma->sc_max_req_size, GFP_KERNEL);
+       buffer = kmalloc_node(rdma->sc_max_req_size, GFP_KERNEL, node);
         if (!buffer)
                 goto fail1;
         addr = ib_dma_map_single(rdma->sc_pd->device, buffer,
@@ -155,7 +156,6 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
         ctxt->rc_recv_sge.length = rdma->sc_max_req_size;
         ctxt->rc_recv_sge.lkey = rdma->sc_pd->local_dma_lkey;
         ctxt->rc_recv_buf = buffer;
-       ctxt->rc_temp = false;
         return ctxt;
  
  fail2:
@@ -232,10 +232,7 @@ void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma,
         pcl_free(&ctxt->rc_write_pcl);
         pcl_free(&ctxt->rc_reply_pcl);
  
-       if (!ctxt->rc_temp)
-               llist_add(&ctxt->rc_node, &rdma->sc_recv_ctxts);
-       else
-               svc_rdma_recv_ctxt_destroy(rdma, ctxt);
+       llist_add(&ctxt->rc_node, &rdma->sc_recv_ctxts);
  }
  
  /**
@@ -258,7 +255,7 @@ void svc_rdma_release_ctxt(struct svc_xprt *xprt, void *vctxt)
  }
  
  static bool svc_rdma_refresh_recvs(struct svcxprt_rdma *rdma,
-                                  unsigned int wanted, bool temp)
+                                  unsigned int wanted)
  {
         const struct ib_recv_wr *bad_wr = NULL;
         struct svc_rdma_recv_ctxt *ctxt;
@@ -275,7 +272,6 @@ static bool svc_rdma_refresh_recvs(struct svcxprt_rdma *rdma,
                         break;
  
                 trace_svcrdma_post_recv(ctxt);
-               ctxt->rc_temp = temp;
                 ctxt->rc_recv_wr.next = recv_chain;
                 recv_chain = &ctxt->rc_recv_wr;
                 rdma->sc_pending_recvs++;
@@ -309,7 +305,7 @@ err_free:
   */
  bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma)
  {
-       return svc_rdma_refresh_recvs(rdma, rdma->sc_max_requests, true);
+       return svc_rdma_refresh_recvs(rdma, rdma->sc_max_requests);
  }
  
  /**
@@ -343,7 +339,7 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
          * client reconnects.
          */
         if (rdma->sc_pending_recvs < rdma->sc_max_requests)
-               if (!svc_rdma_refresh_recvs(rdma, rdma->sc_recv_batch, false))
+               if (!svc_rdma_refresh_recvs(rdma, rdma->sc_recv_batch))
                         goto dropped;
  
         /* All wc fields are now known to be valid */
@@ -775,9 +771,6 @@ static bool svc_rdma_is_reverse_direction_reply(struct svc_xprt *xprt,
   *
   * The next ctxt is removed from the "receive" lists.
   *
- * - If the ctxt completes a Read, then finish assembling the Call
- *   message and return the number of bytes in the message.
- *
   * - If the ctxt completes a Receive, then construct the Call
   *   message from the contents of the Receive buffer.
   *
@@ -786,7 +779,8 @@ static bool svc_rdma_is_reverse_direction_reply(struct svc_xprt *xprt,
   *     in the message.
   *
   *   - If there are Read chunks in this message, post Read WRs to
- *     pull that payload and return 0.
+ *     pull that payload. When the Read WRs complete, build the
+ *     full message and return the number of bytes in it.
   */
  int svc_rdma_recvfrom(struct svc_rqst *rqstp)
  {
@@ -796,6 +790,12 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
         struct svc_rdma_recv_ctxt *ctxt;
         int ret;
  
+       /* Prevent svc_xprt_release() from releasing pages in rq_pages
+        * when returning 0 or an error.
+        */
+       rqstp->rq_respages = rqstp->rq_pages;
+       rqstp->rq_next_page = rqstp->rq_respages;
+
         rqstp->rq_xprt_ctxt = NULL;
  
         ctxt = NULL;
@@ -819,12 +819,6 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
                                    DMA_FROM_DEVICE);
         svc_rdma_build_arg_xdr(rqstp, ctxt);
  
-       /* Prevent svc_xprt_release from releasing pages in rq_pages
-        * if we return 0 or an error.
-        */
-       rqstp->rq_respages = rqstp->rq_pages;
-       rqstp->rq_next_page = rqstp->rq_respages;
-
         ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg, ctxt);
         if (ret < 0)
                 goto out_err;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c

index 11cf7c6..e460e25 100644 (file)
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -62,8 +62,8 @@ svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges)
         if (node) {
                 ctxt = llist_entry(node, struct svc_rdma_rw_ctxt, rw_node);
         } else {
-               ctxt = kmalloc(struct_size(ctxt, rw_first_sgl, SG_CHUNK_SIZE),
-                              GFP_KERNEL);
+               ctxt = kmalloc_node(struct_size(ctxt, rw_first_sgl, SG_CHUNK_SIZE),
+                                   GFP_KERNEL, ibdev_to_node(rdma->sc_cm_id->device));
                 if (!ctxt)
                         goto out_noctx;
  
@@ -84,8 +84,7 @@ out_noctx:
         return NULL;
  }
  
-static void __svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma,
-                                  struct svc_rdma_rw_ctxt *ctxt,
+static void __svc_rdma_put_rw_ctxt(struct svc_rdma_rw_ctxt *ctxt,
                                    struct llist_head *list)
  {
         sg_free_table_chained(&ctxt->rw_sg_table, SG_CHUNK_SIZE);
@@ -95,7 +94,7 @@ static void __svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma,
  static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma,
                                  struct svc_rdma_rw_ctxt *ctxt)
  {
-       __svc_rdma_put_rw_ctxt(rdma, ctxt, &rdma->sc_rw_ctxts);
+       __svc_rdma_put_rw_ctxt(ctxt, &rdma->sc_rw_ctxts);
  }
  
  /**
@@ -191,6 +190,8 @@ static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc,
         struct svc_rdma_rw_ctxt *ctxt;
         LLIST_HEAD(free);
  
+       trace_svcrdma_cc_release(&cc->cc_cid, cc->cc_sqecount);
+
         first = last = NULL;
         while ((ctxt = svc_rdma_next_ctxt(&cc->cc_rwctxts)) != NULL) {
                 list_del(&ctxt->rw_list);
@@ -198,7 +199,7 @@ static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc,
                 rdma_rw_ctx_destroy(&ctxt->rw_ctx, rdma->sc_qp,
                                     rdma->sc_port_num, ctxt->rw_sg_table.sgl,
                                     ctxt->rw_nents, dir);
-               __svc_rdma_put_rw_ctxt(rdma, ctxt, &free);
+               __svc_rdma_put_rw_ctxt(ctxt, &free);
  
                 ctxt->rw_node.next = first;
                 first = &ctxt->rw_node;
@@ -234,7 +235,8 @@ svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma,
  {
         struct svc_rdma_write_info *info;
  
-       info = kmalloc(sizeof(*info), GFP_KERNEL);
+       info = kmalloc_node(sizeof(*info), GFP_KERNEL,
+                           ibdev_to_node(rdma->sc_cm_id->device));
         if (!info)
                 return info;
  
@@ -304,7 +306,8 @@ svc_rdma_read_info_alloc(struct svcxprt_rdma *rdma)
  {
         struct svc_rdma_read_info *info;
  
-       info = kmalloc(sizeof(*info), GFP_KERNEL);
+       info = kmalloc_node(sizeof(*info), GFP_KERNEL,
+                           ibdev_to_node(rdma->sc_cm_id->device));
         if (!info)
                 return info;
  
@@ -351,8 +354,7 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc)
         return;
  }
  
-/* This function sleeps when the transport's Send Queue is congested.
- *
+/*
   * Assumptions:
   * - If ib_post_send() succeeds, only one completion is expected,
   *   even if one or more WRs are flushed. This is true when posting
@@ -367,6 +369,8 @@ static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc)
         struct ib_cqe *cqe;
         int ret;
  
+       might_sleep();
+
         if (cc->cc_sqecount > rdma->sc_sq_depth)
                 return -EINVAL;
  
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c

index 22a871e..c6644cc 100644 (file)
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -123,18 +123,17 @@ static void svc_rdma_send_cid_init(struct svcxprt_rdma *rdma,
  static struct svc_rdma_send_ctxt *
  svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma)
  {
+       int node = ibdev_to_node(rdma->sc_cm_id->device);
         struct svc_rdma_send_ctxt *ctxt;
         dma_addr_t addr;
         void *buffer;
-       size_t size;
         int i;
  
-       size = sizeof(*ctxt);
-       size += rdma->sc_max_send_sges * sizeof(struct ib_sge);
-       ctxt = kmalloc(size, GFP_KERNEL);
+       ctxt = kmalloc_node(struct_size(ctxt, sc_sges, rdma->sc_max_send_sges),
+                           GFP_KERNEL, node);
         if (!ctxt)
                 goto fail0;
-       buffer = kmalloc(rdma->sc_max_req_size, GFP_KERNEL);
+       buffer = kmalloc_node(rdma->sc_max_req_size, GFP_KERNEL, node);
         if (!buffer)
                 goto fail1;
         addr = ib_dma_map_single(rdma->sc_pd->device, buffer,
@@ -148,7 +147,6 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma)
         ctxt->sc_send_wr.wr_cqe = &ctxt->sc_cqe;
         ctxt->sc_send_wr.sg_list = ctxt->sc_sges;
         ctxt->sc_send_wr.send_flags = IB_SEND_SIGNALED;
-       init_completion(&ctxt->sc_done);
         ctxt->sc_cqe.done = svc_rdma_wc_send;
         ctxt->sc_xprt_buf = buffer;
         xdr_buf_init(&ctxt->sc_hdrbuf, ctxt->sc_xprt_buf,
@@ -214,6 +212,7 @@ out:
  
         ctxt->sc_send_wr.num_sge = 0;
         ctxt->sc_cur_sge_no = 0;
+       ctxt->sc_page_count = 0;
         return ctxt;
  
  out_empty:
@@ -228,6 +227,8 @@ out_empty:
   * svc_rdma_send_ctxt_put - Return send_ctxt to free list
   * @rdma: controlling svcxprt_rdma
   * @ctxt: object to return to the free list
+ *
+ * Pages left in sc_pages are DMA unmapped and released.
   */
  void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma,
                             struct svc_rdma_send_ctxt *ctxt)
@@ -235,6 +236,9 @@ void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma,
         struct ib_device *device = rdma->sc_cm_id->device;
         unsigned int i;
  
+       if (ctxt->sc_page_count)
+               release_pages(ctxt->sc_pages, ctxt->sc_page_count);
+
         /* The first SGE contains the transport header, which
          * remains mapped until @ctxt is destroyed.
          */
@@ -281,12 +285,12 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
                 container_of(cqe, struct svc_rdma_send_ctxt, sc_cqe);
  
         svc_rdma_wake_send_waiters(rdma, 1);
-       complete(&ctxt->sc_done);
  
         if (unlikely(wc->status != IB_WC_SUCCESS))
                 goto flushed;
  
         trace_svcrdma_wc_send(wc, &ctxt->sc_cid);
+       svc_rdma_send_ctxt_put(rdma, ctxt);
         return;
  
  flushed:
@@ -294,6 +298,7 @@ flushed:
                 trace_svcrdma_wc_send_err(wc, &ctxt->sc_cid);
         else
                 trace_svcrdma_wc_send_flush(wc, &ctxt->sc_cid);
+       svc_rdma_send_ctxt_put(rdma, ctxt);
         svc_xprt_deferred_close(&rdma->sc_xprt);
  }
  
@@ -310,7 +315,7 @@ int svc_rdma_send(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt)
         struct ib_send_wr *wr = &ctxt->sc_send_wr;
         int ret;
  
-       reinit_completion(&ctxt->sc_done);
+       might_sleep();
  
         /* Sync the transport header buffer */
         ib_dma_sync_single_for_device(rdma->sc_pd->device,
@@ -799,6 +804,25 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
                                        svc_rdma_xb_dma_map, &args);
  }
  
+/* The svc_rqst and all resources it owns are released as soon as
+ * svc_rdma_sendto returns. Transfer pages under I/O to the ctxt
+ * so they are released by the Send completion handler.
+ */
+static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
+                                  struct svc_rdma_send_ctxt *ctxt)
+{
+       int i, pages = rqstp->rq_next_page - rqstp->rq_respages;
+
+       ctxt->sc_page_count += pages;
+       for (i = 0; i < pages; i++) {
+               ctxt->sc_pages[i] = rqstp->rq_respages[i];
+               rqstp->rq_respages[i] = NULL;
+       }
+
+       /* Prevent svc_xprt_release from releasing pages in rq_pages */
+       rqstp->rq_next_page = rqstp->rq_respages;
+}
+
  /* Prepare the portion of the RPC Reply that will be transmitted
   * via RDMA Send. The RPC-over-RDMA transport header is prepared
   * in sc_sges[0], and the RPC xdr_buf is prepared in following sges.
@@ -828,6 +852,8 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
         if (ret < 0)
                 return ret;
  
+       svc_rdma_save_io_pages(rqstp, sctxt);
+
         if (rctxt->rc_inv_rkey) {
                 sctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV;
                 sctxt->sc_send_wr.ex.invalidate_rkey = rctxt->rc_inv_rkey;
@@ -835,13 +861,7 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
                 sctxt->sc_send_wr.opcode = IB_WR_SEND;
         }
  
-       ret = svc_rdma_send(rdma, sctxt);
-       if (ret < 0)
-               return ret;
-
-       ret = wait_for_completion_killable(&sctxt->sc_done);
-       svc_rdma_send_ctxt_put(rdma, sctxt);
-       return ret;
+       return svc_rdma_send(rdma, sctxt);
  }
  
  /**
@@ -907,8 +927,7 @@ void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma,
         sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len;
         if (svc_rdma_send(rdma, sctxt))
                 goto put_ctxt;
-
-       wait_for_completion_killable(&sctxt->sc_done);
+       return;
  
  put_ctxt:
         svc_rdma_send_ctxt_put(rdma, sctxt);
@@ -976,17 +995,16 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
         ret = svc_rdma_send_reply_msg(rdma, sctxt, rctxt, rqstp);
         if (ret < 0)
                 goto put_ctxt;
-
-       /* Prevent svc_xprt_release() from releasing the page backing
-        * rq_res.head[0].iov_base. It's no longer being accessed by
-        * the I/O device. */
-       rqstp->rq_respages++;
         return 0;
  
  reply_chunk:
         if (ret != -E2BIG && ret != -EINVAL)
                 goto put_ctxt;
  
+       /* Send completion releases payload pages that were part
+        * of previously posted RDMA Writes.
+        */
+       svc_rdma_save_io_pages(rqstp, sctxt);
         svc_rdma_send_error_msg(rdma, sctxt, rctxt, ret);
         return 0;
  
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c

index ca04f7a..2abd895 100644 (file)
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -64,7 +64,7 @@
  #define RPCDBG_FACILITY        RPCDBG_SVCXPRT
  
  static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv,
-                                                struct net *net);
+                                                struct net *net, int node);
  static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
                                         struct net *net,
                                         struct sockaddr *sa, int salen,
@@ -123,14 +123,14 @@ static void qp_event_handler(struct ib_event *event, void *context)
  }
  
  static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv,
-                                                struct net *net)
+                                                struct net *net, int node)
  {
-       struct svcxprt_rdma *cma_xprt = kzalloc(sizeof *cma_xprt, GFP_KERNEL);
+       struct svcxprt_rdma *cma_xprt;
  
-       if (!cma_xprt) {
-               dprintk("svcrdma: failed to create new transport\n");
+       cma_xprt = kzalloc_node(sizeof(*cma_xprt), GFP_KERNEL, node);
+       if (!cma_xprt)
                 return NULL;
-       }
+
         svc_xprt_init(net, &svc_rdma_class, &cma_xprt->sc_xprt, serv);
         INIT_LIST_HEAD(&cma_xprt->sc_accept_q);
         INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
@@ -193,9 +193,9 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id,
         struct svcxprt_rdma *newxprt;
         struct sockaddr *sa;
  
-       /* Create a new transport */
         newxprt = svc_rdma_create_xprt(listen_xprt->sc_xprt.xpt_server,
-                                      listen_xprt->sc_xprt.xpt_net);
+                                      listen_xprt->sc_xprt.xpt_net,
+                                      ibdev_to_node(new_cma_id->device));
         if (!newxprt)
                 return;
         newxprt->sc_cm_id = new_cma_id;
@@ -304,7 +304,7 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
  
         if (sa->sa_family != AF_INET && sa->sa_family != AF_INET6)
                 return ERR_PTR(-EAFNOSUPPORT);
-       cma_xprt = svc_rdma_create_xprt(serv, net);
+       cma_xprt = svc_rdma_create_xprt(serv, net, NUMA_NO_NODE);
         if (!cma_xprt)
                 return ERR_PTR(-ENOMEM);
         set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
author	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 26 Jun 2023 17:48:57 +0000 (10:48 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 26 Jun 2023 17:48:57 +0000 (10:48 -0700)
.mailmap		patch \| blob \| history
MAINTAINERS		patch \| blob \| history
fs/lockd/svc.c		patch \| blob \| history
fs/nfsd/cache.h		patch \| blob \| history
fs/nfsd/export.c		patch \| blob \| history
fs/nfsd/nfs3proc.c		patch \| blob \| history
fs/nfsd/nfs3xdr.c		patch \| blob \| history
fs/nfsd/nfs4xdr.c		patch \| blob \| history
fs/nfsd/nfscache.c		patch \| blob \| history
fs/nfsd/nfsctl.c		patch \| blob \| history
fs/nfsd/nfsfh.c		patch \| blob \| history
fs/nfsd/nfsproc.c		patch \| blob \| history
fs/nfsd/nfssvc.c		patch \| blob \| history
fs/nfsd/nfsxdr.c		patch \| blob \| history
fs/nfsd/trace.h		patch \| blob \| history
fs/nfsd/vfs.c		patch \| blob \| history
fs/nfsd/vfs.h		patch \| blob \| history
include/linux/sunrpc/svc.h		patch \| blob \| history
include/linux/sunrpc/svc_rdma.h		patch \| blob \| history
include/linux/sunrpc/xdr.h		patch \| blob \| history
include/trace/events/rpcrdma.h		patch \| blob \| history
include/trace/events/sunrpc.h		patch \| blob \| history
net/sunrpc/svc.c		patch \| blob \| history
net/sunrpc/svc_xprt.c		patch \| blob \| history
net/sunrpc/svcsock.c		patch \| blob \| history
net/sunrpc/xdr.c		patch \| blob \| history
net/sunrpc/xprtrdma/svc_rdma_backchannel.c		patch \| blob \| history
net/sunrpc/xprtrdma/svc_rdma_recvfrom.c		patch \| blob \| history
net/sunrpc/xprtrdma/svc_rdma_rw.c		patch \| blob \| history
net/sunrpc/xprtrdma/svc_rdma_sendto.c		patch \| blob \| history
net/sunrpc/xprtrdma/svc_rdma_transport.c		patch \| blob \| history