Merge tag 'nfs-for-4.9-1' of git://git.linux-nfs.org/projects/anna/linux-nfs
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 14 Oct 2016 04:28:20 +0000 (21:28 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 14 Oct 2016 04:28:20 +0000 (21:28 -0700)
Pull NFS client updates from Anna Schumaker:
 "Highlights include:

  Stable bugfixes:
   - sunrpc: fix writ espace race causing stalls
   - NFS: Fix inode corruption in nfs_prime_dcache()
   - NFSv4: Don't report revoked delegations as valid in nfs_have_delegation()
   - NFSv4: nfs4_copy_delegation_stateid() must fail if the delegation is invalid
   - NFSv4: Open state recovery must account for file permission changes
   - NFSv4.2: Fix a reference leak in nfs42_proc_layoutstats_generic

  Features:
   - Add support for tracking multiple layout types with an ordered list
   - Add support for using multiple backchannel threads on the client
   - Add support for pNFS file layout session trunking
   - Delay xprtrdma use of DMA API (for device driver removal)
   - Add support for xprtrdma remote invalidation
   - Add support for larger xprtrdma inline thresholds
   - Use a scatter/gather list for sending xprtrdma RPC calls
   - Add support for the CB_NOTIFY_LOCK callback
   - Improve hashing sunrpc auth_creds by using both uid and gid

  Bugfixes:
   - Fix xprtrdma use of DMA API
   - Validate filenames before adding to the dcache
   - Fix corruption of xdr->nwords in xdr_copy_to_scratch
   - Fix setting buffer length in xdr_set_next_buffer()
   - Don't deadlock the state manager on the SEQUENCE status flags
   - Various delegation and stateid related fixes
   - Retry operations if an interrupted slot receives EREMOTEIO
   - Make nfs boot time y2038 safe"

* tag 'nfs-for-4.9-1' of git://git.linux-nfs.org/projects/anna/linux-nfs: (100 commits)
  NFSv4.2: Fix a reference leak in nfs42_proc_layoutstats_generic
  fs: nfs: Make nfs boot time y2038 safe
  sunrpc: replace generic auth_cred hash with auth-specific function
  sunrpc: add RPCSEC_GSS hash_cred() function
  sunrpc: add auth_unix hash_cred() function
  sunrpc: add generic_auth hash_cred() function
  sunrpc: add hash_cred() function to rpc_authops struct
  Retry operation on EREMOTEIO on an interrupted slot
  pNFS: Fix atime updates on pNFS clients
  sunrpc: queue work on system_power_efficient_wq
  NFSv4.1: Even if the stateid is OK, we may need to recover the open modes
  NFSv4: If recovery failed for a specific open stateid, then don't retry
  NFSv4: Fix retry issues with nfs41_test/free_stateid
  NFSv4: Open state recovery must account for file permission changes
  NFSv4: Mark the lock and open stateids as invalid after freeing them
  NFSv4: Don't test open_stateid unless it is set
  NFSv4: nfs4_do_handle_exception() handle revoke/expiry of a single stateid
  NFS: Always call nfs_inode_find_state_and_recover() when revoking a delegation
  NFSv4: Fix a race when updating an open_stateid
  NFSv4: Fix a race in nfs_inode_reclaim_delegation()
  ...

58 files changed:
Documentation/kernel-parameters.txt
fs/nfs/cache_lib.c
fs/nfs/callback.c
fs/nfs/callback.h
fs/nfs/callback_proc.c
fs/nfs/callback_xdr.c
fs/nfs/client.c
fs/nfs/delegation.c
fs/nfs/delegation.h
fs/nfs/dir.c
fs/nfs/direct.c
fs/nfs/file.c
fs/nfs/flexfilelayout/flexfilelayout.c
fs/nfs/internal.h
fs/nfs/netns.h
fs/nfs/nfs42proc.c
fs/nfs/nfs4_fs.h
fs/nfs/nfs4client.c
fs/nfs/nfs4proc.c
fs/nfs/nfs4session.h
fs/nfs/nfs4state.c
fs/nfs/nfs4xdr.c
fs/nfs/pnfs.c
fs/nfs/pnfs.h
fs/nfs/pnfs_nfs.c
fs/nfs/super.c
include/linux/nfs4.h
include/linux/nfs_fs_sb.h
include/linux/nfs_xdr.h
include/linux/sunrpc/auth.h
include/linux/sunrpc/clnt.h
include/linux/sunrpc/rpc_rdma.h
include/linux/sunrpc/sched.h
include/linux/sunrpc/xdr.h
include/linux/sunrpc/xprt.h
include/linux/sunrpc/xprtmultipath.h
include/linux/sunrpc/xprtrdma.h
net/sunrpc/auth.c
net/sunrpc/auth_generic.c
net/sunrpc/auth_gss/auth_gss.c
net/sunrpc/auth_unix.c
net/sunrpc/backchannel_rqst.c
net/sunrpc/cache.c
net/sunrpc/clnt.c
net/sunrpc/sched.c
net/sunrpc/svc.c
net/sunrpc/xdr.c
net/sunrpc/xprt.c
net/sunrpc/xprtmultipath.c
net/sunrpc/xprtrdma/backchannel.c
net/sunrpc/xprtrdma/fmr_ops.c
net/sunrpc/xprtrdma/frwr_ops.c
net/sunrpc/xprtrdma/rpc_rdma.c
net/sunrpc/xprtrdma/svc_rdma_backchannel.c
net/sunrpc/xprtrdma/transport.c
net/sunrpc/xprtrdma/verbs.c
net/sunrpc/xprtrdma/xprt_rdma.h
net/sunrpc/xprtsock.c

index a1489e1..58f3c10 100644 (file)
@@ -2470,6 +2470,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
        nfsrootdebug    [NFS] enable nfsroot debugging messages.
                        See Documentation/filesystems/nfs/nfsroot.txt.
 
+       nfs.callback_nr_threads=
+                       [NFSv4] set the total number of threads that the
+                       NFS client will assign to service NFSv4 callback
+                       requests.
+
        nfs.callback_tcpport=
                        [NFS] set the TCP port on which the NFSv4 callback
                        channel should listen.
@@ -2493,6 +2498,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                        of returning the full 64-bit number.
                        The default is to return 64-bit inode numbers.
 
+       nfs.max_session_cb_slots=
+                       [NFSv4.1] Sets the maximum number of session
+                       slots the client will assign to the callback
+                       channel. This determines the maximum number of
+                       callbacks the client will process in parallel for
+                       a particular server.
+
        nfs.max_session_slots=
                        [NFSv4.1] Sets the maximum number of session slots
                        the client will attempt to negotiate with the server.
index 5f7b053..6de1570 100644 (file)
@@ -76,7 +76,7 @@ static void nfs_dns_cache_revisit(struct cache_deferred_req *d, int toomany)
 
        dreq = container_of(d, struct nfs_cache_defer_req, deferred_req);
 
-       complete_all(&dreq->completion);
+       complete(&dreq->completion);
        nfs_cache_defer_req_put(dreq);
 }
 
index 52a2831..532d8e2 100644 (file)
@@ -31,8 +31,6 @@
 struct nfs_callback_data {
        unsigned int users;
        struct svc_serv *serv;
-       struct svc_rqst *rqst;
-       struct task_struct *task;
 };
 
 static struct nfs_callback_data nfs_callback_info[NFS4_MAX_MINOR_VERSION + 1];
@@ -89,15 +87,6 @@ nfs4_callback_svc(void *vrqstp)
        return 0;
 }
 
-/*
- * Prepare to bring up the NFSv4 callback service
- */
-static struct svc_rqst *
-nfs4_callback_up(struct svc_serv *serv)
-{
-       return svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
-}
-
 #if defined(CONFIG_NFS_V4_1)
 /*
  * The callback service for NFSv4.1 callbacks
@@ -139,29 +128,6 @@ nfs41_callback_svc(void *vrqstp)
        return 0;
 }
 
-/*
- * Bring up the NFSv4.1 callback service
- */
-static struct svc_rqst *
-nfs41_callback_up(struct svc_serv *serv)
-{
-       struct svc_rqst *rqstp;
-
-       INIT_LIST_HEAD(&serv->sv_cb_list);
-       spin_lock_init(&serv->sv_cb_lock);
-       init_waitqueue_head(&serv->sv_cb_waitq);
-       rqstp = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
-       dprintk("--> %s return %d\n", __func__, PTR_ERR_OR_ZERO(rqstp));
-       return rqstp;
-}
-
-static void nfs_minorversion_callback_svc_setup(struct svc_serv *serv,
-               struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp))
-{
-       *rqstpp = nfs41_callback_up(serv);
-       *callback_svc = nfs41_callback_svc;
-}
-
 static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
                struct svc_serv *serv)
 {
@@ -173,13 +139,6 @@ static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
                xprt->bc_serv = serv;
 }
 #else
-static void nfs_minorversion_callback_svc_setup(struct svc_serv *serv,
-               struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp))
-{
-       *rqstpp = ERR_PTR(-ENOTSUPP);
-       *callback_svc = ERR_PTR(-ENOTSUPP);
-}
-
 static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
                struct svc_serv *serv)
 {
@@ -189,45 +148,22 @@ static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
 static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt,
                                  struct svc_serv *serv)
 {
-       struct svc_rqst *rqstp;
-       int (*callback_svc)(void *vrqstp);
-       struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
+       int nrservs = nfs_callback_nr_threads;
        int ret;
 
        nfs_callback_bc_serv(minorversion, xprt, serv);
 
-       if (cb_info->task)
-               return 0;
+       if (nrservs < NFS4_MIN_NR_CALLBACK_THREADS)
+               nrservs = NFS4_MIN_NR_CALLBACK_THREADS;
 
-       switch (minorversion) {
-       case 0:
-               /* v4.0 callback setup */
-               rqstp = nfs4_callback_up(serv);
-               callback_svc = nfs4_callback_svc;
-               break;
-       default:
-               nfs_minorversion_callback_svc_setup(serv,
-                               &rqstp, &callback_svc);
-       }
-
-       if (IS_ERR(rqstp))
-               return PTR_ERR(rqstp);
-
-       svc_sock_update_bufs(serv);
+       if (serv->sv_nrthreads-1 == nrservs)
+               return 0;
 
-       cb_info->serv = serv;
-       cb_info->rqst = rqstp;
-       cb_info->task = kthread_create(callback_svc, cb_info->rqst,
-                                   "nfsv4.%u-svc", minorversion);
-       if (IS_ERR(cb_info->task)) {
-               ret = PTR_ERR(cb_info->task);
-               svc_exit_thread(cb_info->rqst);
-               cb_info->rqst = NULL;
-               cb_info->task = NULL;
+       ret = serv->sv_ops->svo_setup(serv, NULL, nrservs);
+       if (ret) {
+               serv->sv_ops->svo_setup(serv, NULL, 0);
                return ret;
        }
-       rqstp->rq_task = cb_info->task;
-       wake_up_process(cb_info->task);
        dprintk("nfs_callback_up: service started\n");
        return 0;
 }
@@ -281,19 +217,41 @@ err_bind:
        return ret;
 }
 
-static struct svc_serv_ops nfs_cb_sv_ops = {
+static struct svc_serv_ops nfs40_cb_sv_ops = {
+       .svo_function           = nfs4_callback_svc,
        .svo_enqueue_xprt       = svc_xprt_do_enqueue,
+       .svo_setup              = svc_set_num_threads,
+       .svo_module             = THIS_MODULE,
+};
+#if defined(CONFIG_NFS_V4_1)
+static struct svc_serv_ops nfs41_cb_sv_ops = {
+       .svo_function           = nfs41_callback_svc,
+       .svo_enqueue_xprt       = svc_xprt_do_enqueue,
+       .svo_setup              = svc_set_num_threads,
+       .svo_module             = THIS_MODULE,
+};
+
+struct svc_serv_ops *nfs4_cb_sv_ops[] = {
+       [0] = &nfs40_cb_sv_ops,
+       [1] = &nfs41_cb_sv_ops,
+};
+#else
+struct svc_serv_ops *nfs4_cb_sv_ops[] = {
+       [0] = &nfs40_cb_sv_ops,
+       [1] = NULL,
 };
+#endif
 
 static struct svc_serv *nfs_callback_create_svc(int minorversion)
 {
        struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
        struct svc_serv *serv;
+       struct svc_serv_ops *sv_ops;
 
        /*
         * Check whether we're already up and running.
         */
-       if (cb_info->task) {
+       if (cb_info->serv) {
                /*
                 * Note: increase service usage, because later in case of error
                 * svc_destroy() will be called.
@@ -302,6 +260,17 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion)
                return cb_info->serv;
        }
 
+       switch (minorversion) {
+       case 0:
+               sv_ops = nfs4_cb_sv_ops[0];
+               break;
+       default:
+               sv_ops = nfs4_cb_sv_ops[1];
+       }
+
+       if (sv_ops == NULL)
+               return ERR_PTR(-ENOTSUPP);
+
        /*
         * Sanity check: if there's no task,
         * we should be the first user ...
@@ -310,11 +279,12 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion)
                printk(KERN_WARNING "nfs_callback_create_svc: no kthread, %d users??\n",
                        cb_info->users);
 
-       serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, &nfs_cb_sv_ops);
+       serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, sv_ops);
        if (!serv) {
                printk(KERN_ERR "nfs_callback_create_svc: create service failed\n");
                return ERR_PTR(-ENOMEM);
        }
+       cb_info->serv = serv;
        /* As there is only one thread we need to over-ride the
         * default maximum of 80 connections
         */
@@ -357,6 +327,8 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt)
         * thread exits.
         */
 err_net:
+       if (!cb_info->users)
+               cb_info->serv = NULL;
        svc_destroy(serv);
 err_create:
        mutex_unlock(&nfs_callback_mutex);
@@ -374,18 +346,18 @@ err_start:
 void nfs_callback_down(int minorversion, struct net *net)
 {
        struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
+       struct svc_serv *serv;
 
        mutex_lock(&nfs_callback_mutex);
-       nfs_callback_down_net(minorversion, cb_info->serv, net);
+       serv = cb_info->serv;
+       nfs_callback_down_net(minorversion, serv, net);
        cb_info->users--;
-       if (cb_info->users == 0 && cb_info->task != NULL) {
-               kthread_stop(cb_info->task);
-               dprintk("nfs_callback_down: service stopped\n");
-               svc_exit_thread(cb_info->rqst);
+       if (cb_info->users == 0) {
+               svc_get(serv);
+               serv->sv_ops->svo_setup(serv, NULL, 0);
+               svc_destroy(serv);
                dprintk("nfs_callback_down: service destroyed\n");
                cb_info->serv = NULL;
-               cb_info->rqst = NULL;
-               cb_info->task = NULL;
        }
        mutex_unlock(&nfs_callback_mutex);
 }
index 5fe1cec..c701c30 100644 (file)
@@ -179,6 +179,15 @@ extern __be32 nfs4_callback_devicenotify(
        struct cb_devicenotifyargs *args,
        void *dummy, struct cb_process_state *cps);
 
+struct cb_notify_lock_args {
+       struct nfs_fh                   cbnl_fh;
+       struct nfs_lowner               cbnl_owner;
+       bool                            cbnl_valid;
+};
+
+extern __be32 nfs4_callback_notify_lock(struct cb_notify_lock_args *args,
+                                        void *dummy,
+                                        struct cb_process_state *cps);
 #endif /* CONFIG_NFS_V4_1 */
 extern int check_gss_callback_principal(struct nfs_client *, struct svc_rqst *);
 extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
@@ -198,6 +207,9 @@ extern void nfs_callback_down(int minorversion, struct net *net);
 #define NFS41_BC_MIN_CALLBACKS 1
 #define NFS41_BC_MAX_CALLBACKS 1
 
+#define NFS4_MIN_NR_CALLBACK_THREADS 1
+
 extern unsigned int nfs_callback_set_tcpport;
+extern unsigned short nfs_callback_nr_threads;
 
 #endif /* __LINUX_FS_NFS_CALLBACK_H */
index f953ef6..e9aa235 100644 (file)
@@ -628,4 +628,20 @@ out:
        dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
        return status;
 }
+
+__be32 nfs4_callback_notify_lock(struct cb_notify_lock_args *args, void *dummy,
+                                struct cb_process_state *cps)
+{
+       if (!cps->clp) /* set in cb_sequence */
+               return htonl(NFS4ERR_OP_NOT_IN_SESSION);
+
+       dprintk_rcu("NFS: CB_NOTIFY_LOCK request from %s\n",
+               rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+
+       /* Don't wake anybody if the string looked bogus */
+       if (args->cbnl_valid)
+               __wake_up(&cps->clp->cl_lock_waitq, TASK_NORMAL, 0, args);
+
+       return htonl(NFS4_OK);
+}
 #endif /* CONFIG_NFS_V4_1 */
index 656f68f..eb094c6 100644 (file)
@@ -35,6 +35,7 @@
                                         (1 + 3) * 4) // seqid, 3 slotids
 #define CB_OP_RECALLANY_RES_MAXSZ      (CB_OP_HDR_RES_MAXSZ)
 #define CB_OP_RECALLSLOT_RES_MAXSZ     (CB_OP_HDR_RES_MAXSZ)
+#define CB_OP_NOTIFY_LOCK_RES_MAXSZ    (CB_OP_HDR_RES_MAXSZ)
 #endif /* CONFIG_NFS_V4_1 */
 
 #define NFSDBG_FACILITY NFSDBG_CALLBACK
@@ -72,7 +73,7 @@ static int nfs4_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
        return xdr_ressize_check(rqstp, p);
 }
 
-static __be32 *read_buf(struct xdr_stream *xdr, int nbytes)
+static __be32 *read_buf(struct xdr_stream *xdr, size_t nbytes)
 {
        __be32 *p;
 
@@ -534,6 +535,49 @@ static __be32 decode_recallslot_args(struct svc_rqst *rqstp,
        return 0;
 }
 
+static __be32 decode_lockowner(struct xdr_stream *xdr, struct cb_notify_lock_args *args)
+{
+       __be32          *p;
+       unsigned int    len;
+
+       p = read_buf(xdr, 12);
+       if (unlikely(p == NULL))
+               return htonl(NFS4ERR_BADXDR);
+
+       p = xdr_decode_hyper(p, &args->cbnl_owner.clientid);
+       len = be32_to_cpu(*p);
+
+       p = read_buf(xdr, len);
+       if (unlikely(p == NULL))
+               return htonl(NFS4ERR_BADXDR);
+
+       /* Only try to decode if the length is right */
+       if (len == 20) {
+               p += 2; /* skip "lock id:" */
+               args->cbnl_owner.s_dev = be32_to_cpu(*p++);
+               xdr_decode_hyper(p, &args->cbnl_owner.id);
+               args->cbnl_valid = true;
+       } else {
+               args->cbnl_owner.s_dev = 0;
+               args->cbnl_owner.id = 0;
+               args->cbnl_valid = false;
+       }
+       return 0;
+}
+
+static __be32 decode_notify_lock_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_notify_lock_args *args)
+{
+       __be32 status;
+
+       status = decode_fh(xdr, &args->cbnl_fh);
+       if (unlikely(status != 0))
+               goto out;
+       status = decode_lockowner(xdr, args);
+out:
+       dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+       return status;
+}
+
 #endif /* CONFIG_NFS_V4_1 */
 
 static __be32 encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
@@ -746,6 +790,7 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
        case OP_CB_RECALL_SLOT:
        case OP_CB_LAYOUTRECALL:
        case OP_CB_NOTIFY_DEVICEID:
+       case OP_CB_NOTIFY_LOCK:
                *op = &callback_ops[op_nr];
                break;
 
@@ -753,7 +798,6 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
        case OP_CB_PUSH_DELEG:
        case OP_CB_RECALLABLE_OBJ_AVAIL:
        case OP_CB_WANTS_CANCELLED:
-       case OP_CB_NOTIFY_LOCK:
                return htonl(NFS4ERR_NOTSUPP);
 
        default:
@@ -1006,6 +1050,11 @@ static struct callback_op callback_ops[] = {
                .decode_args = (callback_decode_arg_t)decode_recallslot_args,
                .res_maxsize = CB_OP_RECALLSLOT_RES_MAXSZ,
        },
+       [OP_CB_NOTIFY_LOCK] = {
+               .process_op = (callback_process_op_t)nfs4_callback_notify_lock,
+               .decode_args = (callback_decode_arg_t)decode_notify_lock_args,
+               .res_maxsize = CB_OP_NOTIFY_LOCK_RES_MAXSZ,
+       },
 #endif /* CONFIG_NFS_V4_1 */
 };
 
index 1e10678..7555ba8 100644 (file)
@@ -313,7 +313,10 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat
                        continue;
                /* Match the full socket address */
                if (!rpc_cmp_addr_port(sap, clap))
-                       continue;
+                       /* Match all xprt_switch full socket addresses */
+                       if (!rpc_clnt_xprt_switch_has_addr(clp->cl_rpcclient,
+                                                          sap))
+                               continue;
 
                atomic_inc(&clp->cl_count);
                return clp;
@@ -785,7 +788,8 @@ int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs
        }
 
        fsinfo.fattr = fattr;
-       fsinfo.layouttype = 0;
+       fsinfo.nlayouttypes = 0;
+       memset(fsinfo.layouttype, 0, sizeof(fsinfo.layouttype));
        error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo);
        if (error < 0)
                goto out_error;
@@ -1078,7 +1082,7 @@ void nfs_clients_init(struct net *net)
        idr_init(&nn->cb_ident_idr);
 #endif
        spin_lock_init(&nn->nfs_client_lock);
-       nn->boot_time = CURRENT_TIME;
+       nn->boot_time = ktime_get_real();
 }
 
 #ifdef CONFIG_PROC_FS
index 322c258..dff600a 100644 (file)
@@ -41,6 +41,17 @@ void nfs_mark_delegation_referenced(struct nfs_delegation *delegation)
        set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags);
 }
 
+static bool
+nfs4_is_valid_delegation(const struct nfs_delegation *delegation,
+               fmode_t flags)
+{
+       if (delegation != NULL && (delegation->type & flags) == flags &&
+           !test_bit(NFS_DELEGATION_REVOKED, &delegation->flags) &&
+           !test_bit(NFS_DELEGATION_RETURNING, &delegation->flags))
+               return true;
+       return false;
+}
+
 static int
 nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark)
 {
@@ -50,8 +61,7 @@ nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark)
        flags &= FMODE_READ|FMODE_WRITE;
        rcu_read_lock();
        delegation = rcu_dereference(NFS_I(inode)->delegation);
-       if (delegation != NULL && (delegation->type & flags) == flags &&
-           !test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) {
+       if (nfs4_is_valid_delegation(delegation, flags)) {
                if (mark)
                        nfs_mark_delegation_referenced(delegation);
                ret = 1;
@@ -185,15 +195,13 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred,
                        rcu_read_unlock();
                        put_rpccred(oldcred);
                        trace_nfs4_reclaim_delegation(inode, res->delegation_type);
-               } else {
-                       /* We appear to have raced with a delegation return. */
-                       spin_unlock(&delegation->lock);
-                       rcu_read_unlock();
-                       nfs_inode_set_delegation(inode, cred, res);
+                       return;
                }
-       } else {
-               rcu_read_unlock();
+               /* We appear to have raced with a delegation return. */
+               spin_unlock(&delegation->lock);
        }
+       rcu_read_unlock();
+       nfs_inode_set_delegation(inode, cred, res);
 }
 
 static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync)
@@ -642,28 +650,49 @@ static void nfs_client_mark_return_unused_delegation_types(struct nfs_client *cl
        rcu_read_unlock();
 }
 
-static void nfs_revoke_delegation(struct inode *inode)
+static void nfs_mark_delegation_revoked(struct nfs_server *server,
+               struct nfs_delegation *delegation)
+{
+       set_bit(NFS_DELEGATION_REVOKED, &delegation->flags);
+       delegation->stateid.type = NFS4_INVALID_STATEID_TYPE;
+       nfs_mark_return_delegation(server, delegation);
+}
+
+static bool nfs_revoke_delegation(struct inode *inode,
+               const nfs4_stateid *stateid)
 {
        struct nfs_delegation *delegation;
+       nfs4_stateid tmp;
+       bool ret = false;
+
        rcu_read_lock();
        delegation = rcu_dereference(NFS_I(inode)->delegation);
-       if (delegation != NULL) {
-               set_bit(NFS_DELEGATION_REVOKED, &delegation->flags);
-               nfs_mark_return_delegation(NFS_SERVER(inode), delegation);
-       }
+       if (delegation == NULL)
+               goto out;
+       if (stateid == NULL) {
+               nfs4_stateid_copy(&tmp, &delegation->stateid);
+               stateid = &tmp;
+       } else if (!nfs4_stateid_match(stateid, &delegation->stateid))
+               goto out;
+       nfs_mark_delegation_revoked(NFS_SERVER(inode), delegation);
+       ret = true;
+out:
        rcu_read_unlock();
+       if (ret)
+               nfs_inode_find_state_and_recover(inode, stateid);
+       return ret;
 }
 
-void nfs_remove_bad_delegation(struct inode *inode)
+void nfs_remove_bad_delegation(struct inode *inode,
+               const nfs4_stateid *stateid)
 {
        struct nfs_delegation *delegation;
 
-       nfs_revoke_delegation(inode);
+       if (!nfs_revoke_delegation(inode, stateid))
+               return;
        delegation = nfs_inode_detach_delegation(inode);
-       if (delegation) {
-               nfs_inode_find_state_and_recover(inode, &delegation->stateid);
+       if (delegation)
                nfs_free_delegation(delegation);
-       }
 }
 EXPORT_SYMBOL_GPL(nfs_remove_bad_delegation);
 
@@ -786,8 +815,15 @@ static void nfs_delegation_mark_reclaim_server(struct nfs_server *server)
 {
        struct nfs_delegation *delegation;
 
-       list_for_each_entry_rcu(delegation, &server->delegations, super_list)
+       list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
+               /*
+                * If the delegation may have been admin revoked, then we
+                * cannot reclaim it.
+                */
+               if (test_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags))
+                       continue;
                set_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
+       }
 }
 
 /**
@@ -851,6 +887,141 @@ restart:
        rcu_read_unlock();
 }
 
+static inline bool nfs4_server_rebooted(const struct nfs_client *clp)
+{
+       return (clp->cl_state & (BIT(NFS4CLNT_CHECK_LEASE) |
+                               BIT(NFS4CLNT_LEASE_EXPIRED) |
+                               BIT(NFS4CLNT_SESSION_RESET))) != 0;
+}
+
+static void nfs_mark_test_expired_delegation(struct nfs_server *server,
+           struct nfs_delegation *delegation)
+{
+       if (delegation->stateid.type == NFS4_INVALID_STATEID_TYPE)
+               return;
+       clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
+       set_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags);
+       set_bit(NFS4CLNT_DELEGATION_EXPIRED, &server->nfs_client->cl_state);
+}
+
+static void nfs_inode_mark_test_expired_delegation(struct nfs_server *server,
+               struct inode *inode)
+{
+       struct nfs_delegation *delegation;
+
+       rcu_read_lock();
+       delegation = rcu_dereference(NFS_I(inode)->delegation);
+       if (delegation)
+               nfs_mark_test_expired_delegation(server, delegation);
+       rcu_read_unlock();
+
+}
+
+static void nfs_delegation_mark_test_expired_server(struct nfs_server *server)
+{
+       struct nfs_delegation *delegation;
+
+       list_for_each_entry_rcu(delegation, &server->delegations, super_list)
+               nfs_mark_test_expired_delegation(server, delegation);
+}
+
+/**
+ * nfs_mark_test_expired_all_delegations - mark all delegations for testing
+ * @clp: nfs_client to process
+ *
+ * Iterates through all the delegations associated with this server and
+ * marks them as needing to be checked for validity.
+ */
+void nfs_mark_test_expired_all_delegations(struct nfs_client *clp)
+{
+       struct nfs_server *server;
+
+       rcu_read_lock();
+       list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+               nfs_delegation_mark_test_expired_server(server);
+       rcu_read_unlock();
+}
+
+/**
+ * nfs_reap_expired_delegations - reap expired delegations
+ * @clp: nfs_client to process
+ *
+ * Iterates through all the delegations associated with this server and
+ * checks if they have may have been revoked. This function is usually
+ * expected to be called in cases where the server may have lost its
+ * lease.
+ */
+void nfs_reap_expired_delegations(struct nfs_client *clp)
+{
+       const struct nfs4_minor_version_ops *ops = clp->cl_mvops;
+       struct nfs_delegation *delegation;
+       struct nfs_server *server;
+       struct inode *inode;
+       struct rpc_cred *cred;
+       nfs4_stateid stateid;
+
+restart:
+       rcu_read_lock();
+       list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+               list_for_each_entry_rcu(delegation, &server->delegations,
+                                                               super_list) {
+                       if (test_bit(NFS_DELEGATION_RETURNING,
+                                               &delegation->flags))
+                               continue;
+                       if (test_bit(NFS_DELEGATION_TEST_EXPIRED,
+                                               &delegation->flags) == 0)
+                               continue;
+                       if (!nfs_sb_active(server->super))
+                               continue;
+                       inode = nfs_delegation_grab_inode(delegation);
+                       if (inode == NULL) {
+                               rcu_read_unlock();
+                               nfs_sb_deactive(server->super);
+                               goto restart;
+                       }
+                       cred = get_rpccred_rcu(delegation->cred);
+                       nfs4_stateid_copy(&stateid, &delegation->stateid);
+                       clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags);
+                       rcu_read_unlock();
+                       if (cred != NULL &&
+                           ops->test_and_free_expired(server, &stateid, cred) < 0) {
+                               nfs_revoke_delegation(inode, &stateid);
+                               nfs_inode_find_state_and_recover(inode, &stateid);
+                       }
+                       put_rpccred(cred);
+                       if (nfs4_server_rebooted(clp)) {
+                               nfs_inode_mark_test_expired_delegation(server,inode);
+                               iput(inode);
+                               nfs_sb_deactive(server->super);
+                               return;
+                       }
+                       iput(inode);
+                       nfs_sb_deactive(server->super);
+                       goto restart;
+               }
+       }
+       rcu_read_unlock();
+}
+
+void nfs_inode_find_delegation_state_and_recover(struct inode *inode,
+               const nfs4_stateid *stateid)
+{
+       struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+       struct nfs_delegation *delegation;
+       bool found = false;
+
+       rcu_read_lock();
+       delegation = rcu_dereference(NFS_I(inode)->delegation);
+       if (delegation &&
+           nfs4_stateid_match_other(&delegation->stateid, stateid)) {
+               nfs_mark_test_expired_delegation(NFS_SERVER(inode), delegation);
+               found = true;
+       }
+       rcu_read_unlock();
+       if (found)
+               nfs4_schedule_state_manager(clp);
+}
+
 /**
  * nfs_delegations_present - check for existence of delegations
  * @clp: client state handle
@@ -893,7 +1064,7 @@ bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags,
        flags &= FMODE_READ|FMODE_WRITE;
        rcu_read_lock();
        delegation = rcu_dereference(nfsi->delegation);
-       ret = (delegation != NULL && (delegation->type & flags) == flags);
+       ret = nfs4_is_valid_delegation(delegation, flags);
        if (ret) {
                nfs4_stateid_copy(dst, &delegation->stateid);
                nfs_mark_delegation_referenced(delegation);
index 64724d2..e9d5557 100644 (file)
@@ -32,6 +32,7 @@ enum {
        NFS_DELEGATION_REFERENCED,
        NFS_DELEGATION_RETURNING,
        NFS_DELEGATION_REVOKED,
+       NFS_DELEGATION_TEST_EXPIRED,
 };
 
 int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
@@ -47,11 +48,14 @@ void nfs_expire_unused_delegation_types(struct nfs_client *clp, fmode_t flags);
 void nfs_expire_unreferenced_delegations(struct nfs_client *clp);
 int nfs_client_return_marked_delegations(struct nfs_client *clp);
 int nfs_delegations_present(struct nfs_client *clp);
-void nfs_remove_bad_delegation(struct inode *inode);
+void nfs_remove_bad_delegation(struct inode *inode, const nfs4_stateid *stateid);
 
 void nfs_delegation_mark_reclaim(struct nfs_client *clp);
 void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
 
+void nfs_mark_test_expired_all_delegations(struct nfs_client *clp);
+void nfs_reap_expired_delegations(struct nfs_client *clp);
+
 /* NFSv4 delegation-related procedures */
 int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync);
 int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid, fmode_t type);
@@ -62,6 +66,8 @@ void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
 int nfs4_have_delegation(struct inode *inode, fmode_t flags);
 int nfs4_check_delegation(struct inode *inode, fmode_t flags);
 bool nfs4_delegation_flush_on_close(const struct inode *inode);
+void nfs_inode_find_delegation_state_and_recover(struct inode *inode,
+               const nfs4_stateid *stateid);
 
 #endif
 
index 06e0bf0..5f1af4c 100644 (file)
@@ -435,11 +435,11 @@ int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry)
                return 0;
 
        nfsi = NFS_I(inode);
-       if (entry->fattr->fileid == nfsi->fileid)
-               return 1;
-       if (nfs_compare_fh(entry->fh, &nfsi->fh) == 0)
-               return 1;
-       return 0;
+       if (entry->fattr->fileid != nfsi->fileid)
+               return 0;
+       if (entry->fh->size && nfs_compare_fh(entry->fh, &nfsi->fh) != 0)
+               return 0;
+       return 1;
 }
 
 static
@@ -496,6 +496,14 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
                return;
        if (!(entry->fattr->valid & NFS_ATTR_FATTR_FSID))
                return;
+       if (filename.len == 0)
+               return;
+       /* Validate that the name doesn't contain any illegal '\0' */
+       if (strnlen(filename.name, filename.len) != filename.len)
+               return;
+       /* ...or '/' */
+       if (strnchr(filename.name, filename.len, '/'))
+               return;
        if (filename.name[0] == '.') {
                if (filename.len == 1)
                        return;
@@ -517,6 +525,8 @@ again:
                                        &entry->fattr->fsid))
                        goto out;
                if (nfs_same_file(dentry, entry)) {
+                       if (!entry->fh->size)
+                               goto out;
                        nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
                        status = nfs_refresh_inode(d_inode(dentry), entry->fattr);
                        if (!status)
@@ -529,6 +539,10 @@ again:
                        goto again;
                }
        }
+       if (!entry->fh->size) {
+               d_lookup_done(dentry);
+               goto out;
+       }
 
        inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr, entry->label);
        alias = d_splice_alias(inode, dentry);
index 72b7d13..bd81bcf 100644 (file)
@@ -387,7 +387,7 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq)
                dreq->iocb->ki_complete(dreq->iocb, res, 0);
        }
 
-       complete_all(&dreq->completion);
+       complete(&dreq->completion);
 
        nfs_direct_req_release(dreq);
 }
index 2efbdde..9ea85ae 100644 (file)
@@ -520,7 +520,9 @@ const struct address_space_operations nfs_file_aops = {
        .invalidatepage = nfs_invalidate_page,
        .releasepage = nfs_release_page,
        .direct_IO = nfs_direct_IO,
+#ifdef CONFIG_MIGRATION
        .migratepage = nfs_migrate_page,
+#endif
        .launder_page = nfs_launder_page,
        .is_dirty_writeback = nfs_check_dirty_writeback,
        .error_remove_page = generic_error_remove_page,
@@ -685,11 +687,6 @@ out_noconflict:
        goto out;
 }
 
-static int do_vfs_lock(struct file *file, struct file_lock *fl)
-{
-       return locks_lock_file_wait(file, fl);
-}
-
 static int
 do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
 {
@@ -722,7 +719,7 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
        if (!is_local)
                status = NFS_PROTO(inode)->lock(filp, cmd, fl);
        else
-               status = do_vfs_lock(filp, fl);
+               status = locks_lock_file_wait(filp, fl);
        return status;
 }
 
@@ -747,7 +744,7 @@ do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
        if (!is_local)
                status = NFS_PROTO(inode)->lock(filp, cmd, fl);
        else
-               status = do_vfs_lock(filp, fl);
+               status = locks_lock_file_wait(filp, fl);
        if (status < 0)
                goto out;
 
index 51b5136..98ace12 100644 (file)
@@ -1080,7 +1080,7 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
        case -NFS4ERR_BAD_STATEID:
                if (state == NULL)
                        break;
-               nfs_remove_bad_delegation(state->inode);
+               nfs_remove_bad_delegation(state->inode, NULL);
        case -NFS4ERR_OPENMODE:
                if (state == NULL)
                        break;
index a6acce6..80bcc0b 100644 (file)
@@ -534,12 +534,9 @@ void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo)
 }
 #endif
 
-
 #ifdef CONFIG_MIGRATION
 extern int nfs_migrate_page(struct address_space *,
                struct page *, struct page *, enum migrate_mode);
-#else
-#define nfs_migrate_page NULL
 #endif
 
 static inline int
@@ -562,7 +559,6 @@ void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
 extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
 
 /* nfs4proc.c */
-extern void __nfs4_read_done_cb(struct nfs_pgio_header *);
 extern struct nfs_client *nfs4_init_client(struct nfs_client *clp,
                            const struct nfs_client_initdata *);
 extern int nfs40_walk_client_list(struct nfs_client *clp,
@@ -571,6 +567,9 @@ extern int nfs40_walk_client_list(struct nfs_client *clp,
 extern int nfs41_walk_client_list(struct nfs_client *clp,
                                struct nfs_client **result,
                                struct rpc_cred *cred);
+extern int nfs4_test_session_trunk(struct rpc_clnt *,
+                               struct rpc_xprt *,
+                               void *);
 
 static inline struct inode *nfs_igrab_and_active(struct inode *inode)
 {
index f0e06e4..fbce0d8 100644 (file)
@@ -29,7 +29,7 @@ struct nfs_net {
        int cb_users[NFS4_MAX_MINOR_VERSION + 1];
 #endif
        spinlock_t nfs_client_lock;
-       struct timespec boot_time;
+       ktime_t boot_time;
 #ifdef CONFIG_PROC_FS
        struct proc_dir_entry *proc_nfsfs;
 #endif
index 64b43b4..6085019 100644 (file)
@@ -443,6 +443,7 @@ int nfs42_proc_layoutstats_generic(struct nfs_server *server,
        task = rpc_run_task(&task_setup);
        if (IS_ERR(task))
                return PTR_ERR(task);
+       rpc_put_task(task);
        return 0;
 }
 
index 9bf64ea..9b3a82a 100644 (file)
@@ -39,6 +39,7 @@ enum nfs4_client_state {
        NFS4CLNT_BIND_CONN_TO_SESSION,
        NFS4CLNT_MOVED,
        NFS4CLNT_LEASE_MOVED,
+       NFS4CLNT_DELEGATION_EXPIRED,
 };
 
 #define NFS4_RENEW_TIMEOUT             0x01
@@ -57,8 +58,11 @@ struct nfs4_minor_version_ops {
                        struct nfs_fsinfo *);
        void    (*free_lock_state)(struct nfs_server *,
                        struct nfs4_lock_state *);
+       int     (*test_and_free_expired)(struct nfs_server *,
+                       nfs4_stateid *, struct rpc_cred *);
        struct nfs_seqid *
                (*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
+       int     (*session_trunk)(struct rpc_clnt *, struct rpc_xprt *, void *);
        const struct rpc_call_ops *call_sync_ops;
        const struct nfs4_state_recovery_ops *reboot_recovery_ops;
        const struct nfs4_state_recovery_ops *nograce_recovery_ops;
@@ -156,6 +160,7 @@ enum {
        NFS_STATE_RECLAIM_NOGRACE,      /* OPEN stateid needs to recover state */
        NFS_STATE_POSIX_LOCKS,          /* Posix locks are supported */
        NFS_STATE_RECOVERY_FAILED,      /* OPEN stateid state recovery failed */
+       NFS_STATE_MAY_NOTIFY_LOCK,      /* server may CB_NOTIFY_LOCK */
 };
 
 struct nfs4_state {
@@ -203,6 +208,11 @@ struct nfs4_state_recovery_ops {
                struct rpc_cred *);
 };
 
+struct nfs4_add_xprt_data {
+       struct nfs_client       *clp;
+       struct rpc_cred         *cred;
+};
+
 struct nfs4_state_maintenance_ops {
        int (*sched_state_renewal)(struct nfs_client *, struct rpc_cred *, unsigned);
        struct rpc_cred * (*get_state_renewal_cred_locked)(struct nfs_client *);
@@ -278,6 +288,8 @@ extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
                struct nfs_fsinfo *fsinfo);
 extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data,
                                  bool sync);
+extern int nfs4_detect_session_trunking(struct nfs_client *clp,
+               struct nfs41_exchange_id_res *res, struct rpc_xprt *xprt);
 
 static inline bool
 is_ds_only_client(struct nfs_client *clp)
@@ -439,7 +451,7 @@ extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp);
 extern int nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *);
 extern int nfs4_schedule_migration_recovery(const struct nfs_server *);
 extern void nfs4_schedule_lease_moved_recovery(struct nfs_client *);
-extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags);
+extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags, bool);
 extern void nfs41_handle_server_scope(struct nfs_client *,
                                      struct nfs41_server_scope **);
 extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
@@ -471,6 +483,7 @@ extern struct nfs_subversion nfs_v4;
 struct dentry *nfs4_try_mount(int, const char *, struct nfs_mount_info *, struct nfs_subversion *);
 extern bool nfs4_disable_idmapping;
 extern unsigned short max_session_slots;
+extern unsigned short max_session_cb_slots;
 extern unsigned short send_implementation_id;
 extern bool recover_lost_locks;
 
index cd3b7cf..074ac71 100644 (file)
@@ -199,6 +199,9 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init)
        clp->cl_minorversion = cl_init->minorversion;
        clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion];
        clp->cl_mig_gen = 1;
+#if IS_ENABLED(CONFIG_NFS_V4_1)
+       init_waitqueue_head(&clp->cl_lock_waitq);
+#endif
        return clp;
 
 error:
@@ -562,15 +565,15 @@ out:
 /*
  * Returns true if the client IDs match
  */
-static bool nfs4_match_clientids(struct nfs_client *a, struct nfs_client *b)
+static bool nfs4_match_clientids(u64 a, u64 b)
 {
-       if (a->cl_clientid != b->cl_clientid) {
+       if (a != b) {
                dprintk("NFS: --> %s client ID %llx does not match %llx\n",
-                       __func__, a->cl_clientid, b->cl_clientid);
+                       __func__, a, b);
                return false;
        }
        dprintk("NFS: --> %s client ID %llx matches %llx\n",
-               __func__, a->cl_clientid, b->cl_clientid);
+               __func__, a, b);
        return true;
 }
 
@@ -578,17 +581,15 @@ static bool nfs4_match_clientids(struct nfs_client *a, struct nfs_client *b)
  * Returns true if the server major ids match
  */
 static bool
-nfs4_check_clientid_trunking(struct nfs_client *a, struct nfs_client *b)
+nfs4_check_serverowner_major_id(struct nfs41_server_owner *o1,
+                               struct nfs41_server_owner *o2)
 {
-       struct nfs41_server_owner *o1 = a->cl_serverowner;
-       struct nfs41_server_owner *o2 = b->cl_serverowner;
-
        if (o1->major_id_sz != o2->major_id_sz)
                goto out_major_mismatch;
        if (memcmp(o1->major_id, o2->major_id, o1->major_id_sz) != 0)
                goto out_major_mismatch;
 
-       dprintk("NFS: --> %s server owners match\n", __func__);
+       dprintk("NFS: --> %s server owner major IDs match\n", __func__);
        return true;
 
 out_major_mismatch:
@@ -597,6 +598,100 @@ out_major_mismatch:
        return false;
 }
 
+/*
+ * Returns true if server minor ids match
+ */
+static bool
+nfs4_check_serverowner_minor_id(struct nfs41_server_owner *o1,
+                               struct nfs41_server_owner *o2)
+{
+       /* Check eir_server_owner so_minor_id */
+       if (o1->minor_id != o2->minor_id)
+               goto out_minor_mismatch;
+
+       dprintk("NFS: --> %s server owner minor IDs match\n", __func__);
+       return true;
+
+out_minor_mismatch:
+       dprintk("NFS: --> %s server owner minor IDs do not match\n", __func__);
+       return false;
+}
+
+/*
+ * Returns true if the server scopes match
+ */
+static bool
+nfs4_check_server_scope(struct nfs41_server_scope *s1,
+                       struct nfs41_server_scope *s2)
+{
+       if (s1->server_scope_sz != s2->server_scope_sz)
+               goto out_scope_mismatch;
+       if (memcmp(s1->server_scope, s2->server_scope,
+                  s1->server_scope_sz) != 0)
+               goto out_scope_mismatch;
+
+       dprintk("NFS: --> %s server scopes match\n", __func__);
+       return true;
+
+out_scope_mismatch:
+       dprintk("NFS: --> %s server scopes do not match\n",
+               __func__);
+       return false;
+}
+
+/**
+ * nfs4_detect_session_trunking - Checks for session trunking.
+ *
+ * Called after a successful EXCHANGE_ID on a multi-addr connection.
+ * Upon success, add the transport.
+ *
+ * @clp:    original mount nfs_client
+ * @res:    result structure from an exchange_id using the original mount
+ *          nfs_client with a new multi_addr transport
+ *
+ * Returns zero on success, otherwise -EINVAL
+ *
+ * Note: since the exchange_id for the new multi_addr transport uses the
+ * same nfs_client from the original mount, the cl_owner_id is reused,
+ * so eir_clientowner is the same.
+ */
+int nfs4_detect_session_trunking(struct nfs_client *clp,
+                                struct nfs41_exchange_id_res *res,
+                                struct rpc_xprt *xprt)
+{
+       /* Check eir_clientid */
+       if (!nfs4_match_clientids(clp->cl_clientid, res->clientid))
+               goto out_err;
+
+       /* Check eir_server_owner so_major_id */
+       if (!nfs4_check_serverowner_major_id(clp->cl_serverowner,
+                                            res->server_owner))
+               goto out_err;
+
+       /* Check eir_server_owner so_minor_id */
+       if (!nfs4_check_serverowner_minor_id(clp->cl_serverowner,
+                                            res->server_owner))
+               goto out_err;
+
+       /* Check eir_server_scope */
+       if (!nfs4_check_server_scope(clp->cl_serverscope, res->server_scope))
+               goto out_err;
+
+       /* Session trunking passed, add the xprt */
+       rpc_clnt_xprt_switch_add_xprt(clp->cl_rpcclient, xprt);
+
+       pr_info("NFS:  %s: Session trunking succeeded for %s\n",
+               clp->cl_hostname,
+               xprt->address_strings[RPC_DISPLAY_ADDR]);
+
+       return 0;
+out_err:
+       pr_info("NFS:  %s: Session trunking failed for %s\n", clp->cl_hostname,
+               xprt->address_strings[RPC_DISPLAY_ADDR]);
+
+       return -EINVAL;
+}
+
 /**
  * nfs41_walk_client_list - Find nfs_client that matches a client/server owner
  *
@@ -650,7 +745,7 @@ int nfs41_walk_client_list(struct nfs_client *new,
                if (pos->cl_cons_state != NFS_CS_READY)
                        continue;
 
-               if (!nfs4_match_clientids(pos, new))
+               if (!nfs4_match_clientids(pos->cl_clientid, new->cl_clientid))
                        continue;
 
                /*
@@ -658,7 +753,8 @@ int nfs41_walk_client_list(struct nfs_client *new,
                 * client id trunking. In either case, we want to fall back
                 * to using the existing nfs_client.
                 */
-               if (!nfs4_check_clientid_trunking(pos, new))
+               if (!nfs4_check_serverowner_major_id(pos->cl_serverowner,
+                                                    new->cl_serverowner))
                        continue;
 
                /* Unlike NFSv4.0, we know that NFSv4.1 always uses the
index 0e32752..ad917bd 100644 (file)
@@ -99,8 +99,8 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
 #ifdef CONFIG_NFS_V4_1
 static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *,
                struct rpc_cred *);
-static int nfs41_free_stateid(struct nfs_server *, nfs4_stateid *,
-               struct rpc_cred *);
+static int nfs41_free_stateid(struct nfs_server *, const nfs4_stateid *,
+               struct rpc_cred *, bool);
 #endif
 
 #ifdef CONFIG_NFS_V4_SECURITY_LABEL
@@ -328,6 +328,33 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
        kunmap_atomic(start);
 }
 
+static void nfs4_test_and_free_stateid(struct nfs_server *server,
+               nfs4_stateid *stateid,
+               struct rpc_cred *cred)
+{
+       const struct nfs4_minor_version_ops *ops = server->nfs_client->cl_mvops;
+
+       ops->test_and_free_expired(server, stateid, cred);
+}
+
+static void __nfs4_free_revoked_stateid(struct nfs_server *server,
+               nfs4_stateid *stateid,
+               struct rpc_cred *cred)
+{
+       stateid->type = NFS4_REVOKED_STATEID_TYPE;
+       nfs4_test_and_free_stateid(server, stateid, cred);
+}
+
+static void nfs4_free_revoked_stateid(struct nfs_server *server,
+               const nfs4_stateid *stateid,
+               struct rpc_cred *cred)
+{
+       nfs4_stateid tmp;
+
+       nfs4_stateid_copy(&tmp, stateid);
+       __nfs4_free_revoked_stateid(server, &tmp, cred);
+}
+
 static long nfs4_update_delay(long *timeout)
 {
        long ret;
@@ -370,13 +397,23 @@ static int nfs4_do_handle_exception(struct nfs_server *server,
        exception->delay = 0;
        exception->recovering = 0;
        exception->retry = 0;
+
+       if (stateid == NULL && state != NULL)
+               stateid = &state->stateid;
+
        switch(errorcode) {
                case 0:
                        return 0;
-               case -NFS4ERR_OPENMODE:
                case -NFS4ERR_DELEG_REVOKED:
                case -NFS4ERR_ADMIN_REVOKED:
+               case -NFS4ERR_EXPIRED:
                case -NFS4ERR_BAD_STATEID:
+                       if (inode != NULL && stateid != NULL) {
+                               nfs_inode_find_state_and_recover(inode,
+                                               stateid);
+                               goto wait_on_recovery;
+                       }
+               case -NFS4ERR_OPENMODE:
                        if (inode) {
                                int err;
 
@@ -395,12 +432,6 @@ static int nfs4_do_handle_exception(struct nfs_server *server,
                        if (ret < 0)
                                break;
                        goto wait_on_recovery;
-               case -NFS4ERR_EXPIRED:
-                       if (state != NULL) {
-                               ret = nfs4_schedule_stateid_recovery(server, state);
-                               if (ret < 0)
-                                       break;
-                       }
                case -NFS4ERR_STALE_STATEID:
                case -NFS4ERR_STALE_CLIENTID:
                        nfs4_schedule_lease_recovery(clp);
@@ -616,6 +647,7 @@ int nfs40_setup_sequence(struct nfs4_slot_table *tbl,
        }
        spin_unlock(&tbl->slot_tbl_lock);
 
+       slot->privileged = args->sa_privileged ? 1 : 0;
        args->sa_slot = slot;
        res->sr_slot = slot;
 
@@ -723,12 +755,20 @@ static int nfs41_sequence_process(struct rpc_task *task,
        /* Check the SEQUENCE operation status */
        switch (res->sr_status) {
        case 0:
+               /* If previous op on slot was interrupted and we reused
+                * the seq# and got a reply from the cache, then retry
+                */
+               if (task->tk_status == -EREMOTEIO && interrupted) {
+                       ++slot->seq_nr;
+                       goto retry_nowait;
+               }
                /* Update the slot's sequence and clientid lease timer */
                slot->seq_done = 1;
                clp = session->clp;
                do_renew_lease(clp, res->sr_timestamp);
                /* Check sequence flags */
-               nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags);
+               nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags,
+                               !!slot->privileged);
                nfs41_update_target_slotid(slot->table, slot, res);
                break;
        case 1:
@@ -875,6 +915,7 @@ int nfs41_setup_sequence(struct nfs4_session *session,
        }
        spin_unlock(&tbl->slot_tbl_lock);
 
+       slot->privileged = args->sa_privileged ? 1 : 0;
        args->sa_slot = slot;
 
        dprintk("<-- %s slotid=%u seqid=%u\n", __func__,
@@ -1353,6 +1394,19 @@ static void update_open_stateflags(struct nfs4_state *state, fmode_t fmode)
        nfs4_state_set_mode_locked(state, state->state | fmode);
 }
 
+#ifdef CONFIG_NFS_V4_1
+static bool nfs_open_stateid_recover_openmode(struct nfs4_state *state)
+{
+       if (state->n_rdonly && !test_bit(NFS_O_RDONLY_STATE, &state->flags))
+               return true;
+       if (state->n_wronly && !test_bit(NFS_O_WRONLY_STATE, &state->flags))
+               return true;
+       if (state->n_rdwr && !test_bit(NFS_O_RDWR_STATE, &state->flags))
+               return true;
+       return false;
+}
+#endif /* CONFIG_NFS_V4_1 */
+
 static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state)
 {
        struct nfs_client *clp = state->owner->so_server->nfs_client;
@@ -1369,11 +1423,12 @@ static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state)
 }
 
 static bool nfs_need_update_open_stateid(struct nfs4_state *state,
-               nfs4_stateid *stateid)
+               const nfs4_stateid *stateid, nfs4_stateid *freeme)
 {
        if (test_and_set_bit(NFS_OPEN_STATE, &state->flags) == 0)
                return true;
        if (!nfs4_stateid_match_other(stateid, &state->open_stateid)) {
+               nfs4_stateid_copy(freeme, &state->open_stateid);
                nfs_test_and_clear_all_open_stateid(state);
                return true;
        }
@@ -1437,7 +1492,9 @@ static void nfs_clear_open_stateid(struct nfs4_state *state,
                nfs4_schedule_state_manager(state->owner->so_server->nfs_client);
 }
 
-static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode)
+static void nfs_set_open_stateid_locked(struct nfs4_state *state,
+               const nfs4_stateid *stateid, fmode_t fmode,
+               nfs4_stateid *freeme)
 {
        switch (fmode) {
                case FMODE_READ:
@@ -1449,14 +1506,18 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *
                case FMODE_READ|FMODE_WRITE:
                        set_bit(NFS_O_RDWR_STATE, &state->flags);
        }
-       if (!nfs_need_update_open_stateid(state, stateid))
+       if (!nfs_need_update_open_stateid(state, stateid, freeme))
                return;
        if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
                nfs4_stateid_copy(&state->stateid, stateid);
        nfs4_stateid_copy(&state->open_stateid, stateid);
 }
 
-static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, const nfs4_stateid *deleg_stateid, fmode_t fmode)
+static void __update_open_stateid(struct nfs4_state *state,
+               const nfs4_stateid *open_stateid,
+               const nfs4_stateid *deleg_stateid,
+               fmode_t fmode,
+               nfs4_stateid *freeme)
 {
        /*
         * Protect the call to nfs4_state_set_mode_locked and
@@ -1469,16 +1530,22 @@ static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_s
                set_bit(NFS_DELEGATED_STATE, &state->flags);
        }
        if (open_stateid != NULL)
-               nfs_set_open_stateid_locked(state, open_stateid, fmode);
+               nfs_set_open_stateid_locked(state, open_stateid, fmode, freeme);
        write_sequnlock(&state->seqlock);
        update_open_stateflags(state, fmode);
        spin_unlock(&state->owner->so_lock);
 }
 
-static int update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, nfs4_stateid *delegation, fmode_t fmode)
+static int update_open_stateid(struct nfs4_state *state,
+               const nfs4_stateid *open_stateid,
+               const nfs4_stateid *delegation,
+               fmode_t fmode)
 {
+       struct nfs_server *server = NFS_SERVER(state->inode);
+       struct nfs_client *clp = server->nfs_client;
        struct nfs_inode *nfsi = NFS_I(state->inode);
        struct nfs_delegation *deleg_cur;
+       nfs4_stateid freeme = {0};
        int ret = 0;
 
        fmode &= (FMODE_READ|FMODE_WRITE);
@@ -1500,7 +1567,8 @@ static int update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stat
                goto no_delegation_unlock;
 
        nfs_mark_delegation_referenced(deleg_cur);
-       __update_open_stateid(state, open_stateid, &deleg_cur->stateid, fmode);
+       __update_open_stateid(state, open_stateid, &deleg_cur->stateid,
+                       fmode, &freeme);
        ret = 1;
 no_delegation_unlock:
        spin_unlock(&deleg_cur->lock);
@@ -1508,11 +1576,14 @@ no_delegation:
        rcu_read_unlock();
 
        if (!ret && open_stateid != NULL) {
-               __update_open_stateid(state, open_stateid, NULL, fmode);
+               __update_open_stateid(state, open_stateid, NULL, fmode, &freeme);
                ret = 1;
        }
        if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags))
-               nfs4_schedule_state_manager(state->owner->so_server->nfs_client);
+               nfs4_schedule_state_manager(clp);
+       if (freeme.type != 0)
+               nfs4_test_and_free_stateid(server, &freeme,
+                               state->owner->so_cred);
 
        return ret;
 }
@@ -1889,7 +1960,6 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct
                case -NFS4ERR_STALE_CLIENTID:
                case -NFS4ERR_STALE_STATEID:
                        set_bit(NFS_DELEGATED_STATE, &state->flags);
-               case -NFS4ERR_EXPIRED:
                        /* Don't recall a delegation if it was lost */
                        nfs4_schedule_lease_recovery(server->nfs_client);
                        return -EAGAIN;
@@ -1901,6 +1971,7 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct
                        return -EAGAIN;
                case -NFS4ERR_DELEG_REVOKED:
                case -NFS4ERR_ADMIN_REVOKED:
+               case -NFS4ERR_EXPIRED:
                case -NFS4ERR_BAD_STATEID:
                case -NFS4ERR_OPENMODE:
                        nfs_inode_find_state_and_recover(state->inode,
@@ -2382,9 +2453,10 @@ static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *sta
        return ret;
 }
 
-static void nfs_finish_clear_delegation_stateid(struct nfs4_state *state)
+static void nfs_finish_clear_delegation_stateid(struct nfs4_state *state,
+               const nfs4_stateid *stateid)
 {
-       nfs_remove_bad_delegation(state->inode);
+       nfs_remove_bad_delegation(state->inode, stateid);
        write_seqlock(&state->seqlock);
        nfs4_stateid_copy(&state->stateid, &state->open_stateid);
        write_sequnlock(&state->seqlock);
@@ -2394,7 +2466,7 @@ static void nfs_finish_clear_delegation_stateid(struct nfs4_state *state)
 static void nfs40_clear_delegation_stateid(struct nfs4_state *state)
 {
        if (rcu_access_pointer(NFS_I(state->inode)->delegation) != NULL)
-               nfs_finish_clear_delegation_stateid(state);
+               nfs_finish_clear_delegation_stateid(state, NULL);
 }
 
 static int nfs40_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state)
@@ -2404,7 +2476,45 @@ static int nfs40_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st
        return nfs4_open_expired(sp, state);
 }
 
+static int nfs40_test_and_free_expired_stateid(struct nfs_server *server,
+               nfs4_stateid *stateid,
+               struct rpc_cred *cred)
+{
+       return -NFS4ERR_BAD_STATEID;
+}
+
 #if defined(CONFIG_NFS_V4_1)
+static int nfs41_test_and_free_expired_stateid(struct nfs_server *server,
+               nfs4_stateid *stateid,
+               struct rpc_cred *cred)
+{
+       int status;
+
+       switch (stateid->type) {
+       default:
+               break;
+       case NFS4_INVALID_STATEID_TYPE:
+       case NFS4_SPECIAL_STATEID_TYPE:
+               return -NFS4ERR_BAD_STATEID;
+       case NFS4_REVOKED_STATEID_TYPE:
+               goto out_free;
+       }
+
+       status = nfs41_test_stateid(server, stateid, cred);
+       switch (status) {
+       case -NFS4ERR_EXPIRED:
+       case -NFS4ERR_ADMIN_REVOKED:
+       case -NFS4ERR_DELEG_REVOKED:
+               break;
+       default:
+               return status;
+       }
+out_free:
+       /* Ack the revoked state to the server */
+       nfs41_free_stateid(server, stateid, cred, true);
+       return -NFS4ERR_EXPIRED;
+}
+
 static void nfs41_check_delegation_stateid(struct nfs4_state *state)
 {
        struct nfs_server *server = NFS_SERVER(state->inode);
@@ -2422,23 +2532,68 @@ static void nfs41_check_delegation_stateid(struct nfs4_state *state)
        }
 
        nfs4_stateid_copy(&stateid, &delegation->stateid);
+       if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) {
+               rcu_read_unlock();
+               nfs_finish_clear_delegation_stateid(state, &stateid);
+               return;
+       }
+
+       if (!test_and_clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags)) {
+               rcu_read_unlock();
+               return;
+       }
+
        cred = get_rpccred(delegation->cred);
        rcu_read_unlock();
-       status = nfs41_test_stateid(server, &stateid, cred);
+       status = nfs41_test_and_free_expired_stateid(server, &stateid, cred);
        trace_nfs4_test_delegation_stateid(state, NULL, status);
-
-       if (status != NFS_OK) {
-               /* Free the stateid unless the server explicitly
-                * informs us the stateid is unrecognized. */
-               if (status != -NFS4ERR_BAD_STATEID)
-                       nfs41_free_stateid(server, &stateid, cred);
-               nfs_finish_clear_delegation_stateid(state);
-       }
+       if (status == -NFS4ERR_EXPIRED || status == -NFS4ERR_BAD_STATEID)
+               nfs_finish_clear_delegation_stateid(state, &stateid);
 
        put_rpccred(cred);
 }
 
 /**
+ * nfs41_check_expired_locks - possibly free a lock stateid
+ *
+ * @state: NFSv4 state for an inode
+ *
+ * Returns NFS_OK if recovery for this stateid is now finished.
+ * Otherwise a negative NFS4ERR value is returned.
+ */
+static int nfs41_check_expired_locks(struct nfs4_state *state)
+{
+       int status, ret = NFS_OK;
+       struct nfs4_lock_state *lsp;
+       struct nfs_server *server = NFS_SERVER(state->inode);
+
+       if (!test_bit(LK_STATE_IN_USE, &state->flags))
+               goto out;
+       list_for_each_entry(lsp, &state->lock_states, ls_locks) {
+               if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
+                       struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
+
+                       status = nfs41_test_and_free_expired_stateid(server,
+                                       &lsp->ls_stateid,
+                                       cred);
+                       trace_nfs4_test_lock_stateid(state, lsp, status);
+                       if (status == -NFS4ERR_EXPIRED ||
+                           status == -NFS4ERR_BAD_STATEID) {
+                               clear_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags);
+                               lsp->ls_stateid.type = NFS4_INVALID_STATEID_TYPE;
+                               if (!recover_lost_locks)
+                                       set_bit(NFS_LOCK_LOST, &lsp->ls_flags);
+                       } else if (status != NFS_OK) {
+                               ret = status;
+                               break;
+                       }
+               }
+       };
+out:
+       return ret;
+}
+
+/**
  * nfs41_check_open_stateid - possibly free an open stateid
  *
  * @state: NFSv4 state for an inode
@@ -2453,26 +2608,28 @@ static int nfs41_check_open_stateid(struct nfs4_state *state)
        struct rpc_cred *cred = state->owner->so_cred;
        int status;
 
-       /* If a state reset has been done, test_stateid is unneeded */
-       if ((test_bit(NFS_O_RDONLY_STATE, &state->flags) == 0) &&
-           (test_bit(NFS_O_WRONLY_STATE, &state->flags) == 0) &&
-           (test_bit(NFS_O_RDWR_STATE, &state->flags) == 0))
+       if (test_bit(NFS_OPEN_STATE, &state->flags) == 0) {
+               if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)  {
+                       if (nfs4_have_delegation(state->inode, state->state))
+                               return NFS_OK;
+                       return -NFS4ERR_OPENMODE;
+               }
                return -NFS4ERR_BAD_STATEID;
-
-       status = nfs41_test_stateid(server, stateid, cred);
+       }
+       status = nfs41_test_and_free_expired_stateid(server, stateid, cred);
        trace_nfs4_test_open_stateid(state, NULL, status);
-       if (status != NFS_OK) {
-               /* Free the stateid unless the server explicitly
-                * informs us the stateid is unrecognized. */
-               if (status != -NFS4ERR_BAD_STATEID)
-                       nfs41_free_stateid(server, stateid, cred);
-
+       if (status == -NFS4ERR_EXPIRED || status == -NFS4ERR_BAD_STATEID) {
                clear_bit(NFS_O_RDONLY_STATE, &state->flags);
                clear_bit(NFS_O_WRONLY_STATE, &state->flags);
                clear_bit(NFS_O_RDWR_STATE, &state->flags);
                clear_bit(NFS_OPEN_STATE, &state->flags);
+               stateid->type = NFS4_INVALID_STATEID_TYPE;
        }
-       return status;
+       if (status != NFS_OK)
+               return status;
+       if (nfs_open_stateid_recover_openmode(state))
+               return -NFS4ERR_OPENMODE;
+       return NFS_OK;
 }
 
 static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state)
@@ -2480,6 +2637,9 @@ static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st
        int status;
 
        nfs41_check_delegation_stateid(state);
+       status = nfs41_check_expired_locks(state);
+       if (status != NFS_OK)
+               return status;
        status = nfs41_check_open_stateid(state);
        if (status != NFS_OK)
                status = nfs4_open_expired(sp, state);
@@ -2537,6 +2697,8 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
                goto out;
        if (server->caps & NFS_CAP_POSIX_LOCK)
                set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
+       if (opendata->o_res.rflags & NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK)
+               set_bit(NFS_STATE_MAY_NOTIFY_LOCK, &state->flags);
 
        dentry = opendata->dentry;
        if (d_really_is_negative(dentry)) {
@@ -2899,9 +3061,12 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
                        break;
                case -NFS4ERR_ADMIN_REVOKED:
                case -NFS4ERR_STALE_STATEID:
+               case -NFS4ERR_EXPIRED:
+                       nfs4_free_revoked_stateid(server,
+                                       &calldata->arg.stateid,
+                                       task->tk_msg.rpc_cred);
                case -NFS4ERR_OLD_STATEID:
                case -NFS4ERR_BAD_STATEID:
-               case -NFS4ERR_EXPIRED:
                        if (!nfs4_stateid_match(&calldata->arg.stateid,
                                                &state->open_stateid)) {
                                rpc_restart_call_prepare(task);
@@ -4312,7 +4477,7 @@ static int nfs4_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, s
        if (error == 0) {
                /* block layout checks this! */
                server->pnfs_blksize = fsinfo->blksize;
-               set_pnfs_layoutdriver(server, fhandle, fsinfo->layouttype);
+               set_pnfs_layoutdriver(server, fhandle, fsinfo);
        }
 
        return error;
@@ -4399,24 +4564,25 @@ static bool nfs4_error_stateid_expired(int err)
        return false;
 }
 
-void __nfs4_read_done_cb(struct nfs_pgio_header *hdr)
-{
-       nfs_invalidate_atime(hdr->inode);
-}
-
 static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr)
 {
        struct nfs_server *server = NFS_SERVER(hdr->inode);
 
        trace_nfs4_read(hdr, task->tk_status);
-       if (nfs4_async_handle_error(task, server,
-                                   hdr->args.context->state,
-                                   NULL) == -EAGAIN) {
-               rpc_restart_call_prepare(task);
-               return -EAGAIN;
+       if (task->tk_status < 0) {
+               struct nfs4_exception exception = {
+                       .inode = hdr->inode,
+                       .state = hdr->args.context->state,
+                       .stateid = &hdr->args.stateid,
+               };
+               task->tk_status = nfs4_async_handle_exception(task,
+                               server, task->tk_status, &exception);
+               if (exception.retry) {
+                       rpc_restart_call_prepare(task);
+                       return -EAGAIN;
+               }
        }
 
-       __nfs4_read_done_cb(hdr);
        if (task->tk_status > 0)
                renew_lease(server, hdr->timestamp);
        return 0;
@@ -4445,6 +4611,8 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
                return -EAGAIN;
        if (nfs4_read_stateid_changed(task, &hdr->args))
                return -EAGAIN;
+       if (task->tk_status > 0)
+               nfs_invalidate_atime(hdr->inode);
        return hdr->pgio_done_cb ? hdr->pgio_done_cb(task, hdr) :
                                    nfs4_read_done_cb(task, hdr);
 }
@@ -4482,11 +4650,19 @@ static int nfs4_write_done_cb(struct rpc_task *task,
        struct inode *inode = hdr->inode;
 
        trace_nfs4_write(hdr, task->tk_status);
-       if (nfs4_async_handle_error(task, NFS_SERVER(inode),
-                                   hdr->args.context->state,
-                                   NULL) == -EAGAIN) {
-               rpc_restart_call_prepare(task);
-               return -EAGAIN;
+       if (task->tk_status < 0) {
+               struct nfs4_exception exception = {
+                       .inode = hdr->inode,
+                       .state = hdr->args.context->state,
+                       .stateid = &hdr->args.stateid,
+               };
+               task->tk_status = nfs4_async_handle_exception(task,
+                               NFS_SERVER(inode), task->tk_status,
+                               &exception);
+               if (exception.retry) {
+                       rpc_restart_call_prepare(task);
+                       return -EAGAIN;
+               }
        }
        if (task->tk_status >= 0) {
                renew_lease(NFS_SERVER(inode), hdr->timestamp);
@@ -5123,12 +5299,14 @@ static void nfs4_init_boot_verifier(const struct nfs_client *clp,
        if (test_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state)) {
                /* An impossible timestamp guarantees this value
                 * will never match a generated boot time. */
-               verf[0] = 0;
-               verf[1] = cpu_to_be32(NSEC_PER_SEC + 1);
+               verf[0] = cpu_to_be32(U32_MAX);
+               verf[1] = cpu_to_be32(U32_MAX);
        } else {
                struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
-               verf[0] = cpu_to_be32(nn->boot_time.tv_sec);
-               verf[1] = cpu_to_be32(nn->boot_time.tv_nsec);
+               u64 ns = ktime_to_ns(nn->boot_time);
+
+               verf[0] = cpu_to_be32(ns >> 32);
+               verf[1] = cpu_to_be32(ns);
        }
        memcpy(bootverf->data, verf, sizeof(bootverf->data));
 }
@@ -5393,10 +5571,13 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
                renew_lease(data->res.server, data->timestamp);
        case -NFS4ERR_ADMIN_REVOKED:
        case -NFS4ERR_DELEG_REVOKED:
+       case -NFS4ERR_EXPIRED:
+               nfs4_free_revoked_stateid(data->res.server,
+                               data->args.stateid,
+                               task->tk_msg.rpc_cred);
        case -NFS4ERR_BAD_STATEID:
        case -NFS4ERR_OLD_STATEID:
        case -NFS4ERR_STALE_STATEID:
-       case -NFS4ERR_EXPIRED:
                task->tk_status = 0;
                if (data->roc)
                        pnfs_roc_set_barrier(data->inode, data->roc_barrier);
@@ -5528,22 +5709,6 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4
        return err;
 }
 
-#define NFS4_LOCK_MINTIMEOUT (1 * HZ)
-#define NFS4_LOCK_MAXTIMEOUT (30 * HZ)
-
-/* 
- * sleep, with exponential backoff, and retry the LOCK operation. 
- */
-static unsigned long
-nfs4_set_lock_task_retry(unsigned long timeout)
-{
-       freezable_schedule_timeout_killable_unsafe(timeout);
-       timeout <<= 1;
-       if (timeout > NFS4_LOCK_MAXTIMEOUT)
-               return NFS4_LOCK_MAXTIMEOUT;
-       return timeout;
-}
-
 static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *request)
 {
        struct inode *inode = state->inode;
@@ -5600,11 +5765,6 @@ static int nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *
        return err;
 }
 
-static int do_vfs_lock(struct inode *inode, struct file_lock *fl)
-{
-       return locks_lock_inode_wait(inode, fl);
-}
-
 struct nfs4_unlockdata {
        struct nfs_locku_args arg;
        struct nfs_locku_res res;
@@ -5657,14 +5817,18 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
        switch (task->tk_status) {
                case 0:
                        renew_lease(calldata->server, calldata->timestamp);
-                       do_vfs_lock(calldata->lsp->ls_state->inode, &calldata->fl);
+                       locks_lock_inode_wait(calldata->lsp->ls_state->inode, &calldata->fl);
                        if (nfs4_update_lock_stateid(calldata->lsp,
                                        &calldata->res.stateid))
                                break;
+               case -NFS4ERR_ADMIN_REVOKED:
+               case -NFS4ERR_EXPIRED:
+                       nfs4_free_revoked_stateid(calldata->server,
+                                       &calldata->arg.stateid,
+                                       task->tk_msg.rpc_cred);
                case -NFS4ERR_BAD_STATEID:
                case -NFS4ERR_OLD_STATEID:
                case -NFS4ERR_STALE_STATEID:
-               case -NFS4ERR_EXPIRED:
                        if (!nfs4_stateid_match(&calldata->arg.stateid,
                                                &calldata->lsp->ls_stateid))
                                rpc_restart_call_prepare(task);
@@ -5765,7 +5929,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
        mutex_lock(&sp->so_delegreturn_mutex);
        /* Exclude nfs4_reclaim_open_stateid() - note nesting! */
        down_read(&nfsi->rwsem);
-       if (do_vfs_lock(inode, request) == -ENOENT) {
+       if (locks_lock_inode_wait(inode, request) == -ENOENT) {
                up_read(&nfsi->rwsem);
                mutex_unlock(&sp->so_delegreturn_mutex);
                goto out;
@@ -5906,7 +6070,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
                                data->timestamp);
                if (data->arg.new_lock) {
                        data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS);
-                       if (do_vfs_lock(lsp->ls_state->inode, &data->fl) < 0) {
+                       if (locks_lock_inode_wait(lsp->ls_state->inode, &data->fl) < 0) {
                                rpc_restart_call_prepare(task);
                                break;
                        }
@@ -5965,6 +6129,7 @@ static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_
 {
        switch (error) {
        case -NFS4ERR_ADMIN_REVOKED:
+       case -NFS4ERR_EXPIRED:
        case -NFS4ERR_BAD_STATEID:
                lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
                if (new_lock_owner != 0 ||
@@ -5973,7 +6138,6 @@ static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_
                break;
        case -NFS4ERR_STALE_STATEID:
                lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
-       case -NFS4ERR_EXPIRED:
                nfs4_schedule_lease_recovery(server->nfs_client);
        };
 }
@@ -6083,52 +6247,19 @@ out:
 }
 
 #if defined(CONFIG_NFS_V4_1)
-/**
- * nfs41_check_expired_locks - possibly free a lock stateid
- *
- * @state: NFSv4 state for an inode
- *
- * Returns NFS_OK if recovery for this stateid is now finished.
- * Otherwise a negative NFS4ERR value is returned.
- */
-static int nfs41_check_expired_locks(struct nfs4_state *state)
-{
-       int status, ret = -NFS4ERR_BAD_STATEID;
-       struct nfs4_lock_state *lsp;
-       struct nfs_server *server = NFS_SERVER(state->inode);
-
-       list_for_each_entry(lsp, &state->lock_states, ls_locks) {
-               if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
-                       struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
-
-                       status = nfs41_test_stateid(server,
-                                       &lsp->ls_stateid,
-                                       cred);
-                       trace_nfs4_test_lock_stateid(state, lsp, status);
-                       if (status != NFS_OK) {
-                               /* Free the stateid unless the server
-                                * informs us the stateid is unrecognized. */
-                               if (status != -NFS4ERR_BAD_STATEID)
-                                       nfs41_free_stateid(server,
-                                                       &lsp->ls_stateid,
-                                                       cred);
-                               clear_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags);
-                               ret = status;
-                       }
-               }
-       };
-
-       return ret;
-}
-
 static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *request)
 {
-       int status = NFS_OK;
+       struct nfs4_lock_state *lsp;
+       int status;
 
-       if (test_bit(LK_STATE_IN_USE, &state->flags))
-               status = nfs41_check_expired_locks(state);
-       if (status != NFS_OK)
-               status = nfs4_lock_expired(state, request);
+       status = nfs4_set_lock_state(state, request);
+       if (status != 0)
+               return status;
+       lsp = request->fl_u.nfs4_fl.owner;
+       if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) ||
+           test_bit(NFS_LOCK_LOST, &lsp->ls_flags))
+               return 0;
+       status = nfs4_lock_expired(state, request);
        return status;
 }
 #endif
@@ -6138,17 +6269,10 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
        struct nfs_inode *nfsi = NFS_I(state->inode);
        struct nfs4_state_owner *sp = state->owner;
        unsigned char fl_flags = request->fl_flags;
-       int status = -ENOLCK;
+       int status;
 
-       if ((fl_flags & FL_POSIX) &&
-                       !test_bit(NFS_STATE_POSIX_LOCKS, &state->flags))
-               goto out;
-       /* Is this a delegated open? */
-       status = nfs4_set_lock_state(state, request);
-       if (status != 0)
-               goto out;
        request->fl_flags |= FL_ACCESS;
-       status = do_vfs_lock(state->inode, request);
+       status = locks_lock_inode_wait(state->inode, request);
        if (status < 0)
                goto out;
        mutex_lock(&sp->so_delegreturn_mutex);
@@ -6157,7 +6281,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
                /* Yes: cache locks! */
                /* ...but avoid races with delegation recall... */
                request->fl_flags = fl_flags & ~FL_SLEEP;
-               status = do_vfs_lock(state->inode, request);
+               status = locks_lock_inode_wait(state->inode, request);
                up_read(&nfsi->rwsem);
                mutex_unlock(&sp->so_delegreturn_mutex);
                goto out;
@@ -6188,12 +6312,124 @@ static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *
        return err;
 }
 
+#define NFS4_LOCK_MINTIMEOUT (1 * HZ)
+#define NFS4_LOCK_MAXTIMEOUT (30 * HZ)
+
+static int
+nfs4_retry_setlk_simple(struct nfs4_state *state, int cmd,
+                       struct file_lock *request)
+{
+       int             status = -ERESTARTSYS;
+       unsigned long   timeout = NFS4_LOCK_MINTIMEOUT;
+
+       while(!signalled()) {
+               status = nfs4_proc_setlk(state, cmd, request);
+               if ((status != -EAGAIN) || IS_SETLK(cmd))
+                       break;
+               freezable_schedule_timeout_interruptible(timeout);
+               timeout *= 2;
+               timeout = min_t(unsigned long, NFS4_LOCK_MAXTIMEOUT, timeout);
+               status = -ERESTARTSYS;
+       }
+       return status;
+}
+
+#ifdef CONFIG_NFS_V4_1
+struct nfs4_lock_waiter {
+       struct task_struct      *task;
+       struct inode            *inode;
+       struct nfs_lowner       *owner;
+       bool                    notified;
+};
+
+static int
+nfs4_wake_lock_waiter(wait_queue_t *wait, unsigned int mode, int flags, void *key)
+{
+       int ret;
+       struct cb_notify_lock_args *cbnl = key;
+       struct nfs4_lock_waiter *waiter = wait->private;
+       struct nfs_lowner       *lowner = &cbnl->cbnl_owner,
+                               *wowner = waiter->owner;
+
+       /* Only wake if the callback was for the same owner */
+       if (lowner->clientid != wowner->clientid ||
+           lowner->id != wowner->id             ||
+           lowner->s_dev != wowner->s_dev)
+               return 0;
+
+       /* Make sure it's for the right inode */
+       if (nfs_compare_fh(NFS_FH(waiter->inode), &cbnl->cbnl_fh))
+               return 0;
+
+       waiter->notified = true;
+
+       /* override "private" so we can use default_wake_function */
+       wait->private = waiter->task;
+       ret = autoremove_wake_function(wait, mode, flags, key);
+       wait->private = waiter;
+       return ret;
+}
+
+static int
+nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
+{
+       int status = -ERESTARTSYS;
+       unsigned long flags;
+       struct nfs4_lock_state *lsp = request->fl_u.nfs4_fl.owner;
+       struct nfs_server *server = NFS_SERVER(state->inode);
+       struct nfs_client *clp = server->nfs_client;
+       wait_queue_head_t *q = &clp->cl_lock_waitq;
+       struct nfs_lowner owner = { .clientid = clp->cl_clientid,
+                                   .id = lsp->ls_seqid.owner_id,
+                                   .s_dev = server->s_dev };
+       struct nfs4_lock_waiter waiter = { .task  = current,
+                                          .inode = state->inode,
+                                          .owner = &owner,
+                                          .notified = false };
+       wait_queue_t wait;
+
+       /* Don't bother with waitqueue if we don't expect a callback */
+       if (!test_bit(NFS_STATE_MAY_NOTIFY_LOCK, &state->flags))
+               return nfs4_retry_setlk_simple(state, cmd, request);
+
+       init_wait(&wait);
+       wait.private = &waiter;
+       wait.func = nfs4_wake_lock_waiter;
+       add_wait_queue(q, &wait);
+
+       while(!signalled()) {
+               status = nfs4_proc_setlk(state, cmd, request);
+               if ((status != -EAGAIN) || IS_SETLK(cmd))
+                       break;
+
+               status = -ERESTARTSYS;
+               spin_lock_irqsave(&q->lock, flags);
+               if (waiter.notified) {
+                       spin_unlock_irqrestore(&q->lock, flags);
+                       continue;
+               }
+               set_current_state(TASK_INTERRUPTIBLE);
+               spin_unlock_irqrestore(&q->lock, flags);
+
+               freezable_schedule_timeout_interruptible(NFS4_LOCK_MAXTIMEOUT);
+       }
+
+       finish_wait(q, &wait);
+       return status;
+}
+#else /* !CONFIG_NFS_V4_1 */
+static inline int
+nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
+{
+       return nfs4_retry_setlk_simple(state, cmd, request);
+}
+#endif
+
 static int
 nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request)
 {
        struct nfs_open_context *ctx;
        struct nfs4_state *state;
-       unsigned long timeout = NFS4_LOCK_MINTIMEOUT;
        int status;
 
        /* verify open state */
@@ -6220,6 +6456,11 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request)
 
        if (state == NULL)
                return -ENOLCK;
+
+       if ((request->fl_flags & FL_POSIX) &&
+           !test_bit(NFS_STATE_POSIX_LOCKS, &state->flags))
+               return -ENOLCK;
+
        /*
         * Don't rely on the VFS having checked the file open mode,
         * since it won't do this for flock() locks.
@@ -6234,16 +6475,11 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request)
                        return -EBADF;
        }
 
-       do {
-               status = nfs4_proc_setlk(state, cmd, request);
-               if ((status != -EAGAIN) || IS_SETLK(cmd))
-                       break;
-               timeout = nfs4_set_lock_task_retry(timeout);
-               status = -ERESTARTSYS;
-               if (signalled())
-                       break;
-       } while(status < 0);
-       return status;
+       status = nfs4_set_lock_state(state, request);
+       if (status != 0)
+               return status;
+
+       return nfs4_retry_setlk(state, cmd, request);
 }
 
 int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid)
@@ -7104,75 +7340,161 @@ static int nfs4_sp4_select_mode(struct nfs_client *clp,
        return 0;
 }
 
+struct nfs41_exchange_id_data {
+       struct nfs41_exchange_id_res res;
+       struct nfs41_exchange_id_args args;
+       struct rpc_xprt *xprt;
+       int rpc_status;
+};
+
+static void nfs4_exchange_id_done(struct rpc_task *task, void *data)
+{
+       struct nfs41_exchange_id_data *cdata =
+                                       (struct nfs41_exchange_id_data *)data;
+       struct nfs_client *clp = cdata->args.client;
+       int status = task->tk_status;
+
+       trace_nfs4_exchange_id(clp, status);
+
+       if (status == 0)
+               status = nfs4_check_cl_exchange_flags(cdata->res.flags);
+
+       if (cdata->xprt && status == 0) {
+               status = nfs4_detect_session_trunking(clp, &cdata->res,
+                                                     cdata->xprt);
+               goto out;
+       }
+
+       if (status  == 0)
+               status = nfs4_sp4_select_mode(clp, &cdata->res.state_protect);
+
+       if (status == 0) {
+               clp->cl_clientid = cdata->res.clientid;
+               clp->cl_exchange_flags = cdata->res.flags;
+               /* Client ID is not confirmed */
+               if (!(cdata->res.flags & EXCHGID4_FLAG_CONFIRMED_R)) {
+                       clear_bit(NFS4_SESSION_ESTABLISHED,
+                       &clp->cl_session->session_state);
+                       clp->cl_seqid = cdata->res.seqid;
+               }
+
+               kfree(clp->cl_serverowner);
+               clp->cl_serverowner = cdata->res.server_owner;
+               cdata->res.server_owner = NULL;
+
+               /* use the most recent implementation id */
+               kfree(clp->cl_implid);
+               clp->cl_implid = cdata->res.impl_id;
+               cdata->res.impl_id = NULL;
+
+               if (clp->cl_serverscope != NULL &&
+                   !nfs41_same_server_scope(clp->cl_serverscope,
+                                       cdata->res.server_scope)) {
+                       dprintk("%s: server_scope mismatch detected\n",
+                               __func__);
+                       set_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state);
+                       kfree(clp->cl_serverscope);
+                       clp->cl_serverscope = NULL;
+               }
+
+               if (clp->cl_serverscope == NULL) {
+                       clp->cl_serverscope = cdata->res.server_scope;
+                       cdata->res.server_scope = NULL;
+               }
+               /* Save the EXCHANGE_ID verifier session trunk tests */
+               memcpy(clp->cl_confirm.data, cdata->args.verifier->data,
+                      sizeof(clp->cl_confirm.data));
+       }
+out:
+       cdata->rpc_status = status;
+       return;
+}
+
+static void nfs4_exchange_id_release(void *data)
+{
+       struct nfs41_exchange_id_data *cdata =
+                                       (struct nfs41_exchange_id_data *)data;
+
+       nfs_put_client(cdata->args.client);
+       if (cdata->xprt) {
+               xprt_put(cdata->xprt);
+               rpc_clnt_xprt_switch_put(cdata->args.client->cl_rpcclient);
+       }
+       kfree(cdata->res.impl_id);
+       kfree(cdata->res.server_scope);
+       kfree(cdata->res.server_owner);
+       kfree(cdata);
+}
+
+static const struct rpc_call_ops nfs4_exchange_id_call_ops = {
+       .rpc_call_done = nfs4_exchange_id_done,
+       .rpc_release = nfs4_exchange_id_release,
+};
+
 /*
  * _nfs4_proc_exchange_id()
  *
  * Wrapper for EXCHANGE_ID operation.
  */
 static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
-       u32 sp4_how)
+                       u32 sp4_how, struct rpc_xprt *xprt)
 {
        nfs4_verifier verifier;
-       struct nfs41_exchange_id_args args = {
-               .verifier = &verifier,
-               .client = clp,
-#ifdef CONFIG_NFS_V4_1_MIGRATION
-               .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER |
-                        EXCHGID4_FLAG_BIND_PRINC_STATEID |
-                        EXCHGID4_FLAG_SUPP_MOVED_MIGR,
-#else
-               .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER |
-                        EXCHGID4_FLAG_BIND_PRINC_STATEID,
-#endif
-       };
-       struct nfs41_exchange_id_res res = {
-               0
-       };
-       int status;
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_EXCHANGE_ID],
-               .rpc_argp = &args,
-               .rpc_resp = &res,
                .rpc_cred = cred,
        };
+       struct rpc_task_setup task_setup_data = {
+               .rpc_client = clp->cl_rpcclient,
+               .callback_ops = &nfs4_exchange_id_call_ops,
+               .rpc_message = &msg,
+               .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT,
+       };
+       struct nfs41_exchange_id_data *calldata;
+       struct rpc_task *task;
+       int status = -EIO;
+
+       if (!atomic_inc_not_zero(&clp->cl_count))
+               goto out;
+
+       status = -ENOMEM;
+       calldata = kzalloc(sizeof(*calldata), GFP_NOFS);
+       if (!calldata)
+               goto out;
 
-       nfs4_init_boot_verifier(clp, &verifier);
+       if (!xprt)
+               nfs4_init_boot_verifier(clp, &verifier);
 
        status = nfs4_init_uniform_client_string(clp);
        if (status)
-               goto out;
+               goto out_calldata;
 
        dprintk("NFS call  exchange_id auth=%s, '%s'\n",
                clp->cl_rpcclient->cl_auth->au_ops->au_name,
                clp->cl_owner_id);
 
-       res.server_owner = kzalloc(sizeof(struct nfs41_server_owner),
-                                       GFP_NOFS);
-       if (unlikely(res.server_owner == NULL)) {
-               status = -ENOMEM;
-               goto out;
-       }
+       calldata->res.server_owner = kzalloc(sizeof(struct nfs41_server_owner),
+                                               GFP_NOFS);
+       status = -ENOMEM;
+       if (unlikely(calldata->res.server_owner == NULL))
+               goto out_calldata;
 
-       res.server_scope = kzalloc(sizeof(struct nfs41_server_scope),
+       calldata->res.server_scope = kzalloc(sizeof(struct nfs41_server_scope),
                                        GFP_NOFS);
-       if (unlikely(res.server_scope == NULL)) {
-               status = -ENOMEM;
+       if (unlikely(calldata->res.server_scope == NULL))
                goto out_server_owner;
-       }
 
-       res.impl_id = kzalloc(sizeof(struct nfs41_impl_id), GFP_NOFS);
-       if (unlikely(res.impl_id == NULL)) {
-               status = -ENOMEM;
+       calldata->res.impl_id = kzalloc(sizeof(struct nfs41_impl_id), GFP_NOFS);
+       if (unlikely(calldata->res.impl_id == NULL))
                goto out_server_scope;
-       }
 
        switch (sp4_how) {
        case SP4_NONE:
-               args.state_protect.how = SP4_NONE;
+               calldata->args.state_protect.how = SP4_NONE;
                break;
 
        case SP4_MACH_CRED:
-               args.state_protect = nfs4_sp4_mach_cred_request;
+               calldata->args.state_protect = nfs4_sp4_mach_cred_request;
                break;
 
        default:
@@ -7181,56 +7503,42 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
                status = -EINVAL;
                goto out_impl_id;
        }
+       if (xprt) {
+               calldata->xprt = xprt;
+               task_setup_data.rpc_xprt = xprt;
+               task_setup_data.flags =
+                               RPC_TASK_SOFT|RPC_TASK_SOFTCONN|RPC_TASK_ASYNC;
+               calldata->args.verifier = &clp->cl_confirm;
+       } else {
+               calldata->args.verifier = &verifier;
+       }
+       calldata->args.client = clp;
+#ifdef CONFIG_NFS_V4_1_MIGRATION
+       calldata->args.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER |
+       EXCHGID4_FLAG_BIND_PRINC_STATEID |
+       EXCHGID4_FLAG_SUPP_MOVED_MIGR,
+#else
+       calldata->args.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER |
+       EXCHGID4_FLAG_BIND_PRINC_STATEID,
+#endif
+       msg.rpc_argp = &calldata->args;
+       msg.rpc_resp = &calldata->res;
+       task_setup_data.callback_data = calldata;
 
-       status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
-       trace_nfs4_exchange_id(clp, status);
-       if (status == 0)
-               status = nfs4_check_cl_exchange_flags(res.flags);
-
-       if (status == 0)
-               status = nfs4_sp4_select_mode(clp, &res.state_protect);
-
-       if (status == 0) {
-               clp->cl_clientid = res.clientid;
-               clp->cl_exchange_flags = res.flags;
-               /* Client ID is not confirmed */
-               if (!(res.flags & EXCHGID4_FLAG_CONFIRMED_R)) {
-                       clear_bit(NFS4_SESSION_ESTABLISHED,
-                                       &clp->cl_session->session_state);
-                       clp->cl_seqid = res.seqid;
-               }
-
-               kfree(clp->cl_serverowner);
-               clp->cl_serverowner = res.server_owner;
-               res.server_owner = NULL;
-
-               /* use the most recent implementation id */
-               kfree(clp->cl_implid);
-               clp->cl_implid = res.impl_id;
-               res.impl_id = NULL;
-
-               if (clp->cl_serverscope != NULL &&
-                   !nfs41_same_server_scope(clp->cl_serverscope,
-                                            res.server_scope)) {
-                       dprintk("%s: server_scope mismatch detected\n",
-                               __func__);
-                       set_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state);
-                       kfree(clp->cl_serverscope);
-                       clp->cl_serverscope = NULL;
-               }
-
-               if (clp->cl_serverscope == NULL) {
-                       clp->cl_serverscope = res.server_scope;
-                       res.server_scope = NULL;
-               }
+       task = rpc_run_task(&task_setup_data);
+       if (IS_ERR(task)) {
+       status = PTR_ERR(task);
+               goto out_impl_id;
        }
 
-out_impl_id:
-       kfree(res.impl_id);
-out_server_scope:
-       kfree(res.server_scope);
-out_server_owner:
-       kfree(res.server_owner);
+       if (!xprt) {
+               status = rpc_wait_for_completion_task(task);
+               if (!status)
+                       status = calldata->rpc_status;
+       } else  /* session trunking test */
+               status = calldata->rpc_status;
+
+       rpc_put_task(task);
 out:
        if (clp->cl_implid != NULL)
                dprintk("NFS reply exchange_id: Server Implementation ID: "
@@ -7240,6 +7548,16 @@ out:
                        clp->cl_implid->date.nseconds);
        dprintk("NFS reply exchange_id: %d\n", status);
        return status;
+
+out_impl_id:
+       kfree(calldata->res.impl_id);
+out_server_scope:
+       kfree(calldata->res.server_scope);
+out_server_owner:
+       kfree(calldata->res.server_owner);
+out_calldata:
+       kfree(calldata);
+       goto out;
 }
 
 /*
@@ -7262,14 +7580,45 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
        /* try SP4_MACH_CRED if krb5i/p */
        if (authflavor == RPC_AUTH_GSS_KRB5I ||
            authflavor == RPC_AUTH_GSS_KRB5P) {
-               status = _nfs4_proc_exchange_id(clp, cred, SP4_MACH_CRED);
+               status = _nfs4_proc_exchange_id(clp, cred, SP4_MACH_CRED, NULL);
                if (!status)
                        return 0;
        }
 
        /* try SP4_NONE */
-       return _nfs4_proc_exchange_id(clp, cred, SP4_NONE);
+       return _nfs4_proc_exchange_id(clp, cred, SP4_NONE, NULL);
+}
+
+/**
+ * nfs4_test_session_trunk
+ *
+ * This is an add_xprt_test() test function called from
+ * rpc_clnt_setup_test_and_add_xprt.
+ *
+ * The rpc_xprt_switch is referrenced by rpc_clnt_setup_test_and_add_xprt
+ * and is dereferrenced in nfs4_exchange_id_release
+ *
+ * Upon success, add the new transport to the rpc_clnt
+ *
+ * @clnt: struct rpc_clnt to get new transport
+ * @xprt: the rpc_xprt to test
+ * @data: call data for _nfs4_proc_exchange_id.
+ */
+int nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt,
+                           void *data)
+{
+       struct nfs4_add_xprt_data *adata = (struct nfs4_add_xprt_data *)data;
+       u32 sp4_how;
+
+       dprintk("--> %s try %s\n", __func__,
+               xprt->address_strings[RPC_DISPLAY_ADDR]);
+
+       sp4_how = (adata->clp->cl_sp4_flags == 0 ? SP4_NONE : SP4_MACH_CRED);
+
+       /* Test connection for session trunking. Async exchange_id call */
+       return  _nfs4_proc_exchange_id(adata->clp, adata->cred, sp4_how, xprt);
 }
+EXPORT_SYMBOL_GPL(nfs4_test_session_trunk);
 
 static int _nfs4_proc_destroy_clientid(struct nfs_client *clp,
                struct rpc_cred *cred)
@@ -7463,7 +7812,7 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args,
        args->bc_attrs.max_resp_sz = max_bc_payload;
        args->bc_attrs.max_resp_sz_cached = 0;
        args->bc_attrs.max_ops = NFS4_MAX_BACK_CHANNEL_OPS;
-       args->bc_attrs.max_reqs = NFS41_BC_MAX_CALLBACKS;
+       args->bc_attrs.max_reqs = min_t(unsigned short, max_session_cb_slots, 1);
 
        dprintk("%s: Back Channel : max_rqst_sz=%u max_resp_sz=%u "
                "max_resp_sz_cached=%u max_ops=%u max_reqs=%u\n",
@@ -7510,10 +7859,9 @@ static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args
                return -EINVAL;
        if (rcvd->max_resp_sz_cached > sent->max_resp_sz_cached)
                return -EINVAL;
-       /* These would render the backchannel useless: */
-       if (rcvd->max_ops != sent->max_ops)
+       if (rcvd->max_ops > sent->max_ops)
                return -EINVAL;
-       if (rcvd->max_reqs != sent->max_reqs)
+       if (rcvd->max_reqs > sent->max_reqs)
                return -EINVAL;
 out:
        return 0;
@@ -7982,6 +8330,8 @@ nfs4_layoutget_handle_exception(struct rpc_task *task,
        case -NFS4ERR_RECALLCONFLICT:
                status = -ERECALLCONFLICT;
                break;
+       case -NFS4ERR_DELEG_REVOKED:
+       case -NFS4ERR_ADMIN_REVOKED:
        case -NFS4ERR_EXPIRED:
        case -NFS4ERR_BAD_STATEID:
                exception->timeout = 0;
@@ -7993,6 +8343,7 @@ nfs4_layoutget_handle_exception(struct rpc_task *task,
                                        &lgp->args.ctx->state->stateid)) {
                        spin_unlock(&inode->i_lock);
                        exception->state = lgp->args.ctx->state;
+                       exception->stateid = &lgp->args.stateid;
                        break;
                }
 
@@ -8591,6 +8942,24 @@ static int _nfs41_test_stateid(struct nfs_server *server,
        return -res.status;
 }
 
+static void nfs4_handle_delay_or_session_error(struct nfs_server *server,
+               int err, struct nfs4_exception *exception)
+{
+       exception->retry = 0;
+       switch(err) {
+       case -NFS4ERR_DELAY:
+       case -NFS4ERR_RETRY_UNCACHED_REP:
+               nfs4_handle_exception(server, err, exception);
+               break;
+       case -NFS4ERR_BADSESSION:
+       case -NFS4ERR_BADSLOT:
+       case -NFS4ERR_BAD_HIGH_SLOT:
+       case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+       case -NFS4ERR_DEADSESSION:
+               nfs4_do_handle_exception(server, err, exception);
+       }
+}
+
 /**
  * nfs41_test_stateid - perform a TEST_STATEID operation
  *
@@ -8610,9 +8979,7 @@ static int nfs41_test_stateid(struct nfs_server *server,
        int err;
        do {
                err = _nfs41_test_stateid(server, stateid, cred);
-               if (err != -NFS4ERR_DELAY)
-                       break;
-               nfs4_handle_exception(server, err, &exception);
+               nfs4_handle_delay_or_session_error(server, err, &exception);
        } while (exception.retry);
        return err;
 }
@@ -8657,7 +9024,7 @@ static const struct rpc_call_ops nfs41_free_stateid_ops = {
 };
 
 static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server,
-               nfs4_stateid *stateid,
+               const nfs4_stateid *stateid,
                struct rpc_cred *cred,
                bool privileged)
 {
@@ -8687,7 +9054,7 @@ static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server,
 
        msg.rpc_argp = &data->args;
        msg.rpc_resp = &data->res;
-       nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
+       nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
        if (privileged)
                nfs4_set_sequence_privileged(&data->args.seq_args);
 
@@ -8700,38 +9067,31 @@ static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server,
  * @server: server / transport on which to perform the operation
  * @stateid: state ID to release
  * @cred: credential
+ * @is_recovery: set to true if this call needs to be privileged
  *
- * Returns NFS_OK if the server freed "stateid".  Otherwise a
- * negative NFS4ERR value is returned.
+ * Note: this function is always asynchronous.
  */
 static int nfs41_free_stateid(struct nfs_server *server,
-               nfs4_stateid *stateid,
-               struct rpc_cred *cred)
+               const nfs4_stateid *stateid,
+               struct rpc_cred *cred,
+               bool is_recovery)
 {
        struct rpc_task *task;
-       int ret;
 
-       task = _nfs41_free_stateid(server, stateid, cred, true);
+       task = _nfs41_free_stateid(server, stateid, cred, is_recovery);
        if (IS_ERR(task))
                return PTR_ERR(task);
-       ret = rpc_wait_for_completion_task(task);
-       if (!ret)
-               ret = task->tk_status;
        rpc_put_task(task);
-       return ret;
+       return 0;
 }
 
 static void
 nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
 {
-       struct rpc_task *task;
        struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
 
-       task = _nfs41_free_stateid(server, &lsp->ls_stateid, cred, false);
+       nfs41_free_stateid(server, &lsp->ls_stateid, cred, false);
        nfs4_free_lock_state(server, lsp);
-       if (IS_ERR(task))
-               return;
-       rpc_put_task(task);
 }
 
 static bool nfs41_match_stateid(const nfs4_stateid *s1,
@@ -8835,6 +9195,7 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
        .match_stateid = nfs4_match_stateid,
        .find_root_sec = nfs4_find_root_sec,
        .free_lock_state = nfs4_release_lockowner,
+       .test_and_free_expired = nfs40_test_and_free_expired_stateid,
        .alloc_seqid = nfs_alloc_seqid,
        .call_sync_ops = &nfs40_call_sync_ops,
        .reboot_recovery_ops = &nfs40_reboot_recovery_ops,
@@ -8862,7 +9223,9 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
        .match_stateid = nfs41_match_stateid,
        .find_root_sec = nfs41_find_root_sec,
        .free_lock_state = nfs41_free_lock_state,
+       .test_and_free_expired = nfs41_test_and_free_expired_stateid,
        .alloc_seqid = nfs_alloc_no_seqid,
+       .session_trunk = nfs4_test_session_trunk,
        .call_sync_ops = &nfs41_call_sync_ops,
        .reboot_recovery_ops = &nfs41_reboot_recovery_ops,
        .nograce_recovery_ops = &nfs41_nograce_recovery_ops,
@@ -8891,7 +9254,9 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
        .find_root_sec = nfs41_find_root_sec,
        .free_lock_state = nfs41_free_lock_state,
        .call_sync_ops = &nfs41_call_sync_ops,
+       .test_and_free_expired = nfs41_test_and_free_expired_stateid,
        .alloc_seqid = nfs_alloc_no_seqid,
+       .session_trunk = nfs4_test_session_trunk,
        .reboot_recovery_ops = &nfs41_reboot_recovery_ops,
        .nograce_recovery_ops = &nfs41_nograce_recovery_ops,
        .state_renewal_ops = &nfs41_state_renewal_ops,
index f703b75..dae3855 100644 (file)
@@ -9,6 +9,7 @@
 
 /* maximum number of slots to use */
 #define NFS4_DEF_SLOT_TABLE_SIZE (64U)
+#define NFS4_DEF_CB_SLOT_TABLE_SIZE (1U)
 #define NFS4_MAX_SLOT_TABLE (1024U)
 #define NFS4_NO_SLOT ((u32)-1)
 
@@ -22,6 +23,7 @@ struct nfs4_slot {
        u32                     slot_nr;
        u32                     seq_nr;
        unsigned int            interrupted : 1,
+                               privileged : 1,
                                seq_done : 1;
 };
 
index cada00a..5f4281e 100644 (file)
@@ -991,6 +991,8 @@ int nfs4_select_rw_stateid(struct nfs4_state *state,
 {
        int ret;
 
+       if (!nfs4_valid_open_stateid(state))
+               return -EIO;
        if (cred != NULL)
                *cred = NULL;
        ret = nfs4_copy_lock_stateid(dst, state, lockowner);
@@ -1303,6 +1305,8 @@ void nfs4_schedule_path_down_recovery(struct nfs_client *clp)
 static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state)
 {
 
+       if (!nfs4_valid_open_stateid(state))
+               return 0;
        set_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
        /* Don't recover state that expired before the reboot */
        if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags)) {
@@ -1316,6 +1320,8 @@ static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_st
 
 int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state)
 {
+       if (!nfs4_valid_open_stateid(state))
+               return 0;
        set_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags);
        clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
        set_bit(NFS_OWNER_RECLAIM_NOGRACE, &state->owner->so_flags);
@@ -1327,9 +1333,8 @@ int nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_
 {
        struct nfs_client *clp = server->nfs_client;
 
-       if (!nfs4_valid_open_stateid(state))
+       if (!nfs4_state_mark_reclaim_nograce(clp, state))
                return -EBADF;
-       nfs4_state_mark_reclaim_nograce(clp, state);
        dprintk("%s: scheduling stateid recovery for server %s\n", __func__,
                        clp->cl_hostname);
        nfs4_schedule_state_manager(clp);
@@ -1337,6 +1342,35 @@ int nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_
 }
 EXPORT_SYMBOL_GPL(nfs4_schedule_stateid_recovery);
 
+static struct nfs4_lock_state *
+nfs_state_find_lock_state_by_stateid(struct nfs4_state *state,
+               const nfs4_stateid *stateid)
+{
+       struct nfs4_lock_state *pos;
+
+       list_for_each_entry(pos, &state->lock_states, ls_locks) {
+               if (!test_bit(NFS_LOCK_INITIALIZED, &pos->ls_flags))
+                       continue;
+               if (nfs4_stateid_match_other(&pos->ls_stateid, stateid))
+                       return pos;
+       }
+       return NULL;
+}
+
+static bool nfs_state_lock_state_matches_stateid(struct nfs4_state *state,
+               const nfs4_stateid *stateid)
+{
+       bool found = false;
+
+       if (test_bit(LK_STATE_IN_USE, &state->flags)) {
+               spin_lock(&state->state_lock);
+               if (nfs_state_find_lock_state_by_stateid(state, stateid))
+                       found = true;
+               spin_unlock(&state->state_lock);
+       }
+       return found;
+}
+
 void nfs_inode_find_state_and_recover(struct inode *inode,
                const nfs4_stateid *stateid)
 {
@@ -1351,14 +1385,18 @@ void nfs_inode_find_state_and_recover(struct inode *inode,
                state = ctx->state;
                if (state == NULL)
                        continue;
-               if (!test_bit(NFS_DELEGATED_STATE, &state->flags))
+               if (nfs4_stateid_match_other(&state->stateid, stateid) &&
+                   nfs4_state_mark_reclaim_nograce(clp, state)) {
+                       found = true;
                        continue;
-               if (!nfs4_stateid_match(&state->stateid, stateid))
-                       continue;
-               nfs4_state_mark_reclaim_nograce(clp, state);
-               found = true;
+               }
+               if (nfs_state_lock_state_matches_stateid(state, stateid) &&
+                   nfs4_state_mark_reclaim_nograce(clp, state))
+                       found = true;
        }
        spin_unlock(&inode->i_lock);
+
+       nfs_inode_find_delegation_state_and_recover(inode, stateid);
        if (found)
                nfs4_schedule_state_manager(clp);
 }
@@ -1498,6 +1536,9 @@ restart:
                                        __func__, status);
                        case -ENOENT:
                        case -ENOMEM:
+                       case -EACCES:
+                       case -EROFS:
+                       case -EIO:
                        case -ESTALE:
                                /* Open state on this file cannot be recovered */
                                nfs4_state_mark_recovery_failed(state, status);
@@ -1656,15 +1697,9 @@ static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
        put_rpccred(cred);
 }
 
-static void nfs_delegation_clear_all(struct nfs_client *clp)
-{
-       nfs_delegation_mark_reclaim(clp);
-       nfs_delegation_reap_unclaimed(clp);
-}
-
 static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp)
 {
-       nfs_delegation_clear_all(clp);
+       nfs_mark_test_expired_all_delegations(clp);
        nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce);
 }
 
@@ -2195,7 +2230,7 @@ static void nfs41_handle_all_state_revoked(struct nfs_client *clp)
 
 static void nfs41_handle_some_state_revoked(struct nfs_client *clp)
 {
-       nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce);
+       nfs4_state_start_reclaim_nograce(clp);
        nfs4_schedule_state_manager(clp);
 
        dprintk("%s: state revoked on server %s\n", __func__, clp->cl_hostname);
@@ -2227,13 +2262,22 @@ static void nfs41_handle_cb_path_down(struct nfs_client *clp)
                nfs4_schedule_state_manager(clp);
 }
 
-void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags)
+void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags,
+               bool recovery)
 {
        if (!flags)
                return;
 
        dprintk("%s: \"%s\" (client ID %llx) flags=0x%08x\n",
                __func__, clp->cl_hostname, clp->cl_clientid, flags);
+       /*
+        * If we're called from the state manager thread, then assume we're
+        * already handling the RECLAIM_NEEDED and/or STATE_REVOKED.
+        * Those flags are expected to remain set until we're done
+        * recovering (see RFC5661, section 18.46.3).
+        */
+       if (recovery)
+               goto out_recovery;
 
        if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED)
                nfs41_handle_server_reboot(clp);
@@ -2246,6 +2290,7 @@ void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags)
                nfs4_schedule_lease_moved_recovery(clp);
        if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED)
                nfs41_handle_recallable_state_revoked(clp);
+out_recovery:
        if (flags & SEQ4_STATUS_BACKCHANNEL_FAULT)
                nfs41_handle_backchannel_fault(clp);
        else if (flags & (SEQ4_STATUS_CB_PATH_DOWN |
@@ -2410,6 +2455,13 @@ static void nfs4_state_manager(struct nfs_client *clp)
                        nfs4_state_end_reclaim_reboot(clp);
                }
 
+               /* Detect expired delegations... */
+               if (test_and_clear_bit(NFS4CLNT_DELEGATION_EXPIRED, &clp->cl_state)) {
+                       section = "detect expired delegations";
+                       nfs_reap_expired_delegations(clp);
+                       continue;
+               }
+
                /* Now recover expired state... */
                if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) {
                        section = "reclaim nograce";
index 7bd3a5c..fc89e5e 100644 (file)
@@ -1850,7 +1850,7 @@ static void encode_create_session(struct xdr_stream *xdr,
        *p++ = cpu_to_be32(RPC_AUTH_UNIX);                      /* auth_sys */
 
        /* authsys_parms rfc1831 */
-       *p++ = cpu_to_be32(nn->boot_time.tv_nsec);      /* stamp */
+       *p++ = cpu_to_be32(ktime_to_ns(nn->boot_time)); /* stamp */
        p = xdr_encode_array(p, clnt->cl_nodename, clnt->cl_nodelen);
        *p++ = cpu_to_be32(0);                          /* UID */
        *p++ = cpu_to_be32(0);                          /* GID */
@@ -4725,34 +4725,37 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
 }
 
 /*
- * Decode potentially multiple layout types. Currently we only support
- * one layout driver per file system.
+ * Decode potentially multiple layout types.
  */
-static int decode_first_pnfs_layout_type(struct xdr_stream *xdr,
-                                        uint32_t *layouttype)
+static int decode_pnfs_layout_types(struct xdr_stream *xdr,
+                                   struct nfs_fsinfo *fsinfo)
 {
        __be32 *p;
-       int num;
+       uint32_t i;
 
        p = xdr_inline_decode(xdr, 4);
        if (unlikely(!p))
                goto out_overflow;
-       num = be32_to_cpup(p);
+       fsinfo->nlayouttypes = be32_to_cpup(p);
 
        /* pNFS is not supported by the underlying file system */
-       if (num == 0) {
-               *layouttype = 0;
+       if (fsinfo->nlayouttypes == 0)
                return 0;
-       }
-       if (num > 1)
-               printk(KERN_INFO "NFS: %s: Warning: Multiple pNFS layout "
-                       "drivers per filesystem not supported\n", __func__);
 
        /* Decode and set first layout type, move xdr->p past unused types */
-       p = xdr_inline_decode(xdr, num * 4);
+       p = xdr_inline_decode(xdr, fsinfo->nlayouttypes * 4);
        if (unlikely(!p))
                goto out_overflow;
-       *layouttype = be32_to_cpup(p);
+
+       /* If we get too many, then just cap it at the max */
+       if (fsinfo->nlayouttypes > NFS_MAX_LAYOUT_TYPES) {
+               printk(KERN_INFO "NFS: %s: Warning: Too many (%u) pNFS layout types\n",
+                       __func__, fsinfo->nlayouttypes);
+               fsinfo->nlayouttypes = NFS_MAX_LAYOUT_TYPES;
+       }
+
+       for(i = 0; i < fsinfo->nlayouttypes; ++i)
+               fsinfo->layouttype[i] = be32_to_cpup(p++);
        return 0;
 out_overflow:
        print_overflow_msg(__func__, xdr);
@@ -4764,7 +4767,7 @@ out_overflow:
  * Note we must ensure that layouttype is set in any non-error case.
  */
 static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap,
-                               uint32_t *layouttype)
+                               struct nfs_fsinfo *fsinfo)
 {
        int status = 0;
 
@@ -4772,10 +4775,9 @@ static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap,
        if (unlikely(bitmap[1] & (FATTR4_WORD1_FS_LAYOUT_TYPES - 1U)))
                return -EIO;
        if (bitmap[1] & FATTR4_WORD1_FS_LAYOUT_TYPES) {
-               status = decode_first_pnfs_layout_type(xdr, layouttype);
+               status = decode_pnfs_layout_types(xdr, fsinfo);
                bitmap[1] &= ~FATTR4_WORD1_FS_LAYOUT_TYPES;
-       } else
-               *layouttype = 0;
+       }
        return status;
 }
 
@@ -4856,7 +4858,7 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
        status = decode_attr_time_delta(xdr, bitmap, &fsinfo->time_delta);
        if (status != 0)
                goto xdr_error;
-       status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype);
+       status = decode_attr_pnfstype(xdr, bitmap, fsinfo);
        if (status != 0)
                goto xdr_error;
 
index 2c93a85..56b2d96 100644 (file)
@@ -30,6 +30,7 @@
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
 #include <linux/module.h>
+#include <linux/sort.h>
 #include "internal.h"
 #include "pnfs.h"
 #include "iostat.h"
@@ -99,35 +100,79 @@ unset_pnfs_layoutdriver(struct nfs_server *nfss)
 }
 
 /*
+ * When the server sends a list of layout types, we choose one in the order
+ * given in the list below.
+ *
+ * FIXME: should this list be configurable in some fashion? module param?
+ *       mount option? something else?
+ */
+static const u32 ld_prefs[] = {
+       LAYOUT_SCSI,
+       LAYOUT_BLOCK_VOLUME,
+       LAYOUT_OSD2_OBJECTS,
+       LAYOUT_FLEX_FILES,
+       LAYOUT_NFSV4_1_FILES,
+       0
+};
+
+static int
+ld_cmp(const void *e1, const void *e2)
+{
+       u32 ld1 = *((u32 *)e1);
+       u32 ld2 = *((u32 *)e2);
+       int i;
+
+       for (i = 0; ld_prefs[i] != 0; i++) {
+               if (ld1 == ld_prefs[i])
+                       return -1;
+
+               if (ld2 == ld_prefs[i])
+                       return 1;
+       }
+       return 0;
+}
+
+/*
  * Try to set the server's pnfs module to the pnfs layout type specified by id.
  * Currently only one pNFS layout driver per filesystem is supported.
  *
- * @id layout type. Zero (illegal layout type) indicates pNFS not in use.
+ * @ids array of layout types supported by MDS.
  */
 void
 set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh,
-                     u32 id)
+                     struct nfs_fsinfo *fsinfo)
 {
        struct pnfs_layoutdriver_type *ld_type = NULL;
+       u32 id;
+       int i;
 
-       if (id == 0)
-               goto out_no_driver;
        if (!(server->nfs_client->cl_exchange_flags &
                 (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) {
-               printk(KERN_ERR "NFS: %s: id %u cl_exchange_flags 0x%x\n",
-                       __func__, id, server->nfs_client->cl_exchange_flags);
+               printk(KERN_ERR "NFS: %s: cl_exchange_flags 0x%x\n",
+                       __func__, server->nfs_client->cl_exchange_flags);
                goto out_no_driver;
        }
-       ld_type = find_pnfs_driver(id);
-       if (!ld_type) {
-               request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id);
+
+       sort(fsinfo->layouttype, fsinfo->nlayouttypes,
+               sizeof(*fsinfo->layouttype), ld_cmp, NULL);
+
+       for (i = 0; i < fsinfo->nlayouttypes; i++) {
+               id = fsinfo->layouttype[i];
                ld_type = find_pnfs_driver(id);
                if (!ld_type) {
-                       dprintk("%s: No pNFS module found for %u.\n",
-                               __func__, id);
-                       goto out_no_driver;
+                       request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX,
+                                       id);
+                       ld_type = find_pnfs_driver(id);
                }
+               if (ld_type)
+                       break;
+       }
+
+       if (!ld_type) {
+               dprintk("%s: No pNFS module found!\n", __func__);
+               goto out_no_driver;
        }
+
        server->pnfs_curr_ld = ld_type;
        if (ld_type->set_layoutdriver
            && ld_type->set_layoutdriver(server, mntfh)) {
@@ -2185,10 +2230,8 @@ static void pnfs_ld_handle_read_error(struct nfs_pgio_header *hdr)
  */
 void pnfs_ld_read_done(struct nfs_pgio_header *hdr)
 {
-       if (likely(!hdr->pnfs_error)) {
-               __nfs4_read_done_cb(hdr);
+       if (likely(!hdr->pnfs_error))
                hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
-       }
        trace_nfs4_pnfs_read(hdr, hdr->pnfs_error);
        if (unlikely(hdr->pnfs_error))
                pnfs_ld_handle_read_error(hdr);
index 31d99b2..5c29551 100644 (file)
@@ -236,7 +236,7 @@ void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo);
 void pnfs_put_lseg(struct pnfs_layout_segment *lseg);
 void pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg);
 
-void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32);
+void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, struct nfs_fsinfo *);
 void unset_pnfs_layoutdriver(struct nfs_server *);
 void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *);
 int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc);
@@ -657,7 +657,8 @@ pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task)
 }
 
 static inline void set_pnfs_layoutdriver(struct nfs_server *s,
-                                        const struct nfs_fh *mntfh, u32 id)
+                                        const struct nfs_fh *mntfh,
+                                        struct nfs_fsinfo *fsinfo)
 {
 }
 
index f3468b5..53b4705 100644 (file)
@@ -690,13 +690,50 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
                dprintk("%s: DS %s: trying address %s\n",
                        __func__, ds->ds_remotestr, da->da_remotestr);
 
-               clp = nfs4_set_ds_client(mds_srv,
-                                       (struct sockaddr *)&da->da_addr,
-                                       da->da_addrlen, IPPROTO_TCP,
-                                       timeo, retrans, minor_version,
-                                       au_flavor);
-               if (!IS_ERR(clp))
-                       break;
+               if (!IS_ERR(clp) && clp->cl_mvops->session_trunk) {
+                       struct xprt_create xprt_args = {
+                               .ident = XPRT_TRANSPORT_TCP,
+                               .net = clp->cl_net,
+                               .dstaddr = (struct sockaddr *)&da->da_addr,
+                               .addrlen = da->da_addrlen,
+                               .servername = clp->cl_hostname,
+                       };
+                       struct nfs4_add_xprt_data xprtdata = {
+                               .clp = clp,
+                               .cred = nfs4_get_clid_cred(clp),
+                       };
+                       struct rpc_add_xprt_test rpcdata = {
+                               .add_xprt_test = clp->cl_mvops->session_trunk,
+                               .data = &xprtdata,
+                       };
+
+                       /**
+                       * Test this address for session trunking and
+                       * add as an alias
+                       */
+                       rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args,
+                                         rpc_clnt_setup_test_and_add_xprt,
+                                         &rpcdata);
+                       if (xprtdata.cred)
+                               put_rpccred(xprtdata.cred);
+               } else {
+                       clp = nfs4_set_ds_client(mds_srv,
+                                               (struct sockaddr *)&da->da_addr,
+                                               da->da_addrlen, IPPROTO_TCP,
+                                               timeo, retrans, minor_version,
+                                               au_flavor);
+                       if (IS_ERR(clp))
+                               continue;
+
+                       status = nfs4_init_ds_session(clp,
+                                       mds_srv->nfs_client->cl_lease_time);
+                       if (status) {
+                               nfs_put_client(clp);
+                               clp = ERR_PTR(-EIO);
+                               continue;
+                       }
+
+               }
        }
 
        if (IS_ERR(clp)) {
@@ -704,18 +741,11 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
                goto out;
        }
 
-       status = nfs4_init_ds_session(clp, mds_srv->nfs_client->cl_lease_time);
-       if (status)
-               goto out_put;
-
        smp_wmb();
        ds->ds_clp = clp;
        dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);
 out:
        return status;
-out_put:
-       nfs_put_client(clp);
-       goto out;
 }
 
 /*
index d396013..001796b 100644 (file)
@@ -2848,19 +2848,23 @@ out_invalid_transport_udp:
  * NFS client for backwards compatibility
  */
 unsigned int nfs_callback_set_tcpport;
+unsigned short nfs_callback_nr_threads;
 /* Default cache timeout is 10 minutes */
 unsigned int nfs_idmap_cache_timeout = 600;
 /* Turn off NFSv4 uid/gid mapping when using AUTH_SYS */
 bool nfs4_disable_idmapping = true;
 unsigned short max_session_slots = NFS4_DEF_SLOT_TABLE_SIZE;
+unsigned short max_session_cb_slots = NFS4_DEF_CB_SLOT_TABLE_SIZE;
 unsigned short send_implementation_id = 1;
 char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN] = "";
 bool recover_lost_locks = false;
 
+EXPORT_SYMBOL_GPL(nfs_callback_nr_threads);
 EXPORT_SYMBOL_GPL(nfs_callback_set_tcpport);
 EXPORT_SYMBOL_GPL(nfs_idmap_cache_timeout);
 EXPORT_SYMBOL_GPL(nfs4_disable_idmapping);
 EXPORT_SYMBOL_GPL(max_session_slots);
+EXPORT_SYMBOL_GPL(max_session_cb_slots);
 EXPORT_SYMBOL_GPL(send_implementation_id);
 EXPORT_SYMBOL_GPL(nfs4_client_id_uniquifier);
 EXPORT_SYMBOL_GPL(recover_lost_locks);
@@ -2887,6 +2891,9 @@ static const struct kernel_param_ops param_ops_portnr = {
 #define param_check_portnr(name, p) __param_check(name, p, unsigned int);
 
 module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644);
+module_param_named(callback_nr_threads, nfs_callback_nr_threads, ushort, 0644);
+MODULE_PARM_DESC(callback_nr_threads, "Number of threads that will be "
+               "assigned to the NFSv4 callback channels.");
 module_param(nfs_idmap_cache_timeout, int, 0644);
 module_param(nfs4_disable_idmapping, bool, 0644);
 module_param_string(nfs4_unique_id, nfs4_client_id_uniquifier,
@@ -2896,6 +2903,9 @@ MODULE_PARM_DESC(nfs4_disable_idmapping,
 module_param(max_session_slots, ushort, 0644);
 MODULE_PARM_DESC(max_session_slots, "Maximum number of outstanding NFSv4.1 "
                "requests the client will negotiate");
+module_param(max_session_cb_slots, ushort, 0644);
+MODULE_PARM_DESC(max_session_slots, "Maximum number of parallel NFSv4.1 "
+               "callbacks the client will process for a given server");
 module_param(send_implementation_id, ushort, 0644);
 MODULE_PARM_DESC(send_implementation_id,
                "Send implementation ID with NFSv4.1 exchange_id");
index c6564ad..9094faf 100644 (file)
@@ -67,6 +67,7 @@ struct nfs4_stateid_struct {
                NFS4_DELEGATION_STATEID_TYPE,
                NFS4_LAYOUT_STATEID_TYPE,
                NFS4_PNFS_DS_STATEID_TYPE,
+               NFS4_REVOKED_STATEID_TYPE,
        } type;
 };
 
index 14a762d..b34097c 100644 (file)
@@ -103,6 +103,9 @@ struct nfs_client {
 #define NFS_SP4_MACH_CRED_WRITE    5   /* WRITE */
 #define NFS_SP4_MACH_CRED_COMMIT   6   /* COMMIT */
 #define NFS_SP4_MACH_CRED_PNFS_CLEANUP  7 /* LAYOUTRETURN */
+#if IS_ENABLED(CONFIG_NFS_V4_1)
+       wait_queue_head_t       cl_lock_waitq;
+#endif /* CONFIG_NFS_V4_1 */
 #endif /* CONFIG_NFS_V4 */
 
        /* Our own IP address, as a null-terminated string.
index 7cc0dee..beb1e10 100644 (file)
@@ -125,6 +125,11 @@ struct nfs_fattr {
                | NFS_ATTR_FATTR_V4_SECURITY_LABEL)
 
 /*
+ * Maximal number of supported layout drivers.
+ */
+#define NFS_MAX_LAYOUT_TYPES 8
+
+/*
  * Info on the file system
  */
 struct nfs_fsinfo {
@@ -139,7 +144,8 @@ struct nfs_fsinfo {
        __u64                   maxfilesize;
        struct timespec         time_delta; /* server time granularity */
        __u32                   lease_time; /* in seconds */
-       __u32                   layouttype; /* supported pnfs layout driver */
+       __u32                   nlayouttypes; /* number of layouttypes */
+       __u32                   layouttype[NFS_MAX_LAYOUT_TYPES]; /* supported pnfs layout driver */
        __u32                   blksize; /* preferred pnfs io block size */
        __u32                   clone_blksize; /* granularity of a CLONE operation */
 };
index 4ccf184..b1bc62b 100644 (file)
@@ -131,6 +131,7 @@ struct rpc_authops {
        struct rpc_auth *       (*create)(struct rpc_auth_create_args *, struct rpc_clnt *);
        void                    (*destroy)(struct rpc_auth *);
 
+       int                     (*hash_cred)(struct auth_cred *, unsigned int);
        struct rpc_cred *       (*lookup_cred)(struct rpc_auth *, struct auth_cred *, int);
        struct rpc_cred *       (*crcreate)(struct rpc_auth*, struct auth_cred *, int, gfp_t);
        int                     (*list_pseudoflavors)(rpc_authflavor_t *, int);
index 5c02b06..85cc819 100644 (file)
@@ -125,6 +125,13 @@ struct rpc_create_args {
        struct svc_xprt         *bc_xprt;       /* NFSv4.1 backchannel */
 };
 
+struct rpc_add_xprt_test {
+       int (*add_xprt_test)(struct rpc_clnt *,
+               struct rpc_xprt *,
+               void *calldata);
+       void *data;
+};
+
 /* Values for "flags" field */
 #define RPC_CLNT_CREATE_HARDRTRY       (1UL << 0)
 #define RPC_CLNT_CREATE_AUTOBIND       (1UL << 2)
@@ -198,6 +205,16 @@ int                rpc_clnt_add_xprt(struct rpc_clnt *, struct xprt_create *,
 void           rpc_cap_max_reconnect_timeout(struct rpc_clnt *clnt,
                        unsigned long timeo);
 
+int            rpc_clnt_setup_test_and_add_xprt(struct rpc_clnt *,
+                       struct rpc_xprt_switch *,
+                       struct rpc_xprt *,
+                       void *);
+
 const char *rpc_proc_name(const struct rpc_task *task);
+
+void rpc_clnt_xprt_switch_put(struct rpc_clnt *);
+void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *, struct rpc_xprt *);
+bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt,
+                       const struct sockaddr *sap);
 #endif /* __KERNEL__ */
 #endif /* _LINUX_SUNRPC_CLNT_H */
index a7da6bf..cfda6ad 100644 (file)
 #define RPCRDMA_VERSION                1
 #define rpcrdma_version                cpu_to_be32(RPCRDMA_VERSION)
 
+enum {
+       RPCRDMA_V1_DEF_INLINE_SIZE      = 1024,
+};
+
 struct rpcrdma_segment {
        __be32 rs_handle;       /* Registered memory handle */
        __be32 rs_length;       /* Length of the chunk in bytes */
index 817af0b..7ba040c 100644 (file)
@@ -239,8 +239,8 @@ struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *,
                                        void *);
 void           rpc_wake_up_status(struct rpc_wait_queue *, int);
 void           rpc_delay(struct rpc_task *, unsigned long);
-void *         rpc_malloc(struct rpc_task *, size_t);
-void           rpc_free(void *);
+int            rpc_malloc(struct rpc_task *);
+void           rpc_free(struct rpc_task *);
 int            rpciod_up(void);
 void           rpciod_down(void);
 int            __rpc_wait_for_completion_task(struct rpc_task *task, wait_bit_action_f *);
index 70c6b92..56c48c8 100644 (file)
@@ -67,6 +67,18 @@ struct xdr_buf {
                        len;            /* Length of XDR encoded message */
 };
 
+static inline void
+xdr_buf_init(struct xdr_buf *buf, void *start, size_t len)
+{
+       buf->head[0].iov_base = start;
+       buf->head[0].iov_len = len;
+       buf->tail[0].iov_len = 0;
+       buf->page_len = 0;
+       buf->flags = 0;
+       buf->len = 0;
+       buf->buflen = len;
+}
+
 /*
  * pre-xdr'ed macros.
  */
index a16070d..a5da60b 100644 (file)
@@ -83,9 +83,11 @@ struct rpc_rqst {
        void (*rq_release_snd_buf)(struct rpc_rqst *); /* release rq_enc_pages */
        struct list_head        rq_list;
 
-       __u32 *                 rq_buffer;      /* XDR encode buffer */
-       size_t                  rq_callsize,
-                               rq_rcvsize;
+       void                    *rq_xprtdata;   /* Per-xprt private data */
+       void                    *rq_buffer;     /* Call XDR encode buffer */
+       size_t                  rq_callsize;
+       void                    *rq_rbuffer;    /* Reply XDR decode buffer */
+       size_t                  rq_rcvsize;
        size_t                  rq_xmit_bytes_sent;     /* total bytes sent */
        size_t                  rq_reply_bytes_recvd;   /* total reply bytes */
                                                        /* received */
@@ -127,8 +129,8 @@ struct rpc_xprt_ops {
        void            (*rpcbind)(struct rpc_task *task);
        void            (*set_port)(struct rpc_xprt *xprt, unsigned short port);
        void            (*connect)(struct rpc_xprt *xprt, struct rpc_task *task);
-       void *          (*buf_alloc)(struct rpc_task *task, size_t size);
-       void            (*buf_free)(void *buffer);
+       int             (*buf_alloc)(struct rpc_task *task);
+       void            (*buf_free)(struct rpc_task *task);
        int             (*send_request)(struct rpc_task *task);
        void            (*set_retrans_timeout)(struct rpc_task *task);
        void            (*timer)(struct rpc_xprt *xprt, struct rpc_task *task);
index 5a9acff..507418c 100644 (file)
@@ -66,4 +66,6 @@ extern struct rpc_xprt *xprt_iter_xprt(struct rpc_xprt_iter *xpi);
 extern struct rpc_xprt *xprt_iter_get_xprt(struct rpc_xprt_iter *xpi);
 extern struct rpc_xprt *xprt_iter_get_next(struct rpc_xprt_iter *xpi);
 
+extern bool rpc_xprt_switch_has_addr(struct rpc_xprt_switch *xps,
+               const struct sockaddr *sap);
 #endif
index 39267dc..221b7a2 100644 (file)
@@ -53,8 +53,8 @@
 #define RPCRDMA_MAX_SLOT_TABLE (256U)
 
 #define RPCRDMA_MIN_INLINE  (1024)     /* min inline thresh */
-#define RPCRDMA_DEF_INLINE  (1024)     /* default inline thresh */
-#define RPCRDMA_MAX_INLINE  (3068)     /* max inline thresh */
+#define RPCRDMA_DEF_INLINE  (4096)     /* default inline thresh */
+#define RPCRDMA_MAX_INLINE  (65536)    /* max inline thresh */
 
 /* Memory registration strategies, by number.
  * This is part of a kernel / user space API. Do not remove. */
index a7e42f9..2bff63a 100644 (file)
@@ -551,7 +551,7 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
                        *entry, *new;
        unsigned int nr;
 
-       nr = hash_long(from_kuid(&init_user_ns, acred->uid), cache->hashbits);
+       nr = auth->au_ops->hash_cred(acred, cache->hashbits);
 
        rcu_read_lock();
        hlist_for_each_entry_rcu(entry, &cache->hashtable[nr], cr_hash) {
index 83dffea..f1df983 100644 (file)
@@ -78,6 +78,14 @@ static struct rpc_cred *generic_bind_cred(struct rpc_task *task,
        return auth->au_ops->lookup_cred(auth, acred, lookupflags);
 }
 
+static int
+generic_hash_cred(struct auth_cred *acred, unsigned int hashbits)
+{
+       return hash_64(from_kgid(&init_user_ns, acred->gid) |
+               ((u64)from_kuid(&init_user_ns, acred->uid) <<
+                       (sizeof(gid_t) * 8)), hashbits);
+}
+
 /*
  * Lookup generic creds for current process
  */
@@ -258,6 +266,7 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred)
 static const struct rpc_authops generic_auth_ops = {
        .owner = THIS_MODULE,
        .au_name = "Generic",
+       .hash_cred = generic_hash_cred,
        .lookup_cred = generic_lookup_cred,
        .crcreate = generic_create_cred,
        .key_timeout = generic_key_timeout,
index 976c781..d8bd97a 100644 (file)
@@ -1298,6 +1298,12 @@ gss_destroy_cred(struct rpc_cred *cred)
        gss_destroy_nullcred(cred);
 }
 
+static int
+gss_hash_cred(struct auth_cred *acred, unsigned int hashbits)
+{
+       return hash_64(from_kuid(&init_user_ns, acred->uid), hashbits);
+}
+
 /*
  * Lookup RPCSEC_GSS cred for the current process
  */
@@ -1982,6 +1988,7 @@ static const struct rpc_authops authgss_ops = {
        .au_name        = "RPCSEC_GSS",
        .create         = gss_create,
        .destroy        = gss_destroy,
+       .hash_cred      = gss_hash_cred,
        .lookup_cred    = gss_lookup_cred,
        .crcreate       = gss_create_cred,
        .list_pseudoflavors = gss_mech_list_pseudoflavors,
index a1d768a..306fc0f 100644 (file)
@@ -46,6 +46,14 @@ unx_destroy(struct rpc_auth *auth)
        rpcauth_clear_credcache(auth->au_credcache);
 }
 
+static int
+unx_hash_cred(struct auth_cred *acred, unsigned int hashbits)
+{
+       return hash_64(from_kgid(&init_user_ns, acred->gid) |
+               ((u64)from_kuid(&init_user_ns, acred->uid) <<
+                       (sizeof(gid_t) * 8)), hashbits);
+}
+
 /*
  * Lookup AUTH_UNIX creds for current process
  */
@@ -220,6 +228,7 @@ const struct rpc_authops authunix_ops = {
        .au_name        = "UNIX",
        .create         = unx_create,
        .destroy        = unx_destroy,
+       .hash_cred      = unx_hash_cred,
        .lookup_cred    = unx_lookup_cred,
        .crcreate       = unx_create_cred,
 };
index 229956b..ac701c2 100644 (file)
@@ -76,13 +76,7 @@ static int xprt_alloc_xdr_buf(struct xdr_buf *buf, gfp_t gfp_flags)
        page = alloc_page(gfp_flags);
        if (page == NULL)
                return -ENOMEM;
-       buf->head[0].iov_base = page_address(page);
-       buf->head[0].iov_len = PAGE_SIZE;
-       buf->tail[0].iov_base = NULL;
-       buf->tail[0].iov_len = 0;
-       buf->page_len = 0;
-       buf->len = 0;
-       buf->buflen = PAGE_SIZE;
+       xdr_buf_init(buf, page_address(page), PAGE_SIZE);
        return 0;
 }
 
index 4d8e11f..8aabe12 100644 (file)
@@ -353,7 +353,7 @@ void sunrpc_init_cache_detail(struct cache_detail *cd)
        spin_unlock(&cache_list_lock);
 
        /* start the cleaning process */
-       schedule_delayed_work(&cache_cleaner, 0);
+       queue_delayed_work(system_power_efficient_wq, &cache_cleaner, 0);
 }
 EXPORT_SYMBOL_GPL(sunrpc_init_cache_detail);
 
@@ -476,7 +476,8 @@ static void do_cache_clean(struct work_struct *work)
                delay = 0;
 
        if (delay)
-               schedule_delayed_work(&cache_cleaner, delay);
+               queue_delayed_work(system_power_efficient_wq,
+                                  &cache_cleaner, delay);
 }
 
 
index 66f23b3..34dd7b2 100644 (file)
@@ -184,7 +184,6 @@ static int __rpc_clnt_handle_event(struct rpc_clnt *clnt, unsigned long event,
                                   struct super_block *sb)
 {
        struct dentry *dentry;
-       int err = 0;
 
        switch (event) {
        case RPC_PIPEFS_MOUNT:
@@ -201,7 +200,7 @@ static int __rpc_clnt_handle_event(struct rpc_clnt *clnt, unsigned long event,
                printk(KERN_ERR "%s: unknown event: %ld\n", __func__, event);
                return -ENOTSUPP;
        }
-       return err;
+       return 0;
 }
 
 static int __rpc_pipefs_event(struct rpc_clnt *clnt, unsigned long event,
@@ -988,7 +987,6 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt)
 {
 
        if (clnt != NULL) {
-               rpc_task_release_client(task);
                if (task->tk_xprt == NULL)
                        task->tk_xprt = xprt_iter_get_next(&clnt->cl_xpi);
                task->tk_client = clnt;
@@ -1693,6 +1691,7 @@ call_allocate(struct rpc_task *task)
        struct rpc_rqst *req = task->tk_rqstp;
        struct rpc_xprt *xprt = req->rq_xprt;
        struct rpc_procinfo *proc = task->tk_msg.rpc_proc;
+       int status;
 
        dprint_status(task);
 
@@ -1718,11 +1717,14 @@ call_allocate(struct rpc_task *task)
        req->rq_rcvsize = RPC_REPHDRSIZE + slack + proc->p_replen;
        req->rq_rcvsize <<= 2;
 
-       req->rq_buffer = xprt->ops->buf_alloc(task,
-                                       req->rq_callsize + req->rq_rcvsize);
-       if (req->rq_buffer != NULL)
-               return;
+       status = xprt->ops->buf_alloc(task);
        xprt_inject_disconnect(xprt);
+       if (status == 0)
+               return;
+       if (status != -ENOMEM) {
+               rpc_exit(task, status);
+               return;
+       }
 
        dprintk("RPC: %5u rpc_buffer allocation failed\n", task->tk_pid);
 
@@ -1748,18 +1750,6 @@ rpc_task_force_reencode(struct rpc_task *task)
        task->tk_rqstp->rq_bytes_sent = 0;
 }
 
-static inline void
-rpc_xdr_buf_init(struct xdr_buf *buf, void *start, size_t len)
-{
-       buf->head[0].iov_base = start;
-       buf->head[0].iov_len = len;
-       buf->tail[0].iov_len = 0;
-       buf->page_len = 0;
-       buf->flags = 0;
-       buf->len = 0;
-       buf->buflen = len;
-}
-
 /*
  * 3.  Encode arguments of an RPC call
  */
@@ -1772,12 +1762,12 @@ rpc_xdr_encode(struct rpc_task *task)
 
        dprint_status(task);
 
-       rpc_xdr_buf_init(&req->rq_snd_buf,
-                        req->rq_buffer,
-                        req->rq_callsize);
-       rpc_xdr_buf_init(&req->rq_rcv_buf,
-                        (char *)req->rq_buffer + req->rq_callsize,
-                        req->rq_rcvsize);
+       xdr_buf_init(&req->rq_snd_buf,
+                    req->rq_buffer,
+                    req->rq_callsize);
+       xdr_buf_init(&req->rq_rcv_buf,
+                    req->rq_rbuffer,
+                    req->rq_rcvsize);
 
        p = rpc_encode_header(task);
        if (p == NULL) {
@@ -2616,6 +2606,70 @@ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt,
 EXPORT_SYMBOL_GPL(rpc_clnt_test_and_add_xprt);
 
 /**
+ * rpc_clnt_setup_test_and_add_xprt()
+ *
+ * This is an rpc_clnt_add_xprt setup() function which returns 1 so:
+ *   1) caller of the test function must dereference the rpc_xprt_switch
+ *   and the rpc_xprt.
+ *   2) test function must call rpc_xprt_switch_add_xprt, usually in
+ *   the rpc_call_done routine.
+ *
+ * Upon success (return of 1), the test function adds the new
+ * transport to the rpc_clnt xprt switch
+ *
+ * @clnt: struct rpc_clnt to get the new transport
+ * @xps:  the rpc_xprt_switch to hold the new transport
+ * @xprt: the rpc_xprt to test
+ * @data: a struct rpc_add_xprt_test pointer that holds the test function
+ *        and test function call data
+ */
+int rpc_clnt_setup_test_and_add_xprt(struct rpc_clnt *clnt,
+                                    struct rpc_xprt_switch *xps,
+                                    struct rpc_xprt *xprt,
+                                    void *data)
+{
+       struct rpc_cred *cred;
+       struct rpc_task *task;
+       struct rpc_add_xprt_test *xtest = (struct rpc_add_xprt_test *)data;
+       int status = -EADDRINUSE;
+
+       xprt = xprt_get(xprt);
+       xprt_switch_get(xps);
+
+       if (rpc_xprt_switch_has_addr(xps, (struct sockaddr *)&xprt->addr))
+               goto out_err;
+
+       /* Test the connection */
+       cred = authnull_ops.lookup_cred(NULL, NULL, 0);
+       task = rpc_call_null_helper(clnt, xprt, cred,
+                                   RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
+                                   NULL, NULL);
+       put_rpccred(cred);
+       if (IS_ERR(task)) {
+               status = PTR_ERR(task);
+               goto out_err;
+       }
+       status = task->tk_status;
+       rpc_put_task(task);
+
+       if (status < 0)
+               goto out_err;
+
+       /* rpc_xprt_switch and rpc_xprt are deferrenced by add_xprt_test() */
+       xtest->add_xprt_test(clnt, xprt, xtest->data);
+
+       /* so that rpc_clnt_add_xprt does not call rpc_xprt_switch_add_xprt */
+       return 1;
+out_err:
+       xprt_put(xprt);
+       xprt_switch_put(xps);
+       pr_info("RPC:   rpc_clnt_test_xprt failed: %d addr %s not added\n",
+               status, xprt->address_strings[RPC_DISPLAY_ADDR]);
+       return status;
+}
+EXPORT_SYMBOL_GPL(rpc_clnt_setup_test_and_add_xprt);
+
+/**
  * rpc_clnt_add_xprt - Add a new transport to a rpc_clnt
  * @clnt: pointer to struct rpc_clnt
  * @xprtargs: pointer to struct xprt_create
@@ -2697,6 +2751,34 @@ rpc_cap_max_reconnect_timeout(struct rpc_clnt *clnt, unsigned long timeo)
 }
 EXPORT_SYMBOL_GPL(rpc_cap_max_reconnect_timeout);
 
+void rpc_clnt_xprt_switch_put(struct rpc_clnt *clnt)
+{
+       xprt_switch_put(rcu_dereference(clnt->cl_xpi.xpi_xpswitch));
+}
+EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_put);
+
+void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt)
+{
+       rpc_xprt_switch_add_xprt(rcu_dereference(clnt->cl_xpi.xpi_xpswitch),
+                                xprt);
+}
+EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_add_xprt);
+
+bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt,
+                                  const struct sockaddr *sap)
+{
+       struct rpc_xprt_switch *xps;
+       bool ret;
+
+       xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch);
+
+       rcu_read_lock();
+       ret = rpc_xprt_switch_has_addr(xps, sap);
+       rcu_read_unlock();
+       return ret;
+}
+EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_has_addr);
+
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 static void rpc_show_header(void)
 {
index 9ae5885..5db68b3 100644 (file)
@@ -849,14 +849,17 @@ static void rpc_async_schedule(struct work_struct *work)
 }
 
 /**
- * rpc_malloc - allocate an RPC buffer
- * @task: RPC task that will use this buffer
- * @size: requested byte size
+ * rpc_malloc - allocate RPC buffer resources
+ * @task: RPC task
+ *
+ * A single memory region is allocated, which is split between the
+ * RPC call and RPC reply that this task is being used for. When
+ * this RPC is retired, the memory is released by calling rpc_free.
  *
  * To prevent rpciod from hanging, this allocator never sleeps,
- * returning NULL and suppressing warning if the request cannot be serviced
- * immediately.
- * The caller can arrange to sleep in a way that is safe for rpciod.
+ * returning -ENOMEM and suppressing warning if the request cannot
+ * be serviced immediately. The caller can arrange to sleep in a
+ * way that is safe for rpciod.
  *
  * Most requests are 'small' (under 2KiB) and can be serviced from a
  * mempool, ensuring that NFS reads and writes can always proceed,
@@ -865,8 +868,10 @@ static void rpc_async_schedule(struct work_struct *work)
  * In order to avoid memory starvation triggering more writebacks of
  * NFS requests, we avoid using GFP_KERNEL.
  */
-void *rpc_malloc(struct rpc_task *task, size_t size)
+int rpc_malloc(struct rpc_task *task)
 {
+       struct rpc_rqst *rqst = task->tk_rqstp;
+       size_t size = rqst->rq_callsize + rqst->rq_rcvsize;
        struct rpc_buffer *buf;
        gfp_t gfp = GFP_NOIO | __GFP_NOWARN;
 
@@ -880,28 +885,28 @@ void *rpc_malloc(struct rpc_task *task, size_t size)
                buf = kmalloc(size, gfp);
 
        if (!buf)
-               return NULL;
+               return -ENOMEM;
 
        buf->len = size;
        dprintk("RPC: %5u allocated buffer of size %zu at %p\n",
                        task->tk_pid, size, buf);
-       return &buf->data;
+       rqst->rq_buffer = buf->data;
+       rqst->rq_rbuffer = (char *)rqst->rq_buffer + rqst->rq_callsize;
+       return 0;
 }
 EXPORT_SYMBOL_GPL(rpc_malloc);
 
 /**
- * rpc_free - free buffer allocated via rpc_malloc
- * @buffer: buffer to free
+ * rpc_free - free RPC buffer resources allocated via rpc_malloc
+ * @task: RPC task
  *
  */
-void rpc_free(void *buffer)
+void rpc_free(struct rpc_task *task)
 {
+       void *buffer = task->tk_rqstp->rq_buffer;
        size_t size;
        struct rpc_buffer *buf;
 
-       if (!buffer)
-               return;
-
        buf = container_of(buffer, struct rpc_buffer, data);
        size = buf->len;
 
index c5b0cb4..7c8070e 100644 (file)
@@ -401,6 +401,21 @@ int svc_bind(struct svc_serv *serv, struct net *net)
 }
 EXPORT_SYMBOL_GPL(svc_bind);
 
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+static void
+__svc_init_bc(struct svc_serv *serv)
+{
+       INIT_LIST_HEAD(&serv->sv_cb_list);
+       spin_lock_init(&serv->sv_cb_lock);
+       init_waitqueue_head(&serv->sv_cb_waitq);
+}
+#else
+static void
+__svc_init_bc(struct svc_serv *serv)
+{
+}
+#endif
+
 /*
  * Create an RPC service
  */
@@ -443,6 +458,8 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
        init_timer(&serv->sv_temptimer);
        spin_lock_init(&serv->sv_lock);
 
+       __svc_init_bc(serv);
+
        serv->sv_nrpools = npools;
        serv->sv_pools =
                kcalloc(serv->sv_nrpools, sizeof(struct svc_pool),
index c4f3cc0..7f1071e 100644 (file)
@@ -767,7 +767,7 @@ static void xdr_set_next_page(struct xdr_stream *xdr)
        newbase -= xdr->buf->page_base;
 
        if (xdr_set_page_base(xdr, newbase, PAGE_SIZE) < 0)
-               xdr_set_iov(xdr, xdr->buf->tail, xdr->buf->len);
+               xdr_set_iov(xdr, xdr->buf->tail, xdr->nwords << 2);
 }
 
 static bool xdr_set_next_buffer(struct xdr_stream *xdr)
@@ -776,7 +776,7 @@ static bool xdr_set_next_buffer(struct xdr_stream *xdr)
                xdr_set_next_page(xdr);
        else if (xdr->iov == xdr->buf->head) {
                if (xdr_set_page_base(xdr, 0, PAGE_SIZE) < 0)
-                       xdr_set_iov(xdr, xdr->buf->tail, xdr->buf->len);
+                       xdr_set_iov(xdr, xdr->buf->tail, xdr->nwords << 2);
        }
        return xdr->p != xdr->end;
 }
@@ -859,12 +859,15 @@ EXPORT_SYMBOL_GPL(xdr_set_scratch_buffer);
 static __be32 *xdr_copy_to_scratch(struct xdr_stream *xdr, size_t nbytes)
 {
        __be32 *p;
-       void *cpdest = xdr->scratch.iov_base;
+       char *cpdest = xdr->scratch.iov_base;
        size_t cplen = (char *)xdr->end - (char *)xdr->p;
 
        if (nbytes > xdr->scratch.iov_len)
                return NULL;
-       memcpy(cpdest, xdr->p, cplen);
+       p = __xdr_inline_decode(xdr, cplen);
+       if (p == NULL)
+               return NULL;
+       memcpy(cpdest, p, cplen);
        cpdest += cplen;
        nbytes -= cplen;
        if (!xdr_set_next_buffer(xdr))
index ea244b2..685e6d2 100644 (file)
@@ -1295,7 +1295,7 @@ void xprt_release(struct rpc_task *task)
        xprt_schedule_autodisconnect(xprt);
        spin_unlock_bh(&xprt->transport_lock);
        if (req->rq_buffer)
-               xprt->ops->buf_free(req->rq_buffer);
+               xprt->ops->buf_free(task);
        xprt_inject_disconnect(xprt);
        if (req->rq_cred != NULL)
                put_rpccred(req->rq_cred);
index 66c9d63..ae92a9e 100644 (file)
@@ -15,6 +15,7 @@
 #include <asm/cmpxchg.h>
 #include <linux/spinlock.h>
 #include <linux/sunrpc/xprt.h>
+#include <linux/sunrpc/addr.h>
 #include <linux/sunrpc/xprtmultipath.h>
 
 typedef struct rpc_xprt *(*xprt_switch_find_xprt_t)(struct list_head *head,
@@ -49,7 +50,8 @@ void rpc_xprt_switch_add_xprt(struct rpc_xprt_switch *xps,
        if (xprt == NULL)
                return;
        spin_lock(&xps->xps_lock);
-       if (xps->xps_net == xprt->xprt_net || xps->xps_net == NULL)
+       if ((xps->xps_net == xprt->xprt_net || xps->xps_net == NULL) &&
+           !rpc_xprt_switch_has_addr(xps, (struct sockaddr *)&xprt->addr))
                xprt_switch_add_xprt_locked(xps, xprt);
        spin_unlock(&xps->xps_lock);
 }
@@ -232,6 +234,26 @@ struct rpc_xprt *xprt_iter_current_entry(struct rpc_xprt_iter *xpi)
        return xprt_switch_find_current_entry(head, xpi->xpi_cursor);
 }
 
+bool rpc_xprt_switch_has_addr(struct rpc_xprt_switch *xps,
+                             const struct sockaddr *sap)
+{
+       struct list_head *head;
+       struct rpc_xprt *pos;
+
+       if (xps == NULL || sap == NULL)
+               return false;
+
+       head = &xps->xps_xprt_list;
+       list_for_each_entry_rcu(pos, head, xprt_switch) {
+               if (rpc_cmp_addr_port(sap, (struct sockaddr *)&pos->addr)) {
+                       pr_info("RPC:   addr %s already in xprt switch\n",
+                               pos->address_strings[RPC_DISPLAY_ADDR]);
+                       return true;
+               }
+       }
+       return false;
+}
+
 static
 struct rpc_xprt *xprt_switch_find_next_entry(struct list_head *head,
                const struct rpc_xprt *cur)
index 87762d9..2c472e1 100644 (file)
@@ -27,7 +27,7 @@ static void rpcrdma_bc_free_rqst(struct rpcrdma_xprt *r_xprt,
        list_del(&req->rl_all);
        spin_unlock(&buf->rb_reqslock);
 
-       rpcrdma_destroy_req(&r_xprt->rx_ia, req);
+       rpcrdma_destroy_req(req);
 
        kfree(rqst);
 }
@@ -35,10 +35,8 @@ static void rpcrdma_bc_free_rqst(struct rpcrdma_xprt *r_xprt,
 static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt,
                                 struct rpc_rqst *rqst)
 {
-       struct rpcrdma_ia *ia = &r_xprt->rx_ia;
        struct rpcrdma_regbuf *rb;
        struct rpcrdma_req *req;
-       struct xdr_buf *buf;
        size_t size;
 
        req = rpcrdma_create_req(r_xprt);
@@ -46,30 +44,19 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt,
                return PTR_ERR(req);
        req->rl_backchannel = true;
 
-       size = RPCRDMA_INLINE_WRITE_THRESHOLD(rqst);
-       rb = rpcrdma_alloc_regbuf(ia, size, GFP_KERNEL);
+       rb = rpcrdma_alloc_regbuf(RPCRDMA_HDRBUF_SIZE,
+                                 DMA_TO_DEVICE, GFP_KERNEL);
        if (IS_ERR(rb))
                goto out_fail;
        req->rl_rdmabuf = rb;
 
-       size += RPCRDMA_INLINE_READ_THRESHOLD(rqst);
-       rb = rpcrdma_alloc_regbuf(ia, size, GFP_KERNEL);
+       size = r_xprt->rx_data.inline_rsize;
+       rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, GFP_KERNEL);
        if (IS_ERR(rb))
                goto out_fail;
-       rb->rg_owner = req;
        req->rl_sendbuf = rb;
-       /* so that rpcr_to_rdmar works when receiving a request */
-       rqst->rq_buffer = (void *)req->rl_sendbuf->rg_base;
-
-       buf = &rqst->rq_snd_buf;
-       buf->head[0].iov_base = rqst->rq_buffer;
-       buf->head[0].iov_len = 0;
-       buf->tail[0].iov_base = NULL;
-       buf->tail[0].iov_len = 0;
-       buf->page_len = 0;
-       buf->len = 0;
-       buf->buflen = size;
-
+       xdr_buf_init(&rqst->rq_snd_buf, rb->rg_base, size);
+       rpcrdma_set_xprtdata(rqst, req);
        return 0;
 
 out_fail:
@@ -219,7 +206,6 @@ int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
        struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
        struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
        struct rpcrdma_msg *headerp;
-       size_t rpclen;
 
        headerp = rdmab_to_msg(req->rl_rdmabuf);
        headerp->rm_xid = rqst->rq_xid;
@@ -231,26 +217,9 @@ int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
        headerp->rm_body.rm_chunks[1] = xdr_zero;
        headerp->rm_body.rm_chunks[2] = xdr_zero;
 
-       rpclen = rqst->rq_svec[0].iov_len;
-
-#ifdef RPCRDMA_BACKCHANNEL_DEBUG
-       pr_info("RPC:       %s: rpclen %zd headerp 0x%p lkey 0x%x\n",
-               __func__, rpclen, headerp, rdmab_lkey(req->rl_rdmabuf));
-       pr_info("RPC:       %s: RPC/RDMA: %*ph\n",
-               __func__, (int)RPCRDMA_HDRLEN_MIN, headerp);
-       pr_info("RPC:       %s:      RPC: %*ph\n",
-               __func__, (int)rpclen, rqst->rq_svec[0].iov_base);
-#endif
-
-       req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf);
-       req->rl_send_iov[0].length = RPCRDMA_HDRLEN_MIN;
-       req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf);
-
-       req->rl_send_iov[1].addr = rdmab_addr(req->rl_sendbuf);
-       req->rl_send_iov[1].length = rpclen;
-       req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf);
-
-       req->rl_niovs = 2;
+       if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, RPCRDMA_HDRLEN_MIN,
+                                      &rqst->rq_snd_buf, rpcrdma_noch))
+               return -EIO;
        return 0;
 }
 
@@ -402,7 +371,7 @@ out_overflow:
 out_short:
        pr_warn("RPC/RDMA short backward direction call\n");
 
-       if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep))
+       if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, rep))
                xprt_disconnect_done(xprt);
        else
                pr_warn("RPC:       %s: reposting rep %p\n",
index 21cb3b1..1ebb09e 100644 (file)
@@ -160,9 +160,8 @@ static int
 fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
            struct rpcrdma_create_data_internal *cdata)
 {
-       rpcrdma_set_max_header_sizes(ia, cdata, max_t(unsigned int, 1,
-                                                     RPCRDMA_MAX_DATA_SEGS /
-                                                     RPCRDMA_MAX_FMR_SGES));
+       ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS /
+                               RPCRDMA_MAX_FMR_SGES);
        return 0;
 }
 
@@ -274,6 +273,7 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
         */
        list_for_each_entry(mw, &req->rl_registered, mw_list)
                list_add_tail(&mw->fmr.fm_mr->list, &unmap_list);
+       r_xprt->rx_stats.local_inv_needed++;
        rc = ib_unmap_fmr(&unmap_list);
        if (rc)
                goto out_reset;
@@ -331,4 +331,5 @@ const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
        .ro_init_mr                     = fmr_op_init_mr,
        .ro_release_mr                  = fmr_op_release_mr,
        .ro_displayname                 = "fmr",
+       .ro_send_w_inv_ok               = 0,
 };
index 892b5e1..2109495 100644 (file)
@@ -67,6 +67,8 @@
  * pending send queue WRs before the transport is reconnected.
  */
 
+#include <linux/sunrpc/rpc_rdma.h>
+
 #include "xprt_rdma.h"
 
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
@@ -161,7 +163,7 @@ __frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r)
                return PTR_ERR(f->fr_mr);
        }
 
-       dprintk("RPC:       %s: recovered FRMR %p\n", __func__, r);
+       dprintk("RPC:       %s: recovered FRMR %p\n", __func__, f);
        f->fr_state = FRMR_IS_INVALID;
        return 0;
 }
@@ -242,9 +244,8 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
                                               depth;
        }
 
-       rpcrdma_set_max_header_sizes(ia, cdata, max_t(unsigned int, 1,
-                                                     RPCRDMA_MAX_DATA_SEGS /
-                                                     ia->ri_max_frmr_depth));
+       ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS /
+                               ia->ri_max_frmr_depth);
        return 0;
 }
 
@@ -329,7 +330,7 @@ frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
        frmr = container_of(cqe, struct rpcrdma_frmr, fr_cqe);
        if (wc->status != IB_WC_SUCCESS)
                __frwr_sendcompletion_flush(wc, frmr, "localinv");
-       complete_all(&frmr->fr_linv_done);
+       complete(&frmr->fr_linv_done);
 }
 
 /* Post a REG_MR Work Request to register a memory region
@@ -396,7 +397,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
                goto out_mapmr_err;
 
        dprintk("RPC:       %s: Using frmr %p to map %u segments (%u bytes)\n",
-               __func__, mw, mw->mw_nents, mr->length);
+               __func__, frmr, mw->mw_nents, mr->length);
 
        key = (u8)(mr->rkey & 0x000000FF);
        ib_update_fast_reg_key(mr, ++key);
@@ -449,6 +450,8 @@ __frwr_prepare_linv_wr(struct rpcrdma_mw *mw)
        struct rpcrdma_frmr *f = &mw->frmr;
        struct ib_send_wr *invalidate_wr;
 
+       dprintk("RPC:       %s: invalidating frmr %p\n", __func__, f);
+
        f->fr_state = FRMR_IS_INVALID;
        invalidate_wr = &f->fr_invwr;
 
@@ -472,6 +475,7 @@ static void
 frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 {
        struct ib_send_wr *invalidate_wrs, *pos, *prev, *bad_wr;
+       struct rpcrdma_rep *rep = req->rl_reply;
        struct rpcrdma_ia *ia = &r_xprt->rx_ia;
        struct rpcrdma_mw *mw, *tmp;
        struct rpcrdma_frmr *f;
@@ -487,6 +491,12 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
        f = NULL;
        invalidate_wrs = pos = prev = NULL;
        list_for_each_entry(mw, &req->rl_registered, mw_list) {
+               if ((rep->rr_wc_flags & IB_WC_WITH_INVALIDATE) &&
+                   (mw->mw_handle == rep->rr_inv_rkey)) {
+                       mw->frmr.fr_state = FRMR_IS_INVALID;
+                       continue;
+               }
+
                pos = __frwr_prepare_linv_wr(mw);
 
                if (!invalidate_wrs)
@@ -496,6 +506,8 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
                prev = pos;
                f = &mw->frmr;
        }
+       if (!f)
+               goto unmap;
 
        /* Strong send queue ordering guarantees that when the
         * last WR in the chain completes, all WRs in the chain
@@ -510,6 +522,7 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
         * replaces the QP. The RPC reply handler won't call us
         * unless ri_id->qp is a valid pointer.
         */
+       r_xprt->rx_stats.local_inv_needed++;
        rc = ib_post_send(ia->ri_id->qp, invalidate_wrs, &bad_wr);
        if (rc)
                goto reset_mrs;
@@ -521,6 +534,8 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
         */
 unmap:
        list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) {
+               dprintk("RPC:       %s: unmapping frmr %p\n",
+                       __func__, &mw->frmr);
                list_del_init(&mw->mw_list);
                ib_dma_unmap_sg(ia->ri_device,
                                mw->mw_sg, mw->mw_nents, mw->mw_dir);
@@ -576,4 +591,5 @@ const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
        .ro_init_mr                     = frwr_op_init_mr,
        .ro_release_mr                  = frwr_op_release_mr,
        .ro_displayname                 = "frwr",
+       .ro_send_w_inv_ok               = RPCRDMA_CMP_F_SND_W_INV_OK,
 };
index a47f170..d987c2d 100644 (file)
 # define RPCDBG_FACILITY       RPCDBG_TRANS
 #endif
 
-enum rpcrdma_chunktype {
-       rpcrdma_noch = 0,
-       rpcrdma_readch,
-       rpcrdma_areadch,
-       rpcrdma_writech,
-       rpcrdma_replych
-};
-
 static const char transfertypes[][12] = {
        "inline",       /* no chunks */
        "read list",    /* some argument via rdma read */
@@ -118,10 +110,12 @@ static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs)
        return size;
 }
 
-void rpcrdma_set_max_header_sizes(struct rpcrdma_ia *ia,
-                                 struct rpcrdma_create_data_internal *cdata,
-                                 unsigned int maxsegs)
+void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *r_xprt)
 {
+       struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
+       struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+       unsigned int maxsegs = ia->ri_max_segs;
+
        ia->ri_max_inline_write = cdata->inline_wsize -
                                  rpcrdma_max_call_header_size(maxsegs);
        ia->ri_max_inline_read = cdata->inline_rsize -
@@ -155,42 +149,6 @@ static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
        return rqst->rq_rcv_buf.buflen <= ia->ri_max_inline_read;
 }
 
-static int
-rpcrdma_tail_pullup(struct xdr_buf *buf)
-{
-       size_t tlen = buf->tail[0].iov_len;
-       size_t skip = tlen & 3;
-
-       /* Do not include the tail if it is only an XDR pad */
-       if (tlen < 4)
-               return 0;
-
-       /* xdr_write_pages() adds a pad at the beginning of the tail
-        * if the content in "buf->pages" is unaligned. Force the
-        * tail's actual content to land at the next XDR position
-        * after the head instead.
-        */
-       if (skip) {
-               unsigned char *src, *dst;
-               unsigned int count;
-
-               src = buf->tail[0].iov_base;
-               dst = buf->head[0].iov_base;
-               dst += buf->head[0].iov_len;
-
-               src += skip;
-               tlen -= skip;
-
-               dprintk("RPC:       %s: skip=%zu, memmove(%p, %p, %zu)\n",
-                       __func__, skip, dst, src, tlen);
-
-               for (count = tlen; count; count--)
-                       *dst++ = *src++;
-       }
-
-       return tlen;
-}
-
 /* Split "vec" on page boundaries into segments. FMR registers pages,
  * not a byte range. Other modes coalesce these segments into a single
  * MR when they can.
@@ -229,7 +187,8 @@ rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, int n)
 
 static int
 rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
-       enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg)
+       enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg,
+       bool reminv_expected)
 {
        int len, n, p, page_base;
        struct page **ppages;
@@ -271,6 +230,13 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
        if (type == rpcrdma_readch)
                return n;
 
+       /* When encoding the Write list, some servers need to see an extra
+        * segment for odd-length Write chunks. The upper layer provides
+        * space in the tail iovec for this purpose.
+        */
+       if (type == rpcrdma_writech && reminv_expected)
+               return n;
+
        if (xdrbuf->tail[0].iov_len) {
                /* the rpcrdma protocol allows us to omit any trailing
                 * xdr pad bytes, saving the server an RDMA operation. */
@@ -327,7 +293,7 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
        if (rtype == rpcrdma_areadch)
                pos = 0;
        seg = req->rl_segments;
-       nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg);
+       nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg, false);
        if (nsegs < 0)
                return ERR_PTR(nsegs);
 
@@ -391,7 +357,8 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
        seg = req->rl_segments;
        nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf,
                                     rqst->rq_rcv_buf.head[0].iov_len,
-                                    wtype, seg);
+                                    wtype, seg,
+                                    r_xprt->rx_ia.ri_reminv_expected);
        if (nsegs < 0)
                return ERR_PTR(nsegs);
 
@@ -456,7 +423,8 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
        }
 
        seg = req->rl_segments;
-       nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg);
+       nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg,
+                                    r_xprt->rx_ia.ri_reminv_expected);
        if (nsegs < 0)
                return ERR_PTR(nsegs);
 
@@ -491,74 +459,184 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
        return iptr;
 }
 
-/*
- * Copy write data inline.
- * This function is used for "small" requests. Data which is passed
- * to RPC via iovecs (or page list) is copied directly into the
- * pre-registered memory buffer for this request. For small amounts
- * of data, this is efficient. The cutoff value is tunable.
+/* Prepare the RPC-over-RDMA header SGE.
  */
-static void rpcrdma_inline_pullup(struct rpc_rqst *rqst)
+static bool
+rpcrdma_prepare_hdr_sge(struct rpcrdma_ia *ia, struct rpcrdma_req *req,
+                       u32 len)
 {
-       int i, npages, curlen;
-       int copy_len;
-       unsigned char *srcp, *destp;
-       struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
-       int page_base;
-       struct page **ppages;
+       struct rpcrdma_regbuf *rb = req->rl_rdmabuf;
+       struct ib_sge *sge = &req->rl_send_sge[0];
+
+       if (unlikely(!rpcrdma_regbuf_is_mapped(rb))) {
+               if (!__rpcrdma_dma_map_regbuf(ia, rb))
+                       return false;
+               sge->addr = rdmab_addr(rb);
+               sge->lkey = rdmab_lkey(rb);
+       }
+       sge->length = len;
+
+       ib_dma_sync_single_for_device(ia->ri_device, sge->addr,
+                                     sge->length, DMA_TO_DEVICE);
+       req->rl_send_wr.num_sge++;
+       return true;
+}
 
-       destp = rqst->rq_svec[0].iov_base;
-       curlen = rqst->rq_svec[0].iov_len;
-       destp += curlen;
+/* Prepare the Send SGEs. The head and tail iovec, and each entry
+ * in the page list, gets its own SGE.
+ */
+static bool
+rpcrdma_prepare_msg_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req,
+                        struct xdr_buf *xdr, enum rpcrdma_chunktype rtype)
+{
+       unsigned int sge_no, page_base, len, remaining;
+       struct rpcrdma_regbuf *rb = req->rl_sendbuf;
+       struct ib_device *device = ia->ri_device;
+       struct ib_sge *sge = req->rl_send_sge;
+       u32 lkey = ia->ri_pd->local_dma_lkey;
+       struct page *page, **ppages;
+
+       /* The head iovec is straightforward, as it is already
+        * DMA-mapped. Sync the content that has changed.
+        */
+       if (!rpcrdma_dma_map_regbuf(ia, rb))
+               return false;
+       sge_no = 1;
+       sge[sge_no].addr = rdmab_addr(rb);
+       sge[sge_no].length = xdr->head[0].iov_len;
+       sge[sge_no].lkey = rdmab_lkey(rb);
+       ib_dma_sync_single_for_device(device, sge[sge_no].addr,
+                                     sge[sge_no].length, DMA_TO_DEVICE);
+
+       /* If there is a Read chunk, the page list is being handled
+        * via explicit RDMA, and thus is skipped here. However, the
+        * tail iovec may include an XDR pad for the page list, as
+        * well as additional content, and may not reside in the
+        * same page as the head iovec.
+        */
+       if (rtype == rpcrdma_readch) {
+               len = xdr->tail[0].iov_len;
 
-       dprintk("RPC:       %s: destp 0x%p len %d hdrlen %d\n",
-               __func__, destp, rqst->rq_slen, curlen);
+               /* Do not include the tail if it is only an XDR pad */
+               if (len < 4)
+                       goto out;
 
-       copy_len = rqst->rq_snd_buf.page_len;
+               page = virt_to_page(xdr->tail[0].iov_base);
+               page_base = (unsigned long)xdr->tail[0].iov_base & ~PAGE_MASK;
 
-       if (rqst->rq_snd_buf.tail[0].iov_len) {
-               curlen = rqst->rq_snd_buf.tail[0].iov_len;
-               if (destp + copy_len != rqst->rq_snd_buf.tail[0].iov_base) {
-                       memmove(destp + copy_len,
-                               rqst->rq_snd_buf.tail[0].iov_base, curlen);
-                       r_xprt->rx_stats.pullup_copy_count += curlen;
+               /* If the content in the page list is an odd length,
+                * xdr_write_pages() has added a pad at the beginning
+                * of the tail iovec. Force the tail's non-pad content
+                * to land at the next XDR position in the Send message.
+                */
+               page_base += len & 3;
+               len -= len & 3;
+               goto map_tail;
+       }
+
+       /* If there is a page list present, temporarily DMA map
+        * and prepare an SGE for each page to be sent.
+        */
+       if (xdr->page_len) {
+               ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
+               page_base = xdr->page_base & ~PAGE_MASK;
+               remaining = xdr->page_len;
+               while (remaining) {
+                       sge_no++;
+                       if (sge_no > RPCRDMA_MAX_SEND_SGES - 2)
+                               goto out_mapping_overflow;
+
+                       len = min_t(u32, PAGE_SIZE - page_base, remaining);
+                       sge[sge_no].addr = ib_dma_map_page(device, *ppages,
+                                                          page_base, len,
+                                                          DMA_TO_DEVICE);
+                       if (ib_dma_mapping_error(device, sge[sge_no].addr))
+                               goto out_mapping_err;
+                       sge[sge_no].length = len;
+                       sge[sge_no].lkey = lkey;
+
+                       req->rl_mapped_sges++;
+                       ppages++;
+                       remaining -= len;
+                       page_base = 0;
                }
-               dprintk("RPC:       %s: tail destp 0x%p len %d\n",
-                       __func__, destp + copy_len, curlen);
-               rqst->rq_svec[0].iov_len += curlen;
        }
-       r_xprt->rx_stats.pullup_copy_count += copy_len;
 
-       page_base = rqst->rq_snd_buf.page_base;
-       ppages = rqst->rq_snd_buf.pages + (page_base >> PAGE_SHIFT);
-       page_base &= ~PAGE_MASK;
-       npages = PAGE_ALIGN(page_base+copy_len) >> PAGE_SHIFT;
-       for (i = 0; copy_len && i < npages; i++) {
-               curlen = PAGE_SIZE - page_base;
-               if (curlen > copy_len)
-                       curlen = copy_len;
-               dprintk("RPC:       %s: page %d destp 0x%p len %d curlen %d\n",
-                       __func__, i, destp, copy_len, curlen);
-               srcp = kmap_atomic(ppages[i]);
-               memcpy(destp, srcp+page_base, curlen);
-               kunmap_atomic(srcp);
-               rqst->rq_svec[0].iov_len += curlen;
-               destp += curlen;
-               copy_len -= curlen;
-               page_base = 0;
+       /* The tail iovec is not always constructed in the same
+        * page where the head iovec resides (see, for example,
+        * gss_wrap_req_priv). To neatly accommodate that case,
+        * DMA map it separately.
+        */
+       if (xdr->tail[0].iov_len) {
+               page = virt_to_page(xdr->tail[0].iov_base);
+               page_base = (unsigned long)xdr->tail[0].iov_base & ~PAGE_MASK;
+               len = xdr->tail[0].iov_len;
+
+map_tail:
+               sge_no++;
+               sge[sge_no].addr = ib_dma_map_page(device, page,
+                                                  page_base, len,
+                                                  DMA_TO_DEVICE);
+               if (ib_dma_mapping_error(device, sge[sge_no].addr))
+                       goto out_mapping_err;
+               sge[sge_no].length = len;
+               sge[sge_no].lkey = lkey;
+               req->rl_mapped_sges++;
        }
-       /* header now contains entire send message */
+
+out:
+       req->rl_send_wr.num_sge = sge_no + 1;
+       return true;
+
+out_mapping_overflow:
+       pr_err("rpcrdma: too many Send SGEs (%u)\n", sge_no);
+       return false;
+
+out_mapping_err:
+       pr_err("rpcrdma: Send mapping error\n");
+       return false;
+}
+
+bool
+rpcrdma_prepare_send_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req,
+                         u32 hdrlen, struct xdr_buf *xdr,
+                         enum rpcrdma_chunktype rtype)
+{
+       req->rl_send_wr.num_sge = 0;
+       req->rl_mapped_sges = 0;
+
+       if (!rpcrdma_prepare_hdr_sge(ia, req, hdrlen))
+               goto out_map;
+
+       if (rtype != rpcrdma_areadch)
+               if (!rpcrdma_prepare_msg_sges(ia, req, xdr, rtype))
+                       goto out_map;
+
+       return true;
+
+out_map:
+       pr_err("rpcrdma: failed to DMA map a Send buffer\n");
+       return false;
+}
+
+void
+rpcrdma_unmap_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
+{
+       struct ib_device *device = ia->ri_device;
+       struct ib_sge *sge;
+       int count;
+
+       sge = &req->rl_send_sge[2];
+       for (count = req->rl_mapped_sges; count--; sge++)
+               ib_dma_unmap_page(device, sge->addr, sge->length,
+                                 DMA_TO_DEVICE);
+       req->rl_mapped_sges = 0;
 }
 
 /*
  * Marshal a request: the primary job of this routine is to choose
  * the transfer modes. See comments below.
  *
- * Prepares up to two IOVs per Call message:
- *
- *  [0] -- RPC RDMA header
- *  [1] -- the RPC header/data
- *
  * Returns zero on success, otherwise a negative errno.
  */
 
@@ -626,12 +704,11 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
         */
        if (rpcrdma_args_inline(r_xprt, rqst)) {
                rtype = rpcrdma_noch;
-               rpcrdma_inline_pullup(rqst);
-               rpclen = rqst->rq_svec[0].iov_len;
+               rpclen = rqst->rq_snd_buf.len;
        } else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
                rtype = rpcrdma_readch;
-               rpclen = rqst->rq_svec[0].iov_len;
-               rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf);
+               rpclen = rqst->rq_snd_buf.head[0].iov_len +
+                        rqst->rq_snd_buf.tail[0].iov_len;
        } else {
                r_xprt->rx_stats.nomsg_call_count++;
                headerp->rm_type = htonl(RDMA_NOMSG);
@@ -673,34 +750,18 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
                goto out_unmap;
        hdrlen = (unsigned char *)iptr - (unsigned char *)headerp;
 
-       if (hdrlen + rpclen > RPCRDMA_INLINE_WRITE_THRESHOLD(rqst))
-               goto out_overflow;
-
        dprintk("RPC: %5u %s: %s/%s: hdrlen %zd rpclen %zd\n",
                rqst->rq_task->tk_pid, __func__,
                transfertypes[rtype], transfertypes[wtype],
                hdrlen, rpclen);
 
-       req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf);
-       req->rl_send_iov[0].length = hdrlen;
-       req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf);
-
-       req->rl_niovs = 1;
-       if (rtype == rpcrdma_areadch)
-               return 0;
-
-       req->rl_send_iov[1].addr = rdmab_addr(req->rl_sendbuf);
-       req->rl_send_iov[1].length = rpclen;
-       req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf);
-
-       req->rl_niovs = 2;
+       if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, hdrlen,
+                                      &rqst->rq_snd_buf, rtype)) {
+               iptr = ERR_PTR(-EIO);
+               goto out_unmap;
+       }
        return 0;
 
-out_overflow:
-       pr_err("rpcrdma: send overflow: hdrlen %zd rpclen %zu %s/%s\n",
-               hdrlen, rpclen, transfertypes[rtype], transfertypes[wtype]);
-       iptr = ERR_PTR(-EIO);
-
 out_unmap:
        r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false);
        return PTR_ERR(iptr);
@@ -916,8 +977,10 @@ rpcrdma_conn_func(struct rpcrdma_ep *ep)
  * allowed to timeout, to discover the errors at that time.
  */
 void
-rpcrdma_reply_handler(struct rpcrdma_rep *rep)
+rpcrdma_reply_handler(struct work_struct *work)
 {
+       struct rpcrdma_rep *rep =
+                       container_of(work, struct rpcrdma_rep, rr_work);
        struct rpcrdma_msg *headerp;
        struct rpcrdma_req *req;
        struct rpc_rqst *rqst;
@@ -1132,6 +1195,6 @@ out_duplicate:
 
 repost:
        r_xprt->rx_stats.bad_reply_count++;
-       if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep))
+       if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, rep))
                rpcrdma_recv_buffer_put(rep);
 }
index cd0c558..2d8545c 100644 (file)
@@ -159,33 +159,34 @@ out_unmap:
 /* Server-side transport endpoint wants a whole page for its send
  * buffer. The client RPC code constructs the RPC header in this
  * buffer before it invokes ->send_request.
- *
- * Returns NULL if there was a temporary allocation failure.
  */
-static void *
-xprt_rdma_bc_allocate(struct rpc_task *task, size_t size)
+static int
+xprt_rdma_bc_allocate(struct rpc_task *task)
 {
        struct rpc_rqst *rqst = task->tk_rqstp;
        struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt;
+       size_t size = rqst->rq_callsize;
        struct svcxprt_rdma *rdma;
        struct page *page;
 
        rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt);
 
-       /* Prevent an infinite loop: try to make this case work */
-       if (size > PAGE_SIZE)
+       if (size > PAGE_SIZE) {
                WARN_ONCE(1, "svcrdma: large bc buffer request (size %zu)\n",
                          size);
+               return -EINVAL;
+       }
 
        page = alloc_page(RPCRDMA_DEF_GFP);
        if (!page)
-               return NULL;
+               return -ENOMEM;
 
-       return page_address(page);
+       rqst->rq_buffer = page_address(page);
+       return 0;
 }
 
 static void
-xprt_rdma_bc_free(void *buffer)
+xprt_rdma_bc_free(struct rpc_task *task)
 {
        /* No-op: ctxt and page have already been freed. */
 }
index 81f0e87..ed5e285 100644 (file)
@@ -97,7 +97,7 @@ static struct ctl_table xr_tunables_table[] = {
                .data           = &xprt_rdma_max_inline_read,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
-               .proc_handler   = proc_dointvec,
+               .proc_handler   = proc_dointvec_minmax,
                .extra1         = &min_inline_size,
                .extra2         = &max_inline_size,
        },
@@ -106,7 +106,7 @@ static struct ctl_table xr_tunables_table[] = {
                .data           = &xprt_rdma_max_inline_write,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
-               .proc_handler   = proc_dointvec,
+               .proc_handler   = proc_dointvec_minmax,
                .extra1         = &min_inline_size,
                .extra2         = &max_inline_size,
        },
@@ -477,115 +477,152 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
        }
 }
 
-/*
- * The RDMA allocate/free functions need the task structure as a place
- * to hide the struct rpcrdma_req, which is necessary for the actual send/recv
- * sequence.
+/* Allocate a fixed-size buffer in which to construct and send the
+ * RPC-over-RDMA header for this request.
+ */
+static bool
+rpcrdma_get_rdmabuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+                   gfp_t flags)
+{
+       size_t size = RPCRDMA_HDRBUF_SIZE;
+       struct rpcrdma_regbuf *rb;
+
+       if (req->rl_rdmabuf)
+               return true;
+
+       rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, flags);
+       if (IS_ERR(rb))
+               return false;
+
+       r_xprt->rx_stats.hardway_register_count += size;
+       req->rl_rdmabuf = rb;
+       return true;
+}
+
+static bool
+rpcrdma_get_sendbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+                   size_t size, gfp_t flags)
+{
+       struct rpcrdma_regbuf *rb;
+
+       if (req->rl_sendbuf && rdmab_length(req->rl_sendbuf) >= size)
+               return true;
+
+       rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, flags);
+       if (IS_ERR(rb))
+               return false;
+
+       rpcrdma_free_regbuf(req->rl_sendbuf);
+       r_xprt->rx_stats.hardway_register_count += size;
+       req->rl_sendbuf = rb;
+       return true;
+}
+
+/* The rq_rcv_buf is used only if a Reply chunk is necessary.
+ * The decision to use a Reply chunk is made later in
+ * rpcrdma_marshal_req. This buffer is registered at that time.
  *
- * The RPC layer allocates both send and receive buffers in the same call
- * (rq_send_buf and rq_rcv_buf are both part of a single contiguous buffer).
- * We may register rq_rcv_buf when using reply chunks.
+ * Otherwise, the associated RPC Reply arrives in a separate
+ * Receive buffer, arbitrarily chosen by the HCA. The buffer
+ * allocated here for the RPC Reply is not utilized in that
+ * case. See rpcrdma_inline_fixup.
+ *
+ * A regbuf is used here to remember the buffer size.
  */
-static void *
-xprt_rdma_allocate(struct rpc_task *task, size_t size)
+static bool
+rpcrdma_get_recvbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+                   size_t size, gfp_t flags)
 {
-       struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt;
-       struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
        struct rpcrdma_regbuf *rb;
+
+       if (req->rl_recvbuf && rdmab_length(req->rl_recvbuf) >= size)
+               return true;
+
+       rb = rpcrdma_alloc_regbuf(size, DMA_NONE, flags);
+       if (IS_ERR(rb))
+               return false;
+
+       rpcrdma_free_regbuf(req->rl_recvbuf);
+       r_xprt->rx_stats.hardway_register_count += size;
+       req->rl_recvbuf = rb;
+       return true;
+}
+
+/**
+ * xprt_rdma_allocate - allocate transport resources for an RPC
+ * @task: RPC task
+ *
+ * Return values:
+ *        0:   Success; rq_buffer points to RPC buffer to use
+ *   ENOMEM:   Out of memory, call again later
+ *      EIO:   A permanent error occurred, do not retry
+ *
+ * The RDMA allocate/free functions need the task structure as a place
+ * to hide the struct rpcrdma_req, which is necessary for the actual
+ * send/recv sequence.
+ *
+ * xprt_rdma_allocate provides buffers that are already mapped for
+ * DMA, and a local DMA lkey is provided for each.
+ */
+static int
+xprt_rdma_allocate(struct rpc_task *task)
+{
+       struct rpc_rqst *rqst = task->tk_rqstp;
+       struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
        struct rpcrdma_req *req;
-       size_t min_size;
        gfp_t flags;
 
        req = rpcrdma_buffer_get(&r_xprt->rx_buf);
        if (req == NULL)
-               return NULL;
+               return -ENOMEM;
 
        flags = RPCRDMA_DEF_GFP;
        if (RPC_IS_SWAPPER(task))
                flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN;
 
-       if (req->rl_rdmabuf == NULL)
-               goto out_rdmabuf;
-       if (req->rl_sendbuf == NULL)
-               goto out_sendbuf;
-       if (size > req->rl_sendbuf->rg_size)
-               goto out_sendbuf;
-
-out:
-       dprintk("RPC:       %s: size %zd, request 0x%p\n", __func__, size, req);
-       req->rl_connect_cookie = 0;     /* our reserved value */
-       req->rl_task = task;
-       return req->rl_sendbuf->rg_base;
-
-out_rdmabuf:
-       min_size = RPCRDMA_INLINE_WRITE_THRESHOLD(task->tk_rqstp);
-       rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, min_size, flags);
-       if (IS_ERR(rb))
+       if (!rpcrdma_get_rdmabuf(r_xprt, req, flags))
                goto out_fail;
-       req->rl_rdmabuf = rb;
-
-out_sendbuf:
-       /* XDR encoding and RPC/RDMA marshaling of this request has not
-        * yet occurred. Thus a lower bound is needed to prevent buffer
-        * overrun during marshaling.
-        *
-        * RPC/RDMA marshaling may choose to send payload bearing ops
-        * inline, if the result is smaller than the inline threshold.
-        * The value of the "size" argument accounts for header
-        * requirements but not for the payload in these cases.
-        *
-        * Likewise, allocate enough space to receive a reply up to the
-        * size of the inline threshold.
-        *
-        * It's unlikely that both the send header and the received
-        * reply will be large, but slush is provided here to allow
-        * flexibility when marshaling.
-        */
-       min_size = RPCRDMA_INLINE_READ_THRESHOLD(task->tk_rqstp);
-       min_size += RPCRDMA_INLINE_WRITE_THRESHOLD(task->tk_rqstp);
-       if (size < min_size)
-               size = min_size;
-
-       rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, size, flags);
-       if (IS_ERR(rb))
+       if (!rpcrdma_get_sendbuf(r_xprt, req, rqst->rq_callsize, flags))
+               goto out_fail;
+       if (!rpcrdma_get_recvbuf(r_xprt, req, rqst->rq_rcvsize, flags))
                goto out_fail;
-       rb->rg_owner = req;
 
-       r_xprt->rx_stats.hardway_register_count += size;
-       rpcrdma_free_regbuf(&r_xprt->rx_ia, req->rl_sendbuf);
-       req->rl_sendbuf = rb;
-       goto out;
+       dprintk("RPC: %5u %s: send size = %zd, recv size = %zd, req = %p\n",
+               task->tk_pid, __func__, rqst->rq_callsize,
+               rqst->rq_rcvsize, req);
+
+       req->rl_connect_cookie = 0;     /* our reserved value */
+       rpcrdma_set_xprtdata(rqst, req);
+       rqst->rq_buffer = req->rl_sendbuf->rg_base;
+       rqst->rq_rbuffer = req->rl_recvbuf->rg_base;
+       return 0;
 
 out_fail:
        rpcrdma_buffer_put(req);
-       return NULL;
+       return -ENOMEM;
 }
 
-/*
- * This function returns all RDMA resources to the pool.
+/**
+ * xprt_rdma_free - release resources allocated by xprt_rdma_allocate
+ * @task: RPC task
+ *
+ * Caller guarantees rqst->rq_buffer is non-NULL.
  */
 static void
-xprt_rdma_free(void *buffer)
+xprt_rdma_free(struct rpc_task *task)
 {
-       struct rpcrdma_req *req;
-       struct rpcrdma_xprt *r_xprt;
-       struct rpcrdma_regbuf *rb;
-
-       if (buffer == NULL)
-               return;
+       struct rpc_rqst *rqst = task->tk_rqstp;
+       struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
+       struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+       struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 
-       rb = container_of(buffer, struct rpcrdma_regbuf, rg_base[0]);
-       req = rb->rg_owner;
        if (req->rl_backchannel)
                return;
 
-       r_xprt = container_of(req->rl_buffer, struct rpcrdma_xprt, rx_buf);
-
        dprintk("RPC:       %s: called on 0x%p\n", __func__, req->rl_reply);
 
-       r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req,
-                                           !RPC_IS_ASYNC(req->rl_task));
-
+       ia->ri_ops->ro_unmap_safe(r_xprt, req, !RPC_IS_ASYNC(task));
+       rpcrdma_unmap_sges(ia, req);
        rpcrdma_buffer_put(req);
 }
 
@@ -685,10 +722,11 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
                   r_xprt->rx_stats.failed_marshal_count,
                   r_xprt->rx_stats.bad_reply_count,
                   r_xprt->rx_stats.nomsg_call_count);
-       seq_printf(seq, "%lu %lu %lu\n",
+       seq_printf(seq, "%lu %lu %lu %lu\n",
                   r_xprt->rx_stats.mrs_recovered,
                   r_xprt->rx_stats.mrs_orphaned,
-                  r_xprt->rx_stats.mrs_allocated);
+                  r_xprt->rx_stats.mrs_allocated,
+                  r_xprt->rx_stats.local_inv_needed);
 }
 
 static int
index be3178e..ec74289 100644 (file)
@@ -129,15 +129,6 @@ rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
                       wc->status, wc->vendor_err);
 }
 
-static void
-rpcrdma_receive_worker(struct work_struct *work)
-{
-       struct rpcrdma_rep *rep =
-                       container_of(work, struct rpcrdma_rep, rr_work);
-
-       rpcrdma_reply_handler(rep);
-}
-
 /* Perform basic sanity checking to avoid using garbage
  * to update the credit grant value.
  */
@@ -161,13 +152,13 @@ rpcrdma_update_granted_credits(struct rpcrdma_rep *rep)
 }
 
 /**
- * rpcrdma_receive_wc - Invoked by RDMA provider for each polled Receive WC
+ * rpcrdma_wc_receive - Invoked by RDMA provider for each polled Receive WC
  * @cq:        completion queue (ignored)
  * @wc:        completed WR
  *
  */
 static void
-rpcrdma_receive_wc(struct ib_cq *cq, struct ib_wc *wc)
+rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
 {
        struct ib_cqe *cqe = wc->wr_cqe;
        struct rpcrdma_rep *rep = container_of(cqe, struct rpcrdma_rep,
@@ -185,6 +176,9 @@ rpcrdma_receive_wc(struct ib_cq *cq, struct ib_wc *wc)
                __func__, rep, wc->byte_len);
 
        rep->rr_len = wc->byte_len;
+       rep->rr_wc_flags = wc->wc_flags;
+       rep->rr_inv_rkey = wc->ex.invalidate_rkey;
+
        ib_dma_sync_single_for_cpu(rep->rr_device,
                                   rdmab_addr(rep->rr_rdmabuf),
                                   rep->rr_len, DMA_FROM_DEVICE);
@@ -204,6 +198,36 @@ out_fail:
        goto out_schedule;
 }
 
+static void
+rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt,
+                              struct rdma_conn_param *param)
+{
+       struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
+       const struct rpcrdma_connect_private *pmsg = param->private_data;
+       unsigned int rsize, wsize;
+
+       /* Default settings for RPC-over-RDMA Version One */
+       r_xprt->rx_ia.ri_reminv_expected = false;
+       rsize = RPCRDMA_V1_DEF_INLINE_SIZE;
+       wsize = RPCRDMA_V1_DEF_INLINE_SIZE;
+
+       if (pmsg &&
+           pmsg->cp_magic == rpcrdma_cmp_magic &&
+           pmsg->cp_version == RPCRDMA_CMP_VERSION) {
+               r_xprt->rx_ia.ri_reminv_expected = true;
+               rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size);
+               wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size);
+       }
+
+       if (rsize < cdata->inline_rsize)
+               cdata->inline_rsize = rsize;
+       if (wsize < cdata->inline_wsize)
+               cdata->inline_wsize = wsize;
+       pr_info("rpcrdma: max send %u, max recv %u\n",
+               cdata->inline_wsize, cdata->inline_rsize);
+       rpcrdma_set_max_header_sizes(r_xprt);
+}
+
 static int
 rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
 {
@@ -244,6 +268,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
                        " (%d initiator)\n",
                        __func__, attr->max_dest_rd_atomic,
                        attr->max_rd_atomic);
+               rpcrdma_update_connect_private(xprt, &event->param.conn);
                goto connected;
        case RDMA_CM_EVENT_CONNECT_ERROR:
                connstate = -ENOTCONN;
@@ -454,11 +479,12 @@ int
 rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
                                struct rpcrdma_create_data_internal *cdata)
 {
+       struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private;
        struct ib_cq *sendcq, *recvcq;
        unsigned int max_qp_wr;
        int rc;
 
-       if (ia->ri_device->attrs.max_sge < RPCRDMA_MAX_IOVS) {
+       if (ia->ri_device->attrs.max_sge < RPCRDMA_MAX_SEND_SGES) {
                dprintk("RPC:       %s: insufficient sge's available\n",
                        __func__);
                return -ENOMEM;
@@ -487,7 +513,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
        ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
        ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
        ep->rep_attr.cap.max_recv_wr += 1;      /* drain cqe */
-       ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_IOVS;
+       ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_SEND_SGES;
        ep->rep_attr.cap.max_recv_sge = 1;
        ep->rep_attr.cap.max_inline_data = 0;
        ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
@@ -536,9 +562,14 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
        /* Initialize cma parameters */
        memset(&ep->rep_remote_cma, 0, sizeof(ep->rep_remote_cma));
 
-       /* RPC/RDMA does not use private data */
-       ep->rep_remote_cma.private_data = NULL;
-       ep->rep_remote_cma.private_data_len = 0;
+       /* Prepare RDMA-CM private message */
+       pmsg->cp_magic = rpcrdma_cmp_magic;
+       pmsg->cp_version = RPCRDMA_CMP_VERSION;
+       pmsg->cp_flags |= ia->ri_ops->ro_send_w_inv_ok;
+       pmsg->cp_send_size = rpcrdma_encode_buffer_size(cdata->inline_wsize);
+       pmsg->cp_recv_size = rpcrdma_encode_buffer_size(cdata->inline_rsize);
+       ep->rep_remote_cma.private_data = pmsg;
+       ep->rep_remote_cma.private_data_len = sizeof(*pmsg);
 
        /* Client offers RDMA Read but does not initiate */
        ep->rep_remote_cma.initiator_depth = 0;
@@ -849,6 +880,10 @@ rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
        req->rl_cqe.done = rpcrdma_wc_send;
        req->rl_buffer = &r_xprt->rx_buf;
        INIT_LIST_HEAD(&req->rl_registered);
+       req->rl_send_wr.next = NULL;
+       req->rl_send_wr.wr_cqe = &req->rl_cqe;
+       req->rl_send_wr.sg_list = req->rl_send_sge;
+       req->rl_send_wr.opcode = IB_WR_SEND;
        return req;
 }
 
@@ -865,17 +900,21 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
        if (rep == NULL)
                goto out;
 
-       rep->rr_rdmabuf = rpcrdma_alloc_regbuf(ia, cdata->inline_rsize,
-                                              GFP_KERNEL);
+       rep->rr_rdmabuf = rpcrdma_alloc_regbuf(cdata->inline_rsize,
+                                              DMA_FROM_DEVICE, GFP_KERNEL);
        if (IS_ERR(rep->rr_rdmabuf)) {
                rc = PTR_ERR(rep->rr_rdmabuf);
                goto out_free;
        }
 
        rep->rr_device = ia->ri_device;
-       rep->rr_cqe.done = rpcrdma_receive_wc;
+       rep->rr_cqe.done = rpcrdma_wc_receive;
        rep->rr_rxprt = r_xprt;
-       INIT_WORK(&rep->rr_work, rpcrdma_receive_worker);
+       INIT_WORK(&rep->rr_work, rpcrdma_reply_handler);
+       rep->rr_recv_wr.next = NULL;
+       rep->rr_recv_wr.wr_cqe = &rep->rr_cqe;
+       rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
+       rep->rr_recv_wr.num_sge = 1;
        return rep;
 
 out_free:
@@ -966,17 +1005,18 @@ rpcrdma_buffer_get_rep_locked(struct rpcrdma_buffer *buf)
 }
 
 static void
-rpcrdma_destroy_rep(struct rpcrdma_ia *ia, struct rpcrdma_rep *rep)
+rpcrdma_destroy_rep(struct rpcrdma_rep *rep)
 {
-       rpcrdma_free_regbuf(ia, rep->rr_rdmabuf);
+       rpcrdma_free_regbuf(rep->rr_rdmabuf);
        kfree(rep);
 }
 
 void
-rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
+rpcrdma_destroy_req(struct rpcrdma_req *req)
 {
-       rpcrdma_free_regbuf(ia, req->rl_sendbuf);
-       rpcrdma_free_regbuf(ia, req->rl_rdmabuf);
+       rpcrdma_free_regbuf(req->rl_recvbuf);
+       rpcrdma_free_regbuf(req->rl_sendbuf);
+       rpcrdma_free_regbuf(req->rl_rdmabuf);
        kfree(req);
 }
 
@@ -1009,15 +1049,13 @@ rpcrdma_destroy_mrs(struct rpcrdma_buffer *buf)
 void
 rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
 {
-       struct rpcrdma_ia *ia = rdmab_to_ia(buf);
-
        cancel_delayed_work_sync(&buf->rb_recovery_worker);
 
        while (!list_empty(&buf->rb_recv_bufs)) {
                struct rpcrdma_rep *rep;
 
                rep = rpcrdma_buffer_get_rep_locked(buf);
-               rpcrdma_destroy_rep(ia, rep);
+               rpcrdma_destroy_rep(rep);
        }
        buf->rb_send_count = 0;
 
@@ -1030,7 +1068,7 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
                list_del(&req->rl_all);
 
                spin_unlock(&buf->rb_reqslock);
-               rpcrdma_destroy_req(ia, req);
+               rpcrdma_destroy_req(req);
                spin_lock(&buf->rb_reqslock);
        }
        spin_unlock(&buf->rb_reqslock);
@@ -1129,7 +1167,7 @@ rpcrdma_buffer_put(struct rpcrdma_req *req)
        struct rpcrdma_buffer *buffers = req->rl_buffer;
        struct rpcrdma_rep *rep = req->rl_reply;
 
-       req->rl_niovs = 0;
+       req->rl_send_wr.num_sge = 0;
        req->rl_reply = NULL;
 
        spin_lock(&buffers->rb_lock);
@@ -1171,70 +1209,81 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
        spin_unlock(&buffers->rb_lock);
 }
 
-/*
- * Wrappers for internal-use kmalloc memory registration, used by buffer code.
- */
-
 /**
- * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers
- * @ia: controlling rpcrdma_ia
+ * rpcrdma_alloc_regbuf - allocate and DMA-map memory for SEND/RECV buffers
  * @size: size of buffer to be allocated, in bytes
+ * @direction: direction of data movement
  * @flags: GFP flags
  *
- * Returns pointer to private header of an area of internally
- * registered memory, or an ERR_PTR. The registered buffer follows
- * the end of the private header.
+ * Returns an ERR_PTR, or a pointer to a regbuf, a buffer that
+ * can be persistently DMA-mapped for I/O.
  *
  * xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for
- * receiving the payload of RDMA RECV operations. regbufs are not
- * used for RDMA READ/WRITE operations, thus are registered only for
- * LOCAL access.
+ * receiving the payload of RDMA RECV operations. During Long Calls
+ * or Replies they may be registered externally via ro_map.
  */
 struct rpcrdma_regbuf *
-rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags)
+rpcrdma_alloc_regbuf(size_t size, enum dma_data_direction direction,
+                    gfp_t flags)
 {
        struct rpcrdma_regbuf *rb;
-       struct ib_sge *iov;
 
        rb = kmalloc(sizeof(*rb) + size, flags);
        if (rb == NULL)
-               goto out;
+               return ERR_PTR(-ENOMEM);
 
-       iov = &rb->rg_iov;
-       iov->addr = ib_dma_map_single(ia->ri_device,
-                                     (void *)rb->rg_base, size,
-                                     DMA_BIDIRECTIONAL);
-       if (ib_dma_mapping_error(ia->ri_device, iov->addr))
-               goto out_free;
+       rb->rg_device = NULL;
+       rb->rg_direction = direction;
+       rb->rg_iov.length = size;
 
-       iov->length = size;
-       iov->lkey = ia->ri_pd->local_dma_lkey;
-       rb->rg_size = size;
-       rb->rg_owner = NULL;
        return rb;
+}
 
-out_free:
-       kfree(rb);
-out:
-       return ERR_PTR(-ENOMEM);
+/**
+ * __rpcrdma_map_regbuf - DMA-map a regbuf
+ * @ia: controlling rpcrdma_ia
+ * @rb: regbuf to be mapped
+ */
+bool
+__rpcrdma_dma_map_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
+{
+       if (rb->rg_direction == DMA_NONE)
+               return false;
+
+       rb->rg_iov.addr = ib_dma_map_single(ia->ri_device,
+                                           (void *)rb->rg_base,
+                                           rdmab_length(rb),
+                                           rb->rg_direction);
+       if (ib_dma_mapping_error(ia->ri_device, rdmab_addr(rb)))
+               return false;
+
+       rb->rg_device = ia->ri_device;
+       rb->rg_iov.lkey = ia->ri_pd->local_dma_lkey;
+       return true;
+}
+
+static void
+rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb)
+{
+       if (!rpcrdma_regbuf_is_mapped(rb))
+               return;
+
+       ib_dma_unmap_single(rb->rg_device, rdmab_addr(rb),
+                           rdmab_length(rb), rb->rg_direction);
+       rb->rg_device = NULL;
 }
 
 /**
  * rpcrdma_free_regbuf - deregister and free registered buffer
- * @ia: controlling rpcrdma_ia
  * @rb: regbuf to be deregistered and freed
  */
 void
-rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
+rpcrdma_free_regbuf(struct rpcrdma_regbuf *rb)
 {
-       struct ib_sge *iov;
-
        if (!rb)
                return;
 
-       iov = &rb->rg_iov;
-       ib_dma_unmap_single(ia->ri_device,
-                           iov->addr, iov->length, DMA_BIDIRECTIONAL);
+       rpcrdma_dma_unmap_regbuf(rb);
        kfree(rb);
 }
 
@@ -1248,39 +1297,28 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
                struct rpcrdma_ep *ep,
                struct rpcrdma_req *req)
 {
-       struct ib_device *device = ia->ri_device;
-       struct ib_send_wr send_wr, *send_wr_fail;
-       struct rpcrdma_rep *rep = req->rl_reply;
-       struct ib_sge *iov = req->rl_send_iov;
-       int i, rc;
+       struct ib_send_wr *send_wr = &req->rl_send_wr;
+       struct ib_send_wr *send_wr_fail;
+       int rc;
 
-       if (rep) {
-               rc = rpcrdma_ep_post_recv(ia, ep, rep);
+       if (req->rl_reply) {
+               rc = rpcrdma_ep_post_recv(ia, req->rl_reply);
                if (rc)
                        return rc;
                req->rl_reply = NULL;
        }
 
-       send_wr.next = NULL;
-       send_wr.wr_cqe = &req->rl_cqe;
-       send_wr.sg_list = iov;
-       send_wr.num_sge = req->rl_niovs;
-       send_wr.opcode = IB_WR_SEND;
-
-       for (i = 0; i < send_wr.num_sge; i++)
-               ib_dma_sync_single_for_device(device, iov[i].addr,
-                                             iov[i].length, DMA_TO_DEVICE);
        dprintk("RPC:       %s: posting %d s/g entries\n",
-               __func__, send_wr.num_sge);
+               __func__, send_wr->num_sge);
 
        if (DECR_CQCOUNT(ep) > 0)
-               send_wr.send_flags = 0;
+               send_wr->send_flags = 0;
        else { /* Provider must take a send completion every now and then */
                INIT_CQCOUNT(ep);
-               send_wr.send_flags = IB_SEND_SIGNALED;
+               send_wr->send_flags = IB_SEND_SIGNALED;
        }
 
-       rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
+       rc = ib_post_send(ia->ri_id->qp, send_wr, &send_wr_fail);
        if (rc)
                goto out_postsend_err;
        return 0;
@@ -1290,32 +1328,24 @@ out_postsend_err:
        return -ENOTCONN;
 }
 
-/*
- * (Re)post a receive buffer.
- */
 int
 rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
-                    struct rpcrdma_ep *ep,
                     struct rpcrdma_rep *rep)
 {
-       struct ib_recv_wr recv_wr, *recv_wr_fail;
+       struct ib_recv_wr *recv_wr_fail;
        int rc;
 
-       recv_wr.next = NULL;
-       recv_wr.wr_cqe = &rep->rr_cqe;
-       recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
-       recv_wr.num_sge = 1;
-
-       ib_dma_sync_single_for_cpu(ia->ri_device,
-                                  rdmab_addr(rep->rr_rdmabuf),
-                                  rdmab_length(rep->rr_rdmabuf),
-                                  DMA_BIDIRECTIONAL);
-
-       rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
+       if (!rpcrdma_dma_map_regbuf(ia, rep->rr_rdmabuf))
+               goto out_map;
+       rc = ib_post_recv(ia->ri_id->qp, &rep->rr_recv_wr, &recv_wr_fail);
        if (rc)
                goto out_postrecv;
        return 0;
 
+out_map:
+       pr_err("rpcrdma: failed to DMA map the Receive buffer\n");
+       return -EIO;
+
 out_postrecv:
        pr_err("rpcrdma: ib_post_recv returned %i\n", rc);
        return -ENOTCONN;
@@ -1333,7 +1363,6 @@ rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count)
 {
        struct rpcrdma_buffer *buffers = &r_xprt->rx_buf;
        struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-       struct rpcrdma_ep *ep = &r_xprt->rx_ep;
        struct rpcrdma_rep *rep;
        int rc;
 
@@ -1344,7 +1373,7 @@ rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count)
                rep = rpcrdma_buffer_get_rep_locked(buffers);
                spin_unlock(&buffers->rb_lock);
 
-               rc = rpcrdma_ep_post_recv(ia, ep, rep);
+               rc = rpcrdma_ep_post_recv(ia, rep);
                if (rc)
                        goto out_rc;
        }
index a71b0f5..0d35b76 100644 (file)
@@ -70,9 +70,11 @@ struct rpcrdma_ia {
        struct ib_pd            *ri_pd;
        struct completion       ri_done;
        int                     ri_async_rc;
+       unsigned int            ri_max_segs;
        unsigned int            ri_max_frmr_depth;
        unsigned int            ri_max_inline_write;
        unsigned int            ri_max_inline_read;
+       bool                    ri_reminv_expected;
        struct ib_qp_attr       ri_qp_attr;
        struct ib_qp_init_attr  ri_qp_init_attr;
 };
@@ -87,6 +89,7 @@ struct rpcrdma_ep {
        int                     rep_connected;
        struct ib_qp_init_attr  rep_attr;
        wait_queue_head_t       rep_connect_wait;
+       struct rpcrdma_connect_private  rep_cm_private;
        struct rdma_conn_param  rep_remote_cma;
        struct sockaddr_storage rep_remote_addr;
        struct delayed_work     rep_connect_worker;
@@ -112,9 +115,9 @@ struct rpcrdma_ep {
  */
 
 struct rpcrdma_regbuf {
-       size_t                  rg_size;
-       struct rpcrdma_req      *rg_owner;
        struct ib_sge           rg_iov;
+       struct ib_device        *rg_device;
+       enum dma_data_direction rg_direction;
        __be32                  rg_base[0] __attribute__ ((aligned(256)));
 };
 
@@ -162,7 +165,10 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
  * The smallest inline threshold is 1024 bytes, ensuring that
  * at least 750 bytes are available for RPC messages.
  */
-#define RPCRDMA_MAX_HDR_SEGS   (8)
+enum {
+       RPCRDMA_MAX_HDR_SEGS = 8,
+       RPCRDMA_HDRBUF_SIZE = 256,
+};
 
 /*
  * struct rpcrdma_rep -- this structure encapsulates state required to recv
@@ -182,10 +188,13 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
 struct rpcrdma_rep {
        struct ib_cqe           rr_cqe;
        unsigned int            rr_len;
+       int                     rr_wc_flags;
+       u32                     rr_inv_rkey;
        struct ib_device        *rr_device;
        struct rpcrdma_xprt     *rr_rxprt;
        struct work_struct      rr_work;
        struct list_head        rr_list;
+       struct ib_recv_wr       rr_recv_wr;
        struct rpcrdma_regbuf   *rr_rdmabuf;
 };
 
@@ -276,19 +285,30 @@ struct rpcrdma_mr_seg {           /* chunk descriptors */
        char            *mr_offset;     /* kva if no page, else offset */
 };
 
-#define RPCRDMA_MAX_IOVS       (2)
+/* Reserve enough Send SGEs to send a maximum size inline request:
+ * - RPC-over-RDMA header
+ * - xdr_buf head iovec
+ * - RPCRDMA_MAX_INLINE bytes, possibly unaligned, in pages
+ * - xdr_buf tail iovec
+ */
+enum {
+       RPCRDMA_MAX_SEND_PAGES = PAGE_SIZE + RPCRDMA_MAX_INLINE - 1,
+       RPCRDMA_MAX_PAGE_SGES = (RPCRDMA_MAX_SEND_PAGES >> PAGE_SHIFT) + 1,
+       RPCRDMA_MAX_SEND_SGES = 1 + 1 + RPCRDMA_MAX_PAGE_SGES + 1,
+};
 
 struct rpcrdma_buffer;
 struct rpcrdma_req {
        struct list_head        rl_free;
-       unsigned int            rl_niovs;
+       unsigned int            rl_mapped_sges;
        unsigned int            rl_connect_cookie;
-       struct rpc_task         *rl_task;
        struct rpcrdma_buffer   *rl_buffer;
-       struct rpcrdma_rep      *rl_reply;/* holder for reply buffer */
-       struct ib_sge           rl_send_iov[RPCRDMA_MAX_IOVS];
-       struct rpcrdma_regbuf   *rl_rdmabuf;
-       struct rpcrdma_regbuf   *rl_sendbuf;
+       struct rpcrdma_rep      *rl_reply;
+       struct ib_send_wr       rl_send_wr;
+       struct ib_sge           rl_send_sge[RPCRDMA_MAX_SEND_SGES];
+       struct rpcrdma_regbuf   *rl_rdmabuf;    /* xprt header */
+       struct rpcrdma_regbuf   *rl_sendbuf;    /* rq_snd_buf */
+       struct rpcrdma_regbuf   *rl_recvbuf;    /* rq_rcv_buf */
 
        struct ib_cqe           rl_cqe;
        struct list_head        rl_all;
@@ -298,14 +318,16 @@ struct rpcrdma_req {
        struct rpcrdma_mr_seg   rl_segments[RPCRDMA_MAX_SEGS];
 };
 
+static inline void
+rpcrdma_set_xprtdata(struct rpc_rqst *rqst, struct rpcrdma_req *req)
+{
+       rqst->rq_xprtdata = req;
+}
+
 static inline struct rpcrdma_req *
 rpcr_to_rdmar(struct rpc_rqst *rqst)
 {
-       void *buffer = rqst->rq_buffer;
-       struct rpcrdma_regbuf *rb;
-
-       rb = container_of(buffer, struct rpcrdma_regbuf, rg_base);
-       return rb->rg_owner;
+       return rqst->rq_xprtdata;
 }
 
 /*
@@ -356,15 +378,6 @@ struct rpcrdma_create_data_internal {
        unsigned int    padding;        /* non-rdma write header padding */
 };
 
-#define RPCRDMA_INLINE_READ_THRESHOLD(rq) \
-       (rpcx_to_rdmad(rq->rq_xprt).inline_rsize)
-
-#define RPCRDMA_INLINE_WRITE_THRESHOLD(rq)\
-       (rpcx_to_rdmad(rq->rq_xprt).inline_wsize)
-
-#define RPCRDMA_INLINE_PAD_VALUE(rq)\
-       rpcx_to_rdmad(rq->rq_xprt).padding
-
 /*
  * Statistics for RPCRDMA
  */
@@ -386,6 +399,7 @@ struct rpcrdma_stats {
        unsigned long           mrs_recovered;
        unsigned long           mrs_orphaned;
        unsigned long           mrs_allocated;
+       unsigned long           local_inv_needed;
 };
 
 /*
@@ -409,6 +423,7 @@ struct rpcrdma_memreg_ops {
                                      struct rpcrdma_mw *);
        void            (*ro_release_mr)(struct rpcrdma_mw *);
        const char      *ro_displayname;
+       const int       ro_send_w_inv_ok;
 };
 
 extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops;
@@ -461,15 +476,14 @@ void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
 
 int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *,
                                struct rpcrdma_req *);
-int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_ep *,
-                               struct rpcrdma_rep *);
+int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_rep *);
 
 /*
  * Buffer calls - xprtrdma/verbs.c
  */
 struct rpcrdma_req *rpcrdma_create_req(struct rpcrdma_xprt *);
 struct rpcrdma_rep *rpcrdma_create_rep(struct rpcrdma_xprt *);
-void rpcrdma_destroy_req(struct rpcrdma_ia *, struct rpcrdma_req *);
+void rpcrdma_destroy_req(struct rpcrdma_req *);
 int rpcrdma_buffer_create(struct rpcrdma_xprt *);
 void rpcrdma_buffer_destroy(struct rpcrdma_buffer *);
 
@@ -482,10 +496,24 @@ void rpcrdma_recv_buffer_put(struct rpcrdma_rep *);
 
 void rpcrdma_defer_mr_recovery(struct rpcrdma_mw *);
 
-struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *,
-                                           size_t, gfp_t);
-void rpcrdma_free_regbuf(struct rpcrdma_ia *,
-                        struct rpcrdma_regbuf *);
+struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(size_t, enum dma_data_direction,
+                                           gfp_t);
+bool __rpcrdma_dma_map_regbuf(struct rpcrdma_ia *, struct rpcrdma_regbuf *);
+void rpcrdma_free_regbuf(struct rpcrdma_regbuf *);
+
+static inline bool
+rpcrdma_regbuf_is_mapped(struct rpcrdma_regbuf *rb)
+{
+       return rb->rg_device != NULL;
+}
+
+static inline bool
+rpcrdma_dma_map_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
+{
+       if (likely(rpcrdma_regbuf_is_mapped(rb)))
+               return true;
+       return __rpcrdma_dma_map_regbuf(ia, rb);
+}
 
 int rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *, unsigned int);
 
@@ -507,15 +535,25 @@ rpcrdma_data_dir(bool writing)
  */
 void rpcrdma_connect_worker(struct work_struct *);
 void rpcrdma_conn_func(struct rpcrdma_ep *);
-void rpcrdma_reply_handler(struct rpcrdma_rep *);
+void rpcrdma_reply_handler(struct work_struct *);
 
 /*
  * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
  */
+
+enum rpcrdma_chunktype {
+       rpcrdma_noch = 0,
+       rpcrdma_readch,
+       rpcrdma_areadch,
+       rpcrdma_writech,
+       rpcrdma_replych
+};
+
+bool rpcrdma_prepare_send_sges(struct rpcrdma_ia *, struct rpcrdma_req *,
+                              u32, struct xdr_buf *, enum rpcrdma_chunktype);
+void rpcrdma_unmap_sges(struct rpcrdma_ia *, struct rpcrdma_req *);
 int rpcrdma_marshal_req(struct rpc_rqst *);
-void rpcrdma_set_max_header_sizes(struct rpcrdma_ia *,
-                                 struct rpcrdma_create_data_internal *,
-                                 unsigned int);
+void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *);
 
 /* RPC/RDMA module init - xprtrdma/transport.c
  */
index bf16883..0137af1 100644 (file)
@@ -473,7 +473,16 @@ static int xs_nospace(struct rpc_task *task)
        spin_unlock_bh(&xprt->transport_lock);
 
        /* Race breaker in case memory is freed before above code is called */
-       sk->sk_write_space(sk);
+       if (ret == -EAGAIN) {
+               struct socket_wq *wq;
+
+               rcu_read_lock();
+               wq = rcu_dereference(sk->sk_wq);
+               set_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags);
+               rcu_read_unlock();
+
+               sk->sk_write_space(sk);
+       }
        return ret;
 }
 
@@ -2533,35 +2542,38 @@ static void xs_tcp_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
  * we allocate pages instead doing a kmalloc like rpc_malloc is because we want
  * to use the server side send routines.
  */
-static void *bc_malloc(struct rpc_task *task, size_t size)
+static int bc_malloc(struct rpc_task *task)
 {
+       struct rpc_rqst *rqst = task->tk_rqstp;
+       size_t size = rqst->rq_callsize;
        struct page *page;
        struct rpc_buffer *buf;
 
-       WARN_ON_ONCE(size > PAGE_SIZE - sizeof(struct rpc_buffer));
-       if (size > PAGE_SIZE - sizeof(struct rpc_buffer))
-               return NULL;
+       if (size > PAGE_SIZE - sizeof(struct rpc_buffer)) {
+               WARN_ONCE(1, "xprtsock: large bc buffer request (size %zu)\n",
+                         size);
+               return -EINVAL;
+       }
 
        page = alloc_page(GFP_KERNEL);
        if (!page)
-               return NULL;
+               return -ENOMEM;
 
        buf = page_address(page);
        buf->len = PAGE_SIZE;
 
-       return buf->data;
+       rqst->rq_buffer = buf->data;
+       return 0;
 }
 
 /*
  * Free the space allocated in the bc_alloc routine
  */
-static void bc_free(void *buffer)
+static void bc_free(struct rpc_task *task)
 {
+       void *buffer = task->tk_rqstp->rq_buffer;
        struct rpc_buffer *buf;
 
-       if (!buffer)
-               return;
-
        buf = container_of(buffer, struct rpc_buffer, data);
        free_page((unsigned long)buf);
 }