Merge branches 'cma', 'cxgb4', 'flowsteer', 'ipoib', 'misc', 'mlx4', 'mlx5', 'nes...
authorRoland Dreier <roland@purestorage.com>
Sun, 17 Nov 2013 16:22:19 +0000 (08:22 -0800)
committerRoland Dreier <roland@purestorage.com>
Sun, 17 Nov 2013 16:22:19 +0000 (08:22 -0800)
45 files changed:
Documentation/ABI/stable/sysfs-driver-ib_srp
Documentation/ABI/stable/sysfs-transport-srp
drivers/infiniband/Kconfig
drivers/infiniband/core/cm.c
drivers/infiniband/core/cma.c
drivers/infiniband/core/netlink.c
drivers/infiniband/core/sysfs.c
drivers/infiniband/core/ucma.c
drivers/infiniband/core/uverbs.h
drivers/infiniband/core/uverbs_cmd.c
drivers/infiniband/core/uverbs_main.c
drivers/infiniband/core/verbs.c
drivers/infiniband/hw/cxgb4/device.c
drivers/infiniband/hw/ipath/ipath_user_sdma.c
drivers/infiniband/hw/mlx4/cq.c
drivers/infiniband/hw/mlx4/main.c
drivers/infiniband/hw/mlx5/cq.c
drivers/infiniband/hw/mlx5/main.c
drivers/infiniband/hw/mlx5/mlx5_ib.h
drivers/infiniband/hw/mlx5/mr.c
drivers/infiniband/hw/mlx5/qp.c
drivers/infiniband/hw/mlx5/srq.c
drivers/infiniband/hw/nes/nes_verbs.c
drivers/infiniband/hw/ocrdma/ocrdma.h
drivers/infiniband/hw/ocrdma/ocrdma_hw.c
drivers/infiniband/hw/ocrdma/ocrdma_main.c
drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
drivers/infiniband/hw/qib/qib_iba7322.c
drivers/infiniband/hw/qib/qib_mad.h
drivers/infiniband/hw/qib/qib_user_sdma.c
drivers/infiniband/hw/qib/qib_verbs.h
drivers/infiniband/ulp/srp/ib_srp.c
drivers/infiniband/ulp/srp/ib_srp.h
drivers/net/ethernet/mellanox/mlx5/core/cmd.c
drivers/net/ethernet/mellanox/mlx5/core/debugfs.c
drivers/net/ethernet/mellanox/mlx5/core/eq.c
drivers/net/ethernet/mellanox/mlx5/core/main.c
drivers/net/ethernet/mellanox/mlx5/core/mr.c
drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c
drivers/scsi/scsi_transport_srp.c
include/linux/mlx5/device.h
include/linux/mlx5/driver.h
include/rdma/ib_verbs.h
include/scsi/scsi_transport_srp.h
include/uapi/rdma/ib_user_verbs.h

index 5c53d28..b9688de 100644 (file)
@@ -61,6 +61,12 @@ Description: Interface for making ib_srp connect to a new target.
                  interrupt is handled by a different CPU then the comp_vector
                  parameter can be used to spread the SRP completion workload
                  over multiple CPU's.
+               * tl_retry_count, a number in the range 2..7 specifying the
+                 IB RC retry count.
+               * queue_size, the maximum number of commands that the
+                 initiator is allowed to queue per SCSI host. The default
+                 value for this parameter is 62. The lowest supported value
+                 is 2.
 
 What:          /sys/class/infiniband_srp/srp-<hca>-<port_number>/ibdev
 Date:          January 2, 2006
@@ -153,6 +159,13 @@ Contact:   linux-rdma@vger.kernel.org
 Description:   InfiniBand service ID used for establishing communication with
                the SRP target.
 
+What:          /sys/class/scsi_host/host<n>/sgid
+Date:          February 1, 2014
+KernelVersion: 3.13
+Contact:       linux-rdma@vger.kernel.org
+Description:   InfiniBand GID of the source port used for communication with
+               the SRP target.
+
 What:          /sys/class/scsi_host/host<n>/zero_req_lim
 Date:          September 20, 2006
 KernelVersion: 2.6.18
index b36fb0d..ec7af69 100644 (file)
@@ -5,6 +5,24 @@ Contact:       linux-scsi@vger.kernel.org, linux-rdma@vger.kernel.org
 Description:   Instructs an SRP initiator to disconnect from a target and to
                remove all LUNs imported from that target.
 
+What:          /sys/class/srp_remote_ports/port-<h>:<n>/dev_loss_tmo
+Date:          February 1, 2014
+KernelVersion: 3.13
+Contact:       linux-scsi@vger.kernel.org, linux-rdma@vger.kernel.org
+Description:   Number of seconds the SCSI layer will wait after a transport
+               layer error has been observed before removing a target port.
+               Zero means immediate removal. Setting this attribute to "off"
+               will disable the dev_loss timer.
+
+What:          /sys/class/srp_remote_ports/port-<h>:<n>/fast_io_fail_tmo
+Date:          February 1, 2014
+KernelVersion: 3.13
+Contact:       linux-scsi@vger.kernel.org, linux-rdma@vger.kernel.org
+Description:   Number of seconds the SCSI layer will wait after a transport
+               layer error has been observed before failing I/O. Zero means
+               failing I/O immediately. Setting this attribute to "off" will
+               disable the fast_io_fail timer.
+
 What:          /sys/class/srp_remote_ports/port-<h>:<n>/port_id
 Date:          June 27, 2007
 KernelVersion: 2.6.24
@@ -12,8 +30,29 @@ Contact:     linux-scsi@vger.kernel.org
 Description:   16-byte local SRP port identifier in hexadecimal format. An
                example: 4c:49:4e:55:58:20:56:49:4f:00:00:00:00:00:00:00.
 
+What:          /sys/class/srp_remote_ports/port-<h>:<n>/reconnect_delay
+Date:          February 1, 2014
+KernelVersion: 3.13
+Contact:       linux-scsi@vger.kernel.org, linux-rdma@vger.kernel.org
+Description:   Number of seconds the SCSI layer will wait after a reconnect
+               attempt failed before retrying. Setting this attribute to
+               "off" will disable time-based reconnecting.
+
 What:          /sys/class/srp_remote_ports/port-<h>:<n>/roles
 Date:          June 27, 2007
 KernelVersion: 2.6.24
 Contact:       linux-scsi@vger.kernel.org
 Description:   Role of the remote port. Either "SRP Initiator" or "SRP Target".
+
+What:          /sys/class/srp_remote_ports/port-<h>:<n>/state
+Date:          February 1, 2014
+KernelVersion: 3.13
+Contact:       linux-scsi@vger.kernel.org, linux-rdma@vger.kernel.org
+Description:   State of the transport layer used for communication with the
+               remote port. "running" if the transport layer is operational;
+               "blocked" if a transport layer error has been encountered but
+               the fast_io_fail_tmo timer has not yet fired; "fail-fast"
+               after the fast_io_fail_tmo timer has fired and before the
+               "dev_loss_tmo" timer has fired; "lost" after the
+               "dev_loss_tmo" timer has fired and before the port is finally
+               removed.
index b84791f..5ceda71 100644 (file)
@@ -31,17 +31,6 @@ config INFINIBAND_USER_ACCESS
          libibverbs, libibcm and a hardware driver library from
          <http://www.openfabrics.org/git/>.
 
-config INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING
-       bool "Experimental and unstable ABI for userspace access to flow steering verbs"
-       depends on INFINIBAND_USER_ACCESS
-       depends on STAGING
-       ---help---
-         The final ABI for userspace access to flow steering verbs
-         has not been defined.  To use the current ABI, *WHICH WILL
-         CHANGE IN THE FUTURE*, say Y here.
-
-         If unsure, say N.
-
 config INFINIBAND_USER_MEM
        bool
        depends on INFINIBAND_USER_ACCESS != n
index 784b97c..f2ef7ef 100644 (file)
@@ -383,14 +383,11 @@ static int cm_alloc_id(struct cm_id_private *cm_id_priv)
 {
        unsigned long flags;
        int id;
-       static int next_id;
 
        idr_preload(GFP_KERNEL);
        spin_lock_irqsave(&cm.lock, flags);
 
-       id = idr_alloc(&cm.local_id_table, cm_id_priv, next_id, 0, GFP_NOWAIT);
-       if (id >= 0)
-               next_id = max(id + 1, 0);
+       id = idr_alloc_cyclic(&cm.local_id_table, cm_id_priv, 0, 0, GFP_NOWAIT);
 
        spin_unlock_irqrestore(&cm.lock, flags);
        idr_preload_end();
index dab4b41..830c983 100644 (file)
@@ -328,28 +328,6 @@ static int cma_set_qkey(struct rdma_id_private *id_priv, u32 qkey)
        return ret;
 }
 
-static int find_gid_port(struct ib_device *device, union ib_gid *gid, u8 port_num)
-{
-       int i;
-       int err;
-       struct ib_port_attr props;
-       union ib_gid tmp;
-
-       err = ib_query_port(device, port_num, &props);
-       if (err)
-               return err;
-
-       for (i = 0; i < props.gid_tbl_len; ++i) {
-               err = ib_query_gid(device, port_num, i, &tmp);
-               if (err)
-                       return err;
-               if (!memcmp(&tmp, gid, sizeof tmp))
-                       return 0;
-       }
-
-       return -EADDRNOTAVAIL;
-}
-
 static void cma_translate_ib(struct sockaddr_ib *sib, struct rdma_dev_addr *dev_addr)
 {
        dev_addr->dev_type = ARPHRD_INFINIBAND;
@@ -371,13 +349,14 @@ static int cma_translate_addr(struct sockaddr *addr, struct rdma_dev_addr *dev_a
        return ret;
 }
 
-static int cma_acquire_dev(struct rdma_id_private *id_priv)
+static int cma_acquire_dev(struct rdma_id_private *id_priv,
+                          struct rdma_id_private *listen_id_priv)
 {
        struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
        struct cma_device *cma_dev;
        union ib_gid gid, iboe_gid;
        int ret = -ENODEV;
-       u8 port;
+       u8 port, found_port;
        enum rdma_link_layer dev_ll = dev_addr->dev_type == ARPHRD_INFINIBAND ?
                IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET;
 
@@ -389,17 +368,39 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv)
        iboe_addr_get_sgid(dev_addr, &iboe_gid);
        memcpy(&gid, dev_addr->src_dev_addr +
               rdma_addr_gid_offset(dev_addr), sizeof gid);
+       if (listen_id_priv &&
+           rdma_port_get_link_layer(listen_id_priv->id.device,
+                                    listen_id_priv->id.port_num) == dev_ll) {
+               cma_dev = listen_id_priv->cma_dev;
+               port = listen_id_priv->id.port_num;
+               if (rdma_node_get_transport(cma_dev->device->node_type) == RDMA_TRANSPORT_IB &&
+                   rdma_port_get_link_layer(cma_dev->device, port) == IB_LINK_LAYER_ETHERNET)
+                       ret = ib_find_cached_gid(cma_dev->device, &iboe_gid,
+                                                &found_port, NULL);
+               else
+                       ret = ib_find_cached_gid(cma_dev->device, &gid,
+                                                &found_port, NULL);
+
+               if (!ret && (port  == found_port)) {
+                       id_priv->id.port_num = found_port;
+                       goto out;
+               }
+       }
        list_for_each_entry(cma_dev, &dev_list, list) {
                for (port = 1; port <= cma_dev->device->phys_port_cnt; ++port) {
+                       if (listen_id_priv &&
+                           listen_id_priv->cma_dev == cma_dev &&
+                           listen_id_priv->id.port_num == port)
+                               continue;
                        if (rdma_port_get_link_layer(cma_dev->device, port) == dev_ll) {
                                if (rdma_node_get_transport(cma_dev->device->node_type) == RDMA_TRANSPORT_IB &&
                                    rdma_port_get_link_layer(cma_dev->device, port) == IB_LINK_LAYER_ETHERNET)
-                                       ret = find_gid_port(cma_dev->device, &iboe_gid, port);
+                                       ret = ib_find_cached_gid(cma_dev->device, &iboe_gid, &found_port, NULL);
                                else
-                                       ret = find_gid_port(cma_dev->device, &gid, port);
+                                       ret = ib_find_cached_gid(cma_dev->device, &gid, &found_port, NULL);
 
-                               if (!ret) {
-                                       id_priv->id.port_num = port;
+                               if (!ret && (port == found_port)) {
+                                       id_priv->id.port_num = found_port;
                                        goto out;
                                }
                        }
@@ -1292,7 +1293,7 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
        }
 
        mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING);
-       ret = cma_acquire_dev(conn_id);
+       ret = cma_acquire_dev(conn_id, listen_id);
        if (ret)
                goto err2;
 
@@ -1451,7 +1452,6 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
 {
        struct rdma_cm_id *new_cm_id;
        struct rdma_id_private *listen_id, *conn_id;
-       struct net_device *dev = NULL;
        struct rdma_cm_event event;
        int ret;
        struct ib_device_attr attr;
@@ -1481,7 +1481,7 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
                goto out;
        }
 
-       ret = cma_acquire_dev(conn_id);
+       ret = cma_acquire_dev(conn_id, listen_id);
        if (ret) {
                mutex_unlock(&conn_id->handler_mutex);
                rdma_destroy_id(new_cm_id);
@@ -1529,8 +1529,6 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
        cma_deref_id(conn_id);
 
 out:
-       if (dev)
-               dev_put(dev);
        mutex_unlock(&listen_id->handler_mutex);
        return ret;
 }
@@ -2050,7 +2048,7 @@ static void addr_handler(int status, struct sockaddr *src_addr,
                goto out;
 
        if (!status && !id_priv->cma_dev)
-               status = cma_acquire_dev(id_priv);
+               status = cma_acquire_dev(id_priv, NULL);
 
        if (status) {
                if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED,
@@ -2547,7 +2545,7 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr)
                if (ret)
                        goto err1;
 
-               ret = cma_acquire_dev(id_priv);
+               ret = cma_acquire_dev(id_priv, NULL);
                if (ret)
                        goto err1;
        }
index da06abd..a1e9cba 100644 (file)
@@ -148,7 +148,7 @@ static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
        list_for_each_entry(client, &client_list, list) {
                if (client->index == index) {
                        if (op < 0 || op >= client->nops ||
-                           !client->cb_table[RDMA_NL_GET_OP(op)].dump)
+                           !client->cb_table[op].dump)
                                return -EINVAL;
 
                        {
index cde1e7b..faad2ca 100644 (file)
@@ -612,6 +612,7 @@ static ssize_t show_node_type(struct device *device,
        switch (dev->node_type) {
        case RDMA_NODE_IB_CA:     return sprintf(buf, "%d: CA\n", dev->node_type);
        case RDMA_NODE_RNIC:      return sprintf(buf, "%d: RNIC\n", dev->node_type);
+       case RDMA_NODE_USNIC:     return sprintf(buf, "%d: usNIC\n", dev->node_type);
        case RDMA_NODE_IB_SWITCH: return sprintf(buf, "%d: switch\n", dev->node_type);
        case RDMA_NODE_IB_ROUTER: return sprintf(buf, "%d: router\n", dev->node_type);
        default:                  return sprintf(buf, "%d: <unknown>\n", dev->node_type);
index b0f189b..ab8b1c3 100644 (file)
@@ -57,7 +57,7 @@ MODULE_LICENSE("Dual BSD/GPL");
 static unsigned int max_backlog = 1024;
 
 static struct ctl_table_header *ucma_ctl_table_hdr;
-static ctl_table ucma_ctl_table[] = {
+static struct ctl_table ucma_ctl_table[] = {
        {
                .procname       = "max_backlog",
                .data           = &max_backlog,
@@ -271,7 +271,7 @@ static int ucma_event_handler(struct rdma_cm_id *cm_id,
                        goto out;
                }
                ctx->backlog--;
-       } else if (!ctx->uid) {
+       } else if (!ctx->uid || ctx->cm_id != cm_id) {
                /*
                 * We ignore events for new connections until userspace has set
                 * their context.  This can only happen if an error occurs on a
index d8f9c6c..bdc842e 100644 (file)
 #include <rdma/ib_umem.h>
 #include <rdma/ib_user_verbs.h>
 
+#define INIT_UDATA(udata, ibuf, obuf, ilen, olen)                      \
+       do {                                                            \
+               (udata)->inbuf  = (void __user *) (ibuf);               \
+               (udata)->outbuf = (void __user *) (obuf);               \
+               (udata)->inlen  = (ilen);                               \
+               (udata)->outlen = (olen);                               \
+       } while (0)
+
 /*
  * Our lifetime rules for these structs are the following:
  *
@@ -178,6 +186,22 @@ void ib_uverbs_event_handler(struct ib_event_handler *handler,
                             struct ib_event *event);
 void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev, struct ib_xrcd *xrcd);
 
+struct ib_uverbs_flow_spec {
+       union {
+               union {
+                       struct ib_uverbs_flow_spec_hdr hdr;
+                       struct {
+                               __u32 type;
+                               __u16 size;
+                               __u16 reserved;
+                       };
+               };
+               struct ib_uverbs_flow_spec_eth     eth;
+               struct ib_uverbs_flow_spec_ipv4    ipv4;
+               struct ib_uverbs_flow_spec_tcp_udp tcp_udp;
+       };
+};
+
 #define IB_UVERBS_DECLARE_CMD(name)                                    \
        ssize_t ib_uverbs_##name(struct ib_uverbs_file *file,           \
                                 const char __user *buf, int in_len,    \
@@ -217,9 +241,13 @@ IB_UVERBS_DECLARE_CMD(destroy_srq);
 IB_UVERBS_DECLARE_CMD(create_xsrq);
 IB_UVERBS_DECLARE_CMD(open_xrcd);
 IB_UVERBS_DECLARE_CMD(close_xrcd);
-#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING
-IB_UVERBS_DECLARE_CMD(create_flow);
-IB_UVERBS_DECLARE_CMD(destroy_flow);
-#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */
+
+#define IB_UVERBS_DECLARE_EX_CMD(name)                         \
+       int ib_uverbs_ex_##name(struct ib_uverbs_file *file,    \
+                               struct ib_udata *ucore,         \
+                               struct ib_udata *uhw)
+
+IB_UVERBS_DECLARE_EX_CMD(create_flow);
+IB_UVERBS_DECLARE_EX_CMD(destroy_flow);
 
 #endif /* UVERBS_H */
index 2f0f01b..65f6e7d 100644 (file)
@@ -54,17 +54,7 @@ static struct uverbs_lock_class qp_lock_class        = { .name = "QP-uobj" };
 static struct uverbs_lock_class ah_lock_class  = { .name = "AH-uobj" };
 static struct uverbs_lock_class srq_lock_class = { .name = "SRQ-uobj" };
 static struct uverbs_lock_class xrcd_lock_class = { .name = "XRCD-uobj" };
-#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING
 static struct uverbs_lock_class rule_lock_class = { .name = "RULE-uobj" };
-#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */
-
-#define INIT_UDATA(udata, ibuf, obuf, ilen, olen)                      \
-       do {                                                            \
-               (udata)->inbuf  = (void __user *) (ibuf);               \
-               (udata)->outbuf = (void __user *) (obuf);               \
-               (udata)->inlen  = (ilen);                               \
-               (udata)->outlen = (olen);                               \
-       } while (0)
 
 /*
  * The ib_uobject locking scheme is as follows:
@@ -939,13 +929,9 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
        if ((cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK))
                return -EINVAL;
 
-       /*
-        * Local write permission is required if remote write or
-        * remote atomic permission is also requested.
-        */
-       if (cmd.access_flags & (IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_REMOTE_WRITE) &&
-           !(cmd.access_flags & IB_ACCESS_LOCAL_WRITE))
-               return -EINVAL;
+       ret = ib_check_mr_access(cmd.access_flags);
+       if (ret)
+               return ret;
 
        uobj = kmalloc(sizeof *uobj, GFP_KERNEL);
        if (!uobj)
@@ -2128,6 +2114,9 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
                        }
                        next->wr.ud.remote_qpn  = user_wr->wr.ud.remote_qpn;
                        next->wr.ud.remote_qkey = user_wr->wr.ud.remote_qkey;
+                       if (next->opcode == IB_WR_SEND_WITH_IMM)
+                               next->ex.imm_data =
+                                       (__be32 __force) user_wr->ex.imm_data;
                } else {
                        switch (next->opcode) {
                        case IB_WR_RDMA_WRITE_WITH_IMM:
@@ -2601,8 +2590,7 @@ out_put:
        return ret ? ret : in_len;
 }
 
-#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING
-static int kern_spec_to_ib_spec(struct ib_kern_spec *kern_spec,
+static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec,
                                union ib_flow_spec *ib_spec)
 {
        ib_spec->type = kern_spec->type;
@@ -2642,28 +2630,31 @@ static int kern_spec_to_ib_spec(struct ib_kern_spec *kern_spec,
        return 0;
 }
 
-ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file,
-                             const char __user *buf, int in_len,
-                             int out_len)
+int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
+                            struct ib_udata *ucore,
+                            struct ib_udata *uhw)
 {
        struct ib_uverbs_create_flow      cmd;
        struct ib_uverbs_create_flow_resp resp;
        struct ib_uobject                 *uobj;
        struct ib_flow                    *flow_id;
-       struct ib_kern_flow_attr          *kern_flow_attr;
+       struct ib_uverbs_flow_attr        *kern_flow_attr;
        struct ib_flow_attr               *flow_attr;
        struct ib_qp                      *qp;
        int err = 0;
        void *kern_spec;
        void *ib_spec;
        int i;
-       int kern_attr_size;
 
-       if (out_len < sizeof(resp))
+       if (ucore->outlen < sizeof(resp))
                return -ENOSPC;
 
-       if (copy_from_user(&cmd, buf, sizeof(cmd)))
-               return -EFAULT;
+       err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd));
+       if (err)
+               return err;
+
+       ucore->inbuf += sizeof(cmd);
+       ucore->inlen -= sizeof(cmd);
 
        if (cmd.comp_mask)
                return -EINVAL;
@@ -2672,32 +2663,27 @@ ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file,
             !capable(CAP_NET_ADMIN)) || !capable(CAP_NET_RAW))
                return -EPERM;
 
-       if (cmd.flow_attr.num_of_specs < 0 ||
-           cmd.flow_attr.num_of_specs > IB_FLOW_SPEC_SUPPORT_LAYERS)
+       if (cmd.flow_attr.num_of_specs > IB_FLOW_SPEC_SUPPORT_LAYERS)
                return -EINVAL;
 
-       kern_attr_size = cmd.flow_attr.size - sizeof(cmd) -
-                        sizeof(struct ib_uverbs_cmd_hdr_ex);
-
-       if (cmd.flow_attr.size < 0 || cmd.flow_attr.size > in_len ||
-           kern_attr_size < 0 || kern_attr_size >
-           (cmd.flow_attr.num_of_specs * sizeof(struct ib_kern_spec)))
+       if (cmd.flow_attr.size > ucore->inlen ||
+           cmd.flow_attr.size >
+           (cmd.flow_attr.num_of_specs * sizeof(struct ib_uverbs_flow_spec)))
                return -EINVAL;
 
        if (cmd.flow_attr.num_of_specs) {
-               kern_flow_attr = kmalloc(cmd.flow_attr.size, GFP_KERNEL);
+               kern_flow_attr = kmalloc(sizeof(*kern_flow_attr) + cmd.flow_attr.size,
+                                        GFP_KERNEL);
                if (!kern_flow_attr)
                        return -ENOMEM;
 
                memcpy(kern_flow_attr, &cmd.flow_attr, sizeof(*kern_flow_attr));
-               if (copy_from_user(kern_flow_attr + 1, buf + sizeof(cmd),
-                                  kern_attr_size)) {
-                       err = -EFAULT;
+               err = ib_copy_from_udata(kern_flow_attr + 1, ucore,
+                                        cmd.flow_attr.size);
+               if (err)
                        goto err_free_attr;
-               }
        } else {
                kern_flow_attr = &cmd.flow_attr;
-               kern_attr_size = sizeof(cmd.flow_attr);
        }
 
        uobj = kmalloc(sizeof(*uobj), GFP_KERNEL);
@@ -2714,7 +2700,7 @@ ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file,
                goto err_uobj;
        }
 
-       flow_attr = kmalloc(cmd.flow_attr.size, GFP_KERNEL);
+       flow_attr = kmalloc(sizeof(*flow_attr) + cmd.flow_attr.size, GFP_KERNEL);
        if (!flow_attr) {
                err = -ENOMEM;
                goto err_put;
@@ -2729,19 +2715,22 @@ ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file,
 
        kern_spec = kern_flow_attr + 1;
        ib_spec = flow_attr + 1;
-       for (i = 0; i < flow_attr->num_of_specs && kern_attr_size > 0; i++) {
+       for (i = 0; i < flow_attr->num_of_specs &&
+            cmd.flow_attr.size > offsetof(struct ib_uverbs_flow_spec, reserved) &&
+            cmd.flow_attr.size >=
+            ((struct ib_uverbs_flow_spec *)kern_spec)->size; i++) {
                err = kern_spec_to_ib_spec(kern_spec, ib_spec);
                if (err)
                        goto err_free;
                flow_attr->size +=
                        ((union ib_flow_spec *) ib_spec)->size;
-               kern_attr_size -= ((struct ib_kern_spec *) kern_spec)->size;
-               kern_spec += ((struct ib_kern_spec *) kern_spec)->size;
+               cmd.flow_attr.size -= ((struct ib_uverbs_flow_spec *)kern_spec)->size;
+               kern_spec += ((struct ib_uverbs_flow_spec *) kern_spec)->size;
                ib_spec += ((union ib_flow_spec *) ib_spec)->size;
        }
-       if (kern_attr_size) {
-               pr_warn("create flow failed, %d bytes left from uverb cmd\n",
-                       kern_attr_size);
+       if (cmd.flow_attr.size || (i != flow_attr->num_of_specs)) {
+               pr_warn("create flow failed, flow %d: %d bytes left from uverb cmd\n",
+                       i, cmd.flow_attr.size);
                goto err_free;
        }
        flow_id = ib_create_flow(qp, flow_attr, IB_FLOW_DOMAIN_USER);
@@ -2760,11 +2749,10 @@ ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file,
        memset(&resp, 0, sizeof(resp));
        resp.flow_handle = uobj->id;
 
-       if (copy_to_user((void __user *)(unsigned long) cmd.response,
-                        &resp, sizeof(resp))) {
-               err = -EFAULT;
+       err = ib_copy_to_udata(ucore,
+                              &resp, sizeof(resp));
+       if (err)
                goto err_copy;
-       }
 
        put_qp_read(qp);
        mutex_lock(&file->mutex);
@@ -2777,7 +2765,7 @@ ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file,
        kfree(flow_attr);
        if (cmd.flow_attr.num_of_specs)
                kfree(kern_flow_attr);
-       return in_len;
+       return 0;
 err_copy:
        idr_remove_uobj(&ib_uverbs_rule_idr, uobj);
 destroy_flow:
@@ -2794,16 +2782,18 @@ err_free_attr:
        return err;
 }
 
-ssize_t ib_uverbs_destroy_flow(struct ib_uverbs_file *file,
-                              const char __user *buf, int in_len,
-                              int out_len) {
+int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file,
+                             struct ib_udata *ucore,
+                             struct ib_udata *uhw)
+{
        struct ib_uverbs_destroy_flow   cmd;
        struct ib_flow                  *flow_id;
        struct ib_uobject               *uobj;
        int                             ret;
 
-       if (copy_from_user(&cmd, buf, sizeof(cmd)))
-               return -EFAULT;
+       ret = ib_copy_from_udata(&cmd, ucore, sizeof(cmd));
+       if (ret)
+               return ret;
 
        uobj = idr_write_uobj(&ib_uverbs_rule_idr, cmd.flow_handle,
                              file->ucontext);
@@ -2825,9 +2815,8 @@ ssize_t ib_uverbs_destroy_flow(struct ib_uverbs_file *file,
 
        put_uobj(uobj);
 
-       return ret ? ret : in_len;
+       return ret;
 }
-#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */
 
 static int __uverbs_create_xsrq(struct ib_uverbs_file *file,
                                struct ib_uverbs_create_xsrq *cmd,
index 2df31f6..3438694 100644 (file)
@@ -115,10 +115,13 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file,
        [IB_USER_VERBS_CMD_CLOSE_XRCD]          = ib_uverbs_close_xrcd,
        [IB_USER_VERBS_CMD_CREATE_XSRQ]         = ib_uverbs_create_xsrq,
        [IB_USER_VERBS_CMD_OPEN_QP]             = ib_uverbs_open_qp,
-#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING
-       [IB_USER_VERBS_CMD_CREATE_FLOW]         = ib_uverbs_create_flow,
-       [IB_USER_VERBS_CMD_DESTROY_FLOW]        = ib_uverbs_destroy_flow
-#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */
+};
+
+static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file,
+                                   struct ib_udata *ucore,
+                                   struct ib_udata *uhw) = {
+       [IB_USER_VERBS_EX_CMD_CREATE_FLOW]      = ib_uverbs_ex_create_flow,
+       [IB_USER_VERBS_EX_CMD_DESTROY_FLOW]     = ib_uverbs_ex_destroy_flow
 };
 
 static void ib_uverbs_add_one(struct ib_device *device);
@@ -589,6 +592,7 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
 {
        struct ib_uverbs_file *file = filp->private_data;
        struct ib_uverbs_cmd_hdr hdr;
+       __u32 flags;
 
        if (count < sizeof hdr)
                return -EINVAL;
@@ -596,45 +600,105 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
        if (copy_from_user(&hdr, buf, sizeof hdr))
                return -EFAULT;
 
-       if (hdr.command >= ARRAY_SIZE(uverbs_cmd_table) ||
-           !uverbs_cmd_table[hdr.command])
-               return -EINVAL;
+       flags = (hdr.command &
+                IB_USER_VERBS_CMD_FLAGS_MASK) >> IB_USER_VERBS_CMD_FLAGS_SHIFT;
 
-       if (!file->ucontext &&
-           hdr.command != IB_USER_VERBS_CMD_GET_CONTEXT)
-               return -EINVAL;
+       if (!flags) {
+               __u32 command;
 
-       if (!(file->device->ib_dev->uverbs_cmd_mask & (1ull << hdr.command)))
-               return -ENOSYS;
+               if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK |
+                                          IB_USER_VERBS_CMD_COMMAND_MASK))
+                       return -EINVAL;
 
-#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING
-       if (hdr.command >= IB_USER_VERBS_CMD_THRESHOLD) {
-               struct ib_uverbs_cmd_hdr_ex hdr_ex;
+               command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK;
 
-               if (copy_from_user(&hdr_ex, buf, sizeof(hdr_ex)))
-                       return -EFAULT;
+               if (command >= ARRAY_SIZE(uverbs_cmd_table) ||
+                   !uverbs_cmd_table[command])
+                       return -EINVAL;
 
-               if (((hdr_ex.in_words + hdr_ex.provider_in_words) * 4) != count)
+               if (!file->ucontext &&
+                   command != IB_USER_VERBS_CMD_GET_CONTEXT)
                        return -EINVAL;
 
-               return uverbs_cmd_table[hdr.command](file,
-                                                    buf + sizeof(hdr_ex),
-                                                    (hdr_ex.in_words +
-                                                     hdr_ex.provider_in_words) * 4,
-                                                    (hdr_ex.out_words +
-                                                     hdr_ex.provider_out_words) * 4);
-       } else {
-#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */
+               if (!(file->device->ib_dev->uverbs_cmd_mask & (1ull << command)))
+                       return -ENOSYS;
+
                if (hdr.in_words * 4 != count)
                        return -EINVAL;
 
-               return uverbs_cmd_table[hdr.command](file,
-                                                    buf + sizeof(hdr),
-                                                    hdr.in_words * 4,
-                                                    hdr.out_words * 4);
-#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING
+               return uverbs_cmd_table[command](file,
+                                                buf + sizeof(hdr),
+                                                hdr.in_words * 4,
+                                                hdr.out_words * 4);
+
+       } else if (flags == IB_USER_VERBS_CMD_FLAG_EXTENDED) {
+               __u32 command;
+
+               struct ib_uverbs_ex_cmd_hdr ex_hdr;
+               struct ib_udata ucore;
+               struct ib_udata uhw;
+               int err;
+               size_t written_count = count;
+
+               if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK |
+                                          IB_USER_VERBS_CMD_COMMAND_MASK))
+                       return -EINVAL;
+
+               command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK;
+
+               if (command >= ARRAY_SIZE(uverbs_ex_cmd_table) ||
+                   !uverbs_ex_cmd_table[command])
+                       return -ENOSYS;
+
+               if (!file->ucontext)
+                       return -EINVAL;
+
+               if (!(file->device->ib_dev->uverbs_ex_cmd_mask & (1ull << command)))
+                       return -ENOSYS;
+
+               if (count < (sizeof(hdr) + sizeof(ex_hdr)))
+                       return -EINVAL;
+
+               if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr)))
+                       return -EFAULT;
+
+               count -= sizeof(hdr) + sizeof(ex_hdr);
+               buf += sizeof(hdr) + sizeof(ex_hdr);
+
+               if ((hdr.in_words + ex_hdr.provider_in_words) * 8 != count)
+                       return -EINVAL;
+
+               if (ex_hdr.response) {
+                       if (!hdr.out_words && !ex_hdr.provider_out_words)
+                               return -EINVAL;
+               } else {
+                       if (hdr.out_words || ex_hdr.provider_out_words)
+                               return -EINVAL;
+               }
+
+               INIT_UDATA(&ucore,
+                          (hdr.in_words) ? buf : 0,
+                          (unsigned long)ex_hdr.response,
+                          hdr.in_words * 8,
+                          hdr.out_words * 8);
+
+               INIT_UDATA(&uhw,
+                          (ex_hdr.provider_in_words) ? buf + ucore.inlen : 0,
+                          (ex_hdr.provider_out_words) ? (unsigned long)ex_hdr.response + ucore.outlen : 0,
+                          ex_hdr.provider_in_words * 8,
+                          ex_hdr.provider_out_words * 8);
+
+               err = uverbs_ex_cmd_table[command](file,
+                                                  &ucore,
+                                                  &uhw);
+
+               if (err)
+                       return err;
+
+               return written_count;
        }
-#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */
+
+       return -ENOSYS;
 }
 
 static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma)
index a321df2..d4f6ddf 100644 (file)
@@ -114,6 +114,8 @@ rdma_node_get_transport(enum rdma_node_type node_type)
                return RDMA_TRANSPORT_IB;
        case RDMA_NODE_RNIC:
                return RDMA_TRANSPORT_IWARP;
+       case RDMA_NODE_USNIC:
+               return RDMA_TRANSPORT_USNIC;
        default:
                BUG();
                return 0;
@@ -130,6 +132,7 @@ enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_
        case RDMA_TRANSPORT_IB:
                return IB_LINK_LAYER_INFINIBAND;
        case RDMA_TRANSPORT_IWARP:
+       case RDMA_TRANSPORT_USNIC:
                return IB_LINK_LAYER_ETHERNET;
        default:
                return IB_LINK_LAYER_UNSPECIFIED;
@@ -958,6 +961,11 @@ EXPORT_SYMBOL(ib_resize_cq);
 struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags)
 {
        struct ib_mr *mr;
+       int err;
+
+       err = ib_check_mr_access(mr_access_flags);
+       if (err)
+               return ERR_PTR(err);
 
        mr = pd->device->get_dma_mr(pd, mr_access_flags);
 
@@ -980,6 +988,11 @@ struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd,
                             u64 *iova_start)
 {
        struct ib_mr *mr;
+       int err;
+
+       err = ib_check_mr_access(mr_access_flags);
+       if (err)
+               return ERR_PTR(err);
 
        if (!pd->device->reg_phys_mr)
                return ERR_PTR(-ENOSYS);
@@ -1010,6 +1023,10 @@ int ib_rereg_phys_mr(struct ib_mr *mr,
        struct ib_pd *old_pd;
        int ret;
 
+       ret = ib_check_mr_access(mr_access_flags);
+       if (ret)
+               return ret;
+
        if (!mr->device->rereg_phys_mr)
                return -ENOSYS;
 
index 33d2cc6..4a03385 100644 (file)
@@ -602,10 +602,10 @@ static int c4iw_rdev_open(struct c4iw_rdev *rdev)
             rdev->lldi.vr->qp.size,
             rdev->lldi.vr->cq.start,
             rdev->lldi.vr->cq.size);
-       PDBG("udb len 0x%x udb base %p db_reg %p gts_reg %p qpshift %lu "
+       PDBG("udb len 0x%x udb base %llx db_reg %p gts_reg %p qpshift %lu "
             "qpmask 0x%x cqshift %lu cqmask 0x%x\n",
             (unsigned)pci_resource_len(rdev->lldi.pdev, 2),
-            (void *)(unsigned long)pci_resource_start(rdev->lldi.pdev, 2),
+            (u64)pci_resource_start(rdev->lldi.pdev, 2),
             rdev->lldi.db_reg,
             rdev->lldi.gts_reg,
             rdev->qpshift, rdev->qpmask,
index f5cb13b..cc04b7b 100644 (file)
@@ -280,9 +280,7 @@ static int ipath_user_sdma_pin_pages(const struct ipath_devdata *dd,
        int j;
        int ret;
 
-       ret = get_user_pages(current, current->mm, addr,
-                            npages, 0, 1, pages, NULL);
-
+       ret = get_user_pages_fast(addr, npages, 0, pages);
        if (ret != npages) {
                int i;
 
@@ -811,10 +809,7 @@ int ipath_user_sdma_writev(struct ipath_devdata *dd,
        while (dim) {
                const int mxp = 8;
 
-               down_write(&current->mm->mmap_sem);
                ret = ipath_user_sdma_queue_pkts(dd, pq, &list, iov, dim, mxp);
-               up_write(&current->mm->mmap_sem);
-
                if (ret <= 0)
                        goto done_unlock;
                else {
index d5e60f4..66dbf80 100644 (file)
@@ -324,7 +324,7 @@ static int mlx4_ib_get_outstanding_cqes(struct mlx4_ib_cq *cq)
        u32 i;
 
        i = cq->mcq.cons_index;
-       while (get_sw_cqe(cq, i & cq->ibcq.cqe))
+       while (get_sw_cqe(cq, i))
                ++i;
 
        return i - cq->mcq.cons_index;
@@ -365,7 +365,7 @@ int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata)
 
        mutex_lock(&cq->resize_mutex);
 
-       if (entries < 1 || entries > dev->dev->caps.max_cqes) {
+       if (entries < 1) {
                err = -EINVAL;
                goto out;
        }
@@ -376,6 +376,11 @@ int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata)
                goto out;
        }
 
+       if (entries > dev->dev->caps.max_cqes) {
+               err = -EINVAL;
+               goto out;
+       }
+
        if (ibcq->uobject) {
                err = mlx4_alloc_resize_umem(dev, cq, entries, udata);
                if (err)
index f061264..1aad9b3 100644 (file)
@@ -1691,11 +1691,9 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
                ibdev->ib_dev.create_flow       = mlx4_ib_create_flow;
                ibdev->ib_dev.destroy_flow      = mlx4_ib_destroy_flow;
 
-#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING
-               ibdev->ib_dev.uverbs_cmd_mask   |=
-                       (1ull << IB_USER_VERBS_CMD_CREATE_FLOW) |
-                       (1ull << IB_USER_VERBS_CMD_DESTROY_FLOW);
-#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */
+               ibdev->ib_dev.uverbs_ex_cmd_mask        |=
+                       (1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) |
+                       (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW);
        }
 
        mlx4_ib_alloc_eqs(dev, ibdev);
index 344ab03..b726274 100644 (file)
@@ -556,7 +556,7 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata,
                goto err_db;
        }
        mlx5_ib_populate_pas(dev, cq->buf.umem, page_shift, (*cqb)->pas, 0);
-       (*cqb)->ctx.log_pg_sz = page_shift - PAGE_SHIFT;
+       (*cqb)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT;
 
        *index = to_mucontext(context)->uuari.uars[0].index;
 
@@ -620,7 +620,7 @@ static int create_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq,
        }
        mlx5_fill_page_array(&cq->buf.buf, (*cqb)->pas);
 
-       (*cqb)->ctx.log_pg_sz = cq->buf.buf.page_shift - PAGE_SHIFT;
+       (*cqb)->ctx.log_pg_sz = cq->buf.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT;
        *index = dev->mdev.priv.uuari.uars[0].index;
 
        return 0;
@@ -653,8 +653,11 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, int entries,
        int eqn;
        int err;
 
+       if (entries < 0)
+               return ERR_PTR(-EINVAL);
+
        entries = roundup_pow_of_two(entries + 1);
-       if (entries < 1 || entries > dev->mdev.caps.max_cqes)
+       if (entries > dev->mdev.caps.max_cqes)
                return ERR_PTR(-EINVAL);
 
        cq = kzalloc(sizeof(*cq), GFP_KERNEL);
@@ -747,17 +750,9 @@ int mlx5_ib_destroy_cq(struct ib_cq *cq)
        return 0;
 }
 
-static int is_equal_rsn(struct mlx5_cqe64 *cqe64, struct mlx5_ib_srq *srq,
-                       u32 rsn)
+static int is_equal_rsn(struct mlx5_cqe64 *cqe64, u32 rsn)
 {
-       u32 lrsn;
-
-       if (srq)
-               lrsn = be32_to_cpu(cqe64->srqn) & 0xffffff;
-       else
-               lrsn = be32_to_cpu(cqe64->sop_drop_qpn) & 0xffffff;
-
-       return rsn == lrsn;
+       return rsn == (ntohl(cqe64->sop_drop_qpn) & 0xffffff);
 }
 
 void __mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 rsn, struct mlx5_ib_srq *srq)
@@ -787,8 +782,8 @@ void __mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 rsn, struct mlx5_ib_srq *srq)
        while ((int) --prod_index - (int) cq->mcq.cons_index >= 0) {
                cqe = get_cqe(cq, prod_index & cq->ibcq.cqe);
                cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64;
-               if (is_equal_rsn(cqe64, srq, rsn)) {
-                       if (srq)
+               if (is_equal_rsn(cqe64, rsn)) {
+                       if (srq && (ntohl(cqe64->srqn) & 0xffffff))
                                mlx5_ib_free_srq_wqe(srq, be16_to_cpu(cqe64->wqe_counter));
                        ++nfreed;
                } else if (nfreed) {
index b1a6cb3..3065341 100644 (file)
@@ -745,7 +745,8 @@ static int alloc_pa_mkey(struct mlx5_ib_dev *dev, u32 *key, u32 pdn)
        seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
        seg->start_addr = 0;
 
-       err = mlx5_core_create_mkey(&dev->mdev, &mr, in, sizeof(*in));
+       err = mlx5_core_create_mkey(&dev->mdev, &mr, in, sizeof(*in),
+                                   NULL, NULL, NULL);
        if (err) {
                mlx5_ib_warn(dev, "failed to create mkey, %d\n", err);
                goto err_in;
index 836be91..4c134d9 100644 (file)
@@ -262,6 +262,9 @@ struct mlx5_ib_mr {
        int                     npages;
        struct completion       done;
        enum ib_wc_status       status;
+       struct mlx5_ib_dev     *dev;
+       struct mlx5_create_mkey_mbox_out out;
+       unsigned long           start;
 };
 
 struct mlx5_ib_fast_reg_page_list {
@@ -323,6 +326,7 @@ struct mlx5_cache_ent {
        struct mlx5_ib_dev     *dev;
        struct work_struct      work;
        struct delayed_work     dwork;
+       int                     pending;
 };
 
 struct mlx5_mr_cache {
@@ -358,6 +362,8 @@ struct mlx5_ib_dev {
        spinlock_t                      mr_lock;
        struct mlx5_ib_resources        devr;
        struct mlx5_mr_cache            cache;
+       struct timer_list               delay_timer;
+       int                             fill_delay;
 };
 
 static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq)
index 3453580..039c3e4 100644 (file)
 #include <linux/random.h>
 #include <linux/debugfs.h>
 #include <linux/export.h>
+#include <linux/delay.h>
 #include <rdma/ib_umem.h>
 #include "mlx5_ib.h"
 
 enum {
-       DEF_CACHE_SIZE  = 10,
+       MAX_PENDING_REG_MR = 8,
 };
 
 enum {
@@ -63,6 +64,51 @@ static int order2idx(struct mlx5_ib_dev *dev, int order)
                return order - cache->ent[0].order;
 }
 
+static void reg_mr_callback(int status, void *context)
+{
+       struct mlx5_ib_mr *mr = context;
+       struct mlx5_ib_dev *dev = mr->dev;
+       struct mlx5_mr_cache *cache = &dev->cache;
+       int c = order2idx(dev, mr->order);
+       struct mlx5_cache_ent *ent = &cache->ent[c];
+       u8 key;
+       unsigned long flags;
+
+       spin_lock_irqsave(&ent->lock, flags);
+       ent->pending--;
+       spin_unlock_irqrestore(&ent->lock, flags);
+       if (status) {
+               mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status);
+               kfree(mr);
+               dev->fill_delay = 1;
+               mod_timer(&dev->delay_timer, jiffies + HZ);
+               return;
+       }
+
+       if (mr->out.hdr.status) {
+               mlx5_ib_warn(dev, "failed - status %d, syndorme 0x%x\n",
+                            mr->out.hdr.status,
+                            be32_to_cpu(mr->out.hdr.syndrome));
+               kfree(mr);
+               dev->fill_delay = 1;
+               mod_timer(&dev->delay_timer, jiffies + HZ);
+               return;
+       }
+
+       spin_lock_irqsave(&dev->mdev.priv.mkey_lock, flags);
+       key = dev->mdev.priv.mkey_key++;
+       spin_unlock_irqrestore(&dev->mdev.priv.mkey_lock, flags);
+       mr->mmr.key = mlx5_idx_to_mkey(be32_to_cpu(mr->out.mkey) & 0xffffff) | key;
+
+       cache->last_add = jiffies;
+
+       spin_lock_irqsave(&ent->lock, flags);
+       list_add_tail(&mr->list, &ent->head);
+       ent->cur++;
+       ent->size++;
+       spin_unlock_irqrestore(&ent->lock, flags);
+}
+
 static int add_keys(struct mlx5_ib_dev *dev, int c, int num)
 {
        struct mlx5_mr_cache *cache = &dev->cache;
@@ -78,36 +124,39 @@ static int add_keys(struct mlx5_ib_dev *dev, int c, int num)
                return -ENOMEM;
 
        for (i = 0; i < num; i++) {
+               if (ent->pending >= MAX_PENDING_REG_MR) {
+                       err = -EAGAIN;
+                       break;
+               }
+
                mr = kzalloc(sizeof(*mr), GFP_KERNEL);
                if (!mr) {
                        err = -ENOMEM;
-                       goto out;
+                       break;
                }
                mr->order = ent->order;
                mr->umred = 1;
+               mr->dev = dev;
                in->seg.status = 1 << 6;
                in->seg.xlt_oct_size = cpu_to_be32((npages + 1) / 2);
                in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
                in->seg.flags = MLX5_ACCESS_MODE_MTT | MLX5_PERM_UMR_EN;
                in->seg.log2_page_size = 12;
 
+               spin_lock_irq(&ent->lock);
+               ent->pending++;
+               spin_unlock_irq(&ent->lock);
+               mr->start = jiffies;
                err = mlx5_core_create_mkey(&dev->mdev, &mr->mmr, in,
-                                           sizeof(*in));
+                                           sizeof(*in), reg_mr_callback,
+                                           mr, &mr->out);
                if (err) {
                        mlx5_ib_warn(dev, "create mkey failed %d\n", err);
                        kfree(mr);
-                       goto out;
+                       break;
                }
-               cache->last_add = jiffies;
-
-               spin_lock(&ent->lock);
-               list_add_tail(&mr->list, &ent->head);
-               ent->cur++;
-               ent->size++;
-               spin_unlock(&ent->lock);
        }
 
-out:
        kfree(in);
        return err;
 }
@@ -121,16 +170,16 @@ static void remove_keys(struct mlx5_ib_dev *dev, int c, int num)
        int i;
 
        for (i = 0; i < num; i++) {
-               spin_lock(&ent->lock);
+               spin_lock_irq(&ent->lock);
                if (list_empty(&ent->head)) {
-                       spin_unlock(&ent->lock);
+                       spin_unlock_irq(&ent->lock);
                        return;
                }
                mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
                list_del(&mr->list);
                ent->cur--;
                ent->size--;
-               spin_unlock(&ent->lock);
+               spin_unlock_irq(&ent->lock);
                err = mlx5_core_destroy_mkey(&dev->mdev, &mr->mmr);
                if (err)
                        mlx5_ib_warn(dev, "failed destroy mkey\n");
@@ -162,9 +211,13 @@ static ssize_t size_write(struct file *filp, const char __user *buf,
                return -EINVAL;
 
        if (var > ent->size) {
-               err = add_keys(dev, c, var - ent->size);
-               if (err)
-                       return err;
+               do {
+                       err = add_keys(dev, c, var - ent->size);
+                       if (err && err != -EAGAIN)
+                               return err;
+
+                       usleep_range(3000, 5000);
+               } while (err);
        } else if (var < ent->size) {
                remove_keys(dev, c, ent->size - var);
        }
@@ -280,23 +333,37 @@ static void __cache_work_func(struct mlx5_cache_ent *ent)
        struct mlx5_ib_dev *dev = ent->dev;
        struct mlx5_mr_cache *cache = &dev->cache;
        int i = order2idx(dev, ent->order);
+       int err;
 
        if (cache->stopped)
                return;
 
        ent = &dev->cache.ent[i];
-       if (ent->cur < 2 * ent->limit) {
-               add_keys(dev, i, 1);
-               if (ent->cur < 2 * ent->limit)
-                       queue_work(cache->wq, &ent->work);
+       if (ent->cur < 2 * ent->limit && !dev->fill_delay) {
+               err = add_keys(dev, i, 1);
+               if (ent->cur < 2 * ent->limit) {
+                       if (err == -EAGAIN) {
+                               mlx5_ib_dbg(dev, "returned eagain, order %d\n",
+                                           i + 2);
+                               queue_delayed_work(cache->wq, &ent->dwork,
+                                                  msecs_to_jiffies(3));
+                       } else if (err) {
+                               mlx5_ib_warn(dev, "command failed order %d, err %d\n",
+                                            i + 2, err);
+                               queue_delayed_work(cache->wq, &ent->dwork,
+                                                  msecs_to_jiffies(1000));
+                       } else {
+                               queue_work(cache->wq, &ent->work);
+                       }
+               }
        } else if (ent->cur > 2 * ent->limit) {
                if (!someone_adding(cache) &&
-                   time_after(jiffies, cache->last_add + 60 * HZ)) {
+                   time_after(jiffies, cache->last_add + 300 * HZ)) {
                        remove_keys(dev, i, 1);
                        if (ent->cur > ent->limit)
                                queue_work(cache->wq, &ent->work);
                } else {
-                       queue_delayed_work(cache->wq, &ent->dwork, 60 * HZ);
+                       queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ);
                }
        }
 }
@@ -336,18 +403,18 @@ static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order)
 
                mlx5_ib_dbg(dev, "order %d, cache index %d\n", ent->order, i);
 
-               spin_lock(&ent->lock);
+               spin_lock_irq(&ent->lock);
                if (!list_empty(&ent->head)) {
                        mr = list_first_entry(&ent->head, struct mlx5_ib_mr,
                                              list);
                        list_del(&mr->list);
                        ent->cur--;
-                       spin_unlock(&ent->lock);
+                       spin_unlock_irq(&ent->lock);
                        if (ent->cur < ent->limit)
                                queue_work(cache->wq, &ent->work);
                        break;
                }
-               spin_unlock(&ent->lock);
+               spin_unlock_irq(&ent->lock);
 
                queue_work(cache->wq, &ent->work);
 
@@ -374,12 +441,12 @@ static void free_cached_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
                return;
        }
        ent = &cache->ent[c];
-       spin_lock(&ent->lock);
+       spin_lock_irq(&ent->lock);
        list_add_tail(&mr->list, &ent->head);
        ent->cur++;
        if (ent->cur > 2 * ent->limit)
                shrink = 1;
-       spin_unlock(&ent->lock);
+       spin_unlock_irq(&ent->lock);
 
        if (shrink)
                queue_work(cache->wq, &ent->work);
@@ -394,16 +461,16 @@ static void clean_keys(struct mlx5_ib_dev *dev, int c)
 
        cancel_delayed_work(&ent->dwork);
        while (1) {
-               spin_lock(&ent->lock);
+               spin_lock_irq(&ent->lock);
                if (list_empty(&ent->head)) {
-                       spin_unlock(&ent->lock);
+                       spin_unlock_irq(&ent->lock);
                        return;
                }
                mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
                list_del(&mr->list);
                ent->cur--;
                ent->size--;
-               spin_unlock(&ent->lock);
+               spin_unlock_irq(&ent->lock);
                err = mlx5_core_destroy_mkey(&dev->mdev, &mr->mmr);
                if (err)
                        mlx5_ib_warn(dev, "failed destroy mkey\n");
@@ -464,12 +531,18 @@ static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
        debugfs_remove_recursive(dev->cache.root);
 }
 
+static void delay_time_func(unsigned long ctx)
+{
+       struct mlx5_ib_dev *dev = (struct mlx5_ib_dev *)ctx;
+
+       dev->fill_delay = 0;
+}
+
 int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
 {
        struct mlx5_mr_cache *cache = &dev->cache;
        struct mlx5_cache_ent *ent;
        int limit;
-       int size;
        int err;
        int i;
 
@@ -479,6 +552,7 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
                return -ENOMEM;
        }
 
+       setup_timer(&dev->delay_timer, delay_time_func, (unsigned long)dev);
        for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
                INIT_LIST_HEAD(&cache->ent[i].head);
                spin_lock_init(&cache->ent[i].lock);
@@ -489,13 +563,11 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
                ent->order = i + 2;
                ent->dev = dev;
 
-               if (dev->mdev.profile->mask & MLX5_PROF_MASK_MR_CACHE) {
-                       size = dev->mdev.profile->mr_cache[i].size;
+               if (dev->mdev.profile->mask & MLX5_PROF_MASK_MR_CACHE)
                        limit = dev->mdev.profile->mr_cache[i].limit;
-               } else {
-                       size = DEF_CACHE_SIZE;
+               else
                        limit = 0;
-               }
+
                INIT_WORK(&ent->work, cache_work_func);
                INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
                ent->limit = limit;
@@ -522,6 +594,7 @@ int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev)
                clean_keys(dev, i);
 
        destroy_workqueue(dev->cache.wq);
+       del_timer_sync(&dev->delay_timer);
 
        return 0;
 }
@@ -551,7 +624,8 @@ struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc)
        seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
        seg->start_addr = 0;
 
-       err = mlx5_core_create_mkey(mdev, &mr->mmr, in, sizeof(*in));
+       err = mlx5_core_create_mkey(mdev, &mr->mmr, in, sizeof(*in), NULL, NULL,
+                                   NULL);
        if (err)
                goto err_in;
 
@@ -660,14 +734,14 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem,
        int err;
        int i;
 
-       for (i = 0; i < 10; i++) {
+       for (i = 0; i < 1; i++) {
                mr = alloc_cached_mr(dev, order);
                if (mr)
                        break;
 
                err = add_keys(dev, order2idx(dev, order), 1);
-               if (err) {
-                       mlx5_ib_warn(dev, "add_keys failed\n");
+               if (err && err != -EAGAIN) {
+                       mlx5_ib_warn(dev, "add_keys failed, err %d\n", err);
                        break;
                }
        }
@@ -759,8 +833,10 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr,
        in->seg.xlt_oct_size = cpu_to_be32(get_octo_len(virt_addr, length, 1 << page_shift));
        in->seg.log2_page_size = page_shift;
        in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
-       in->xlat_oct_act_size = cpu_to_be32(get_octo_len(virt_addr, length, 1 << page_shift));
-       err = mlx5_core_create_mkey(&dev->mdev, &mr->mmr, in, inlen);
+       in->xlat_oct_act_size = cpu_to_be32(get_octo_len(virt_addr, length,
+                                                        1 << page_shift));
+       err = mlx5_core_create_mkey(&dev->mdev, &mr->mmr, in, inlen, NULL,
+                                   NULL, NULL);
        if (err) {
                mlx5_ib_warn(dev, "create mkey failed\n");
                goto err_2;
@@ -944,7 +1020,8 @@ struct ib_mr *mlx5_ib_alloc_fast_reg_mr(struct ib_pd *pd,
         * TBD not needed - issue 197292 */
        in->seg.log2_page_size = PAGE_SHIFT;
 
-       err = mlx5_core_create_mkey(&dev->mdev, &mr->mmr, in, sizeof(*in));
+       err = mlx5_core_create_mkey(&dev->mdev, &mr->mmr, in, sizeof(*in), NULL,
+                                   NULL, NULL);
        kfree(in);
        if (err)
                goto err_free;
index 5659ea8..7c6b4ba 100644 (file)
@@ -551,7 +551,7 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
        }
        mlx5_ib_populate_pas(dev, qp->umem, page_shift, (*in)->pas, 0);
        (*in)->ctx.log_pg_sz_remote_qpn =
-               cpu_to_be32((page_shift - PAGE_SHIFT) << 24);
+               cpu_to_be32((page_shift - MLX5_ADAPTER_PAGE_SHIFT) << 24);
        (*in)->ctx.params2 = cpu_to_be32(offset << 6);
 
        (*in)->ctx.qp_counter_set_usr_page = cpu_to_be32(uar_index);
@@ -648,7 +648,8 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev,
                goto err_buf;
        }
        (*in)->ctx.qp_counter_set_usr_page = cpu_to_be32(uar_index);
-       (*in)->ctx.log_pg_sz_remote_qpn = cpu_to_be32((qp->buf.page_shift - PAGE_SHIFT) << 24);
+       (*in)->ctx.log_pg_sz_remote_qpn =
+               cpu_to_be32((qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT) << 24);
        /* Set "fast registration enabled" for all kernel QPs */
        (*in)->ctx.params1 |= cpu_to_be32(1 << 11);
        (*in)->ctx.sq_crq_size |= cpu_to_be16(1 << 4);
@@ -1317,9 +1318,11 @@ static enum mlx5_qp_optpar opt_mask[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE][MLX5_Q
                                          MLX5_QP_OPTPAR_RAE            |
                                          MLX5_QP_OPTPAR_RWE            |
                                          MLX5_QP_OPTPAR_RNR_TIMEOUT    |
-                                         MLX5_QP_OPTPAR_PM_STATE,
+                                         MLX5_QP_OPTPAR_PM_STATE       |
+                                         MLX5_QP_OPTPAR_ALT_ADDR_PATH,
                        [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE            |
-                                         MLX5_QP_OPTPAR_PM_STATE,
+                                         MLX5_QP_OPTPAR_PM_STATE       |
+                                         MLX5_QP_OPTPAR_ALT_ADDR_PATH,
                        [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY          |
                                          MLX5_QP_OPTPAR_SRQN           |
                                          MLX5_QP_OPTPAR_CQN_RCV,
@@ -1550,7 +1553,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
        mlx5_cur = to_mlx5_state(cur_state);
        mlx5_new = to_mlx5_state(new_state);
        mlx5_st = to_mlx5_st(ibqp->qp_type);
-       if (mlx5_cur < 0 || mlx5_new < 0 || mlx5_st < 0)
+       if (mlx5_st < 0)
                goto out;
 
        optpar = ib_mask_to_mlx5_opt(attr_mask);
@@ -1744,6 +1747,7 @@ static void set_reg_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr,
                        MLX5_MKEY_MASK_PD               |
                        MLX5_MKEY_MASK_LR               |
                        MLX5_MKEY_MASK_LW               |
+                       MLX5_MKEY_MASK_KEY              |
                        MLX5_MKEY_MASK_RR               |
                        MLX5_MKEY_MASK_RW               |
                        MLX5_MKEY_MASK_A                |
@@ -1800,7 +1804,8 @@ static void set_reg_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *w
        seg->start_addr = cpu_to_be64(wr->wr.fast_reg.iova_start);
        seg->len = cpu_to_be64(wr->wr.fast_reg.length);
        seg->log2_page_size = wr->wr.fast_reg.page_shift;
-       seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
+       seg->qpn_mkey7_0 = cpu_to_be32(0xffffff00 |
+                                      mlx5_mkey_variant(wr->wr.fast_reg.rkey));
 }
 
 static void set_frwr_pages(struct mlx5_wqe_data_seg *dseg,
@@ -1913,6 +1918,10 @@ static int set_frwr_li_wr(void **seg, struct ib_send_wr *wr, int *size,
        if (unlikely((*seg == qp->sq.qend)))
                *seg = mlx5_get_send_wqe(qp, 0);
        if (!li) {
+               if (unlikely(wr->wr.fast_reg.page_list_len >
+                            wr->wr.fast_reg.page_list->max_page_list_len))
+                       return  -ENOMEM;
+
                set_frwr_pages(*seg, wr, mdev, pd, writ);
                *seg += sizeof(struct mlx5_wqe_data_seg);
                *size += (sizeof(struct mlx5_wqe_data_seg) / 16);
index 0aa478b..210b3ea 100644 (file)
@@ -123,7 +123,7 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
                goto err_in;
        }
 
-       (*in)->ctx.log_pg_sz = page_shift - PAGE_SHIFT;
+       (*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT;
        (*in)->ctx.pgoff_cqn = cpu_to_be32(offset << 26);
 
        return 0;
@@ -192,7 +192,7 @@ static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq,
        }
        srq->wq_sig = !!srq_signature;
 
-       (*in)->ctx.log_pg_sz = page_shift - PAGE_SHIFT;
+       (*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT;
 
        return 0;
 
@@ -390,9 +390,7 @@ int mlx5_ib_destroy_srq(struct ib_srq *srq)
                mlx5_ib_db_unmap_user(to_mucontext(srq->uobject->context), &msrq->db);
                ib_umem_release(msrq->umem);
        } else {
-               kfree(msrq->wrid);
-               mlx5_buf_free(&dev->mdev, &msrq->buf);
-               mlx5_db_free(&dev->mdev, &msrq->db);
+               destroy_srq_kernel(dev, msrq);
        }
 
        kfree(srq);
index 5b53ca5..8308e36 100644 (file)
@@ -2834,7 +2834,7 @@ static int nes_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
        init_attr->qp_context = nesqp->ibqp.qp_context;
        init_attr->send_cq = nesqp->ibqp.send_cq;
        init_attr->recv_cq = nesqp->ibqp.recv_cq;
-       init_attr->srq = nesqp->ibqp.srq = nesqp->ibqp.srq;
+       init_attr->srq = nesqp->ibqp.srq;
        init_attr->cap = attr->cap;
 
        return 0;
index adc11d1..294dd27 100644 (file)
@@ -122,6 +122,32 @@ struct mqe_ctx {
        bool cmd_done;
 };
 
+struct ocrdma_hw_mr {
+       u32 lkey;
+       u8 fr_mr;
+       u8 remote_atomic;
+       u8 remote_rd;
+       u8 remote_wr;
+       u8 local_rd;
+       u8 local_wr;
+       u8 mw_bind;
+       u8 rsvd;
+       u64 len;
+       struct ocrdma_pbl *pbl_table;
+       u32 num_pbls;
+       u32 num_pbes;
+       u32 pbl_size;
+       u32 pbe_size;
+       u64 fbo;
+       u64 va;
+};
+
+struct ocrdma_mr {
+       struct ib_mr ibmr;
+       struct ib_umem *umem;
+       struct ocrdma_hw_mr hwmr;
+};
+
 struct ocrdma_dev {
        struct ib_device ibdev;
        struct ocrdma_dev_attr attr;
@@ -169,7 +195,7 @@ struct ocrdma_dev {
        struct list_head entry;
        struct rcu_head rcu;
        int id;
-       u64 stag_arr[OCRDMA_MAX_STAG];
+       struct ocrdma_mr *stag_arr[OCRDMA_MAX_STAG];
        u16 pvid;
 };
 
@@ -294,31 +320,6 @@ struct ocrdma_qp {
        u16 db_cache;
 };
 
-struct ocrdma_hw_mr {
-       u32 lkey;
-       u8 fr_mr;
-       u8 remote_atomic;
-       u8 remote_rd;
-       u8 remote_wr;
-       u8 local_rd;
-       u8 local_wr;
-       u8 mw_bind;
-       u8 rsvd;
-       u64 len;
-       struct ocrdma_pbl *pbl_table;
-       u32 num_pbls;
-       u32 num_pbes;
-       u32 pbl_size;
-       u32 pbe_size;
-       u64 fbo;
-       u64 va;
-};
-
-struct ocrdma_mr {
-       struct ib_mr ibmr;
-       struct ib_umem *umem;
-       struct ocrdma_hw_mr hwmr;
-};
 
 struct ocrdma_ucontext {
        struct ib_ucontext ibucontext;
index 50219ab..56bf32f 100644 (file)
@@ -1783,7 +1783,7 @@ static int ocrdma_set_create_qp_sq_cmd(struct ocrdma_create_qp_req *cmd,
        u32 max_sges = attrs->cap.max_send_sge;
 
        /* QP1 may exceed 127 */
-       max_wqe_allocated = min_t(int, attrs->cap.max_send_wr + 1,
+       max_wqe_allocated = min_t(u32, attrs->cap.max_send_wr + 1,
                                dev->attr.max_wqe);
 
        status = ocrdma_build_q_conf(&max_wqe_allocated,
index 0ce7674..91443bc 100644 (file)
@@ -452,9 +452,6 @@ static void ocrdma_remove_free(struct rcu_head *rcu)
 {
        struct ocrdma_dev *dev = container_of(rcu, struct ocrdma_dev, rcu);
 
-       ocrdma_free_resources(dev);
-       ocrdma_cleanup_hw(dev);
-
        idr_remove(&ocrdma_dev_id, dev->id);
        kfree(dev->mbx_cmd);
        ib_dealloc_device(&dev->ibdev);
@@ -470,6 +467,10 @@ static void ocrdma_remove(struct ocrdma_dev *dev)
        spin_lock(&ocrdma_devlist_lock);
        list_del_rcu(&dev->entry);
        spin_unlock(&ocrdma_devlist_lock);
+
+       ocrdma_free_resources(dev);
+       ocrdma_cleanup_hw(dev);
+
        call_rcu(&dev->rcu, ocrdma_remove_free);
 }
 
index 69f1d12..7686dce 100644 (file)
@@ -1981,9 +1981,7 @@ static int ocrdma_build_fr(struct ocrdma_qp *qp, struct ocrdma_hdr_wqe *hdr,
 
        wqe_size = roundup(wqe_size, OCRDMA_WQE_ALIGN_BYTES);
 
-       if ((wr->wr.fast_reg.page_list_len >
-               qp->dev->attr.max_pages_per_frmr) ||
-               (wr->wr.fast_reg.length > 0xffffffffULL))
+       if (wr->wr.fast_reg.page_list_len > qp->dev->attr.max_pages_per_frmr)
                return -EINVAL;
 
        hdr->cw |= (OCRDMA_FR_MR << OCRDMA_WQE_OPCODE_SHIFT);
@@ -2839,7 +2837,7 @@ struct ib_mr *ocrdma_alloc_frmr(struct ib_pd *ibpd, int max_page_list_len)
                goto mbx_err;
        mr->ibmr.rkey = mr->hwmr.lkey;
        mr->ibmr.lkey = mr->hwmr.lkey;
-       dev->stag_arr[(mr->hwmr.lkey >> 8) & (OCRDMA_MAX_STAG - 1)] = (unsigned long) mr;
+       dev->stag_arr[(mr->hwmr.lkey >> 8) & (OCRDMA_MAX_STAG - 1)] = mr;
        return &mr->ibmr;
 mbx_err:
        ocrdma_free_mr_pbl_tbl(dev, &mr->hwmr);
index 016e742..5bfc02f 100644 (file)
@@ -6190,21 +6190,20 @@ static int setup_txselect(const char *str, struct kernel_param *kp)
 {
        struct qib_devdata *dd;
        unsigned long val;
-       int ret;
-
+       char *n;
        if (strlen(str) >= MAX_ATTEN_LEN) {
                pr_info("txselect_values string too long\n");
                return -ENOSPC;
        }
-       ret = kstrtoul(str, 0, &val);
-       if (ret || val >= (TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ +
+       val = simple_strtoul(str, &n, 0);
+       if (n == str || val >= (TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ +
                                TXDDS_MFG_SZ)) {
                pr_info("txselect_values must start with a number < %d\n",
                        TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ + TXDDS_MFG_SZ);
-               return ret ? ret : -EINVAL;
+               return -EINVAL;
        }
-
        strcpy(txselect_list, str);
+
        list_for_each_entry(dd, &qib_dev_list, list)
                if (dd->deviceid == PCI_DEVICE_ID_QLOGIC_IB_7322)
                        set_no_qsfp_atten(dd, 1);
index 28874f8..941d4d5 100644 (file)
@@ -54,7 +54,7 @@ struct ib_node_info {
        __be32 revision;
        u8 local_port_num;
        u8 vendor_id[3];
-} __attribute__ ((packed));
+} __packed;
 
 struct ib_mad_notice_attr {
        u8 generic_type;
@@ -73,7 +73,7 @@ struct ib_mad_notice_attr {
                        __be16  reserved;
                        __be16  lid;            /* where violation happened */
                        u8      port_num;       /* where violation happened */
-               } __attribute__ ((packed)) ntc_129_131;
+               } __packed ntc_129_131;
 
                struct {
                        __be16  reserved;
@@ -83,14 +83,14 @@ struct ib_mad_notice_attr {
                        __be32  new_cap_mask;   /* new capability mask */
                        u8      reserved3;
                        u8      change_flags;   /* low 3 bits only */
-               } __attribute__ ((packed)) ntc_144;
+               } __packed ntc_144;
 
                struct {
                        __be16  reserved;
                        __be16  lid;            /* lid where sys guid changed */
                        __be16  reserved2;
                        __be64  new_sys_guid;
-               } __attribute__ ((packed)) ntc_145;
+               } __packed ntc_145;
 
                struct {
                        __be16  reserved;
@@ -104,7 +104,7 @@ struct ib_mad_notice_attr {
                        u8      reserved3;
                        u8      dr_trunc_hop;
                        u8      dr_rtn_path[30];
-               } __attribute__ ((packed)) ntc_256;
+               } __packed ntc_256;
 
                struct {
                        __be16          reserved;
@@ -115,7 +115,7 @@ struct ib_mad_notice_attr {
                        __be32          qp2;    /* high 8 bits reserved */
                        union ib_gid    gid1;
                        union ib_gid    gid2;
-               } __attribute__ ((packed)) ntc_257_258;
+               } __packed ntc_257_258;
 
        } details;
 };
@@ -209,7 +209,7 @@ struct ib_pma_portcounters_cong {
        __be64 port_rcv_packets;
        __be64 port_xmit_wait;
        __be64 port_adr_events;
-} __attribute__ ((packed));
+} __packed;
 
 #define IB_PMA_CONG_HW_CONTROL_TIMER            0x00
 #define IB_PMA_CONG_HW_CONTROL_SAMPLE           0x01
index d0a0ea0..165aee2 100644 (file)
@@ -594,8 +594,7 @@ static int qib_user_sdma_pin_pages(const struct qib_devdata *dd,
                else
                        j = npages;
 
-               ret = get_user_pages(current, current->mm, addr,
-                            j, 0, 1, pages, NULL);
+               ret = get_user_pages_fast(addr, j, 0, pages);
                if (ret != j) {
                        i = 0;
                        j = ret;
@@ -1294,11 +1293,8 @@ int qib_user_sdma_writev(struct qib_ctxtdata *rcd,
                int mxp = 8;
                int ndesc = 0;
 
-               down_write(&current->mm->mmap_sem);
                ret = qib_user_sdma_queue_pkts(dd, ppd, pq,
                                iov, dim, &list, &mxp, &ndesc);
-               up_write(&current->mm->mmap_sem);
-
                if (ret < 0)
                        goto done_unlock;
                else {
index 012e2c7..a01c7d2 100644 (file)
@@ -150,14 +150,14 @@ struct ib_reth {
        __be64 vaddr;
        __be32 rkey;
        __be32 length;
-} __attribute__ ((packed));
+} __packed;
 
 struct ib_atomic_eth {
        __be32 vaddr[2];        /* unaligned so access as 2 32-bit words */
        __be32 rkey;
        __be64 swap_data;
        __be64 compare_data;
-} __attribute__ ((packed));
+} __packed;
 
 struct qib_other_headers {
        __be32 bth[3];
@@ -178,7 +178,7 @@ struct qib_other_headers {
                __be32 aeth;
                struct ib_atomic_eth atomic_eth;
        } u;
-} __attribute__ ((packed));
+} __packed;
 
 /*
  * Note that UD packets with a GRH header are 8+40+12+8 = 68 bytes
@@ -195,12 +195,12 @@ struct qib_ib_header {
                } l;
                struct qib_other_headers oth;
        } u;
-} __attribute__ ((packed));
+} __packed;
 
 struct qib_pio_header {
        __le32 pbc[2];
        struct qib_ib_header hdr;
-} __attribute__ ((packed));
+} __packed;
 
 /*
  * There is one struct qib_mcast for each multicast GID.
index f93baf8..a886319 100644 (file)
@@ -46,6 +46,7 @@
 #include <scsi/scsi.h>
 #include <scsi/scsi_device.h>
 #include <scsi/scsi_dbg.h>
+#include <scsi/scsi_tcq.h>
 #include <scsi/srp.h>
 #include <scsi/scsi_transport_srp.h>
 
@@ -86,6 +87,32 @@ module_param(topspin_workarounds, int, 0444);
 MODULE_PARM_DESC(topspin_workarounds,
                 "Enable workarounds for Topspin/Cisco SRP target bugs if != 0");
 
+static struct kernel_param_ops srp_tmo_ops;
+
+static int srp_reconnect_delay = 10;
+module_param_cb(reconnect_delay, &srp_tmo_ops, &srp_reconnect_delay,
+               S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(reconnect_delay, "Time between successive reconnect attempts");
+
+static int srp_fast_io_fail_tmo = 15;
+module_param_cb(fast_io_fail_tmo, &srp_tmo_ops, &srp_fast_io_fail_tmo,
+               S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(fast_io_fail_tmo,
+                "Number of seconds between the observation of a transport"
+                " layer error and failing all I/O. \"off\" means that this"
+                " functionality is disabled.");
+
+static int srp_dev_loss_tmo = 600;
+module_param_cb(dev_loss_tmo, &srp_tmo_ops, &srp_dev_loss_tmo,
+               S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(dev_loss_tmo,
+                "Maximum number of seconds that the SRP transport should"
+                " insulate transport layer errors. After this time has been"
+                " exceeded the SCSI host is removed. Should be"
+                " between 1 and " __stringify(SCSI_DEVICE_BLOCK_MAX_TIMEOUT)
+                " if fast_io_fail_tmo has not been set. \"off\" means that"
+                " this functionality is disabled.");
+
 static void srp_add_one(struct ib_device *device);
 static void srp_remove_one(struct ib_device *device);
 static void srp_recv_completion(struct ib_cq *cq, void *target_ptr);
@@ -102,6 +129,48 @@ static struct ib_client srp_client = {
 
 static struct ib_sa_client srp_sa_client;
 
+static int srp_tmo_get(char *buffer, const struct kernel_param *kp)
+{
+       int tmo = *(int *)kp->arg;
+
+       if (tmo >= 0)
+               return sprintf(buffer, "%d", tmo);
+       else
+               return sprintf(buffer, "off");
+}
+
+static int srp_tmo_set(const char *val, const struct kernel_param *kp)
+{
+       int tmo, res;
+
+       if (strncmp(val, "off", 3) != 0) {
+               res = kstrtoint(val, 0, &tmo);
+               if (res)
+                       goto out;
+       } else {
+               tmo = -1;
+       }
+       if (kp->arg == &srp_reconnect_delay)
+               res = srp_tmo_valid(tmo, srp_fast_io_fail_tmo,
+                                   srp_dev_loss_tmo);
+       else if (kp->arg == &srp_fast_io_fail_tmo)
+               res = srp_tmo_valid(srp_reconnect_delay, tmo, srp_dev_loss_tmo);
+       else
+               res = srp_tmo_valid(srp_reconnect_delay, srp_fast_io_fail_tmo,
+                                   tmo);
+       if (res)
+               goto out;
+       *(int *)kp->arg = tmo;
+
+out:
+       return res;
+}
+
+static struct kernel_param_ops srp_tmo_ops = {
+       .get = srp_tmo_get,
+       .set = srp_tmo_set,
+};
+
 static inline struct srp_target_port *host_to_target(struct Scsi_Host *host)
 {
        return (struct srp_target_port *) host->hostdata;
@@ -231,16 +300,16 @@ static int srp_create_target_ib(struct srp_target_port *target)
                return -ENOMEM;
 
        recv_cq = ib_create_cq(target->srp_host->srp_dev->dev,
-                              srp_recv_completion, NULL, target, SRP_RQ_SIZE,
-                              target->comp_vector);
+                              srp_recv_completion, NULL, target,
+                              target->queue_size, target->comp_vector);
        if (IS_ERR(recv_cq)) {
                ret = PTR_ERR(recv_cq);
                goto err;
        }
 
        send_cq = ib_create_cq(target->srp_host->srp_dev->dev,
-                              srp_send_completion, NULL, target, SRP_SQ_SIZE,
-                              target->comp_vector);
+                              srp_send_completion, NULL, target,
+                              target->queue_size, target->comp_vector);
        if (IS_ERR(send_cq)) {
                ret = PTR_ERR(send_cq);
                goto err_recv_cq;
@@ -249,8 +318,8 @@ static int srp_create_target_ib(struct srp_target_port *target)
        ib_req_notify_cq(recv_cq, IB_CQ_NEXT_COMP);
 
        init_attr->event_handler       = srp_qp_event;
-       init_attr->cap.max_send_wr     = SRP_SQ_SIZE;
-       init_attr->cap.max_recv_wr     = SRP_RQ_SIZE;
+       init_attr->cap.max_send_wr     = target->queue_size;
+       init_attr->cap.max_recv_wr     = target->queue_size;
        init_attr->cap.max_recv_sge    = 1;
        init_attr->cap.max_send_sge    = 1;
        init_attr->sq_sig_type         = IB_SIGNAL_ALL_WR;
@@ -296,6 +365,10 @@ err:
        return ret;
 }
 
+/*
+ * Note: this function may be called without srp_alloc_iu_bufs() having been
+ * invoked. Hence the target->[rt]x_ring checks.
+ */
 static void srp_free_target_ib(struct srp_target_port *target)
 {
        int i;
@@ -307,10 +380,18 @@ static void srp_free_target_ib(struct srp_target_port *target)
        target->qp = NULL;
        target->send_cq = target->recv_cq = NULL;
 
-       for (i = 0; i < SRP_RQ_SIZE; ++i)
-               srp_free_iu(target->srp_host, target->rx_ring[i]);
-       for (i = 0; i < SRP_SQ_SIZE; ++i)
-               srp_free_iu(target->srp_host, target->tx_ring[i]);
+       if (target->rx_ring) {
+               for (i = 0; i < target->queue_size; ++i)
+                       srp_free_iu(target->srp_host, target->rx_ring[i]);
+               kfree(target->rx_ring);
+               target->rx_ring = NULL;
+       }
+       if (target->tx_ring) {
+               for (i = 0; i < target->queue_size; ++i)
+                       srp_free_iu(target->srp_host, target->tx_ring[i]);
+               kfree(target->tx_ring);
+               target->tx_ring = NULL;
+       }
 }
 
 static void srp_path_rec_completion(int status,
@@ -390,7 +471,7 @@ static int srp_send_req(struct srp_target_port *target)
        req->param.responder_resources        = 4;
        req->param.remote_cm_response_timeout = 20;
        req->param.local_cm_response_timeout  = 20;
-       req->param.retry_count                = 7;
+       req->param.retry_count                = target->tl_retry_count;
        req->param.rnr_retry_count            = 7;
        req->param.max_cm_retries             = 15;
 
@@ -496,7 +577,11 @@ static void srp_free_req_data(struct srp_target_port *target)
        struct srp_request *req;
        int i;
 
-       for (i = 0, req = target->req_ring; i < SRP_CMD_SQ_SIZE; ++i, ++req) {
+       if (!target->req_ring)
+               return;
+
+       for (i = 0; i < target->req_ring_size; ++i) {
+               req = &target->req_ring[i];
                kfree(req->fmr_list);
                kfree(req->map_page);
                if (req->indirect_dma_addr) {
@@ -506,6 +591,50 @@ static void srp_free_req_data(struct srp_target_port *target)
                }
                kfree(req->indirect_desc);
        }
+
+       kfree(target->req_ring);
+       target->req_ring = NULL;
+}
+
+static int srp_alloc_req_data(struct srp_target_port *target)
+{
+       struct srp_device *srp_dev = target->srp_host->srp_dev;
+       struct ib_device *ibdev = srp_dev->dev;
+       struct srp_request *req;
+       dma_addr_t dma_addr;
+       int i, ret = -ENOMEM;
+
+       INIT_LIST_HEAD(&target->free_reqs);
+
+       target->req_ring = kzalloc(target->req_ring_size *
+                                  sizeof(*target->req_ring), GFP_KERNEL);
+       if (!target->req_ring)
+               goto out;
+
+       for (i = 0; i < target->req_ring_size; ++i) {
+               req = &target->req_ring[i];
+               req->fmr_list = kmalloc(target->cmd_sg_cnt * sizeof(void *),
+                                       GFP_KERNEL);
+               req->map_page = kmalloc(SRP_FMR_SIZE * sizeof(void *),
+                                       GFP_KERNEL);
+               req->indirect_desc = kmalloc(target->indirect_size, GFP_KERNEL);
+               if (!req->fmr_list || !req->map_page || !req->indirect_desc)
+                       goto out;
+
+               dma_addr = ib_dma_map_single(ibdev, req->indirect_desc,
+                                            target->indirect_size,
+                                            DMA_TO_DEVICE);
+               if (ib_dma_mapping_error(ibdev, dma_addr))
+                       goto out;
+
+               req->indirect_dma_addr = dma_addr;
+               req->index = i;
+               list_add_tail(&req->list, &target->free_reqs);
+       }
+       ret = 0;
+
+out:
+       return ret;
 }
 
 /**
@@ -528,12 +657,20 @@ static void srp_remove_target(struct srp_target_port *target)
        WARN_ON_ONCE(target->state != SRP_TARGET_REMOVED);
 
        srp_del_scsi_host_attr(target->scsi_host);
+       srp_rport_get(target->rport);
        srp_remove_host(target->scsi_host);
        scsi_remove_host(target->scsi_host);
        srp_disconnect_target(target);
        ib_destroy_cm_id(target->cm_id);
        srp_free_target_ib(target);
+       cancel_work_sync(&target->tl_err_work);
+       srp_rport_put(target->rport);
        srp_free_req_data(target);
+
+       spin_lock(&target->srp_host->target_lock);
+       list_del(&target->list);
+       spin_unlock(&target->srp_host->target_lock);
+
        scsi_host_put(target->scsi_host);
 }
 
@@ -545,10 +682,6 @@ static void srp_remove_work(struct work_struct *work)
        WARN_ON_ONCE(target->state != SRP_TARGET_REMOVED);
 
        srp_remove_target(target);
-
-       spin_lock(&target->srp_host->target_lock);
-       list_del(&target->list);
-       spin_unlock(&target->srp_host->target_lock);
 }
 
 static void srp_rport_delete(struct srp_rport *rport)
@@ -686,23 +819,42 @@ static void srp_free_req(struct srp_target_port *target,
        spin_unlock_irqrestore(&target->lock, flags);
 }
 
-static void srp_reset_req(struct srp_target_port *target, struct srp_request *req)
+static void srp_finish_req(struct srp_target_port *target,
+                          struct srp_request *req, int result)
 {
        struct scsi_cmnd *scmnd = srp_claim_req(target, req, NULL);
 
        if (scmnd) {
                srp_free_req(target, req, scmnd, 0);
-               scmnd->result = DID_RESET << 16;
+               scmnd->result = result;
                scmnd->scsi_done(scmnd);
        }
 }
 
-static int srp_reconnect_target(struct srp_target_port *target)
+static void srp_terminate_io(struct srp_rport *rport)
 {
-       struct Scsi_Host *shost = target->scsi_host;
-       int i, ret;
+       struct srp_target_port *target = rport->lld_data;
+       int i;
 
-       scsi_target_block(&shost->shost_gendev);
+       for (i = 0; i < target->req_ring_size; ++i) {
+               struct srp_request *req = &target->req_ring[i];
+               srp_finish_req(target, req, DID_TRANSPORT_FAILFAST << 16);
+       }
+}
+
+/*
+ * It is up to the caller to ensure that srp_rport_reconnect() calls are
+ * serialized and that no concurrent srp_queuecommand(), srp_abort(),
+ * srp_reset_device() or srp_reset_host() calls will occur while this function
+ * is in progress. One way to realize that is not to call this function
+ * directly but to call srp_reconnect_rport() instead since that last function
+ * serializes calls of this function via rport->mutex and also blocks
+ * srp_queuecommand() calls before invoking this function.
+ */
+static int srp_rport_reconnect(struct srp_rport *rport)
+{
+       struct srp_target_port *target = rport->lld_data;
+       int i, ret;
 
        srp_disconnect_target(target);
        /*
@@ -721,41 +873,21 @@ static int srp_reconnect_target(struct srp_target_port *target)
        else
                srp_create_target_ib(target);
 
-       for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) {
+       for (i = 0; i < target->req_ring_size; ++i) {
                struct srp_request *req = &target->req_ring[i];
-               if (req->scmnd)
-                       srp_reset_req(target, req);
+               srp_finish_req(target, req, DID_RESET << 16);
        }
 
        INIT_LIST_HEAD(&target->free_tx);
-       for (i = 0; i < SRP_SQ_SIZE; ++i)
+       for (i = 0; i < target->queue_size; ++i)
                list_add(&target->tx_ring[i]->list, &target->free_tx);
 
        if (ret == 0)
                ret = srp_connect_target(target);
 
-       scsi_target_unblock(&shost->shost_gendev, ret == 0 ? SDEV_RUNNING :
-                           SDEV_TRANSPORT_OFFLINE);
-       target->transport_offline = !!ret;
-
-       if (ret)
-               goto err;
-
-       shost_printk(KERN_INFO, target->scsi_host, PFX "reconnect succeeded\n");
-
-       return ret;
-
-err:
-       shost_printk(KERN_ERR, target->scsi_host,
-                    PFX "reconnect failed (%d), removing target port.\n", ret);
-
-       /*
-        * We couldn't reconnect, so kill our target port off.
-        * However, we have to defer the real removal because we
-        * are in the context of the SCSI error handler now, which
-        * will deadlock if we call scsi_remove_host().
-        */
-       srp_queue_remove_work(target);
+       if (ret == 0)
+               shost_printk(KERN_INFO, target->scsi_host,
+                            PFX "reconnect succeeded\n");
 
        return ret;
 }
@@ -1302,15 +1434,30 @@ static void srp_handle_recv(struct srp_target_port *target, struct ib_wc *wc)
                             PFX "Recv failed with error code %d\n", res);
 }
 
-static void srp_handle_qp_err(enum ib_wc_status wc_status,
-                             enum ib_wc_opcode wc_opcode,
+/**
+ * srp_tl_err_work() - handle a transport layer error
+ *
+ * Note: This function may get invoked before the rport has been created,
+ * hence the target->rport test.
+ */
+static void srp_tl_err_work(struct work_struct *work)
+{
+       struct srp_target_port *target;
+
+       target = container_of(work, struct srp_target_port, tl_err_work);
+       if (target->rport)
+               srp_start_tl_fail_timers(target->rport);
+}
+
+static void srp_handle_qp_err(enum ib_wc_status wc_status, bool send_err,
                              struct srp_target_port *target)
 {
        if (target->connected && !target->qp_in_error) {
                shost_printk(KERN_ERR, target->scsi_host,
                             PFX "failed %s status %d\n",
-                            wc_opcode & IB_WC_RECV ? "receive" : "send",
+                            send_err ? "send" : "receive",
                             wc_status);
+               queue_work(system_long_wq, &target->tl_err_work);
        }
        target->qp_in_error = true;
 }
@@ -1325,7 +1472,7 @@ static void srp_recv_completion(struct ib_cq *cq, void *target_ptr)
                if (likely(wc.status == IB_WC_SUCCESS)) {
                        srp_handle_recv(target, &wc);
                } else {
-                       srp_handle_qp_err(wc.status, wc.opcode, target);
+                       srp_handle_qp_err(wc.status, false, target);
                }
        }
 }
@@ -1341,7 +1488,7 @@ static void srp_send_completion(struct ib_cq *cq, void *target_ptr)
                        iu = (struct srp_iu *) (uintptr_t) wc.wr_id;
                        list_add(&iu->list, &target->free_tx);
                } else {
-                       srp_handle_qp_err(wc.status, wc.opcode, target);
+                       srp_handle_qp_err(wc.status, true, target);
                }
        }
 }
@@ -1349,17 +1496,29 @@ static void srp_send_completion(struct ib_cq *cq, void *target_ptr)
 static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd)
 {
        struct srp_target_port *target = host_to_target(shost);
+       struct srp_rport *rport = target->rport;
        struct srp_request *req;
        struct srp_iu *iu;
        struct srp_cmd *cmd;
        struct ib_device *dev;
        unsigned long flags;
-       int len;
+       int len, result;
+       const bool in_scsi_eh = !in_interrupt() && current == shost->ehandler;
+
+       /*
+        * The SCSI EH thread is the only context from which srp_queuecommand()
+        * can get invoked for blocked devices (SDEV_BLOCK /
+        * SDEV_CREATED_BLOCK). Avoid racing with srp_reconnect_rport() by
+        * locking the rport mutex if invoked from inside the SCSI EH.
+        */
+       if (in_scsi_eh)
+               mutex_lock(&rport->mutex);
 
-       if (unlikely(target->transport_offline)) {
-               scmnd->result = DID_NO_CONNECT << 16;
+       result = srp_chkready(target->rport);
+       if (unlikely(result)) {
+               scmnd->result = result;
                scmnd->scsi_done(scmnd);
-               return 0;
+               goto unlock_rport;
        }
 
        spin_lock_irqsave(&target->lock, flags);
@@ -1404,6 +1563,10 @@ static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd)
                goto err_unmap;
        }
 
+unlock_rport:
+       if (in_scsi_eh)
+               mutex_unlock(&rport->mutex);
+
        return 0;
 
 err_unmap:
@@ -1418,14 +1581,30 @@ err_iu:
 err_unlock:
        spin_unlock_irqrestore(&target->lock, flags);
 
+       if (in_scsi_eh)
+               mutex_unlock(&rport->mutex);
+
        return SCSI_MLQUEUE_HOST_BUSY;
 }
 
+/*
+ * Note: the resources allocated in this function are freed in
+ * srp_free_target_ib().
+ */
 static int srp_alloc_iu_bufs(struct srp_target_port *target)
 {
        int i;
 
-       for (i = 0; i < SRP_RQ_SIZE; ++i) {
+       target->rx_ring = kzalloc(target->queue_size * sizeof(*target->rx_ring),
+                                 GFP_KERNEL);
+       if (!target->rx_ring)
+               goto err_no_ring;
+       target->tx_ring = kzalloc(target->queue_size * sizeof(*target->tx_ring),
+                                 GFP_KERNEL);
+       if (!target->tx_ring)
+               goto err_no_ring;
+
+       for (i = 0; i < target->queue_size; ++i) {
                target->rx_ring[i] = srp_alloc_iu(target->srp_host,
                                                  target->max_ti_iu_len,
                                                  GFP_KERNEL, DMA_FROM_DEVICE);
@@ -1433,7 +1612,7 @@ static int srp_alloc_iu_bufs(struct srp_target_port *target)
                        goto err;
        }
 
-       for (i = 0; i < SRP_SQ_SIZE; ++i) {
+       for (i = 0; i < target->queue_size; ++i) {
                target->tx_ring[i] = srp_alloc_iu(target->srp_host,
                                                  target->max_iu_len,
                                                  GFP_KERNEL, DMA_TO_DEVICE);
@@ -1446,16 +1625,18 @@ static int srp_alloc_iu_bufs(struct srp_target_port *target)
        return 0;
 
 err:
-       for (i = 0; i < SRP_RQ_SIZE; ++i) {
+       for (i = 0; i < target->queue_size; ++i) {
                srp_free_iu(target->srp_host, target->rx_ring[i]);
-               target->rx_ring[i] = NULL;
-       }
-
-       for (i = 0; i < SRP_SQ_SIZE; ++i) {
                srp_free_iu(target->srp_host, target->tx_ring[i]);
-               target->tx_ring[i] = NULL;
        }
 
+
+err_no_ring:
+       kfree(target->tx_ring);
+       target->tx_ring = NULL;
+       kfree(target->rx_ring);
+       target->rx_ring = NULL;
+
        return -ENOMEM;
 }
 
@@ -1506,6 +1687,9 @@ static void srp_cm_rep_handler(struct ib_cm_id *cm_id,
                target->scsi_host->can_queue
                        = min(target->req_lim - SRP_TSK_MGMT_SQ_SIZE,
                              target->scsi_host->can_queue);
+               target->scsi_host->cmd_per_lun
+                       = min_t(int, target->scsi_host->can_queue,
+                               target->scsi_host->cmd_per_lun);
        } else {
                shost_printk(KERN_WARNING, target->scsi_host,
                             PFX "Unhandled RSP opcode %#x\n", lrsp->opcode);
@@ -1513,7 +1697,7 @@ static void srp_cm_rep_handler(struct ib_cm_id *cm_id,
                goto error;
        }
 
-       if (!target->rx_ring[0]) {
+       if (!target->rx_ring) {
                ret = srp_alloc_iu_bufs(target);
                if (ret)
                        goto error;
@@ -1533,7 +1717,7 @@ static void srp_cm_rep_handler(struct ib_cm_id *cm_id,
        if (ret)
                goto error_free;
 
-       for (i = 0; i < SRP_RQ_SIZE; i++) {
+       for (i = 0; i < target->queue_size; i++) {
                struct srp_iu *iu = target->rx_ring[i];
                ret = srp_post_recv(target, iu);
                if (ret)
@@ -1672,6 +1856,7 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
                if (ib_send_cm_drep(cm_id, NULL, 0))
                        shost_printk(KERN_ERR, target->scsi_host,
                                     PFX "Sending CM DREP failed\n");
+               queue_work(system_long_wq, &target->tl_err_work);
                break;
 
        case IB_CM_TIMEWAIT_EXIT:
@@ -1698,9 +1883,61 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
        return 0;
 }
 
+/**
+ * srp_change_queue_type - changing device queue tag type
+ * @sdev: scsi device struct
+ * @tag_type: requested tag type
+ *
+ * Returns queue tag type.
+ */
+static int
+srp_change_queue_type(struct scsi_device *sdev, int tag_type)
+{
+       if (sdev->tagged_supported) {
+               scsi_set_tag_type(sdev, tag_type);
+               if (tag_type)
+                       scsi_activate_tcq(sdev, sdev->queue_depth);
+               else
+                       scsi_deactivate_tcq(sdev, sdev->queue_depth);
+       } else
+               tag_type = 0;
+
+       return tag_type;
+}
+
+/**
+ * srp_change_queue_depth - setting device queue depth
+ * @sdev: scsi device struct
+ * @qdepth: requested queue depth
+ * @reason: SCSI_QDEPTH_DEFAULT/SCSI_QDEPTH_QFULL/SCSI_QDEPTH_RAMP_UP
+ * (see include/scsi/scsi_host.h for definition)
+ *
+ * Returns queue depth.
+ */
+static int
+srp_change_queue_depth(struct scsi_device *sdev, int qdepth, int reason)
+{
+       struct Scsi_Host *shost = sdev->host;
+       int max_depth;
+       if (reason == SCSI_QDEPTH_DEFAULT || reason == SCSI_QDEPTH_RAMP_UP) {
+               max_depth = shost->can_queue;
+               if (!sdev->tagged_supported)
+                       max_depth = 1;
+               if (qdepth > max_depth)
+                       qdepth = max_depth;
+               scsi_adjust_queue_depth(sdev, scsi_get_tag_type(sdev), qdepth);
+       } else if (reason == SCSI_QDEPTH_QFULL)
+               scsi_track_queue_full(sdev, qdepth);
+       else
+               return -EOPNOTSUPP;
+
+       return sdev->queue_depth;
+}
+
 static int srp_send_tsk_mgmt(struct srp_target_port *target,
                             u64 req_tag, unsigned int lun, u8 func)
 {
+       struct srp_rport *rport = target->rport;
        struct ib_device *dev = target->srp_host->srp_dev->dev;
        struct srp_iu *iu;
        struct srp_tsk_mgmt *tsk_mgmt;
@@ -1710,12 +1947,20 @@ static int srp_send_tsk_mgmt(struct srp_target_port *target,
 
        init_completion(&target->tsk_mgmt_done);
 
+       /*
+        * Lock the rport mutex to avoid that srp_create_target_ib() is
+        * invoked while a task management function is being sent.
+        */
+       mutex_lock(&rport->mutex);
        spin_lock_irq(&target->lock);
        iu = __srp_get_tx_iu(target, SRP_IU_TSK_MGMT);
        spin_unlock_irq(&target->lock);
 
-       if (!iu)
+       if (!iu) {
+               mutex_unlock(&rport->mutex);
+
                return -1;
+       }
 
        ib_dma_sync_single_for_cpu(dev, iu->dma, sizeof *tsk_mgmt,
                                   DMA_TO_DEVICE);
@@ -1732,8 +1977,11 @@ static int srp_send_tsk_mgmt(struct srp_target_port *target,
                                      DMA_TO_DEVICE);
        if (srp_post_send(target, iu, sizeof *tsk_mgmt)) {
                srp_put_tx_iu(target, iu, SRP_IU_TSK_MGMT);
+               mutex_unlock(&rport->mutex);
+
                return -1;
        }
+       mutex_unlock(&rport->mutex);
 
        if (!wait_for_completion_timeout(&target->tsk_mgmt_done,
                                         msecs_to_jiffies(SRP_ABORT_TIMEOUT_MS)))
@@ -1751,11 +1999,11 @@ static int srp_abort(struct scsi_cmnd *scmnd)
        shost_printk(KERN_ERR, target->scsi_host, "SRP abort called\n");
 
        if (!req || !srp_claim_req(target, req, scmnd))
-               return FAILED;
+               return SUCCESS;
        if (srp_send_tsk_mgmt(target, req->index, scmnd->device->lun,
                              SRP_TSK_ABORT_TASK) == 0)
                ret = SUCCESS;
-       else if (target->transport_offline)
+       else if (target->rport->state == SRP_RPORT_LOST)
                ret = FAST_IO_FAIL;
        else
                ret = FAILED;
@@ -1779,10 +2027,10 @@ static int srp_reset_device(struct scsi_cmnd *scmnd)
        if (target->tsk_mgmt_status)
                return FAILED;
 
-       for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) {
+       for (i = 0; i < target->req_ring_size; ++i) {
                struct srp_request *req = &target->req_ring[i];
                if (req->scmnd && req->scmnd->device == scmnd->device)
-                       srp_reset_req(target, req);
+                       srp_finish_req(target, req, DID_RESET << 16);
        }
 
        return SUCCESS;
@@ -1791,14 +2039,10 @@ static int srp_reset_device(struct scsi_cmnd *scmnd)
 static int srp_reset_host(struct scsi_cmnd *scmnd)
 {
        struct srp_target_port *target = host_to_target(scmnd->device->host);
-       int ret = FAILED;
 
        shost_printk(KERN_ERR, target->scsi_host, PFX "SRP reset_host called\n");
 
-       if (!srp_reconnect_target(target))
-               ret = SUCCESS;
-
-       return ret;
+       return srp_reconnect_rport(target->rport) == 0 ? SUCCESS : FAILED;
 }
 
 static int srp_slave_configure(struct scsi_device *sdev)
@@ -1851,6 +2095,14 @@ static ssize_t show_pkey(struct device *dev, struct device_attribute *attr,
        return sprintf(buf, "0x%04x\n", be16_to_cpu(target->path.pkey));
 }
 
+static ssize_t show_sgid(struct device *dev, struct device_attribute *attr,
+                        char *buf)
+{
+       struct srp_target_port *target = host_to_target(class_to_shost(dev));
+
+       return sprintf(buf, "%pI6\n", target->path.sgid.raw);
+}
+
 static ssize_t show_dgid(struct device *dev, struct device_attribute *attr,
                         char *buf)
 {
@@ -1907,6 +2159,14 @@ static ssize_t show_comp_vector(struct device *dev,
        return sprintf(buf, "%d\n", target->comp_vector);
 }
 
+static ssize_t show_tl_retry_count(struct device *dev,
+                                  struct device_attribute *attr, char *buf)
+{
+       struct srp_target_port *target = host_to_target(class_to_shost(dev));
+
+       return sprintf(buf, "%d\n", target->tl_retry_count);
+}
+
 static ssize_t show_cmd_sg_entries(struct device *dev,
                                   struct device_attribute *attr, char *buf)
 {
@@ -1927,6 +2187,7 @@ static DEVICE_ATTR(id_ext,            S_IRUGO, show_id_ext,          NULL);
 static DEVICE_ATTR(ioc_guid,       S_IRUGO, show_ioc_guid,        NULL);
 static DEVICE_ATTR(service_id,     S_IRUGO, show_service_id,      NULL);
 static DEVICE_ATTR(pkey,           S_IRUGO, show_pkey,            NULL);
+static DEVICE_ATTR(sgid,           S_IRUGO, show_sgid,            NULL);
 static DEVICE_ATTR(dgid,           S_IRUGO, show_dgid,            NULL);
 static DEVICE_ATTR(orig_dgid,      S_IRUGO, show_orig_dgid,       NULL);
 static DEVICE_ATTR(req_lim,         S_IRUGO, show_req_lim,         NULL);
@@ -1934,6 +2195,7 @@ static DEVICE_ATTR(zero_req_lim,    S_IRUGO, show_zero_req_lim,      NULL);
 static DEVICE_ATTR(local_ib_port,   S_IRUGO, show_local_ib_port,   NULL);
 static DEVICE_ATTR(local_ib_device, S_IRUGO, show_local_ib_device, NULL);
 static DEVICE_ATTR(comp_vector,     S_IRUGO, show_comp_vector,     NULL);
+static DEVICE_ATTR(tl_retry_count,  S_IRUGO, show_tl_retry_count,  NULL);
 static DEVICE_ATTR(cmd_sg_entries,  S_IRUGO, show_cmd_sg_entries,  NULL);
 static DEVICE_ATTR(allow_ext_sg,    S_IRUGO, show_allow_ext_sg,    NULL);
 
@@ -1942,6 +2204,7 @@ static struct device_attribute *srp_host_attrs[] = {
        &dev_attr_ioc_guid,
        &dev_attr_service_id,
        &dev_attr_pkey,
+       &dev_attr_sgid,
        &dev_attr_dgid,
        &dev_attr_orig_dgid,
        &dev_attr_req_lim,
@@ -1949,6 +2212,7 @@ static struct device_attribute *srp_host_attrs[] = {
        &dev_attr_local_ib_port,
        &dev_attr_local_ib_device,
        &dev_attr_comp_vector,
+       &dev_attr_tl_retry_count,
        &dev_attr_cmd_sg_entries,
        &dev_attr_allow_ext_sg,
        NULL
@@ -1961,14 +2225,16 @@ static struct scsi_host_template srp_template = {
        .slave_configure                = srp_slave_configure,
        .info                           = srp_target_info,
        .queuecommand                   = srp_queuecommand,
+       .change_queue_depth             = srp_change_queue_depth,
+       .change_queue_type              = srp_change_queue_type,
        .eh_abort_handler               = srp_abort,
        .eh_device_reset_handler        = srp_reset_device,
        .eh_host_reset_handler          = srp_reset_host,
        .skip_settle_delay              = true,
        .sg_tablesize                   = SRP_DEF_SG_TABLESIZE,
-       .can_queue                      = SRP_CMD_SQ_SIZE,
+       .can_queue                      = SRP_DEFAULT_CMD_SQ_SIZE,
        .this_id                        = -1,
-       .cmd_per_lun                    = SRP_CMD_SQ_SIZE,
+       .cmd_per_lun                    = SRP_DEFAULT_CMD_SQ_SIZE,
        .use_clustering                 = ENABLE_CLUSTERING,
        .shost_attrs                    = srp_host_attrs
 };
@@ -1994,6 +2260,7 @@ static int srp_add_target(struct srp_host *host, struct srp_target_port *target)
        }
 
        rport->lld_data = target;
+       target->rport = rport;
 
        spin_lock(&host->target_lock);
        list_add_tail(&target->list, &host->target_list);
@@ -2073,6 +2340,8 @@ enum {
        SRP_OPT_ALLOW_EXT_SG    = 1 << 10,
        SRP_OPT_SG_TABLESIZE    = 1 << 11,
        SRP_OPT_COMP_VECTOR     = 1 << 12,
+       SRP_OPT_TL_RETRY_COUNT  = 1 << 13,
+       SRP_OPT_QUEUE_SIZE      = 1 << 14,
        SRP_OPT_ALL             = (SRP_OPT_ID_EXT       |
                                   SRP_OPT_IOC_GUID     |
                                   SRP_OPT_DGID         |
@@ -2094,6 +2363,8 @@ static const match_table_t srp_opt_tokens = {
        { SRP_OPT_ALLOW_EXT_SG,         "allow_ext_sg=%u"       },
        { SRP_OPT_SG_TABLESIZE,         "sg_tablesize=%u"       },
        { SRP_OPT_COMP_VECTOR,          "comp_vector=%u"        },
+       { SRP_OPT_TL_RETRY_COUNT,       "tl_retry_count=%u"     },
+       { SRP_OPT_QUEUE_SIZE,           "queue_size=%d"         },
        { SRP_OPT_ERR,                  NULL                    }
 };
 
@@ -2188,13 +2459,25 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target)
                        target->scsi_host->max_sectors = token;
                        break;
 
+               case SRP_OPT_QUEUE_SIZE:
+                       if (match_int(args, &token) || token < 1) {
+                               pr_warn("bad queue_size parameter '%s'\n", p);
+                               goto out;
+                       }
+                       target->scsi_host->can_queue = token;
+                       target->queue_size = token + SRP_RSP_SQ_SIZE +
+                                            SRP_TSK_MGMT_SQ_SIZE;
+                       if (!(opt_mask & SRP_OPT_MAX_CMD_PER_LUN))
+                               target->scsi_host->cmd_per_lun = token;
+                       break;
+
                case SRP_OPT_MAX_CMD_PER_LUN:
-                       if (match_int(args, &token)) {
+                       if (match_int(args, &token) || token < 1) {
                                pr_warn("bad max cmd_per_lun parameter '%s'\n",
                                        p);
                                goto out;
                        }
-                       target->scsi_host->cmd_per_lun = min(token, SRP_CMD_SQ_SIZE);
+                       target->scsi_host->cmd_per_lun = token;
                        break;
 
                case SRP_OPT_IO_CLASS:
@@ -2257,6 +2540,15 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target)
                        target->comp_vector = token;
                        break;
 
+               case SRP_OPT_TL_RETRY_COUNT:
+                       if (match_int(args, &token) || token < 2 || token > 7) {
+                               pr_warn("bad tl_retry_count parameter '%s' (must be a number between 2 and 7)\n",
+                                       p);
+                               goto out;
+                       }
+                       target->tl_retry_count = token;
+                       break;
+
                default:
                        pr_warn("unknown parameter or missing value '%s' in target creation request\n",
                                p);
@@ -2273,6 +2565,12 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target)
                                pr_warn("target creation request is missing parameter '%s'\n",
                                        srp_opt_tokens[i].pattern);
 
+       if (target->scsi_host->cmd_per_lun > target->scsi_host->can_queue
+           && (opt_mask & SRP_OPT_MAX_CMD_PER_LUN))
+               pr_warn("cmd_per_lun = %d > queue_size = %d\n",
+                       target->scsi_host->cmd_per_lun,
+                       target->scsi_host->can_queue);
+
 out:
        kfree(options);
        return ret;
@@ -2287,8 +2585,7 @@ static ssize_t srp_create_target(struct device *dev,
        struct Scsi_Host *target_host;
        struct srp_target_port *target;
        struct ib_device *ibdev = host->srp_dev->dev;
-       dma_addr_t dma_addr;
-       int i, ret;
+       int ret;
 
        target_host = scsi_host_alloc(&srp_template,
                                      sizeof (struct srp_target_port));
@@ -2311,11 +2608,15 @@ static ssize_t srp_create_target(struct device *dev,
        target->cmd_sg_cnt      = cmd_sg_entries;
        target->sg_tablesize    = indirect_sg_entries ? : cmd_sg_entries;
        target->allow_ext_sg    = allow_ext_sg;
+       target->tl_retry_count  = 7;
+       target->queue_size      = SRP_DEFAULT_QUEUE_SIZE;
 
        ret = srp_parse_options(buf, target);
        if (ret)
                goto err;
 
+       target->req_ring_size = target->queue_size - SRP_TSK_MGMT_SQ_SIZE;
+
        if (!srp_conn_unique(target->srp_host, target)) {
                shost_printk(KERN_INFO, target->scsi_host,
                             PFX "Already connected to target port with id_ext=%016llx;ioc_guid=%016llx;initiator_ext=%016llx\n",
@@ -2339,31 +2640,13 @@ static ssize_t srp_create_target(struct device *dev,
                             sizeof (struct srp_indirect_buf) +
                             target->cmd_sg_cnt * sizeof (struct srp_direct_buf);
 
+       INIT_WORK(&target->tl_err_work, srp_tl_err_work);
        INIT_WORK(&target->remove_work, srp_remove_work);
        spin_lock_init(&target->lock);
        INIT_LIST_HEAD(&target->free_tx);
-       INIT_LIST_HEAD(&target->free_reqs);
-       for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) {
-               struct srp_request *req = &target->req_ring[i];
-
-               req->fmr_list = kmalloc(target->cmd_sg_cnt * sizeof (void *),
-                                       GFP_KERNEL);
-               req->map_page = kmalloc(SRP_FMR_SIZE * sizeof (void *),
-                                       GFP_KERNEL);
-               req->indirect_desc = kmalloc(target->indirect_size, GFP_KERNEL);
-               if (!req->fmr_list || !req->map_page || !req->indirect_desc)
-                       goto err_free_mem;
-
-               dma_addr = ib_dma_map_single(ibdev, req->indirect_desc,
-                                            target->indirect_size,
-                                            DMA_TO_DEVICE);
-               if (ib_dma_mapping_error(ibdev, dma_addr))
-                       goto err_free_mem;
-
-               req->indirect_dma_addr = dma_addr;
-               req->index = i;
-               list_add_tail(&req->list, &target->free_reqs);
-       }
+       ret = srp_alloc_req_data(target);
+       if (ret)
+               goto err_free_mem;
 
        ib_query_gid(ibdev, host->port, 0, &target->path.sgid);
 
@@ -2612,7 +2895,14 @@ static void srp_remove_one(struct ib_device *device)
 }
 
 static struct srp_function_template ib_srp_transport_functions = {
+       .has_rport_state         = true,
+       .reset_timer_if_blocked  = true,
+       .reconnect_delay         = &srp_reconnect_delay,
+       .fast_io_fail_tmo        = &srp_fast_io_fail_tmo,
+       .dev_loss_tmo            = &srp_dev_loss_tmo,
+       .reconnect               = srp_rport_reconnect,
        .rport_delete            = srp_rport_delete,
+       .terminate_rport_io      = srp_terminate_io,
 };
 
 static int __init srp_init_module(void)
index e641088..5756810 100644 (file)
@@ -57,14 +57,11 @@ enum {
        SRP_MAX_LUN             = 512,
        SRP_DEF_SG_TABLESIZE    = 12,
 
-       SRP_RQ_SHIFT            = 6,
-       SRP_RQ_SIZE             = 1 << SRP_RQ_SHIFT,
-
-       SRP_SQ_SIZE             = SRP_RQ_SIZE,
+       SRP_DEFAULT_QUEUE_SIZE  = 1 << 6,
        SRP_RSP_SQ_SIZE         = 1,
-       SRP_REQ_SQ_SIZE         = SRP_SQ_SIZE - SRP_RSP_SQ_SIZE,
        SRP_TSK_MGMT_SQ_SIZE    = 1,
-       SRP_CMD_SQ_SIZE         = SRP_REQ_SQ_SIZE - SRP_TSK_MGMT_SQ_SIZE,
+       SRP_DEFAULT_CMD_SQ_SIZE = SRP_DEFAULT_QUEUE_SIZE - SRP_RSP_SQ_SIZE -
+                                 SRP_TSK_MGMT_SQ_SIZE,
 
        SRP_TAG_NO_REQ          = ~0U,
        SRP_TAG_TSK_MGMT        = 1U << 31,
@@ -140,7 +137,6 @@ struct srp_target_port {
        unsigned int            cmd_sg_cnt;
        unsigned int            indirect_size;
        bool                    allow_ext_sg;
-       bool                    transport_offline;
 
        /* Everything above this point is used in the hot path of
         * command processing. Try to keep them packed into cachelines.
@@ -153,10 +149,14 @@ struct srp_target_port {
        u16                     io_class;
        struct srp_host        *srp_host;
        struct Scsi_Host       *scsi_host;
+       struct srp_rport       *rport;
        char                    target_name[32];
        unsigned int            scsi_id;
        unsigned int            sg_tablesize;
+       int                     queue_size;
+       int                     req_ring_size;
        int                     comp_vector;
+       int                     tl_retry_count;
 
        struct ib_sa_path_rec   path;
        __be16                  orig_dgid[8];
@@ -172,10 +172,11 @@ struct srp_target_port {
 
        int                     zero_req_lim;
 
-       struct srp_iu          *tx_ring[SRP_SQ_SIZE];
-       struct srp_iu          *rx_ring[SRP_RQ_SIZE];
-       struct srp_request      req_ring[SRP_CMD_SQ_SIZE];
+       struct srp_iu          **tx_ring;
+       struct srp_iu          **rx_ring;
+       struct srp_request      *req_ring;
 
+       struct work_struct      tl_err_work;
        struct work_struct      remove_work;
 
        struct list_head        list;
index 6ca3073..8675d26 100644 (file)
@@ -98,6 +98,7 @@ enum {
 static struct mlx5_cmd_work_ent *alloc_cmd(struct mlx5_cmd *cmd,
                                           struct mlx5_cmd_msg *in,
                                           struct mlx5_cmd_msg *out,
+                                          void *uout, int uout_size,
                                           mlx5_cmd_cbk_t cbk,
                                           void *context, int page_queue)
 {
@@ -110,6 +111,8 @@ static struct mlx5_cmd_work_ent *alloc_cmd(struct mlx5_cmd *cmd,
 
        ent->in         = in;
        ent->out        = out;
+       ent->uout       = uout;
+       ent->uout_size  = uout_size;
        ent->callback   = cbk;
        ent->context    = context;
        ent->cmd        = cmd;
@@ -534,6 +537,7 @@ static void cmd_work_handler(struct work_struct *work)
        ent->lay = lay;
        memset(lay, 0, sizeof(*lay));
        memcpy(lay->in, ent->in->first.data, sizeof(lay->in));
+       ent->op = be32_to_cpu(lay->in[0]) >> 16;
        if (ent->in->next)
                lay->in_ptr = cpu_to_be64(ent->in->next->dma);
        lay->inlen = cpu_to_be32(ent->in->len);
@@ -628,7 +632,8 @@ static int wait_func(struct mlx5_core_dev *dev, struct mlx5_cmd_work_ent *ent)
  *    2. page queue commands do not support asynchrous completion
  */
 static int mlx5_cmd_invoke(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *in,
-                          struct mlx5_cmd_msg *out, mlx5_cmd_cbk_t callback,
+                          struct mlx5_cmd_msg *out, void *uout, int uout_size,
+                          mlx5_cmd_cbk_t callback,
                           void *context, int page_queue, u8 *status)
 {
        struct mlx5_cmd *cmd = &dev->cmd;
@@ -642,7 +647,8 @@ static int mlx5_cmd_invoke(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *in,
        if (callback && page_queue)
                return -EINVAL;
 
-       ent = alloc_cmd(cmd, in, out, callback, context, page_queue);
+       ent = alloc_cmd(cmd, in, out, uout, uout_size, callback, context,
+                       page_queue);
        if (IS_ERR(ent))
                return PTR_ERR(ent);
 
@@ -670,10 +676,10 @@ static int mlx5_cmd_invoke(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *in,
                op = be16_to_cpu(((struct mlx5_inbox_hdr *)in->first.data)->opcode);
                if (op < ARRAY_SIZE(cmd->stats)) {
                        stats = &cmd->stats[op];
-                       spin_lock(&stats->lock);
+                       spin_lock_irq(&stats->lock);
                        stats->sum += ds;
                        ++stats->n;
-                       spin_unlock(&stats->lock);
+                       spin_unlock_irq(&stats->lock);
                }
                mlx5_core_dbg_mask(dev, 1 << MLX5_CMD_TIME,
                                   "fw exec time for %s is %lld nsec\n",
@@ -826,7 +832,7 @@ static struct mlx5_cmd_msg *mlx5_alloc_cmd_msg(struct mlx5_core_dev *dev,
        int n;
        int i;
 
-       msg = kzalloc(sizeof(*msg), GFP_KERNEL);
+       msg = kzalloc(sizeof(*msg), flags);
        if (!msg)
                return ERR_PTR(-ENOMEM);
 
@@ -1109,6 +1115,19 @@ void mlx5_cmd_use_polling(struct mlx5_core_dev *dev)
                up(&cmd->sem);
 }
 
+static void free_msg(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *msg)
+{
+       unsigned long flags;
+
+       if (msg->cache) {
+               spin_lock_irqsave(&msg->cache->lock, flags);
+               list_add_tail(&msg->list, &msg->cache->head);
+               spin_unlock_irqrestore(&msg->cache->lock, flags);
+       } else {
+               mlx5_free_cmd_msg(dev, msg);
+       }
+}
+
 void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, unsigned long vector)
 {
        struct mlx5_cmd *cmd = &dev->cmd;
@@ -1117,6 +1136,10 @@ void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, unsigned long vector)
        void *context;
        int err;
        int i;
+       ktime_t t1, t2, delta;
+       s64 ds;
+       struct mlx5_cmd_stats *stats;
+       unsigned long flags;
 
        for (i = 0; i < (1 << cmd->log_sz); i++) {
                if (test_bit(i, &vector)) {
@@ -1141,9 +1164,29 @@ void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, unsigned long vector)
                        }
                        free_ent(cmd, ent->idx);
                        if (ent->callback) {
+                               t1 = timespec_to_ktime(ent->ts1);
+                               t2 = timespec_to_ktime(ent->ts2);
+                               delta = ktime_sub(t2, t1);
+                               ds = ktime_to_ns(delta);
+                               if (ent->op < ARRAY_SIZE(cmd->stats)) {
+                                       stats = &cmd->stats[ent->op];
+                                       spin_lock_irqsave(&stats->lock, flags);
+                                       stats->sum += ds;
+                                       ++stats->n;
+                                       spin_unlock_irqrestore(&stats->lock, flags);
+                               }
+
                                callback = ent->callback;
                                context = ent->context;
                                err = ent->ret;
+                               if (!err)
+                                       err = mlx5_copy_from_msg(ent->uout,
+                                                                ent->out,
+                                                                ent->uout_size);
+
+                               mlx5_free_cmd_msg(dev, ent->out);
+                               free_msg(dev, ent->in);
+
                                free_cmd(ent);
                                callback(err, context);
                        } else {
@@ -1160,7 +1203,8 @@ static int status_to_err(u8 status)
        return status ? -1 : 0; /* TBD more meaningful codes */
 }
 
-static struct mlx5_cmd_msg *alloc_msg(struct mlx5_core_dev *dev, int in_size)
+static struct mlx5_cmd_msg *alloc_msg(struct mlx5_core_dev *dev, int in_size,
+                                     gfp_t gfp)
 {
        struct mlx5_cmd_msg *msg = ERR_PTR(-ENOMEM);
        struct mlx5_cmd *cmd = &dev->cmd;
@@ -1172,7 +1216,7 @@ static struct mlx5_cmd_msg *alloc_msg(struct mlx5_core_dev *dev, int in_size)
                ent = &cmd->cache.med;
 
        if (ent) {
-               spin_lock(&ent->lock);
+               spin_lock_irq(&ent->lock);
                if (!list_empty(&ent->head)) {
                        msg = list_entry(ent->head.next, typeof(*msg), list);
                        /* For cached lists, we must explicitly state what is
@@ -1181,43 +1225,34 @@ static struct mlx5_cmd_msg *alloc_msg(struct mlx5_core_dev *dev, int in_size)
                        msg->len = in_size;
                        list_del(&msg->list);
                }
-               spin_unlock(&ent->lock);
+               spin_unlock_irq(&ent->lock);
        }
 
        if (IS_ERR(msg))
-               msg = mlx5_alloc_cmd_msg(dev, GFP_KERNEL, in_size);
+               msg = mlx5_alloc_cmd_msg(dev, gfp, in_size);
 
        return msg;
 }
 
-static void free_msg(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *msg)
-{
-       if (msg->cache) {
-               spin_lock(&msg->cache->lock);
-               list_add_tail(&msg->list, &msg->cache->head);
-               spin_unlock(&msg->cache->lock);
-       } else {
-               mlx5_free_cmd_msg(dev, msg);
-       }
-}
-
 static int is_manage_pages(struct mlx5_inbox_hdr *in)
 {
        return be16_to_cpu(in->opcode) == MLX5_CMD_OP_MANAGE_PAGES;
 }
 
-int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out,
-                 int out_size)
+static int cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out,
+                   int out_size, mlx5_cmd_cbk_t callback, void *context)
 {
        struct mlx5_cmd_msg *inb;
        struct mlx5_cmd_msg *outb;
        int pages_queue;
+       gfp_t gfp;
        int err;
        u8 status = 0;
 
        pages_queue = is_manage_pages(in);
+       gfp = callback ? GFP_ATOMIC : GFP_KERNEL;
 
-       inb = alloc_msg(dev, in_size);
+       inb = alloc_msg(dev, in_size, gfp);
        if (IS_ERR(inb)) {
                err = PTR_ERR(inb);
                return err;
@@ -1229,13 +1264,14 @@ int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out,
                goto out_in;
        }
 
-       outb = mlx5_alloc_cmd_msg(dev, GFP_KERNEL, out_size);
+       outb = mlx5_alloc_cmd_msg(dev, gfp, out_size);
        if (IS_ERR(outb)) {
                err = PTR_ERR(outb);
                goto out_in;
        }
 
-       err = mlx5_cmd_invoke(dev, inb, outb, NULL, NULL, pages_queue, &status);
+       err = mlx5_cmd_invoke(dev, inb, outb, out, out_size, callback, context,
+                             pages_queue, &status);
        if (err)
                goto out_out;
 
@@ -1248,14 +1284,30 @@ int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out,
        err = mlx5_copy_from_msg(out, outb, out_size);
 
 out_out:
-       mlx5_free_cmd_msg(dev, outb);
+       if (!callback)
+               mlx5_free_cmd_msg(dev, outb);
 
 out_in:
-       free_msg(dev, inb);
+       if (!callback)
+               free_msg(dev, inb);
        return err;
 }
+
+int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out,
+                 int out_size)
+{
+       return cmd_exec(dev, in, in_size, out, out_size, NULL, NULL);
+}
 EXPORT_SYMBOL(mlx5_cmd_exec);
 
+int mlx5_cmd_exec_cb(struct mlx5_core_dev *dev, void *in, int in_size,
+                    void *out, int out_size, mlx5_cmd_cbk_t callback,
+                    void *context)
+{
+       return cmd_exec(dev, in, in_size, out, out_size, callback, context);
+}
+EXPORT_SYMBOL(mlx5_cmd_exec_cb);
+
 static void destroy_msg_cache(struct mlx5_core_dev *dev)
 {
        struct mlx5_cmd *cmd = &dev->cmd;
index 9c7194b..80f6d12 100644 (file)
@@ -154,10 +154,10 @@ static ssize_t average_read(struct file *filp, char __user *buf, size_t count,
                return 0;
 
        stats = filp->private_data;
-       spin_lock(&stats->lock);
+       spin_lock_irq(&stats->lock);
        if (stats->n)
                field = div64_u64(stats->sum, stats->n);
-       spin_unlock(&stats->lock);
+       spin_unlock_irq(&stats->lock);
        ret = snprintf(tbuf, sizeof(tbuf), "%llu\n", field);
        if (ret > 0) {
                if (copy_to_user(buf, tbuf, ret))
@@ -175,10 +175,10 @@ static ssize_t average_write(struct file *filp, const char __user *buf,
        struct mlx5_cmd_stats *stats;
 
        stats = filp->private_data;
-       spin_lock(&stats->lock);
+       spin_lock_irq(&stats->lock);
        stats->sum = 0;
        stats->n = 0;
-       spin_unlock(&stats->lock);
+       spin_unlock_irq(&stats->lock);
 
        *pos += count;
 
index 2231d93..64a61b2 100644 (file)
@@ -354,7 +354,7 @@ int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx,
        in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_CREATE_EQ);
        in->ctx.log_sz_usr_page = cpu_to_be32(ilog2(eq->nent) << 24 | uar->index);
        in->ctx.intr = vecidx;
-       in->ctx.log_page_size = PAGE_SHIFT - 12;
+       in->ctx.log_page_size = eq->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT;
        in->events_mask = cpu_to_be64(mask);
 
        err = mlx5_cmd_exec(dev, in, inlen, &out, sizeof(out));
index bc0f5fb..40a9f5e 100644 (file)
@@ -159,6 +159,36 @@ struct mlx5_reg_host_endianess {
        u8      rsvd[15];
 };
 
+
+#define CAP_MASK(pos, size) ((u64)((1 << (size)) - 1) << (pos))
+
+enum {
+       MLX5_CAP_BITS_RW_MASK   = CAP_MASK(MLX5_CAP_OFF_CMDIF_CSUM, 2) |
+                                 CAP_MASK(MLX5_CAP_OFF_DCT, 1),
+};
+
+/* selectively copy writable fields clearing any reserved area
+ */
+static void copy_rw_fields(struct mlx5_hca_cap *to, struct mlx5_hca_cap *from)
+{
+       u64 v64;
+
+       to->log_max_qp = from->log_max_qp & 0x1f;
+       to->log_max_ra_req_dc = from->log_max_ra_req_dc & 0x3f;
+       to->log_max_ra_res_dc = from->log_max_ra_res_dc & 0x3f;
+       to->log_max_ra_req_qp = from->log_max_ra_req_qp & 0x3f;
+       to->log_max_ra_res_qp = from->log_max_ra_res_qp & 0x3f;
+       to->log_max_atomic_size_qp = from->log_max_atomic_size_qp;
+       to->log_max_atomic_size_dc = from->log_max_atomic_size_dc;
+       v64 = be64_to_cpu(from->flags) & MLX5_CAP_BITS_RW_MASK;
+       to->flags = cpu_to_be64(v64);
+}
+
+enum {
+       HCA_CAP_OPMOD_GET_MAX   = 0,
+       HCA_CAP_OPMOD_GET_CUR   = 1,
+};
+
 static int handle_hca_cap(struct mlx5_core_dev *dev)
 {
        struct mlx5_cmd_query_hca_cap_mbox_out *query_out = NULL;
@@ -180,7 +210,7 @@ static int handle_hca_cap(struct mlx5_core_dev *dev)
        }
 
        query_ctx.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_HCA_CAP);
-       query_ctx.hdr.opmod  = cpu_to_be16(0x1);
+       query_ctx.hdr.opmod  = cpu_to_be16(HCA_CAP_OPMOD_GET_CUR);
        err = mlx5_cmd_exec(dev, &query_ctx, sizeof(query_ctx),
                                 query_out, sizeof(*query_out));
        if (err)
@@ -192,8 +222,7 @@ static int handle_hca_cap(struct mlx5_core_dev *dev)
                goto query_ex;
        }
 
-       memcpy(&set_ctx->hca_cap, &query_out->hca_cap,
-              sizeof(set_ctx->hca_cap));
+       copy_rw_fields(&set_ctx->hca_cap, &query_out->hca_cap);
 
        if (dev->profile->mask & MLX5_PROF_MASK_QP_SIZE)
                set_ctx->hca_cap.log_max_qp = dev->profile->log_max_qp;
index 5b44e2e..35e514d 100644 (file)
 #include "mlx5_core.h"
 
 int mlx5_core_create_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr,
-                         struct mlx5_create_mkey_mbox_in *in, int inlen)
+                         struct mlx5_create_mkey_mbox_in *in, int inlen,
+                         mlx5_cmd_cbk_t callback, void *context,
+                         struct mlx5_create_mkey_mbox_out *out)
 {
-       struct mlx5_create_mkey_mbox_out out;
+       struct mlx5_create_mkey_mbox_out lout;
        int err;
        u8 key;
 
-       memset(&out, 0, sizeof(out));
-       spin_lock(&dev->priv.mkey_lock);
+       memset(&lout, 0, sizeof(lout));
+       spin_lock_irq(&dev->priv.mkey_lock);
        key = dev->priv.mkey_key++;
-       spin_unlock(&dev->priv.mkey_lock);
+       spin_unlock_irq(&dev->priv.mkey_lock);
        in->seg.qpn_mkey7_0 |= cpu_to_be32(key);
        in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_CREATE_MKEY);
-       err = mlx5_cmd_exec(dev, in, inlen, &out, sizeof(out));
+       if (callback) {
+               err = mlx5_cmd_exec_cb(dev, in, inlen, out, sizeof(*out),
+                                      callback, context);
+               return err;
+       } else {
+               err = mlx5_cmd_exec(dev, in, inlen, &lout, sizeof(lout));
+       }
+
        if (err) {
                mlx5_core_dbg(dev, "cmd exec faile %d\n", err);
                return err;
        }
 
-       if (out.hdr.status) {
-               mlx5_core_dbg(dev, "status %d\n", out.hdr.status);
-               return mlx5_cmd_status_to_err(&out.hdr);
+       if (lout.hdr.status) {
+               mlx5_core_dbg(dev, "status %d\n", lout.hdr.status);
+               return mlx5_cmd_status_to_err(&lout.hdr);
        }
 
-       mr->key = mlx5_idx_to_mkey(be32_to_cpu(out.mkey) & 0xffffff) | key;
-       mlx5_core_dbg(dev, "out 0x%x, key 0x%x, mkey 0x%x\n", be32_to_cpu(out.mkey), key, mr->key);
+       mr->key = mlx5_idx_to_mkey(be32_to_cpu(lout.mkey) & 0xffffff) | key;
+       mlx5_core_dbg(dev, "out 0x%x, key 0x%x, mkey 0x%x\n",
+                     be32_to_cpu(lout.mkey), key, mr->key);
 
        return err;
 }
index 7b12acf..37b6ad1 100644 (file)
@@ -57,10 +57,13 @@ struct mlx5_pages_req {
 };
 
 struct fw_page {
-       struct rb_node  rb_node;
-       u64             addr;
-       struct page     *page;
-       u16             func_id;
+       struct rb_node          rb_node;
+       u64                     addr;
+       struct page            *page;
+       u16                     func_id;
+       unsigned long           bitmask;
+       struct list_head        list;
+       unsigned                free_count;
 };
 
 struct mlx5_query_pages_inbox {
@@ -94,6 +97,11 @@ enum {
        MAX_RECLAIM_TIME_MSECS  = 5000,
 };
 
+enum {
+       MLX5_MAX_RECLAIM_TIME_MILI      = 5000,
+       MLX5_NUM_4K_IN_PAGE             = PAGE_SIZE / 4096,
+};
+
 static int insert_page(struct mlx5_core_dev *dev, u64 addr, struct page *page, u16 func_id)
 {
        struct rb_root *root = &dev->priv.page_root;
@@ -101,6 +109,7 @@ static int insert_page(struct mlx5_core_dev *dev, u64 addr, struct page *page, u
        struct rb_node *parent = NULL;
        struct fw_page *nfp;
        struct fw_page *tfp;
+       int i;
 
        while (*new) {
                parent = *new;
@@ -113,25 +122,29 @@ static int insert_page(struct mlx5_core_dev *dev, u64 addr, struct page *page, u
                        return -EEXIST;
        }
 
-       nfp = kmalloc(sizeof(*nfp), GFP_KERNEL);
+       nfp = kzalloc(sizeof(*nfp), GFP_KERNEL);
        if (!nfp)
                return -ENOMEM;
 
        nfp->addr = addr;
        nfp->page = page;
        nfp->func_id = func_id;
+       nfp->free_count = MLX5_NUM_4K_IN_PAGE;
+       for (i = 0; i < MLX5_NUM_4K_IN_PAGE; i++)
+               set_bit(i, &nfp->bitmask);
 
        rb_link_node(&nfp->rb_node, parent, new);
        rb_insert_color(&nfp->rb_node, root);
+       list_add(&nfp->list, &dev->priv.free_list);
 
        return 0;
 }
 
-static struct page *remove_page(struct mlx5_core_dev *dev, u64 addr)
+static struct fw_page *find_fw_page(struct mlx5_core_dev *dev, u64 addr)
 {
        struct rb_root *root = &dev->priv.page_root;
        struct rb_node *tmp = root->rb_node;
-       struct page *result = NULL;
+       struct fw_page *result = NULL;
        struct fw_page *tfp;
 
        while (tmp) {
@@ -141,9 +154,7 @@ static struct page *remove_page(struct mlx5_core_dev *dev, u64 addr)
                } else if (tfp->addr > addr) {
                        tmp = tmp->rb_right;
                } else {
-                       rb_erase(&tfp->rb_node, root);
-                       result = tfp->page;
-                       kfree(tfp);
+                       result = tfp;
                        break;
                }
        }
@@ -176,12 +187,98 @@ static int mlx5_cmd_query_pages(struct mlx5_core_dev *dev, u16 *func_id,
        return err;
 }
 
+static int alloc_4k(struct mlx5_core_dev *dev, u64 *addr)
+{
+       struct fw_page *fp;
+       unsigned n;
+
+       if (list_empty(&dev->priv.free_list)) {
+               return -ENOMEM;
+               mlx5_core_warn(dev, "\n");
+       }
+
+       fp = list_entry(dev->priv.free_list.next, struct fw_page, list);
+       n = find_first_bit(&fp->bitmask, 8 * sizeof(fp->bitmask));
+       if (n >= MLX5_NUM_4K_IN_PAGE) {
+               mlx5_core_warn(dev, "alloc 4k bug\n");
+               return -ENOENT;
+       }
+       clear_bit(n, &fp->bitmask);
+       fp->free_count--;
+       if (!fp->free_count)
+               list_del(&fp->list);
+
+       *addr = fp->addr + n * 4096;
+
+       return 0;
+}
+
+static void free_4k(struct mlx5_core_dev *dev, u64 addr)
+{
+       struct fw_page *fwp;
+       int n;
+
+       fwp = find_fw_page(dev, addr & PAGE_MASK);
+       if (!fwp) {
+               mlx5_core_warn(dev, "page not found\n");
+               return;
+       }
+
+       n = (addr & ~PAGE_MASK) % 4096;
+       fwp->free_count++;
+       set_bit(n, &fwp->bitmask);
+       if (fwp->free_count == MLX5_NUM_4K_IN_PAGE) {
+               rb_erase(&fwp->rb_node, &dev->priv.page_root);
+               if (fwp->free_count != 1)
+                       list_del(&fwp->list);
+               dma_unmap_page(&dev->pdev->dev, addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
+               __free_page(fwp->page);
+               kfree(fwp);
+       } else if (fwp->free_count == 1) {
+               list_add(&fwp->list, &dev->priv.free_list);
+       }
+}
+
+static int alloc_system_page(struct mlx5_core_dev *dev, u16 func_id)
+{
+       struct page *page;
+       u64 addr;
+       int err;
+
+       page = alloc_page(GFP_HIGHUSER);
+       if (!page) {
+               mlx5_core_warn(dev, "failed to allocate page\n");
+               return -ENOMEM;
+       }
+       addr = dma_map_page(&dev->pdev->dev, page, 0,
+                           PAGE_SIZE, DMA_BIDIRECTIONAL);
+       if (dma_mapping_error(&dev->pdev->dev, addr)) {
+               mlx5_core_warn(dev, "failed dma mapping page\n");
+               err = -ENOMEM;
+               goto out_alloc;
+       }
+       err = insert_page(dev, addr, page, func_id);
+       if (err) {
+               mlx5_core_err(dev, "failed to track allocated page\n");
+               goto out_mapping;
+       }
+
+       return 0;
+
+out_mapping:
+       dma_unmap_page(&dev->pdev->dev, addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
+
+out_alloc:
+       __free_page(page);
+
+       return err;
+}
 static int give_pages(struct mlx5_core_dev *dev, u16 func_id, int npages,
                      int notify_fail)
 {
        struct mlx5_manage_pages_inbox *in;
        struct mlx5_manage_pages_outbox out;
-       struct page *page;
+       struct mlx5_manage_pages_inbox *nin;
        int inlen;
        u64 addr;
        int err;
@@ -196,27 +293,15 @@ static int give_pages(struct mlx5_core_dev *dev, u16 func_id, int npages,
        memset(&out, 0, sizeof(out));
 
        for (i = 0; i < npages; i++) {
-               page = alloc_page(GFP_HIGHUSER);
-               if (!page) {
-                       err = -ENOMEM;
-                       mlx5_core_warn(dev, "failed to allocate page\n");
-                       goto out_alloc;
-               }
-               addr = dma_map_page(&dev->pdev->dev, page, 0,
-                                   PAGE_SIZE, DMA_BIDIRECTIONAL);
-               if (dma_mapping_error(&dev->pdev->dev, addr)) {
-                       mlx5_core_warn(dev, "failed dma mapping page\n");
-                       __free_page(page);
-                       err = -ENOMEM;
-                       goto out_alloc;
-               }
-               err = insert_page(dev, addr, page, func_id);
+retry:
+               err = alloc_4k(dev, &addr);
                if (err) {
-                       mlx5_core_err(dev, "failed to track allocated page\n");
-                       dma_unmap_page(&dev->pdev->dev, addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
-                       __free_page(page);
-                       err = -ENOMEM;
-                       goto out_alloc;
+                       if (err == -ENOMEM)
+                               err = alloc_system_page(dev, func_id);
+                       if (err)
+                               goto out_4k;
+
+                       goto retry;
                }
                in->pas[i] = cpu_to_be64(addr);
        }
@@ -226,7 +311,6 @@ static int give_pages(struct mlx5_core_dev *dev, u16 func_id, int npages,
        in->func_id = cpu_to_be16(func_id);
        in->num_entries = cpu_to_be32(npages);
        err = mlx5_cmd_exec(dev, in, inlen, &out, sizeof(out));
-       mlx5_core_dbg(dev, "err %d\n", err);
        if (err) {
                mlx5_core_warn(dev, "func_id 0x%x, npages %d, err %d\n", func_id, npages, err);
                goto out_alloc;
@@ -247,25 +331,22 @@ static int give_pages(struct mlx5_core_dev *dev, u16 func_id, int npages,
 
 out_alloc:
        if (notify_fail) {
-               memset(in, 0, inlen);
-               memset(&out, 0, sizeof(out));
-               in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_MANAGE_PAGES);
-               in->hdr.opmod = cpu_to_be16(MLX5_PAGES_CANT_GIVE);
-               if (mlx5_cmd_exec(dev, in, sizeof(*in), &out, sizeof(out)))
-                       mlx5_core_warn(dev, "\n");
-       }
-       for (i--; i >= 0; i--) {
-               addr = be64_to_cpu(in->pas[i]);
-               page = remove_page(dev, addr);
-               if (!page) {
-                       mlx5_core_err(dev, "BUG: can't remove page at addr 0x%llx\n",
-                                     addr);
-                       continue;
+               nin = kzalloc(sizeof(*nin), GFP_KERNEL);
+               if (!nin) {
+                       mlx5_core_warn(dev, "allocation failed\n");
+                       goto out_4k;
                }
-               dma_unmap_page(&dev->pdev->dev, addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
-               __free_page(page);
+               memset(&out, 0, sizeof(out));
+               nin->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_MANAGE_PAGES);
+               nin->hdr.opmod = cpu_to_be16(MLX5_PAGES_CANT_GIVE);
+               if (mlx5_cmd_exec(dev, nin, sizeof(*nin), &out, sizeof(out)))
+                       mlx5_core_warn(dev, "page notify failed\n");
+               kfree(nin);
        }
 
+out_4k:
+       for (i--; i >= 0; i--)
+               free_4k(dev, be64_to_cpu(in->pas[i]));
 out_free:
        mlx5_vfree(in);
        return err;
@@ -276,7 +357,6 @@ static int reclaim_pages(struct mlx5_core_dev *dev, u32 func_id, int npages,
 {
        struct mlx5_manage_pages_inbox   in;
        struct mlx5_manage_pages_outbox *out;
-       struct page *page;
        int num_claimed;
        int outlen;
        u64 addr;
@@ -315,13 +395,7 @@ static int reclaim_pages(struct mlx5_core_dev *dev, u32 func_id, int npages,
 
        for (i = 0; i < num_claimed; i++) {
                addr = be64_to_cpu(out->pas[i]);
-               page = remove_page(dev, addr);
-               if (!page) {
-                       mlx5_core_warn(dev, "FW reported unknown DMA address 0x%llx\n", addr);
-               } else {
-                       dma_unmap_page(&dev->pdev->dev, addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
-                       __free_page(page);
-               }
+               free_4k(dev, addr);
        }
 
 out_free:
@@ -381,14 +455,19 @@ int mlx5_satisfy_startup_pages(struct mlx5_core_dev *dev, int boot)
        return give_pages(dev, func_id, npages, 0);
 }
 
+enum {
+       MLX5_BLKS_FOR_RECLAIM_PAGES = 12
+};
+
 static int optimal_reclaimed_pages(void)
 {
        struct mlx5_cmd_prot_block *block;
        struct mlx5_cmd_layout *lay;
        int ret;
 
-       ret = (sizeof(lay->in) + sizeof(block->data) -
-              sizeof(struct mlx5_manage_pages_outbox)) / 8;
+       ret = (sizeof(lay->out) + MLX5_BLKS_FOR_RECLAIM_PAGES * sizeof(block->data) -
+              sizeof(struct mlx5_manage_pages_outbox)) /
+              FIELD_SIZEOF(struct mlx5_manage_pages_outbox, pas[0]);
 
        return ret;
 }
@@ -427,6 +506,7 @@ int mlx5_reclaim_startup_pages(struct mlx5_core_dev *dev)
 void mlx5_pagealloc_init(struct mlx5_core_dev *dev)
 {
        dev->priv.page_root = RB_ROOT;
+       INIT_LIST_HEAD(&dev->priv.free_list);
 }
 
 void mlx5_pagealloc_cleanup(struct mlx5_core_dev *dev)
index f379c7f..2700a5a 100644 (file)
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/string.h>
+#include <linux/delay.h>
 
 #include <scsi/scsi.h>
+#include <scsi/scsi_cmnd.h>
 #include <scsi/scsi_device.h>
 #include <scsi/scsi_host.h>
 #include <scsi/scsi_transport.h>
 #include <scsi/scsi_transport_srp.h>
+#include "scsi_priv.h"
 #include "scsi_transport_srp_internal.h"
 
 struct srp_host_attrs {
@@ -38,7 +41,7 @@ struct srp_host_attrs {
 #define to_srp_host_attrs(host)        ((struct srp_host_attrs *)(host)->shost_data)
 
 #define SRP_HOST_ATTRS 0
-#define SRP_RPORT_ATTRS 3
+#define SRP_RPORT_ATTRS 8
 
 struct srp_internal {
        struct scsi_transport_template t;
@@ -54,6 +57,36 @@ struct srp_internal {
 
 #define        dev_to_rport(d) container_of(d, struct srp_rport, dev)
 #define transport_class_to_srp_rport(dev) dev_to_rport((dev)->parent)
+static inline struct Scsi_Host *rport_to_shost(struct srp_rport *r)
+{
+       return dev_to_shost(r->dev.parent);
+}
+
+/**
+ * srp_tmo_valid() - check timeout combination validity
+ *
+ * The combination of the timeout parameters must be such that SCSI commands
+ * are finished in a reasonable time. Hence do not allow the fast I/O fail
+ * timeout to exceed SCSI_DEVICE_BLOCK_MAX_TIMEOUT. Furthermore, these
+ * parameters must be such that multipath can detect failed paths timely.
+ * Hence do not allow all three parameters to be disabled simultaneously.
+ */
+int srp_tmo_valid(int reconnect_delay, int fast_io_fail_tmo, int dev_loss_tmo)
+{
+       if (reconnect_delay < 0 && fast_io_fail_tmo < 0 && dev_loss_tmo < 0)
+               return -EINVAL;
+       if (reconnect_delay == 0)
+               return -EINVAL;
+       if (fast_io_fail_tmo > SCSI_DEVICE_BLOCK_MAX_TIMEOUT)
+               return -EINVAL;
+       if (dev_loss_tmo >= LONG_MAX / HZ)
+               return -EINVAL;
+       if (fast_io_fail_tmo >= 0 && dev_loss_tmo >= 0 &&
+           fast_io_fail_tmo >= dev_loss_tmo)
+               return -EINVAL;
+       return 0;
+}
+EXPORT_SYMBOL_GPL(srp_tmo_valid);
 
 static int srp_host_setup(struct transport_container *tc, struct device *dev,
                          struct device *cdev)
@@ -134,10 +167,465 @@ static ssize_t store_srp_rport_delete(struct device *dev,
 
 static DEVICE_ATTR(delete, S_IWUSR, NULL, store_srp_rport_delete);
 
+static ssize_t show_srp_rport_state(struct device *dev,
+                                   struct device_attribute *attr,
+                                   char *buf)
+{
+       static const char *const state_name[] = {
+               [SRP_RPORT_RUNNING]     = "running",
+               [SRP_RPORT_BLOCKED]     = "blocked",
+               [SRP_RPORT_FAIL_FAST]   = "fail-fast",
+               [SRP_RPORT_LOST]        = "lost",
+       };
+       struct srp_rport *rport = transport_class_to_srp_rport(dev);
+       enum srp_rport_state state = rport->state;
+
+       return sprintf(buf, "%s\n",
+                      (unsigned)state < ARRAY_SIZE(state_name) ?
+                      state_name[state] : "???");
+}
+
+static DEVICE_ATTR(state, S_IRUGO, show_srp_rport_state, NULL);
+
+static ssize_t srp_show_tmo(char *buf, int tmo)
+{
+       return tmo >= 0 ? sprintf(buf, "%d\n", tmo) : sprintf(buf, "off\n");
+}
+
+static int srp_parse_tmo(int *tmo, const char *buf)
+{
+       int res = 0;
+
+       if (strncmp(buf, "off", 3) != 0)
+               res = kstrtoint(buf, 0, tmo);
+       else
+               *tmo = -1;
+
+       return res;
+}
+
+static ssize_t show_reconnect_delay(struct device *dev,
+                                   struct device_attribute *attr, char *buf)
+{
+       struct srp_rport *rport = transport_class_to_srp_rport(dev);
+
+       return srp_show_tmo(buf, rport->reconnect_delay);
+}
+
+static ssize_t store_reconnect_delay(struct device *dev,
+                                    struct device_attribute *attr,
+                                    const char *buf, const size_t count)
+{
+       struct srp_rport *rport = transport_class_to_srp_rport(dev);
+       int res, delay;
+
+       res = srp_parse_tmo(&delay, buf);
+       if (res)
+               goto out;
+       res = srp_tmo_valid(delay, rport->fast_io_fail_tmo,
+                           rport->dev_loss_tmo);
+       if (res)
+               goto out;
+
+       if (rport->reconnect_delay <= 0 && delay > 0 &&
+           rport->state != SRP_RPORT_RUNNING) {
+               queue_delayed_work(system_long_wq, &rport->reconnect_work,
+                                  delay * HZ);
+       } else if (delay <= 0) {
+               cancel_delayed_work(&rport->reconnect_work);
+       }
+       rport->reconnect_delay = delay;
+       res = count;
+
+out:
+       return res;
+}
+
+static DEVICE_ATTR(reconnect_delay, S_IRUGO | S_IWUSR, show_reconnect_delay,
+                  store_reconnect_delay);
+
+static ssize_t show_failed_reconnects(struct device *dev,
+                                     struct device_attribute *attr, char *buf)
+{
+       struct srp_rport *rport = transport_class_to_srp_rport(dev);
+
+       return sprintf(buf, "%d\n", rport->failed_reconnects);
+}
+
+static DEVICE_ATTR(failed_reconnects, S_IRUGO, show_failed_reconnects, NULL);
+
+static ssize_t show_srp_rport_fast_io_fail_tmo(struct device *dev,
+                                              struct device_attribute *attr,
+                                              char *buf)
+{
+       struct srp_rport *rport = transport_class_to_srp_rport(dev);
+
+       return srp_show_tmo(buf, rport->fast_io_fail_tmo);
+}
+
+static ssize_t store_srp_rport_fast_io_fail_tmo(struct device *dev,
+                                               struct device_attribute *attr,
+                                               const char *buf, size_t count)
+{
+       struct srp_rport *rport = transport_class_to_srp_rport(dev);
+       int res;
+       int fast_io_fail_tmo;
+
+       res = srp_parse_tmo(&fast_io_fail_tmo, buf);
+       if (res)
+               goto out;
+       res = srp_tmo_valid(rport->reconnect_delay, fast_io_fail_tmo,
+                           rport->dev_loss_tmo);
+       if (res)
+               goto out;
+       rport->fast_io_fail_tmo = fast_io_fail_tmo;
+       res = count;
+
+out:
+       return res;
+}
+
+static DEVICE_ATTR(fast_io_fail_tmo, S_IRUGO | S_IWUSR,
+                  show_srp_rport_fast_io_fail_tmo,
+                  store_srp_rport_fast_io_fail_tmo);
+
+static ssize_t show_srp_rport_dev_loss_tmo(struct device *dev,
+                                          struct device_attribute *attr,
+                                          char *buf)
+{
+       struct srp_rport *rport = transport_class_to_srp_rport(dev);
+
+       return srp_show_tmo(buf, rport->dev_loss_tmo);
+}
+
+static ssize_t store_srp_rport_dev_loss_tmo(struct device *dev,
+                                           struct device_attribute *attr,
+                                           const char *buf, size_t count)
+{
+       struct srp_rport *rport = transport_class_to_srp_rport(dev);
+       int res;
+       int dev_loss_tmo;
+
+       res = srp_parse_tmo(&dev_loss_tmo, buf);
+       if (res)
+               goto out;
+       res = srp_tmo_valid(rport->reconnect_delay, rport->fast_io_fail_tmo,
+                           dev_loss_tmo);
+       if (res)
+               goto out;
+       rport->dev_loss_tmo = dev_loss_tmo;
+       res = count;
+
+out:
+       return res;
+}
+
+static DEVICE_ATTR(dev_loss_tmo, S_IRUGO | S_IWUSR,
+                  show_srp_rport_dev_loss_tmo,
+                  store_srp_rport_dev_loss_tmo);
+
+static int srp_rport_set_state(struct srp_rport *rport,
+                              enum srp_rport_state new_state)
+{
+       enum srp_rport_state old_state = rport->state;
+
+       lockdep_assert_held(&rport->mutex);
+
+       switch (new_state) {
+       case SRP_RPORT_RUNNING:
+               switch (old_state) {
+               case SRP_RPORT_LOST:
+                       goto invalid;
+               default:
+                       break;
+               }
+               break;
+       case SRP_RPORT_BLOCKED:
+               switch (old_state) {
+               case SRP_RPORT_RUNNING:
+                       break;
+               default:
+                       goto invalid;
+               }
+               break;
+       case SRP_RPORT_FAIL_FAST:
+               switch (old_state) {
+               case SRP_RPORT_LOST:
+                       goto invalid;
+               default:
+                       break;
+               }
+               break;
+       case SRP_RPORT_LOST:
+               break;
+       }
+       rport->state = new_state;
+       return 0;
+
+invalid:
+       return -EINVAL;
+}
+
+/**
+ * srp_reconnect_work() - reconnect and schedule a new attempt if necessary
+ */
+static void srp_reconnect_work(struct work_struct *work)
+{
+       struct srp_rport *rport = container_of(to_delayed_work(work),
+                                       struct srp_rport, reconnect_work);
+       struct Scsi_Host *shost = rport_to_shost(rport);
+       int delay, res;
+
+       res = srp_reconnect_rport(rport);
+       if (res != 0) {
+               shost_printk(KERN_ERR, shost,
+                            "reconnect attempt %d failed (%d)\n",
+                            ++rport->failed_reconnects, res);
+               delay = rport->reconnect_delay *
+                       min(100, max(1, rport->failed_reconnects - 10));
+               if (delay > 0)
+                       queue_delayed_work(system_long_wq,
+                                          &rport->reconnect_work, delay * HZ);
+       }
+}
+
+static void __rport_fail_io_fast(struct srp_rport *rport)
+{
+       struct Scsi_Host *shost = rport_to_shost(rport);
+       struct srp_internal *i;
+
+       lockdep_assert_held(&rport->mutex);
+
+       if (srp_rport_set_state(rport, SRP_RPORT_FAIL_FAST))
+               return;
+       scsi_target_unblock(rport->dev.parent, SDEV_TRANSPORT_OFFLINE);
+
+       /* Involve the LLD if possible to terminate all I/O on the rport. */
+       i = to_srp_internal(shost->transportt);
+       if (i->f->terminate_rport_io)
+               i->f->terminate_rport_io(rport);
+}
+
+/**
+ * rport_fast_io_fail_timedout() - fast I/O failure timeout handler
+ */
+static void rport_fast_io_fail_timedout(struct work_struct *work)
+{
+       struct srp_rport *rport = container_of(to_delayed_work(work),
+                                       struct srp_rport, fast_io_fail_work);
+       struct Scsi_Host *shost = rport_to_shost(rport);
+
+       pr_info("fast_io_fail_tmo expired for SRP %s / %s.\n",
+               dev_name(&rport->dev), dev_name(&shost->shost_gendev));
+
+       mutex_lock(&rport->mutex);
+       if (rport->state == SRP_RPORT_BLOCKED)
+               __rport_fail_io_fast(rport);
+       mutex_unlock(&rport->mutex);
+}
+
+/**
+ * rport_dev_loss_timedout() - device loss timeout handler
+ */
+static void rport_dev_loss_timedout(struct work_struct *work)
+{
+       struct srp_rport *rport = container_of(to_delayed_work(work),
+                                       struct srp_rport, dev_loss_work);
+       struct Scsi_Host *shost = rport_to_shost(rport);
+       struct srp_internal *i = to_srp_internal(shost->transportt);
+
+       pr_info("dev_loss_tmo expired for SRP %s / %s.\n",
+               dev_name(&rport->dev), dev_name(&shost->shost_gendev));
+
+       mutex_lock(&rport->mutex);
+       WARN_ON(srp_rport_set_state(rport, SRP_RPORT_LOST) != 0);
+       scsi_target_unblock(rport->dev.parent, SDEV_TRANSPORT_OFFLINE);
+       mutex_unlock(&rport->mutex);
+
+       i->f->rport_delete(rport);
+}
+
+static void __srp_start_tl_fail_timers(struct srp_rport *rport)
+{
+       struct Scsi_Host *shost = rport_to_shost(rport);
+       int delay, fast_io_fail_tmo, dev_loss_tmo;
+
+       lockdep_assert_held(&rport->mutex);
+
+       if (!rport->deleted) {
+               delay = rport->reconnect_delay;
+               fast_io_fail_tmo = rport->fast_io_fail_tmo;
+               dev_loss_tmo = rport->dev_loss_tmo;
+               pr_debug("%s current state: %d\n",
+                        dev_name(&shost->shost_gendev), rport->state);
+
+               if (delay > 0)
+                       queue_delayed_work(system_long_wq,
+                                          &rport->reconnect_work,
+                                          1UL * delay * HZ);
+               if (fast_io_fail_tmo >= 0 &&
+                   srp_rport_set_state(rport, SRP_RPORT_BLOCKED) == 0) {
+                       pr_debug("%s new state: %d\n",
+                                dev_name(&shost->shost_gendev),
+                                rport->state);
+                       scsi_target_block(&shost->shost_gendev);
+                       queue_delayed_work(system_long_wq,
+                                          &rport->fast_io_fail_work,
+                                          1UL * fast_io_fail_tmo * HZ);
+               }
+               if (dev_loss_tmo >= 0)
+                       queue_delayed_work(system_long_wq,
+                                          &rport->dev_loss_work,
+                                          1UL * dev_loss_tmo * HZ);
+       } else {
+               pr_debug("%s has already been deleted\n",
+                        dev_name(&shost->shost_gendev));
+               srp_rport_set_state(rport, SRP_RPORT_FAIL_FAST);
+               scsi_target_unblock(&shost->shost_gendev,
+                                   SDEV_TRANSPORT_OFFLINE);
+       }
+}
+
+/**
+ * srp_start_tl_fail_timers() - start the transport layer failure timers
+ *
+ * Start the transport layer fast I/O failure and device loss timers. Do not
+ * modify a timer that was already started.
+ */
+void srp_start_tl_fail_timers(struct srp_rport *rport)
+{
+       mutex_lock(&rport->mutex);
+       __srp_start_tl_fail_timers(rport);
+       mutex_unlock(&rport->mutex);
+}
+EXPORT_SYMBOL(srp_start_tl_fail_timers);
+
+/**
+ * scsi_request_fn_active() - number of kernel threads inside scsi_request_fn()
+ */
+static int scsi_request_fn_active(struct Scsi_Host *shost)
+{
+       struct scsi_device *sdev;
+       struct request_queue *q;
+       int request_fn_active = 0;
+
+       shost_for_each_device(sdev, shost) {
+               q = sdev->request_queue;
+
+               spin_lock_irq(q->queue_lock);
+               request_fn_active += q->request_fn_active;
+               spin_unlock_irq(q->queue_lock);
+       }
+
+       return request_fn_active;
+}
+
+/**
+ * srp_reconnect_rport() - reconnect to an SRP target port
+ *
+ * Blocks SCSI command queueing before invoking reconnect() such that
+ * queuecommand() won't be invoked concurrently with reconnect() from outside
+ * the SCSI EH. This is important since a reconnect() implementation may
+ * reallocate resources needed by queuecommand().
+ *
+ * Notes:
+ * - This function neither waits until outstanding requests have finished nor
+ *   tries to abort these. It is the responsibility of the reconnect()
+ *   function to finish outstanding commands before reconnecting to the target
+ *   port.
+ * - It is the responsibility of the caller to ensure that the resources
+ *   reallocated by the reconnect() function won't be used while this function
+ *   is in progress. One possible strategy is to invoke this function from
+ *   the context of the SCSI EH thread only. Another possible strategy is to
+ *   lock the rport mutex inside each SCSI LLD callback that can be invoked by
+ *   the SCSI EH (the scsi_host_template.eh_*() functions and also the
+ *   scsi_host_template.queuecommand() function).
+ */
+int srp_reconnect_rport(struct srp_rport *rport)
+{
+       struct Scsi_Host *shost = rport_to_shost(rport);
+       struct srp_internal *i = to_srp_internal(shost->transportt);
+       struct scsi_device *sdev;
+       int res;
+
+       pr_debug("SCSI host %s\n", dev_name(&shost->shost_gendev));
+
+       res = mutex_lock_interruptible(&rport->mutex);
+       if (res)
+               goto out;
+       scsi_target_block(&shost->shost_gendev);
+       while (scsi_request_fn_active(shost))
+               msleep(20);
+       res = i->f->reconnect(rport);
+       pr_debug("%s (state %d): transport.reconnect() returned %d\n",
+                dev_name(&shost->shost_gendev), rport->state, res);
+       if (res == 0) {
+               cancel_delayed_work(&rport->fast_io_fail_work);
+               cancel_delayed_work(&rport->dev_loss_work);
+
+               rport->failed_reconnects = 0;
+               srp_rport_set_state(rport, SRP_RPORT_RUNNING);
+               scsi_target_unblock(&shost->shost_gendev, SDEV_RUNNING);
+               /*
+                * If the SCSI error handler has offlined one or more devices,
+                * invoking scsi_target_unblock() won't change the state of
+                * these devices into running so do that explicitly.
+                */
+               spin_lock_irq(shost->host_lock);
+               __shost_for_each_device(sdev, shost)
+                       if (sdev->sdev_state == SDEV_OFFLINE)
+                               sdev->sdev_state = SDEV_RUNNING;
+               spin_unlock_irq(shost->host_lock);
+       } else if (rport->state == SRP_RPORT_RUNNING) {
+               /*
+                * srp_reconnect_rport() was invoked with fast_io_fail
+                * off. Mark the port as failed and start the TL failure
+                * timers if these had not yet been started.
+                */
+               __rport_fail_io_fast(rport);
+               scsi_target_unblock(&shost->shost_gendev,
+                                   SDEV_TRANSPORT_OFFLINE);
+               __srp_start_tl_fail_timers(rport);
+       } else if (rport->state != SRP_RPORT_BLOCKED) {
+               scsi_target_unblock(&shost->shost_gendev,
+                                   SDEV_TRANSPORT_OFFLINE);
+       }
+       mutex_unlock(&rport->mutex);
+
+out:
+       return res;
+}
+EXPORT_SYMBOL(srp_reconnect_rport);
+
+/**
+ * srp_timed_out() - SRP transport intercept of the SCSI timeout EH
+ *
+ * If a timeout occurs while an rport is in the blocked state, ask the SCSI
+ * EH to continue waiting (BLK_EH_RESET_TIMER). Otherwise let the SCSI core
+ * handle the timeout (BLK_EH_NOT_HANDLED).
+ *
+ * Note: This function is called from soft-IRQ context and with the request
+ * queue lock held.
+ */
+static enum blk_eh_timer_return srp_timed_out(struct scsi_cmnd *scmd)
+{
+       struct scsi_device *sdev = scmd->device;
+       struct Scsi_Host *shost = sdev->host;
+       struct srp_internal *i = to_srp_internal(shost->transportt);
+
+       pr_debug("timeout for sdev %s\n", dev_name(&sdev->sdev_gendev));
+       return i->f->reset_timer_if_blocked && scsi_device_blocked(sdev) ?
+               BLK_EH_RESET_TIMER : BLK_EH_NOT_HANDLED;
+}
+
 static void srp_rport_release(struct device *dev)
 {
        struct srp_rport *rport = dev_to_rport(dev);
 
+       cancel_delayed_work_sync(&rport->reconnect_work);
+       cancel_delayed_work_sync(&rport->fast_io_fail_work);
+       cancel_delayed_work_sync(&rport->dev_loss_work);
+
        put_device(dev->parent);
        kfree(rport);
 }
@@ -185,6 +673,24 @@ static int srp_host_match(struct attribute_container *cont, struct device *dev)
 }
 
 /**
+ * srp_rport_get() - increment rport reference count
+ */
+void srp_rport_get(struct srp_rport *rport)
+{
+       get_device(&rport->dev);
+}
+EXPORT_SYMBOL(srp_rport_get);
+
+/**
+ * srp_rport_put() - decrement rport reference count
+ */
+void srp_rport_put(struct srp_rport *rport)
+{
+       put_device(&rport->dev);
+}
+EXPORT_SYMBOL(srp_rport_put);
+
+/**
  * srp_rport_add - add a SRP remote port to the device hierarchy
  * @shost:     scsi host the remote port is connected to.
  * @ids:       The port id for the remote port.
@@ -196,12 +702,15 @@ struct srp_rport *srp_rport_add(struct Scsi_Host *shost,
 {
        struct srp_rport *rport;
        struct device *parent = &shost->shost_gendev;
+       struct srp_internal *i = to_srp_internal(shost->transportt);
        int id, ret;
 
        rport = kzalloc(sizeof(*rport), GFP_KERNEL);
        if (!rport)
                return ERR_PTR(-ENOMEM);
 
+       mutex_init(&rport->mutex);
+
        device_initialize(&rport->dev);
 
        rport->dev.parent = get_device(parent);
@@ -210,6 +719,17 @@ struct srp_rport *srp_rport_add(struct Scsi_Host *shost,
        memcpy(rport->port_id, ids->port_id, sizeof(rport->port_id));
        rport->roles = ids->roles;
 
+       if (i->f->reconnect)
+               rport->reconnect_delay = i->f->reconnect_delay ?
+                       *i->f->reconnect_delay : 10;
+       INIT_DELAYED_WORK(&rport->reconnect_work, srp_reconnect_work);
+       rport->fast_io_fail_tmo = i->f->fast_io_fail_tmo ?
+               *i->f->fast_io_fail_tmo : 15;
+       rport->dev_loss_tmo = i->f->dev_loss_tmo ? *i->f->dev_loss_tmo : 60;
+       INIT_DELAYED_WORK(&rport->fast_io_fail_work,
+                         rport_fast_io_fail_timedout);
+       INIT_DELAYED_WORK(&rport->dev_loss_work, rport_dev_loss_timedout);
+
        id = atomic_inc_return(&to_srp_host_attrs(shost)->next_port_id);
        dev_set_name(&rport->dev, "port-%d:%d", shost->host_no, id);
 
@@ -259,6 +779,13 @@ void srp_rport_del(struct srp_rport *rport)
        transport_remove_device(dev);
        device_del(dev);
        transport_destroy_device(dev);
+
+       mutex_lock(&rport->mutex);
+       if (rport->state == SRP_RPORT_BLOCKED)
+               __rport_fail_io_fast(rport);
+       rport->deleted = true;
+       mutex_unlock(&rport->mutex);
+
        put_device(dev);
 }
 EXPORT_SYMBOL_GPL(srp_rport_del);
@@ -310,6 +837,8 @@ srp_attach_transport(struct srp_function_template *ft)
        if (!i)
                return NULL;
 
+       i->t.eh_timed_out = srp_timed_out;
+
        i->t.tsk_mgmt_response = srp_tsk_mgmt_response;
        i->t.it_nexus_response = srp_it_nexus_response;
 
@@ -327,6 +856,15 @@ srp_attach_transport(struct srp_function_template *ft)
        count = 0;
        i->rport_attrs[count++] = &dev_attr_port_id;
        i->rport_attrs[count++] = &dev_attr_roles;
+       if (ft->has_rport_state) {
+               i->rport_attrs[count++] = &dev_attr_state;
+               i->rport_attrs[count++] = &dev_attr_fast_io_fail_tmo;
+               i->rport_attrs[count++] = &dev_attr_dev_loss_tmo;
+       }
+       if (ft->reconnect) {
+               i->rport_attrs[count++] = &dev_attr_reconnect_delay;
+               i->rport_attrs[count++] = &dev_attr_failed_reconnects;
+       }
        if (ft->rport_delete)
                i->rport_attrs[count++] = &dev_attr_delete;
        i->rport_attrs[count++] = NULL;
index 5eb4e31..da78875 100644 (file)
@@ -230,6 +230,15 @@ enum {
        MLX5_MAX_PAGE_SHIFT             = 31
 };
 
+enum {
+       MLX5_ADAPTER_PAGE_SHIFT         = 12
+};
+
+enum {
+       MLX5_CAP_OFF_DCT                = 41,
+       MLX5_CAP_OFF_CMDIF_CSUM         = 46,
+};
+
 struct mlx5_inbox_hdr {
        __be16          opcode;
        u8              rsvd[4];
@@ -319,9 +328,9 @@ struct mlx5_hca_cap {
        u8      rsvd25[42];
        __be16  log_uar_page_sz;
        u8      rsvd26[28];
-       u8      log_msx_atomic_size_qp;
+       u8      log_max_atomic_size_qp;
        u8      rsvd27[2];
-       u8      log_msx_atomic_size_dc;
+       u8      log_max_atomic_size_dc;
        u8      rsvd28[76];
 };
 
index 6b8c496..554548c 100644 (file)
@@ -483,6 +483,7 @@ struct mlx5_priv {
        struct rb_root          page_root;
        int                     fw_pages;
        int                     reg_pages;
+       struct list_head        free_list;
 
        struct mlx5_core_health health;
 
@@ -557,9 +558,11 @@ typedef void (*mlx5_cmd_cbk_t)(int status, void *context);
 struct mlx5_cmd_work_ent {
        struct mlx5_cmd_msg    *in;
        struct mlx5_cmd_msg    *out;
+       void                   *uout;
+       int                     uout_size;
        mlx5_cmd_cbk_t          callback;
        void                   *context;
-       int idx;
+       int                     idx;
        struct completion       done;
        struct mlx5_cmd        *cmd;
        struct work_struct      work;
@@ -570,6 +573,7 @@ struct mlx5_cmd_work_ent {
        u8                      token;
        struct timespec         ts1;
        struct timespec         ts2;
+       u16                     op;
 };
 
 struct mlx5_pas {
@@ -653,6 +657,9 @@ void mlx5_cmd_use_polling(struct mlx5_core_dev *dev);
 int mlx5_cmd_status_to_err(struct mlx5_outbox_hdr *hdr);
 int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out,
                  int out_size);
+int mlx5_cmd_exec_cb(struct mlx5_core_dev *dev, void *in, int in_size,
+                    void *out, int out_size, mlx5_cmd_cbk_t callback,
+                    void *context);
 int mlx5_cmd_alloc_uar(struct mlx5_core_dev *dev, u32 *uarn);
 int mlx5_cmd_free_uar(struct mlx5_core_dev *dev, u32 uarn);
 int mlx5_alloc_uuars(struct mlx5_core_dev *dev, struct mlx5_uuar_info *uuari);
@@ -676,7 +683,9 @@ int mlx5_core_query_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
 int mlx5_core_arm_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
                      u16 lwm, int is_srq);
 int mlx5_core_create_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr,
-                         struct mlx5_create_mkey_mbox_in *in, int inlen);
+                         struct mlx5_create_mkey_mbox_in *in, int inlen,
+                         mlx5_cmd_cbk_t callback, void *context,
+                         struct mlx5_create_mkey_mbox_out *out);
 int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr);
 int mlx5_core_query_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr,
                         struct mlx5_query_mkey_mbox_out *out, int outlen);
@@ -745,6 +754,11 @@ static inline u32 mlx5_idx_to_mkey(u32 mkey_idx)
        return mkey_idx << 8;
 }
 
+static inline u8 mlx5_mkey_variant(u32 mkey)
+{
+       return mkey & 0xff;
+}
+
 enum {
        MLX5_PROF_MASK_QP_SIZE          = (u64)1 << 0,
        MLX5_PROF_MASK_MR_CACHE         = (u64)1 << 1,
index e393171..979874c 100644 (file)
@@ -67,12 +67,14 @@ enum rdma_node_type {
        RDMA_NODE_IB_CA         = 1,
        RDMA_NODE_IB_SWITCH,
        RDMA_NODE_IB_ROUTER,
-       RDMA_NODE_RNIC
+       RDMA_NODE_RNIC,
+       RDMA_NODE_USNIC,
 };
 
 enum rdma_transport_type {
        RDMA_TRANSPORT_IB,
-       RDMA_TRANSPORT_IWARP
+       RDMA_TRANSPORT_IWARP,
+       RDMA_TRANSPORT_USNIC
 };
 
 enum rdma_transport_type
@@ -1436,6 +1438,7 @@ struct ib_device {
 
        int                          uverbs_abi_ver;
        u64                          uverbs_cmd_mask;
+       u64                          uverbs_ex_cmd_mask;
 
        char                         node_desc[64];
        __be64                       node_guid;
@@ -2384,4 +2387,17 @@ struct ib_flow *ib_create_flow(struct ib_qp *qp,
                               struct ib_flow_attr *flow_attr, int domain);
 int ib_destroy_flow(struct ib_flow *flow_id);
 
+static inline int ib_check_mr_access(int flags)
+{
+       /*
+        * Local write permission is required if remote write or
+        * remote atomic permission is also requested.
+        */
+       if (flags & (IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_REMOTE_WRITE) &&
+           !(flags & IB_ACCESS_LOCAL_WRITE))
+               return -EINVAL;
+
+       return 0;
+}
+
 #endif /* IB_VERBS_H */
index ff0f04a..4ebf691 100644 (file)
@@ -13,6 +13,27 @@ struct srp_rport_identifiers {
        u8 roles;
 };
 
+/**
+ * enum srp_rport_state - SRP transport layer state
+ * @SRP_RPORT_RUNNING:   Transport layer operational.
+ * @SRP_RPORT_BLOCKED:   Transport layer not operational; fast I/O fail timer
+ *                       is running and I/O has been blocked.
+ * @SRP_RPORT_FAIL_FAST: Fast I/O fail timer has expired; fail I/O fast.
+ * @SRP_RPORT_LOST:      Device loss timer has expired; port is being removed.
+ */
+enum srp_rport_state {
+       SRP_RPORT_RUNNING,
+       SRP_RPORT_BLOCKED,
+       SRP_RPORT_FAIL_FAST,
+       SRP_RPORT_LOST,
+};
+
+/**
+ * struct srp_rport
+ * @lld_data: LLD private data.
+ * @mutex:    Protects against concurrent rport reconnect / fast_io_fail /
+ *   dev_loss_tmo activity.
+ */
 struct srp_rport {
        /* for initiator and target drivers */
 
@@ -23,11 +44,43 @@ struct srp_rport {
 
        /* for initiator drivers */
 
-       void *lld_data; /* LLD private data */
+       void                    *lld_data;
+
+       struct mutex            mutex;
+       enum srp_rport_state    state;
+       bool                    deleted;
+       int                     reconnect_delay;
+       int                     failed_reconnects;
+       struct delayed_work     reconnect_work;
+       int                     fast_io_fail_tmo;
+       int                     dev_loss_tmo;
+       struct delayed_work     fast_io_fail_work;
+       struct delayed_work     dev_loss_work;
 };
 
+/**
+ * struct srp_function_template
+ * @has_rport_state: Whether or not to create the state, fast_io_fail_tmo and
+ *     dev_loss_tmo sysfs attribute for an rport.
+ * @reset_timer_if_blocked: Whether or srp_timed_out() should reset the command
+ *     timer if the device on which it has been queued is blocked.
+ * @reconnect_delay: If not NULL, points to the default reconnect_delay value.
+ * @fast_io_fail_tmo: If not NULL, points to the default fast_io_fail_tmo value.
+ * @dev_loss_tmo: If not NULL, points to the default dev_loss_tmo value.
+ * @reconnect: Callback function for reconnecting to the target. See also
+ *     srp_reconnect_rport().
+ * @terminate_rport_io: Callback function for terminating all outstanding I/O
+ *     requests for an rport.
+ */
 struct srp_function_template {
        /* for initiator drivers */
+       bool has_rport_state;
+       bool reset_timer_if_blocked;
+       int *reconnect_delay;
+       int *fast_io_fail_tmo;
+       int *dev_loss_tmo;
+       int (*reconnect)(struct srp_rport *rport);
+       void (*terminate_rport_io)(struct srp_rport *rport);
        void (*rport_delete)(struct srp_rport *rport);
        /* for target drivers */
        int (* tsk_mgmt_response)(struct Scsi_Host *, u64, u64, int);
@@ -38,10 +91,36 @@ extern struct scsi_transport_template *
 srp_attach_transport(struct srp_function_template *);
 extern void srp_release_transport(struct scsi_transport_template *);
 
+extern void srp_rport_get(struct srp_rport *rport);
+extern void srp_rport_put(struct srp_rport *rport);
 extern struct srp_rport *srp_rport_add(struct Scsi_Host *,
                                       struct srp_rport_identifiers *);
 extern void srp_rport_del(struct srp_rport *);
-
+extern int srp_tmo_valid(int reconnect_delay, int fast_io_fail_tmo,
+                        int dev_loss_tmo);
+extern int srp_reconnect_rport(struct srp_rport *rport);
+extern void srp_start_tl_fail_timers(struct srp_rport *rport);
 extern void srp_remove_host(struct Scsi_Host *);
 
+/**
+ * srp_chkready() - evaluate the transport layer state before I/O
+ *
+ * Returns a SCSI result code that can be returned by the LLD queuecommand()
+ * implementation. The role of this function is similar to that of
+ * fc_remote_port_chkready().
+ */
+static inline int srp_chkready(struct srp_rport *rport)
+{
+       switch (rport->state) {
+       case SRP_RPORT_RUNNING:
+       case SRP_RPORT_BLOCKED:
+       default:
+               return 0;
+       case SRP_RPORT_FAIL_FAST:
+               return DID_TRANSPORT_FAILFAST << 16;
+       case SRP_RPORT_LOST:
+               return DID_NO_CONNECT << 16;
+       }
+}
+
 #endif
index e3ddd86..cbfdd4c 100644 (file)
@@ -87,10 +87,11 @@ enum {
        IB_USER_VERBS_CMD_CLOSE_XRCD,
        IB_USER_VERBS_CMD_CREATE_XSRQ,
        IB_USER_VERBS_CMD_OPEN_QP,
-#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING
-       IB_USER_VERBS_CMD_CREATE_FLOW = IB_USER_VERBS_CMD_THRESHOLD,
-       IB_USER_VERBS_CMD_DESTROY_FLOW
-#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */
+};
+
+enum {
+       IB_USER_VERBS_EX_CMD_CREATE_FLOW = IB_USER_VERBS_CMD_THRESHOLD,
+       IB_USER_VERBS_EX_CMD_DESTROY_FLOW
 };
 
 /*
@@ -122,22 +123,24 @@ struct ib_uverbs_comp_event_desc {
  * the rest of the command struct based on these value.
  */
 
+#define IB_USER_VERBS_CMD_COMMAND_MASK 0xff
+#define IB_USER_VERBS_CMD_FLAGS_MASK 0xff000000u
+#define IB_USER_VERBS_CMD_FLAGS_SHIFT 24
+
+#define IB_USER_VERBS_CMD_FLAG_EXTENDED 0x80
+
 struct ib_uverbs_cmd_hdr {
        __u32 command;
        __u16 in_words;
        __u16 out_words;
 };
 
-#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING
-struct ib_uverbs_cmd_hdr_ex {
-       __u32 command;
-       __u16 in_words;
-       __u16 out_words;
+struct ib_uverbs_ex_cmd_hdr {
+       __u64 response;
        __u16 provider_in_words;
        __u16 provider_out_words;
        __u32 cmd_hdr_reserved;
 };
-#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */
 
 struct ib_uverbs_get_context {
        __u64 response;
@@ -700,62 +703,71 @@ struct ib_uverbs_detach_mcast {
        __u64 driver_data[0];
 };
 
-#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING
-struct ib_kern_eth_filter {
+struct ib_uverbs_flow_spec_hdr {
+       __u32 type;
+       __u16 size;
+       __u16 reserved;
+       /* followed by flow_spec */
+       __u64 flow_spec_data[0];
+};
+
+struct ib_uverbs_flow_eth_filter {
        __u8  dst_mac[6];
        __u8  src_mac[6];
        __be16 ether_type;
        __be16 vlan_tag;
 };
 
-struct ib_kern_spec_eth {
-       __u32  type;
-       __u16  size;
-       __u16  reserved;
-       struct ib_kern_eth_filter val;
-       struct ib_kern_eth_filter mask;
+struct ib_uverbs_flow_spec_eth {
+       union {
+               struct ib_uverbs_flow_spec_hdr hdr;
+               struct {
+                       __u32 type;
+                       __u16 size;
+                       __u16 reserved;
+               };
+       };
+       struct ib_uverbs_flow_eth_filter val;
+       struct ib_uverbs_flow_eth_filter mask;
 };
 
-struct ib_kern_ipv4_filter {
+struct ib_uverbs_flow_ipv4_filter {
        __be32 src_ip;
        __be32 dst_ip;
 };
 
-struct ib_kern_spec_ipv4 {
-       __u32  type;
-       __u16  size;
-       __u16  reserved;
-       struct ib_kern_ipv4_filter val;
-       struct ib_kern_ipv4_filter mask;
+struct ib_uverbs_flow_spec_ipv4 {
+       union {
+               struct ib_uverbs_flow_spec_hdr hdr;
+               struct {
+                       __u32 type;
+                       __u16 size;
+                       __u16 reserved;
+               };
+       };
+       struct ib_uverbs_flow_ipv4_filter val;
+       struct ib_uverbs_flow_ipv4_filter mask;
 };
 
-struct ib_kern_tcp_udp_filter {
+struct ib_uverbs_flow_tcp_udp_filter {
        __be16 dst_port;
        __be16 src_port;
 };
 
-struct ib_kern_spec_tcp_udp {
-       __u32  type;
-       __u16  size;
-       __u16  reserved;
-       struct ib_kern_tcp_udp_filter val;
-       struct ib_kern_tcp_udp_filter mask;
-};
-
-struct ib_kern_spec {
+struct ib_uverbs_flow_spec_tcp_udp {
        union {
+               struct ib_uverbs_flow_spec_hdr hdr;
                struct {
                        __u32 type;
                        __u16 size;
                        __u16 reserved;
                };
-               struct ib_kern_spec_eth     eth;
-               struct ib_kern_spec_ipv4    ipv4;
-               struct ib_kern_spec_tcp_udp tcp_udp;
        };
+       struct ib_uverbs_flow_tcp_udp_filter val;
+       struct ib_uverbs_flow_tcp_udp_filter mask;
 };
 
-struct ib_kern_flow_attr {
+struct ib_uverbs_flow_attr {
        __u32 type;
        __u16 size;
        __u16 priority;
@@ -767,13 +779,13 @@ struct ib_kern_flow_attr {
         * struct ib_flow_spec_xxx
         * struct ib_flow_spec_yyy
         */
+       struct ib_uverbs_flow_spec_hdr flow_specs[0];
 };
 
 struct ib_uverbs_create_flow  {
        __u32 comp_mask;
-       __u64 response;
        __u32 qp_handle;
-       struct ib_kern_flow_attr flow_attr;
+       struct ib_uverbs_flow_attr flow_attr;
 };
 
 struct ib_uverbs_create_flow_resp {
@@ -785,7 +797,6 @@ struct ib_uverbs_destroy_flow  {
        __u32 comp_mask;
        __u32 flow_handle;
 };
-#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */
 
 struct ib_uverbs_create_srq {
        __u64 response;