smc: remote memory buffers (RMBs)
authorUrsula Braun <ubraun@linux.vnet.ibm.com>
Mon, 9 Jan 2017 15:55:18 +0000 (16:55 +0100)
committerDavid S. Miller <davem@davemloft.net>
Mon, 9 Jan 2017 21:07:39 +0000 (16:07 -0500)
* allocate data RMB memory for sending and receiving
* size depends on the maximum socket send and receive buffers
* allocated RMBs are kept during life time of the owning link group
* map the allocated RMBs to DMA

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
net/smc/af_smc.c
net/smc/smc.h
net/smc/smc_clc.c
net/smc/smc_core.c
net/smc/smc_core.h
net/smc/smc_ib.c
net/smc/smc_ib.h

index 5fda37d..a38f470 100644 (file)
@@ -249,6 +249,8 @@ static void smc_conn_save_peer_info(struct smc_sock *smc,
                                    struct smc_clc_msg_accept_confirm *clc)
 {
        smc->conn.peer_conn_idx = clc->conn_idx;
+       smc->conn.peer_rmbe_size = smc_uncompress_bufsize(clc->rmbe_size);
+       atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
 }
 
 static void smc_link_save_peer_info(struct smc_link *link,
@@ -323,6 +325,18 @@ static int smc_connect_rdma(struct smc_sock *smc)
        link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
 
        smc_conn_save_peer_info(smc, &aclc);
+
+       rc = smc_sndbuf_create(smc);
+       if (rc) {
+               reason_code = SMC_CLC_DECL_MEM;
+               goto decline_rdma_unlock;
+       }
+       rc = smc_rmb_create(smc);
+       if (rc) {
+               reason_code = SMC_CLC_DECL_MEM;
+               goto decline_rdma_unlock;
+       }
+
        if (local_contact == SMC_FIRST_CONTACT)
                smc_link_save_peer_info(link, &aclc);
        /* tbd in follow-on patch: more steps to setup RDMA communcication,
@@ -598,9 +612,16 @@ static void smc_listen_work(struct work_struct *work)
        }
        link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
 
-       /* tbd in follow-on patch: more steps to setup RDMA communcication,
-        * create rmbs, map rmbs
-        */
+       rc = smc_sndbuf_create(new_smc);
+       if (rc) {
+               reason_code = SMC_CLC_DECL_MEM;
+               goto decline_rdma;
+       }
+       rc = smc_rmb_create(new_smc);
+       if (rc) {
+               reason_code = SMC_CLC_DECL_MEM;
+               goto decline_rdma;
+       }
 
        rc = smc_clc_send_accept(new_smc, local_contact);
        if (rc)
@@ -1047,6 +1068,8 @@ static int smc_create(struct net *net, struct socket *sock, int protocol,
                              IPPROTO_TCP, &smc->clcsock);
        if (rc)
                sk_common_release(sk);
+       smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE);
+       smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE);
 
 out:
        return rc;
index 11265bd..2bf5044 100644 (file)
@@ -34,6 +34,16 @@ struct smc_connection {
        struct smc_link_group   *lgr;           /* link group of connection */
        u32                     alert_token_local; /* unique conn. id */
        u8                      peer_conn_idx;  /* from tcp handshake */
+       int                     peer_rmbe_size; /* size of peer rx buffer */
+       atomic_t                peer_rmbe_space;/* remaining free bytes in peer
+                                                * rmbe
+                                                */
+
+       struct smc_buf_desc     *sndbuf_desc;   /* send buffer descriptor */
+       int                     sndbuf_size;    /* sndbuf size <== sock wmem */
+       struct smc_buf_desc     *rmb_desc;      /* RMBE descriptor */
+       int                     rmbe_size;      /* RMBE size <== sock rmem */
+       int                     rmbe_size_short;/* compressed notation */
 };
 
 struct smc_sock {                              /* smc sock container */
@@ -76,6 +86,41 @@ static inline u32 ntoh24(u8 *net)
        return be32_to_cpu(t);
 }
 
+#define SMC_BUF_MIN_SIZE 16384         /* minimum size of an RMB */
+
+#define SMC_RMBE_SIZES 16      /* number of distinct sizes for an RMBE */
+/* theoretically, the RFC states that largest size would be 512K,
+ * i.e. compressed 5 and thus 6 sizes (0..5), despite
+ * struct smc_clc_msg_accept_confirm.rmbe_size being a 4 bit value (0..15)
+ */
+
+/* convert the RMB size into the compressed notation - minimum 16K.
+ * In contrast to plain ilog2, this rounds towards the next power of 2,
+ * so the socket application gets at least its desired sndbuf / rcvbuf size.
+ */
+static inline u8 smc_compress_bufsize(int size)
+{
+       u8 compressed;
+
+       if (size <= SMC_BUF_MIN_SIZE)
+               return 0;
+
+       size = (size - 1) >> 14;
+       compressed = ilog2(size) + 1;
+       if (compressed >= SMC_RMBE_SIZES)
+               compressed = SMC_RMBE_SIZES - 1;
+       return compressed;
+}
+
+/* convert the RMB size from compressed notation into integer */
+static inline int smc_uncompress_bufsize(u8 compressed)
+{
+       u32 size;
+
+       size = 0x00000001 << (((int)compressed) + 14);
+       return (int)size;
+}
+
 #ifdef CONFIG_XFRM
 static inline bool using_ipsec(struct smc_sock *smc)
 {
index f8e47c0..4b475dd 100644 (file)
@@ -252,13 +252,13 @@ int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact)
               SMC_GID_SIZE);
        memcpy(&aclc.lcl.mac, link->smcibdev->mac[link->ibport - 1],
               sizeof(link->smcibdev->mac[link->ibport - 1]));
-
-       /* tbd in follow-on patch: fill in rmb-related values */
-
        hton24(aclc.qpn, link->roce_qp->qp_num);
        aclc.conn_idx = 1;                      /* as long as 1 RMB = 1 RMBE */
        aclc.rmbe_alert_token = htonl(conn->alert_token_local);
        aclc.qp_mtu = link->path_mtu;
+       aclc.rmbe_size = conn->rmbe_size_short,
+       aclc.rmb_dma_addr =
+               cpu_to_be64((u64)conn->rmb_desc->dma_addr[SMC_SINGLE_LINK]);
        hton24(aclc.psn, link->psn_initial);
        memcpy(aclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
 
index b88a829..e1b9572 100644 (file)
@@ -133,6 +133,7 @@ static int smc_lgr_create(struct smc_sock *smc, __be32 peer_in_addr,
        struct smc_link *lnk;
        u8 rndvec[3];
        int rc = 0;
+       int i;
 
        lgr = kzalloc(sizeof(*lgr), GFP_KERNEL);
        if (!lgr) {
@@ -144,6 +145,12 @@ static int smc_lgr_create(struct smc_sock *smc, __be32 peer_in_addr,
        lgr->daddr = peer_in_addr;
        memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN);
        lgr->vlan_id = vlan_id;
+       rwlock_init(&lgr->sndbufs_lock);
+       rwlock_init(&lgr->rmbs_lock);
+       for (i = 0; i < SMC_RMBE_SIZES; i++) {
+               INIT_LIST_HEAD(&lgr->sndbufs[i]);
+               INIT_LIST_HEAD(&lgr->rmbs[i]);
+       }
        INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work);
        lgr->conns_all = RB_ROOT;
 
@@ -164,6 +171,22 @@ out:
        return rc;
 }
 
+static void smc_sndbuf_unuse(struct smc_connection *conn)
+{
+       if (conn->sndbuf_desc) {
+               conn->sndbuf_desc->used = 0;
+               conn->sndbuf_size = 0;
+       }
+}
+
+static void smc_rmb_unuse(struct smc_connection *conn)
+{
+       if (conn->rmb_desc) {
+               conn->rmb_desc->used = 0;
+               conn->rmbe_size = 0;
+       }
+}
+
 /* remove a finished connection from its link group */
 void smc_conn_free(struct smc_connection *conn)
 {
@@ -172,6 +195,8 @@ void smc_conn_free(struct smc_connection *conn)
        if (!lgr)
                return;
        smc_lgr_unregister_conn(conn);
+       smc_rmb_unuse(conn);
+       smc_sndbuf_unuse(conn);
 }
 
 static void smc_link_clear(struct smc_link *lnk)
@@ -179,9 +204,39 @@ static void smc_link_clear(struct smc_link *lnk)
        lnk->peer_qpn = 0;
 }
 
+static void smc_lgr_free_sndbufs(struct smc_link_group *lgr)
+{
+       struct smc_buf_desc *sndbuf_desc, *bf_desc;
+       int i;
+
+       for (i = 0; i < SMC_RMBE_SIZES; i++) {
+               list_for_each_entry_safe(sndbuf_desc, bf_desc, &lgr->sndbufs[i],
+                                        list) {
+                       kfree(sndbuf_desc->cpu_addr);
+                       kfree(sndbuf_desc);
+               }
+       }
+}
+
+static void smc_lgr_free_rmbs(struct smc_link_group *lgr)
+{
+       struct smc_buf_desc *rmb_desc, *bf_desc;
+       int i;
+
+       for (i = 0; i < SMC_RMBE_SIZES; i++) {
+               list_for_each_entry_safe(rmb_desc, bf_desc, &lgr->rmbs[i],
+                                        list) {
+                       kfree(rmb_desc->cpu_addr);
+                       kfree(rmb_desc);
+               }
+       }
+}
+
 /* remove a link group */
 void smc_lgr_free(struct smc_link_group *lgr)
 {
+       smc_lgr_free_rmbs(lgr);
+       smc_lgr_free_sndbufs(lgr);
        smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]);
        kfree(lgr);
 }
@@ -300,7 +355,9 @@ int smc_conn_create(struct smc_sock *smc, __be32 peer_in_addr,
                            sizeof(lcl->mac)) &&
                    !lgr->sync_err &&
                    (lgr->role == role) &&
-                   (lgr->vlan_id == vlan_id)) {
+                   (lgr->vlan_id == vlan_id) &&
+                   ((role == SMC_CLNT) ||
+                    (lgr->conns_num < SMC_RMBS_PER_LGR_MAX))) {
                        /* link group found */
                        local_contact = SMC_REUSE_CONTACT;
                        conn->lgr = lgr;
@@ -334,3 +391,168 @@ create:
 out:
        return rc ? rc : local_contact;
 }
+
+/* try to reuse a sndbuf description slot of the sndbufs list for a certain
+ * buf_size; if not available, return NULL
+ */
+static inline
+struct smc_buf_desc *smc_sndbuf_get_slot(struct smc_link_group *lgr,
+                                        int compressed_bufsize)
+{
+       struct smc_buf_desc *sndbuf_slot;
+
+       read_lock_bh(&lgr->sndbufs_lock);
+       list_for_each_entry(sndbuf_slot, &lgr->sndbufs[compressed_bufsize],
+                           list) {
+               if (cmpxchg(&sndbuf_slot->used, 0, 1) == 0) {
+                       read_unlock_bh(&lgr->sndbufs_lock);
+                       return sndbuf_slot;
+               }
+       }
+       read_unlock_bh(&lgr->sndbufs_lock);
+       return NULL;
+}
+
+/* try to reuse an rmb description slot of the rmbs list for a certain
+ * rmbe_size; if not available, return NULL
+ */
+static inline
+struct smc_buf_desc *smc_rmb_get_slot(struct smc_link_group *lgr,
+                                     int compressed_bufsize)
+{
+       struct smc_buf_desc *rmb_slot;
+
+       read_lock_bh(&lgr->rmbs_lock);
+       list_for_each_entry(rmb_slot, &lgr->rmbs[compressed_bufsize],
+                           list) {
+               if (cmpxchg(&rmb_slot->used, 0, 1) == 0) {
+                       read_unlock_bh(&lgr->rmbs_lock);
+                       return rmb_slot;
+               }
+       }
+       read_unlock_bh(&lgr->rmbs_lock);
+       return NULL;
+}
+
+/* create the tx buffer for an SMC socket */
+int smc_sndbuf_create(struct smc_sock *smc)
+{
+       struct smc_connection *conn = &smc->conn;
+       struct smc_link_group *lgr = conn->lgr;
+       int tmp_bufsize, tmp_bufsize_short;
+       struct smc_buf_desc *sndbuf_desc;
+       int rc;
+
+       /* use socket send buffer size (w/o overhead) as start value */
+       for (tmp_bufsize_short = smc_compress_bufsize(smc->sk.sk_sndbuf / 2);
+            tmp_bufsize_short >= 0; tmp_bufsize_short--) {
+               tmp_bufsize = smc_uncompress_bufsize(tmp_bufsize_short);
+               /* check for reusable sndbuf_slot in the link group */
+               sndbuf_desc = smc_sndbuf_get_slot(lgr, tmp_bufsize_short);
+               if (sndbuf_desc) {
+                       memset(sndbuf_desc->cpu_addr, 0, tmp_bufsize);
+                       break; /* found reusable slot */
+               }
+               /* try to alloc a new send buffer */
+               sndbuf_desc = kzalloc(sizeof(*sndbuf_desc), GFP_KERNEL);
+               if (!sndbuf_desc)
+                       break; /* give up with -ENOMEM */
+               sndbuf_desc->cpu_addr = kzalloc(tmp_bufsize,
+                                               GFP_KERNEL | __GFP_NOWARN |
+                                               __GFP_NOMEMALLOC |
+                                               __GFP_NORETRY);
+               if (!sndbuf_desc->cpu_addr) {
+                       kfree(sndbuf_desc);
+                       /* if send buffer allocation has failed,
+                        * try a smaller one
+                        */
+                       continue;
+               }
+               rc = smc_ib_buf_map(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
+                                   tmp_bufsize, sndbuf_desc,
+                                   DMA_TO_DEVICE);
+               if (rc) {
+                       kfree(sndbuf_desc->cpu_addr);
+                       kfree(sndbuf_desc);
+                       continue; /* if mapping failed, try smaller one */
+               }
+               sndbuf_desc->used = 1;
+               write_lock_bh(&lgr->sndbufs_lock);
+               list_add(&sndbuf_desc->list,
+                        &lgr->sndbufs[tmp_bufsize_short]);
+               write_unlock_bh(&lgr->sndbufs_lock);
+               break;
+       }
+       if (sndbuf_desc && sndbuf_desc->cpu_addr) {
+               conn->sndbuf_desc = sndbuf_desc;
+               conn->sndbuf_size = tmp_bufsize;
+               smc->sk.sk_sndbuf = tmp_bufsize * 2;
+               return 0;
+       } else {
+               return -ENOMEM;
+       }
+}
+
+/* create the RMB for an SMC socket (even though the SMC protocol
+ * allows more than one RMB-element per RMB, the Linux implementation
+ * uses just one RMB-element per RMB, i.e. uses an extra RMB for every
+ * connection in a link group
+ */
+int smc_rmb_create(struct smc_sock *smc)
+{
+       struct smc_connection *conn = &smc->conn;
+       struct smc_link_group *lgr = conn->lgr;
+       int tmp_bufsize, tmp_bufsize_short;
+       struct smc_buf_desc *rmb_desc;
+       int rc;
+
+       /* use socket recv buffer size (w/o overhead) as start value */
+       for (tmp_bufsize_short = smc_compress_bufsize(smc->sk.sk_rcvbuf / 2);
+            tmp_bufsize_short >= 0; tmp_bufsize_short--) {
+               tmp_bufsize = smc_uncompress_bufsize(tmp_bufsize_short);
+               /* check for reusable rmb_slot in the link group */
+               rmb_desc = smc_rmb_get_slot(lgr, tmp_bufsize_short);
+               if (rmb_desc) {
+                       memset(rmb_desc->cpu_addr, 0, tmp_bufsize);
+                       break; /* found reusable slot */
+               }
+               /* try to alloc a new RMB */
+               rmb_desc = kzalloc(sizeof(*rmb_desc), GFP_KERNEL);
+               if (!rmb_desc)
+                       break; /* give up with -ENOMEM */
+               rmb_desc->cpu_addr = kzalloc(tmp_bufsize,
+                                            GFP_KERNEL | __GFP_NOWARN |
+                                            __GFP_NOMEMALLOC |
+                                            __GFP_NORETRY);
+               if (!rmb_desc->cpu_addr) {
+                       kfree(rmb_desc);
+                       /* if RMB allocation has failed,
+                        * try a smaller one
+                        */
+                       continue;
+               }
+               rc = smc_ib_buf_map(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
+                                   tmp_bufsize, rmb_desc,
+                                   DMA_FROM_DEVICE);
+               if (rc) {
+                       kfree(rmb_desc->cpu_addr);
+                       kfree(rmb_desc);
+                       continue; /* if mapping failed, try smaller one */
+               }
+               rmb_desc->used = 1;
+               write_lock_bh(&lgr->rmbs_lock);
+               list_add(&rmb_desc->list,
+                        &lgr->rmbs[tmp_bufsize_short]);
+               write_unlock_bh(&lgr->rmbs_lock);
+               break;
+       }
+       if (rmb_desc && rmb_desc->cpu_addr) {
+               conn->rmb_desc = rmb_desc;
+               conn->rmbe_size = tmp_bufsize;
+               conn->rmbe_size_short = tmp_bufsize_short;
+               smc->sk.sk_rcvbuf = tmp_bufsize * 2;
+               return 0;
+       } else {
+               return -ENOMEM;
+       }
+}
index 14b787a..bf0026d 100644 (file)
@@ -16,6 +16,8 @@
 #include "smc.h"
 #include "smc_ib.h"
 
+#define SMC_RMBS_PER_LGR_MAX   255     /* max. # of RMBs per link group */
+
 struct smc_lgr_list {                  /* list of link group definition */
        struct list_head        list;
        spinlock_t              lock;   /* protects list of link groups */
@@ -52,6 +54,15 @@ struct smc_link {
 #define SMC_FIRST_CONTACT      1               /* first contact to a peer */
 #define SMC_REUSE_CONTACT      0               /* follow-on contact to a peer*/
 
+/* tx/rx buffer list element for sndbufs list and rmbs list of a lgr */
+struct smc_buf_desc {
+       struct list_head        list;
+       u64                     dma_addr[SMC_LINKS_PER_LGR_MAX];
+                                               /* mapped address of buffer */
+       void                    *cpu_addr;      /* virtual address of buffer */
+       u32                     used;           /* currently used / unused */
+};
+
 struct smc_link_group {
        struct list_head        list;
        enum smc_lgr_role       role;           /* client or server */
@@ -63,6 +74,11 @@ struct smc_link_group {
        rwlock_t                conns_lock;     /* protects conns_all */
        unsigned int            conns_num;      /* current # of connections */
        unsigned short          vlan_id;        /* vlan id of link group */
+
+       struct list_head        sndbufs[SMC_RMBE_SIZES];/* tx buffers */
+       rwlock_t                sndbufs_lock;   /* protects tx buffers */
+       struct list_head        rmbs[SMC_RMBE_SIZES];   /* rx buffers */
+       rwlock_t                rmbs_lock;      /* protects rx buffers */
        struct delayed_work     free_work;      /* delayed freeing of an lgr */
        bool                    sync_err;       /* lgr no longer fits to peer */
 };
@@ -100,7 +116,12 @@ static inline struct smc_connection *smc_lgr_find_conn(
        return res;
 }
 
+struct smc_sock;
+struct smc_clc_msg_accept_confirm;
+
 void smc_lgr_free(struct smc_link_group *lgr);
 void smc_lgr_terminate(struct smc_link_group *lgr);
+int smc_sndbuf_create(struct smc_sock *smc);
+int smc_rmb_create(struct smc_sock *smc);
 
 #endif
index 5b037f4..762b7e1 100644 (file)
@@ -16,6 +16,7 @@
 
 #include "smc_pnet.h"
 #include "smc_ib.h"
+#include "smc_core.h"
 #include "smc.h"
 
 struct smc_ib_devices smc_ib_devices = {       /* smc-registered ib devices */
@@ -29,6 +30,24 @@ u8 local_systemid[SMC_SYSTEMID_LEN] = SMC_LOCAL_SYSTEMID_RESET;      /* unique system
                                                                 * identifier
                                                                 */
 
+/* map a new TX or RX buffer to DMA */
+int smc_ib_buf_map(struct smc_ib_device *smcibdev, int buf_size,
+                  struct smc_buf_desc *buf_slot,
+                  enum dma_data_direction data_direction)
+{
+       int rc = 0;
+
+       if (buf_slot->dma_addr[SMC_SINGLE_LINK])
+               return rc; /* already mapped */
+       buf_slot->dma_addr[SMC_SINGLE_LINK] =
+               ib_dma_map_single(smcibdev->ibdev, buf_slot->cpu_addr,
+                                 buf_size, data_direction);
+       if (ib_dma_mapping_error(smcibdev->ibdev,
+                                buf_slot->dma_addr[SMC_SINGLE_LINK]))
+               rc = -EIO;
+       return rc;
+}
+
 static int smc_ib_fill_gid_and_mac(struct smc_ib_device *smcibdev, u8 ibport)
 {
        struct net_device *ndev;
index 63613e7..c3b6172 100644 (file)
@@ -32,9 +32,14 @@ struct smc_ib_device {                               /* ib-device infos for smc */
        u8                      initialized : 1; /* ib dev CQ, evthdl done */
 };
 
+struct smc_buf_desc;
+
 int smc_ib_register_client(void) __init;
 void smc_ib_unregister_client(void);
 bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport);
 int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport);
+int smc_ib_buf_map(struct smc_ib_device *smcibdev, int buf_size,
+                  struct smc_buf_desc *buf_slot,
+                  enum dma_data_direction data_direction);
 
 #endif