b62b69c6c718bdfff8e38d262991d3dc8dd403ae
[platform/kernel/linux-rpi.git] / net / smc / af_smc.c
1 /*
2  *  Shared Memory Communications over RDMA (SMC-R) and RoCE
3  *
4  *  AF_SMC protocol family socket handler keeping the AF_INET sock address type
5  *  applies to SOCK_STREAM sockets only
6  *  offers an alternative communication option for TCP-protocol sockets
7  *  applicable with RoCE-cards only
8  *
9  *  Initial restrictions:
10  *    - non-blocking connect postponed
11  *    - IPv6 support postponed
12  *    - support for alternate links postponed
13  *    - partial support for non-blocking sockets only
14  *    - support for urgent data postponed
15  *
16  *  Copyright IBM Corp. 2016
17  *
18  *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
19  *              based on prototype from Frank Blaschka
20  */
21
22 #define KMSG_COMPONENT "smc"
23 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
24
25 #include <linux/module.h>
26 #include <linux/socket.h>
27 #include <linux/inetdevice.h>
28 #include <linux/workqueue.h>
29 #include <linux/in.h>
30 #include <net/sock.h>
31 #include <net/tcp.h>
32
33 #include "smc.h"
34 #include "smc_clc.h"
35 #include "smc_llc.h"
36 #include "smc_cdc.h"
37 #include "smc_core.h"
38 #include "smc_ib.h"
39 #include "smc_pnet.h"
40 #include "smc_tx.h"
41
42 static DEFINE_MUTEX(smc_create_lgr_pending);    /* serialize link group
43                                                  * creation
44                                                  */
45
46 struct smc_lgr_list smc_lgr_list = {            /* established link groups */
47         .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
48         .list = LIST_HEAD_INIT(smc_lgr_list.list),
49 };
50
51 static void smc_tcp_listen_work(struct work_struct *);
52
53 static void smc_set_keepalive(struct sock *sk, int val)
54 {
55         struct smc_sock *smc = smc_sk(sk);
56
57         smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
58 }
59
60 static struct proto smc_proto = {
61         .name           = "SMC",
62         .owner          = THIS_MODULE,
63         .keepalive      = smc_set_keepalive,
64         .obj_size       = sizeof(struct smc_sock),
65         .slab_flags     = SLAB_DESTROY_BY_RCU,
66 };
67
68 static int smc_release(struct socket *sock)
69 {
70         struct sock *sk = sock->sk;
71         struct smc_sock *smc;
72
73         if (!sk)
74                 goto out;
75
76         smc = smc_sk(sk);
77         lock_sock(sk);
78
79         sk->sk_state = SMC_CLOSED;
80         if (smc->clcsock) {
81                 sock_release(smc->clcsock);
82                 smc->clcsock = NULL;
83         }
84
85         /* detach socket */
86         sock_orphan(sk);
87         sock->sk = NULL;
88         release_sock(sk);
89
90         sock_put(sk);
91 out:
92         return 0;
93 }
94
95 static void smc_destruct(struct sock *sk)
96 {
97         if (sk->sk_state != SMC_CLOSED)
98                 return;
99         if (!sock_flag(sk, SOCK_DEAD))
100                 return;
101
102         sk_refcnt_debug_dec(sk);
103 }
104
105 static struct sock *smc_sock_alloc(struct net *net, struct socket *sock)
106 {
107         struct smc_sock *smc;
108         struct sock *sk;
109
110         sk = sk_alloc(net, PF_SMC, GFP_KERNEL, &smc_proto, 0);
111         if (!sk)
112                 return NULL;
113
114         sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
115         sk->sk_state = SMC_INIT;
116         sk->sk_destruct = smc_destruct;
117         sk->sk_protocol = SMCPROTO_SMC;
118         smc = smc_sk(sk);
119         INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
120         INIT_LIST_HEAD(&smc->accept_q);
121         spin_lock_init(&smc->accept_q_lock);
122         sk_refcnt_debug_inc(sk);
123
124         return sk;
125 }
126
127 static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
128                     int addr_len)
129 {
130         struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
131         struct sock *sk = sock->sk;
132         struct smc_sock *smc;
133         int rc;
134
135         smc = smc_sk(sk);
136
137         /* replicate tests from inet_bind(), to be safe wrt. future changes */
138         rc = -EINVAL;
139         if (addr_len < sizeof(struct sockaddr_in))
140                 goto out;
141
142         rc = -EAFNOSUPPORT;
143         /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
144         if ((addr->sin_family != AF_INET) &&
145             ((addr->sin_family != AF_UNSPEC) ||
146              (addr->sin_addr.s_addr != htonl(INADDR_ANY))))
147                 goto out;
148
149         lock_sock(sk);
150
151         /* Check if socket is already active */
152         rc = -EINVAL;
153         if (sk->sk_state != SMC_INIT)
154                 goto out_rel;
155
156         smc->clcsock->sk->sk_reuse = sk->sk_reuse;
157         rc = kernel_bind(smc->clcsock, uaddr, addr_len);
158
159 out_rel:
160         release_sock(sk);
161 out:
162         return rc;
163 }
164
165 static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
166                                    unsigned long mask)
167 {
168         /* options we don't get control via setsockopt for */
169         nsk->sk_type = osk->sk_type;
170         nsk->sk_sndbuf = osk->sk_sndbuf;
171         nsk->sk_rcvbuf = osk->sk_rcvbuf;
172         nsk->sk_sndtimeo = osk->sk_sndtimeo;
173         nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
174         nsk->sk_mark = osk->sk_mark;
175         nsk->sk_priority = osk->sk_priority;
176         nsk->sk_rcvlowat = osk->sk_rcvlowat;
177         nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
178         nsk->sk_err = osk->sk_err;
179
180         nsk->sk_flags &= ~mask;
181         nsk->sk_flags |= osk->sk_flags & mask;
182 }
183
184 #define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
185                              (1UL << SOCK_KEEPOPEN) | \
186                              (1UL << SOCK_LINGER) | \
187                              (1UL << SOCK_BROADCAST) | \
188                              (1UL << SOCK_TIMESTAMP) | \
189                              (1UL << SOCK_DBG) | \
190                              (1UL << SOCK_RCVTSTAMP) | \
191                              (1UL << SOCK_RCVTSTAMPNS) | \
192                              (1UL << SOCK_LOCALROUTE) | \
193                              (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
194                              (1UL << SOCK_RXQ_OVFL) | \
195                              (1UL << SOCK_WIFI_STATUS) | \
196                              (1UL << SOCK_NOFCS) | \
197                              (1UL << SOCK_FILTER_LOCKED))
198 /* copy only relevant settings and flags of SOL_SOCKET level from smc to
199  * clc socket (since smc is not called for these options from net/core)
200  */
201 static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
202 {
203         smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
204 }
205
206 #define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
207                              (1UL << SOCK_KEEPOPEN) | \
208                              (1UL << SOCK_LINGER) | \
209                              (1UL << SOCK_DBG))
210 /* copy only settings and flags relevant for smc from clc to smc socket */
211 static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
212 {
213         smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
214 }
215
216 /* determine subnet and mask of internal TCP socket */
217 int smc_netinfo_by_tcpsk(struct socket *clcsock,
218                          __be32 *subnet, u8 *prefix_len)
219 {
220         struct dst_entry *dst = sk_dst_get(clcsock->sk);
221         struct sockaddr_in addr;
222         int rc = -ENOENT;
223         int len;
224
225         if (!dst) {
226                 rc = -ENOTCONN;
227                 goto out;
228         }
229         if (!dst->dev) {
230                 rc = -ENODEV;
231                 goto out_rel;
232         }
233
234         /* get address to which the internal TCP socket is bound */
235         kernel_getsockname(clcsock, (struct sockaddr *)&addr, &len);
236         /* analyze IPv4 specific data of net_device belonging to TCP socket */
237         for_ifa(dst->dev->ip_ptr) {
238                 if (ifa->ifa_address != addr.sin_addr.s_addr)
239                         continue;
240                 *prefix_len = inet_mask_len(ifa->ifa_mask);
241                 *subnet = ifa->ifa_address & ifa->ifa_mask;
242                 rc = 0;
243                 break;
244         } endfor_ifa(dst->dev->ip_ptr);
245
246 out_rel:
247         dst_release(dst);
248 out:
249         return rc;
250 }
251
252 static int smc_clnt_conf_first_link(struct smc_sock *smc, union ib_gid *gid)
253 {
254         struct smc_link_group *lgr = smc->conn.lgr;
255         struct smc_link *link;
256         int rest;
257         int rc;
258
259         link = &lgr->lnk[SMC_SINGLE_LINK];
260         /* receive CONFIRM LINK request from server over RoCE fabric */
261         rest = wait_for_completion_interruptible_timeout(
262                 &link->llc_confirm,
263                 SMC_LLC_WAIT_FIRST_TIME);
264         if (rest <= 0) {
265                 struct smc_clc_msg_decline dclc;
266
267                 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
268                                       SMC_CLC_DECLINE);
269                 return rc;
270         }
271
272         rc = smc_ib_modify_qp_rts(link);
273         if (rc)
274                 return SMC_CLC_DECL_INTERR;
275
276         smc_wr_remember_qp_attr(link);
277         /* send CONFIRM LINK response over RoCE fabric */
278         rc = smc_llc_send_confirm_link(link,
279                                        link->smcibdev->mac[link->ibport - 1],
280                                        gid, SMC_LLC_RESP);
281         if (rc < 0)
282                 return SMC_CLC_DECL_TCL;
283
284         return rc;
285 }
286
287 static void smc_conn_save_peer_info(struct smc_sock *smc,
288                                     struct smc_clc_msg_accept_confirm *clc)
289 {
290         smc->conn.peer_conn_idx = clc->conn_idx;
291         smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
292         smc->conn.peer_rmbe_size = smc_uncompress_bufsize(clc->rmbe_size);
293         atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
294 }
295
296 static void smc_link_save_peer_info(struct smc_link *link,
297                                     struct smc_clc_msg_accept_confirm *clc)
298 {
299         link->peer_qpn = ntoh24(clc->qpn);
300         memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE);
301         memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac));
302         link->peer_psn = ntoh24(clc->psn);
303         link->peer_mtu = clc->qp_mtu;
304 }
305
306 /* setup for RDMA connection of client */
307 static int smc_connect_rdma(struct smc_sock *smc)
308 {
309         struct sockaddr_in *inaddr = (struct sockaddr_in *)smc->addr;
310         struct smc_clc_msg_accept_confirm aclc;
311         int local_contact = SMC_FIRST_CONTACT;
312         struct smc_ib_device *smcibdev;
313         struct smc_link *link;
314         u8 srv_first_contact;
315         int reason_code = 0;
316         int rc = 0;
317         u8 ibport;
318
319         /* IPSec connections opt out of SMC-R optimizations */
320         if (using_ipsec(smc)) {
321                 reason_code = SMC_CLC_DECL_IPSEC;
322                 goto decline_rdma;
323         }
324
325         /* PNET table look up: search active ib_device and port
326          * within same PNETID that also contains the ethernet device
327          * used for the internal TCP socket
328          */
329         smc_pnet_find_roce_resource(smc->clcsock->sk, &smcibdev, &ibport);
330         if (!smcibdev) {
331                 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
332                 goto decline_rdma;
333         }
334
335         /* do inband token exchange */
336         reason_code = smc_clc_send_proposal(smc, smcibdev, ibport);
337         if (reason_code < 0) {
338                 rc = reason_code;
339                 goto out_err;
340         }
341         if (reason_code > 0) /* configuration error */
342                 goto decline_rdma;
343         /* receive SMC Accept CLC message */
344         reason_code = smc_clc_wait_msg(smc, &aclc, sizeof(aclc),
345                                        SMC_CLC_ACCEPT);
346         if (reason_code < 0) {
347                 rc = reason_code;
348                 goto out_err;
349         }
350         if (reason_code > 0)
351                 goto decline_rdma;
352
353         srv_first_contact = aclc.hdr.flag;
354         mutex_lock(&smc_create_lgr_pending);
355         local_contact = smc_conn_create(smc, inaddr->sin_addr.s_addr, smcibdev,
356                                         ibport, &aclc.lcl, srv_first_contact);
357         if (local_contact < 0) {
358                 rc = local_contact;
359                 if (rc == -ENOMEM)
360                         reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
361                 else if (rc == -ENOLINK)
362                         reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
363                 goto decline_rdma_unlock;
364         }
365         link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
366
367         smc_conn_save_peer_info(smc, &aclc);
368
369         rc = smc_sndbuf_create(smc);
370         if (rc) {
371                 reason_code = SMC_CLC_DECL_MEM;
372                 goto decline_rdma_unlock;
373         }
374         rc = smc_rmb_create(smc);
375         if (rc) {
376                 reason_code = SMC_CLC_DECL_MEM;
377                 goto decline_rdma_unlock;
378         }
379
380         if (local_contact == SMC_FIRST_CONTACT)
381                 smc_link_save_peer_info(link, &aclc);
382
383         rc = smc_rmb_rtoken_handling(&smc->conn, &aclc);
384         if (rc) {
385                 reason_code = SMC_CLC_DECL_INTERR;
386                 goto decline_rdma_unlock;
387         }
388
389         if (local_contact == SMC_FIRST_CONTACT) {
390                 rc = smc_ib_ready_link(link);
391                 if (rc) {
392                         reason_code = SMC_CLC_DECL_INTERR;
393                         goto decline_rdma_unlock;
394                 }
395         }
396
397         rc = smc_clc_send_confirm(smc);
398         if (rc)
399                 goto out_err_unlock;
400
401         if (local_contact == SMC_FIRST_CONTACT) {
402                 /* QP confirmation over RoCE fabric */
403                 reason_code = smc_clnt_conf_first_link(
404                         smc, &smcibdev->gid[ibport - 1]);
405                 if (reason_code < 0) {
406                         rc = reason_code;
407                         goto out_err_unlock;
408                 }
409                 if (reason_code > 0)
410                         goto decline_rdma_unlock;
411         }
412
413         mutex_unlock(&smc_create_lgr_pending);
414         smc_tx_init(smc);
415
416 out_connected:
417         smc_copy_sock_settings_to_clc(smc);
418         smc->sk.sk_state = SMC_ACTIVE;
419
420         return rc ? rc : local_contact;
421
422 decline_rdma_unlock:
423         mutex_unlock(&smc_create_lgr_pending);
424         smc_conn_free(&smc->conn);
425 decline_rdma:
426         /* RDMA setup failed, switch back to TCP */
427         smc->use_fallback = true;
428         if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
429                 rc = smc_clc_send_decline(smc, reason_code, 0);
430                 if (rc < sizeof(struct smc_clc_msg_decline))
431                         goto out_err;
432         }
433         goto out_connected;
434
435 out_err_unlock:
436         mutex_unlock(&smc_create_lgr_pending);
437         smc_conn_free(&smc->conn);
438 out_err:
439         return rc;
440 }
441
442 static int smc_connect(struct socket *sock, struct sockaddr *addr,
443                        int alen, int flags)
444 {
445         struct sock *sk = sock->sk;
446         struct smc_sock *smc;
447         int rc = -EINVAL;
448
449         smc = smc_sk(sk);
450
451         /* separate smc parameter checking to be safe */
452         if (alen < sizeof(addr->sa_family))
453                 goto out_err;
454         if (addr->sa_family != AF_INET)
455                 goto out_err;
456         smc->addr = addr;       /* needed for nonblocking connect */
457
458         lock_sock(sk);
459         switch (sk->sk_state) {
460         default:
461                 goto out;
462         case SMC_ACTIVE:
463                 rc = -EISCONN;
464                 goto out;
465         case SMC_INIT:
466                 rc = 0;
467                 break;
468         }
469
470         smc_copy_sock_settings_to_clc(smc);
471         rc = kernel_connect(smc->clcsock, addr, alen, flags);
472         if (rc)
473                 goto out;
474
475         /* setup RDMA connection */
476         rc = smc_connect_rdma(smc);
477         if (rc < 0)
478                 goto out;
479         else
480                 rc = 0; /* success cases including fallback */
481
482 out:
483         release_sock(sk);
484 out_err:
485         return rc;
486 }
487
488 static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
489 {
490         struct sock *sk = &lsmc->sk;
491         struct socket *new_clcsock;
492         struct sock *new_sk;
493         int rc;
494
495         release_sock(&lsmc->sk);
496         new_sk = smc_sock_alloc(sock_net(sk), NULL);
497         if (!new_sk) {
498                 rc = -ENOMEM;
499                 lsmc->sk.sk_err = ENOMEM;
500                 *new_smc = NULL;
501                 lock_sock(&lsmc->sk);
502                 goto out;
503         }
504         *new_smc = smc_sk(new_sk);
505
506         rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
507         lock_sock(&lsmc->sk);
508         if  (rc < 0) {
509                 lsmc->sk.sk_err = -rc;
510                 new_sk->sk_state = SMC_CLOSED;
511                 sock_set_flag(new_sk, SOCK_DEAD);
512                 sock_put(new_sk);
513                 *new_smc = NULL;
514                 goto out;
515         }
516         if (lsmc->sk.sk_state == SMC_CLOSED) {
517                 if (new_clcsock)
518                         sock_release(new_clcsock);
519                 new_sk->sk_state = SMC_CLOSED;
520                 sock_set_flag(new_sk, SOCK_DEAD);
521                 sock_put(new_sk);
522                 *new_smc = NULL;
523                 goto out;
524         }
525
526         (*new_smc)->clcsock = new_clcsock;
527 out:
528         return rc;
529 }
530
531 /* add a just created sock to the accept queue of the listen sock as
532  * candidate for a following socket accept call from user space
533  */
534 static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
535 {
536         struct smc_sock *par = smc_sk(parent);
537
538         sock_hold(sk);
539         spin_lock(&par->accept_q_lock);
540         list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
541         spin_unlock(&par->accept_q_lock);
542         sk_acceptq_added(parent);
543 }
544
545 /* remove a socket from the accept queue of its parental listening socket */
546 static void smc_accept_unlink(struct sock *sk)
547 {
548         struct smc_sock *par = smc_sk(sk)->listen_smc;
549
550         spin_lock(&par->accept_q_lock);
551         list_del_init(&smc_sk(sk)->accept_q);
552         spin_unlock(&par->accept_q_lock);
553         sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
554         sock_put(sk);
555 }
556
557 /* remove a sock from the accept queue to bind it to a new socket created
558  * for a socket accept call from user space
559  */
560 static struct sock *smc_accept_dequeue(struct sock *parent,
561                                        struct socket *new_sock)
562 {
563         struct smc_sock *isk, *n;
564         struct sock *new_sk;
565
566         list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
567                 new_sk = (struct sock *)isk;
568
569                 smc_accept_unlink(new_sk);
570                 if (new_sk->sk_state == SMC_CLOSED) {
571                         /* tbd in follow-on patch: close this sock */
572                         continue;
573                 }
574                 if (new_sock)
575                         sock_graft(new_sk, new_sock);
576                 return new_sk;
577         }
578         return NULL;
579 }
580
581 /* clean up for a created but never accepted sock */
582 static void smc_close_non_accepted(struct sock *sk)
583 {
584         struct smc_sock *smc = smc_sk(sk);
585
586         sock_hold(sk);
587         if (smc->clcsock) {
588                 struct socket *tcp;
589
590                 tcp = smc->clcsock;
591                 smc->clcsock = NULL;
592                 sock_release(tcp);
593         }
594         /* more closing stuff to be added with socket closing patch */
595         sock_put(sk);
596 }
597
598 static int smc_serv_conf_first_link(struct smc_sock *smc)
599 {
600         struct smc_link_group *lgr = smc->conn.lgr;
601         struct smc_link *link;
602         int rest;
603         int rc;
604
605         link = &lgr->lnk[SMC_SINGLE_LINK];
606         /* send CONFIRM LINK request to client over the RoCE fabric */
607         rc = smc_llc_send_confirm_link(link,
608                                        link->smcibdev->mac[link->ibport - 1],
609                                        &link->smcibdev->gid[link->ibport - 1],
610                                        SMC_LLC_REQ);
611         if (rc < 0)
612                 return SMC_CLC_DECL_TCL;
613
614         /* receive CONFIRM LINK response from client over the RoCE fabric */
615         rest = wait_for_completion_interruptible_timeout(
616                 &link->llc_confirm_resp,
617                 SMC_LLC_WAIT_FIRST_TIME);
618         if (rest <= 0) {
619                 struct smc_clc_msg_decline dclc;
620
621                 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
622                                       SMC_CLC_DECLINE);
623         }
624
625         return rc;
626 }
627
628 /* setup for RDMA connection of server */
629 static void smc_listen_work(struct work_struct *work)
630 {
631         struct smc_sock *new_smc = container_of(work, struct smc_sock,
632                                                 smc_listen_work);
633         struct socket *newclcsock = new_smc->clcsock;
634         struct smc_sock *lsmc = new_smc->listen_smc;
635         struct smc_clc_msg_accept_confirm cclc;
636         int local_contact = SMC_REUSE_CONTACT;
637         struct sock *newsmcsk = &new_smc->sk;
638         struct smc_clc_msg_proposal pclc;
639         struct smc_ib_device *smcibdev;
640         struct sockaddr_in peeraddr;
641         struct smc_link *link;
642         int reason_code = 0;
643         int rc = 0, len;
644         __be32 subnet;
645         u8 prefix_len;
646         u8 ibport;
647
648         /* do inband token exchange -
649          *wait for and receive SMC Proposal CLC message
650          */
651         reason_code = smc_clc_wait_msg(new_smc, &pclc, sizeof(pclc),
652                                        SMC_CLC_PROPOSAL);
653         if (reason_code < 0)
654                 goto out_err;
655         if (reason_code > 0)
656                 goto decline_rdma;
657
658         /* IPSec connections opt out of SMC-R optimizations */
659         if (using_ipsec(new_smc)) {
660                 reason_code = SMC_CLC_DECL_IPSEC;
661                 goto decline_rdma;
662         }
663
664         /* PNET table look up: search active ib_device and port
665          * within same PNETID that also contains the ethernet device
666          * used for the internal TCP socket
667          */
668         smc_pnet_find_roce_resource(newclcsock->sk, &smcibdev, &ibport);
669         if (!smcibdev) {
670                 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
671                 goto decline_rdma;
672         }
673
674         /* determine subnet and mask from internal TCP socket */
675         rc = smc_netinfo_by_tcpsk(newclcsock, &subnet, &prefix_len);
676         if (rc) {
677                 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
678                 goto decline_rdma;
679         }
680         if ((pclc.outgoing_subnet != subnet) ||
681             (pclc.prefix_len != prefix_len)) {
682                 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
683                 goto decline_rdma;
684         }
685
686         /* get address of the peer connected to the internal TCP socket */
687         kernel_getpeername(newclcsock, (struct sockaddr *)&peeraddr, &len);
688
689         /* allocate connection / link group */
690         mutex_lock(&smc_create_lgr_pending);
691         local_contact = smc_conn_create(new_smc, peeraddr.sin_addr.s_addr,
692                                         smcibdev, ibport, &pclc.lcl, 0);
693         if (local_contact == SMC_REUSE_CONTACT)
694                 /* lock no longer needed, free it due to following
695                  * smc_clc_wait_msg() call
696                  */
697                 mutex_unlock(&smc_create_lgr_pending);
698         if (local_contact < 0) {
699                 rc = local_contact;
700                 if (rc == -ENOMEM)
701                         reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
702                 else if (rc == -ENOLINK)
703                         reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
704                 goto decline_rdma;
705         }
706         link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
707
708         rc = smc_sndbuf_create(new_smc);
709         if (rc) {
710                 reason_code = SMC_CLC_DECL_MEM;
711                 goto decline_rdma;
712         }
713         rc = smc_rmb_create(new_smc);
714         if (rc) {
715                 reason_code = SMC_CLC_DECL_MEM;
716                 goto decline_rdma;
717         }
718
719         rc = smc_clc_send_accept(new_smc, local_contact);
720         if (rc)
721                 goto out_err;
722
723         /* receive SMC Confirm CLC message */
724         reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
725                                        SMC_CLC_CONFIRM);
726         if (reason_code < 0)
727                 goto out_err;
728         if (reason_code > 0)
729                 goto decline_rdma;
730         smc_conn_save_peer_info(new_smc, &cclc);
731         if (local_contact == SMC_FIRST_CONTACT)
732                 smc_link_save_peer_info(link, &cclc);
733
734         rc = smc_rmb_rtoken_handling(&new_smc->conn, &cclc);
735         if (rc) {
736                 reason_code = SMC_CLC_DECL_INTERR;
737                 goto decline_rdma;
738         }
739
740         if (local_contact == SMC_FIRST_CONTACT) {
741                 rc = smc_ib_ready_link(link);
742                 if (rc) {
743                         reason_code = SMC_CLC_DECL_INTERR;
744                         goto decline_rdma;
745                 }
746                 /* QP confirmation over RoCE fabric */
747                 reason_code = smc_serv_conf_first_link(new_smc);
748                 if (reason_code < 0) {
749                         /* peer is not aware of a problem */
750                         rc = reason_code;
751                         goto out_err;
752                 }
753                 if (reason_code > 0)
754                         goto decline_rdma;
755         }
756
757         smc_tx_init(new_smc);
758
759 out_connected:
760         sk_refcnt_debug_inc(newsmcsk);
761         newsmcsk->sk_state = SMC_ACTIVE;
762 enqueue:
763         if (local_contact == SMC_FIRST_CONTACT)
764                 mutex_unlock(&smc_create_lgr_pending);
765         lock_sock(&lsmc->sk);
766         if (lsmc->sk.sk_state == SMC_LISTEN) {
767                 smc_accept_enqueue(&lsmc->sk, newsmcsk);
768         } else { /* no longer listening */
769                 smc_close_non_accepted(newsmcsk);
770         }
771         release_sock(&lsmc->sk);
772
773         /* Wake up accept */
774         lsmc->sk.sk_data_ready(&lsmc->sk);
775         sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
776         return;
777
778 decline_rdma:
779         /* RDMA setup failed, switch back to TCP */
780         smc_conn_free(&new_smc->conn);
781         new_smc->use_fallback = true;
782         if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
783                 rc = smc_clc_send_decline(new_smc, reason_code, 0);
784                 if (rc < sizeof(struct smc_clc_msg_decline))
785                         goto out_err;
786         }
787         goto out_connected;
788
789 out_err:
790         newsmcsk->sk_state = SMC_CLOSED;
791         goto enqueue; /* queue new sock with sk_err set */
792 }
793
794 static void smc_tcp_listen_work(struct work_struct *work)
795 {
796         struct smc_sock *lsmc = container_of(work, struct smc_sock,
797                                              tcp_listen_work);
798         struct smc_sock *new_smc;
799         int rc = 0;
800
801         lock_sock(&lsmc->sk);
802         while (lsmc->sk.sk_state == SMC_LISTEN) {
803                 rc = smc_clcsock_accept(lsmc, &new_smc);
804                 if (rc)
805                         goto out;
806                 if (!new_smc)
807                         continue;
808
809                 new_smc->listen_smc = lsmc;
810                 new_smc->use_fallback = false; /* assume rdma capability first*/
811                 sock_hold(&lsmc->sk); /* sock_put in smc_listen_work */
812                 INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
813                 smc_copy_sock_settings_to_smc(new_smc);
814                 schedule_work(&new_smc->smc_listen_work);
815         }
816
817 out:
818         release_sock(&lsmc->sk);
819         lsmc->sk.sk_data_ready(&lsmc->sk); /* no more listening, wake accept */
820 }
821
822 static int smc_listen(struct socket *sock, int backlog)
823 {
824         struct sock *sk = sock->sk;
825         struct smc_sock *smc;
826         int rc;
827
828         smc = smc_sk(sk);
829         lock_sock(sk);
830
831         rc = -EINVAL;
832         if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN))
833                 goto out;
834
835         rc = 0;
836         if (sk->sk_state == SMC_LISTEN) {
837                 sk->sk_max_ack_backlog = backlog;
838                 goto out;
839         }
840         /* some socket options are handled in core, so we could not apply
841          * them to the clc socket -- copy smc socket options to clc socket
842          */
843         smc_copy_sock_settings_to_clc(smc);
844
845         rc = kernel_listen(smc->clcsock, backlog);
846         if (rc)
847                 goto out;
848         sk->sk_max_ack_backlog = backlog;
849         sk->sk_ack_backlog = 0;
850         sk->sk_state = SMC_LISTEN;
851         INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
852         schedule_work(&smc->tcp_listen_work);
853
854 out:
855         release_sock(sk);
856         return rc;
857 }
858
859 static int smc_accept(struct socket *sock, struct socket *new_sock,
860                       int flags)
861 {
862         struct sock *sk = sock->sk, *nsk;
863         DECLARE_WAITQUEUE(wait, current);
864         struct smc_sock *lsmc;
865         long timeo;
866         int rc = 0;
867
868         lsmc = smc_sk(sk);
869         lock_sock(sk);
870
871         if (lsmc->sk.sk_state != SMC_LISTEN) {
872                 rc = -EINVAL;
873                 goto out;
874         }
875
876         /* Wait for an incoming connection */
877         timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
878         add_wait_queue_exclusive(sk_sleep(sk), &wait);
879         while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
880                 set_current_state(TASK_INTERRUPTIBLE);
881                 if (!timeo) {
882                         rc = -EAGAIN;
883                         break;
884                 }
885                 release_sock(sk);
886                 timeo = schedule_timeout(timeo);
887                 /* wakeup by sk_data_ready in smc_listen_work() */
888                 sched_annotate_sleep();
889                 lock_sock(sk);
890                 if (signal_pending(current)) {
891                         rc = sock_intr_errno(timeo);
892                         break;
893                 }
894         }
895         set_current_state(TASK_RUNNING);
896         remove_wait_queue(sk_sleep(sk), &wait);
897
898         if (!rc)
899                 rc = sock_error(nsk);
900
901 out:
902         release_sock(sk);
903         return rc;
904 }
905
906 static int smc_getname(struct socket *sock, struct sockaddr *addr,
907                        int *len, int peer)
908 {
909         struct smc_sock *smc;
910
911         if (peer && (sock->sk->sk_state != SMC_ACTIVE))
912                 return -ENOTCONN;
913
914         smc = smc_sk(sock->sk);
915
916         return smc->clcsock->ops->getname(smc->clcsock, addr, len, peer);
917 }
918
919 static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
920 {
921         struct sock *sk = sock->sk;
922         struct smc_sock *smc;
923         int rc = -EPIPE;
924
925         smc = smc_sk(sk);
926         lock_sock(sk);
927         if (sk->sk_state != SMC_ACTIVE)
928                 goto out;
929         if (smc->use_fallback)
930                 rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
931         else
932                 rc = smc_tx_sendmsg(smc, msg, len);
933 out:
934         release_sock(sk);
935         return rc;
936 }
937
938 static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
939                        int flags)
940 {
941         struct sock *sk = sock->sk;
942         struct smc_sock *smc;
943         int rc = -ENOTCONN;
944
945         smc = smc_sk(sk);
946         lock_sock(sk);
947         if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED))
948                 goto out;
949
950         if (smc->use_fallback)
951                 rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
952         else
953                 rc = sock_no_recvmsg(sock, msg, len, flags);
954 out:
955         release_sock(sk);
956         return rc;
957 }
958
959 static unsigned int smc_accept_poll(struct sock *parent)
960 {
961         struct smc_sock *isk;
962         struct sock *sk;
963
964         lock_sock(parent);
965         list_for_each_entry(isk, &smc_sk(parent)->accept_q, accept_q) {
966                 sk = (struct sock *)isk;
967
968                 if (sk->sk_state == SMC_ACTIVE) {
969                         release_sock(parent);
970                         return POLLIN | POLLRDNORM;
971                 }
972         }
973         release_sock(parent);
974
975         return 0;
976 }
977
978 static unsigned int smc_poll(struct file *file, struct socket *sock,
979                              poll_table *wait)
980 {
981         struct sock *sk = sock->sk;
982         unsigned int mask = 0;
983         struct smc_sock *smc;
984         int rc;
985
986         smc = smc_sk(sock->sk);
987         if ((sk->sk_state == SMC_INIT) || smc->use_fallback) {
988                 /* delegate to CLC child sock */
989                 mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
990                 /* if non-blocking connect finished ... */
991                 lock_sock(sk);
992                 if ((sk->sk_state == SMC_INIT) && (mask & POLLOUT)) {
993                         sk->sk_err = smc->clcsock->sk->sk_err;
994                         if (sk->sk_err) {
995                                 mask |= POLLERR;
996                         } else {
997                                 rc = smc_connect_rdma(smc);
998                                 if (rc < 0)
999                                         mask |= POLLERR;
1000                                 else
1001                                         /* success cases including fallback */
1002                                         mask |= POLLOUT | POLLWRNORM;
1003                         }
1004                 }
1005                 release_sock(sk);
1006         } else {
1007                 sock_poll_wait(file, sk_sleep(sk), wait);
1008                 if (sk->sk_state == SMC_LISTEN)
1009                         /* woken up by sk_data_ready in smc_listen_work() */
1010                         mask |= smc_accept_poll(sk);
1011                 if (sk->sk_err)
1012                         mask |= POLLERR;
1013                 if (atomic_read(&smc->conn.sndbuf_space)) {
1014                         mask |= POLLOUT | POLLWRNORM;
1015                 } else {
1016                         sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1017                         set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1018                 }
1019                 /* for now - to be enhanced in follow-on patch */
1020         }
1021
1022         return mask;
1023 }
1024
1025 static int smc_shutdown(struct socket *sock, int how)
1026 {
1027         struct sock *sk = sock->sk;
1028         struct smc_sock *smc;
1029         int rc = -EINVAL;
1030
1031         smc = smc_sk(sk);
1032
1033         if ((how < SHUT_RD) || (how > SHUT_RDWR))
1034                 goto out_err;
1035
1036         lock_sock(sk);
1037
1038         rc = -ENOTCONN;
1039         if (sk->sk_state == SMC_CLOSED)
1040                 goto out;
1041         if (smc->use_fallback) {
1042                 rc = kernel_sock_shutdown(smc->clcsock, how);
1043                 sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
1044                 if (sk->sk_shutdown == SHUTDOWN_MASK)
1045                         sk->sk_state = SMC_CLOSED;
1046         } else {
1047                 rc = sock_no_shutdown(sock, how);
1048         }
1049
1050 out:
1051         release_sock(sk);
1052
1053 out_err:
1054         return rc;
1055 }
1056
1057 static int smc_setsockopt(struct socket *sock, int level, int optname,
1058                           char __user *optval, unsigned int optlen)
1059 {
1060         struct sock *sk = sock->sk;
1061         struct smc_sock *smc;
1062
1063         smc = smc_sk(sk);
1064
1065         /* generic setsockopts reaching us here always apply to the
1066          * CLC socket
1067          */
1068         return smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
1069                                              optval, optlen);
1070 }
1071
1072 static int smc_getsockopt(struct socket *sock, int level, int optname,
1073                           char __user *optval, int __user *optlen)
1074 {
1075         struct smc_sock *smc;
1076
1077         smc = smc_sk(sock->sk);
1078         /* socket options apply to the CLC socket */
1079         return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
1080                                              optval, optlen);
1081 }
1082
1083 static int smc_ioctl(struct socket *sock, unsigned int cmd,
1084                      unsigned long arg)
1085 {
1086         struct smc_sock *smc;
1087
1088         smc = smc_sk(sock->sk);
1089         if (smc->use_fallback)
1090                 return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
1091         else
1092                 return sock_no_ioctl(sock, cmd, arg);
1093 }
1094
1095 static ssize_t smc_sendpage(struct socket *sock, struct page *page,
1096                             int offset, size_t size, int flags)
1097 {
1098         struct sock *sk = sock->sk;
1099         struct smc_sock *smc;
1100         int rc = -EPIPE;
1101
1102         smc = smc_sk(sk);
1103         lock_sock(sk);
1104         if (sk->sk_state != SMC_ACTIVE)
1105                 goto out;
1106         if (smc->use_fallback)
1107                 rc = kernel_sendpage(smc->clcsock, page, offset,
1108                                      size, flags);
1109         else
1110                 rc = sock_no_sendpage(sock, page, offset, size, flags);
1111
1112 out:
1113         release_sock(sk);
1114         return rc;
1115 }
1116
1117 static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
1118                                struct pipe_inode_info *pipe, size_t len,
1119                                     unsigned int flags)
1120 {
1121         struct sock *sk = sock->sk;
1122         struct smc_sock *smc;
1123         int rc = -ENOTCONN;
1124
1125         smc = smc_sk(sk);
1126         lock_sock(sk);
1127         if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED))
1128                 goto out;
1129         if (smc->use_fallback) {
1130                 rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
1131                                                     pipe, len, flags);
1132         } else {
1133                 rc = -EOPNOTSUPP;
1134         }
1135 out:
1136         release_sock(sk);
1137         return rc;
1138 }
1139
1140 /* must look like tcp */
1141 static const struct proto_ops smc_sock_ops = {
1142         .family         = PF_SMC,
1143         .owner          = THIS_MODULE,
1144         .release        = smc_release,
1145         .bind           = smc_bind,
1146         .connect        = smc_connect,
1147         .socketpair     = sock_no_socketpair,
1148         .accept         = smc_accept,
1149         .getname        = smc_getname,
1150         .poll           = smc_poll,
1151         .ioctl          = smc_ioctl,
1152         .listen         = smc_listen,
1153         .shutdown       = smc_shutdown,
1154         .setsockopt     = smc_setsockopt,
1155         .getsockopt     = smc_getsockopt,
1156         .sendmsg        = smc_sendmsg,
1157         .recvmsg        = smc_recvmsg,
1158         .mmap           = sock_no_mmap,
1159         .sendpage       = smc_sendpage,
1160         .splice_read    = smc_splice_read,
1161 };
1162
1163 static int smc_create(struct net *net, struct socket *sock, int protocol,
1164                       int kern)
1165 {
1166         struct smc_sock *smc;
1167         struct sock *sk;
1168         int rc;
1169
1170         rc = -ESOCKTNOSUPPORT;
1171         if (sock->type != SOCK_STREAM)
1172                 goto out;
1173
1174         rc = -EPROTONOSUPPORT;
1175         if ((protocol != IPPROTO_IP) && (protocol != IPPROTO_TCP))
1176                 goto out;
1177
1178         rc = -ENOBUFS;
1179         sock->ops = &smc_sock_ops;
1180         sk = smc_sock_alloc(net, sock);
1181         if (!sk)
1182                 goto out;
1183
1184         /* create internal TCP socket for CLC handshake and fallback */
1185         smc = smc_sk(sk);
1186         smc->use_fallback = false; /* assume rdma capability first */
1187         rc = sock_create_kern(net, PF_INET, SOCK_STREAM,
1188                               IPPROTO_TCP, &smc->clcsock);
1189         if (rc)
1190                 sk_common_release(sk);
1191         smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE);
1192         smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE);
1193
1194 out:
1195         return rc;
1196 }
1197
1198 static const struct net_proto_family smc_sock_family_ops = {
1199         .family = PF_SMC,
1200         .owner  = THIS_MODULE,
1201         .create = smc_create,
1202 };
1203
1204 static int __init smc_init(void)
1205 {
1206         int rc;
1207
1208         rc = smc_pnet_init();
1209         if (rc)
1210                 return rc;
1211
1212         rc = smc_llc_init();
1213         if (rc) {
1214                 pr_err("%s: smc_llc_init fails with %d\n", __func__, rc);
1215                 goto out_pnet;
1216         }
1217
1218         rc = smc_cdc_init();
1219         if (rc) {
1220                 pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc);
1221                 goto out_pnet;
1222         }
1223
1224         rc = proto_register(&smc_proto, 1);
1225         if (rc) {
1226                 pr_err("%s: proto_register fails with %d\n", __func__, rc);
1227                 goto out_pnet;
1228         }
1229
1230         rc = sock_register(&smc_sock_family_ops);
1231         if (rc) {
1232                 pr_err("%s: sock_register fails with %d\n", __func__, rc);
1233                 goto out_proto;
1234         }
1235
1236         rc = smc_ib_register_client();
1237         if (rc) {
1238                 pr_err("%s: ib_register fails with %d\n", __func__, rc);
1239                 goto out_sock;
1240         }
1241
1242         return 0;
1243
1244 out_sock:
1245         sock_unregister(PF_SMC);
1246 out_proto:
1247         proto_unregister(&smc_proto);
1248 out_pnet:
1249         smc_pnet_exit();
1250         return rc;
1251 }
1252
1253 static void __exit smc_exit(void)
1254 {
1255         struct smc_link_group *lgr, *lg;
1256         LIST_HEAD(lgr_freeing_list);
1257
1258         spin_lock_bh(&smc_lgr_list.lock);
1259         if (!list_empty(&smc_lgr_list.list))
1260                 list_splice_init(&smc_lgr_list.list, &lgr_freeing_list);
1261         spin_unlock_bh(&smc_lgr_list.lock);
1262         list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) {
1263                 list_del_init(&lgr->list);
1264                 smc_lgr_free(lgr); /* free link group */
1265         }
1266         smc_ib_unregister_client();
1267         sock_unregister(PF_SMC);
1268         proto_unregister(&smc_proto);
1269         smc_pnet_exit();
1270 }
1271
1272 module_init(smc_init);
1273 module_exit(smc_exit);
1274
1275 MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
1276 MODULE_DESCRIPTION("smc socket address family");
1277 MODULE_LICENSE("GPL");
1278 MODULE_ALIAS_NETPROTO(PF_SMC);