net/smc: handle unregistered buffers
[platform/kernel/linux-rpi.git] / net / smc / af_smc.c
1 /*
2  *  Shared Memory Communications over RDMA (SMC-R) and RoCE
3  *
4  *  AF_SMC protocol family socket handler keeping the AF_INET sock address type
5  *  applies to SOCK_STREAM sockets only
6  *  offers an alternative communication option for TCP-protocol sockets
7  *  applicable with RoCE-cards only
8  *
9  *  Initial restrictions:
10  *    - support for alternate links postponed
11  *    - partial support for non-blocking sockets only
12  *    - support for urgent data postponed
13  *
14  *  Copyright IBM Corp. 2016, 2018
15  *
16  *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
17  *              based on prototype from Frank Blaschka
18  */
19
20 #define KMSG_COMPONENT "smc"
21 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
22
23 #include <linux/module.h>
24 #include <linux/socket.h>
25 #include <linux/workqueue.h>
26 #include <linux/in.h>
27 #include <linux/sched/signal.h>
28
29 #include <net/sock.h>
30 #include <net/tcp.h>
31 #include <net/smc.h>
32
33 #include "smc.h"
34 #include "smc_clc.h"
35 #include "smc_llc.h"
36 #include "smc_cdc.h"
37 #include "smc_core.h"
38 #include "smc_ib.h"
39 #include "smc_pnet.h"
40 #include "smc_tx.h"
41 #include "smc_rx.h"
42 #include "smc_close.h"
43
44 static DEFINE_MUTEX(smc_create_lgr_pending);    /* serialize link group
45                                                  * creation
46                                                  */
47
48 struct smc_lgr_list smc_lgr_list = {            /* established link groups */
49         .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
50         .list = LIST_HEAD_INIT(smc_lgr_list.list),
51 };
52
53 static void smc_tcp_listen_work(struct work_struct *);
54
55 static void smc_set_keepalive(struct sock *sk, int val)
56 {
57         struct smc_sock *smc = smc_sk(sk);
58
59         smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
60 }
61
62 static struct smc_hashinfo smc_v4_hashinfo = {
63         .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
64 };
65
66 static struct smc_hashinfo smc_v6_hashinfo = {
67         .lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock),
68 };
69
70 int smc_hash_sk(struct sock *sk)
71 {
72         struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
73         struct hlist_head *head;
74
75         head = &h->ht;
76
77         write_lock_bh(&h->lock);
78         sk_add_node(sk, head);
79         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
80         write_unlock_bh(&h->lock);
81
82         return 0;
83 }
84 EXPORT_SYMBOL_GPL(smc_hash_sk);
85
86 void smc_unhash_sk(struct sock *sk)
87 {
88         struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
89
90         write_lock_bh(&h->lock);
91         if (sk_del_node_init(sk))
92                 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
93         write_unlock_bh(&h->lock);
94 }
95 EXPORT_SYMBOL_GPL(smc_unhash_sk);
96
97 struct proto smc_proto = {
98         .name           = "SMC",
99         .owner          = THIS_MODULE,
100         .keepalive      = smc_set_keepalive,
101         .hash           = smc_hash_sk,
102         .unhash         = smc_unhash_sk,
103         .obj_size       = sizeof(struct smc_sock),
104         .h.smc_hash     = &smc_v4_hashinfo,
105         .slab_flags     = SLAB_TYPESAFE_BY_RCU,
106 };
107 EXPORT_SYMBOL_GPL(smc_proto);
108
109 struct proto smc_proto6 = {
110         .name           = "SMC6",
111         .owner          = THIS_MODULE,
112         .keepalive      = smc_set_keepalive,
113         .hash           = smc_hash_sk,
114         .unhash         = smc_unhash_sk,
115         .obj_size       = sizeof(struct smc_sock),
116         .h.smc_hash     = &smc_v6_hashinfo,
117         .slab_flags     = SLAB_TYPESAFE_BY_RCU,
118 };
119 EXPORT_SYMBOL_GPL(smc_proto6);
120
121 static int smc_release(struct socket *sock)
122 {
123         struct sock *sk = sock->sk;
124         struct smc_sock *smc;
125         int rc = 0;
126
127         if (!sk)
128                 goto out;
129
130         smc = smc_sk(sk);
131         if (sk->sk_state == SMC_LISTEN)
132                 /* smc_close_non_accepted() is called and acquires
133                  * sock lock for child sockets again
134                  */
135                 lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
136         else
137                 lock_sock(sk);
138
139         if (!smc->use_fallback) {
140                 rc = smc_close_active(smc);
141                 sock_set_flag(sk, SOCK_DEAD);
142                 sk->sk_shutdown |= SHUTDOWN_MASK;
143         }
144         if (smc->clcsock) {
145                 sock_release(smc->clcsock);
146                 smc->clcsock = NULL;
147         }
148         if (smc->use_fallback) {
149                 sock_put(sk); /* passive closing */
150                 sk->sk_state = SMC_CLOSED;
151                 sk->sk_state_change(sk);
152         }
153
154         /* detach socket */
155         sock_orphan(sk);
156         sock->sk = NULL;
157         if (!smc->use_fallback && sk->sk_state == SMC_CLOSED)
158                 smc_conn_free(&smc->conn);
159         release_sock(sk);
160
161         sk->sk_prot->unhash(sk);
162         sock_put(sk); /* final sock_put */
163 out:
164         return rc;
165 }
166
167 static void smc_destruct(struct sock *sk)
168 {
169         if (sk->sk_state != SMC_CLOSED)
170                 return;
171         if (!sock_flag(sk, SOCK_DEAD))
172                 return;
173
174         sk_refcnt_debug_dec(sk);
175 }
176
177 static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
178                                    int protocol)
179 {
180         struct smc_sock *smc;
181         struct proto *prot;
182         struct sock *sk;
183
184         prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto;
185         sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0);
186         if (!sk)
187                 return NULL;
188
189         sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
190         sk->sk_state = SMC_INIT;
191         sk->sk_destruct = smc_destruct;
192         sk->sk_protocol = protocol;
193         smc = smc_sk(sk);
194         INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
195         INIT_LIST_HEAD(&smc->accept_q);
196         spin_lock_init(&smc->accept_q_lock);
197         sk->sk_prot->hash(sk);
198         sk_refcnt_debug_inc(sk);
199
200         return sk;
201 }
202
203 static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
204                     int addr_len)
205 {
206         struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
207         struct sock *sk = sock->sk;
208         struct smc_sock *smc;
209         int rc;
210
211         smc = smc_sk(sk);
212
213         /* replicate tests from inet_bind(), to be safe wrt. future changes */
214         rc = -EINVAL;
215         if (addr_len < sizeof(struct sockaddr_in))
216                 goto out;
217
218         rc = -EAFNOSUPPORT;
219         if (addr->sin_family != AF_INET &&
220             addr->sin_family != AF_INET6 &&
221             addr->sin_family != AF_UNSPEC)
222                 goto out;
223         /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
224         if (addr->sin_family == AF_UNSPEC &&
225             addr->sin_addr.s_addr != htonl(INADDR_ANY))
226                 goto out;
227
228         lock_sock(sk);
229
230         /* Check if socket is already active */
231         rc = -EINVAL;
232         if (sk->sk_state != SMC_INIT)
233                 goto out_rel;
234
235         smc->clcsock->sk->sk_reuse = sk->sk_reuse;
236         rc = kernel_bind(smc->clcsock, uaddr, addr_len);
237
238 out_rel:
239         release_sock(sk);
240 out:
241         return rc;
242 }
243
244 static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
245                                    unsigned long mask)
246 {
247         /* options we don't get control via setsockopt for */
248         nsk->sk_type = osk->sk_type;
249         nsk->sk_sndbuf = osk->sk_sndbuf;
250         nsk->sk_rcvbuf = osk->sk_rcvbuf;
251         nsk->sk_sndtimeo = osk->sk_sndtimeo;
252         nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
253         nsk->sk_mark = osk->sk_mark;
254         nsk->sk_priority = osk->sk_priority;
255         nsk->sk_rcvlowat = osk->sk_rcvlowat;
256         nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
257         nsk->sk_err = osk->sk_err;
258
259         nsk->sk_flags &= ~mask;
260         nsk->sk_flags |= osk->sk_flags & mask;
261 }
262
263 #define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
264                              (1UL << SOCK_KEEPOPEN) | \
265                              (1UL << SOCK_LINGER) | \
266                              (1UL << SOCK_BROADCAST) | \
267                              (1UL << SOCK_TIMESTAMP) | \
268                              (1UL << SOCK_DBG) | \
269                              (1UL << SOCK_RCVTSTAMP) | \
270                              (1UL << SOCK_RCVTSTAMPNS) | \
271                              (1UL << SOCK_LOCALROUTE) | \
272                              (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
273                              (1UL << SOCK_RXQ_OVFL) | \
274                              (1UL << SOCK_WIFI_STATUS) | \
275                              (1UL << SOCK_NOFCS) | \
276                              (1UL << SOCK_FILTER_LOCKED))
277 /* copy only relevant settings and flags of SOL_SOCKET level from smc to
278  * clc socket (since smc is not called for these options from net/core)
279  */
280 static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
281 {
282         smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
283 }
284
285 #define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
286                              (1UL << SOCK_KEEPOPEN) | \
287                              (1UL << SOCK_LINGER) | \
288                              (1UL << SOCK_DBG))
289 /* copy only settings and flags relevant for smc from clc to smc socket */
290 static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
291 {
292         smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
293 }
294
295 /* register a new rmb */
296 static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc)
297 {
298         /* register memory region for new rmb */
299         if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) {
300                 rmb_desc->regerr = 1;
301                 return -EFAULT;
302         }
303         return 0;
304 }
305
306 static int smc_clnt_conf_first_link(struct smc_sock *smc)
307 {
308         struct smc_link_group *lgr = smc->conn.lgr;
309         struct smc_link *link;
310         int rest;
311         int rc;
312
313         link = &lgr->lnk[SMC_SINGLE_LINK];
314         /* receive CONFIRM LINK request from server over RoCE fabric */
315         rest = wait_for_completion_interruptible_timeout(
316                 &link->llc_confirm,
317                 SMC_LLC_WAIT_FIRST_TIME);
318         if (rest <= 0) {
319                 struct smc_clc_msg_decline dclc;
320
321                 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
322                                       SMC_CLC_DECLINE);
323                 return rc;
324         }
325
326         if (link->llc_confirm_rc)
327                 return SMC_CLC_DECL_RMBE_EC;
328
329         rc = smc_ib_modify_qp_rts(link);
330         if (rc)
331                 return SMC_CLC_DECL_INTERR;
332
333         smc_wr_remember_qp_attr(link);
334
335         if (smc_reg_rmb(link, smc->conn.rmb_desc))
336                 return SMC_CLC_DECL_INTERR;
337
338         /* send CONFIRM LINK response over RoCE fabric */
339         rc = smc_llc_send_confirm_link(link,
340                                        link->smcibdev->mac[link->ibport - 1],
341                                        &link->smcibdev->gid[link->ibport - 1],
342                                        SMC_LLC_RESP);
343         if (rc < 0)
344                 return SMC_CLC_DECL_TCL;
345
346         /* receive ADD LINK request from server over RoCE fabric */
347         rest = wait_for_completion_interruptible_timeout(&link->llc_add,
348                                                          SMC_LLC_WAIT_TIME);
349         if (rest <= 0) {
350                 struct smc_clc_msg_decline dclc;
351
352                 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
353                                       SMC_CLC_DECLINE);
354                 return rc;
355         }
356
357         /* send add link reject message, only one link supported for now */
358         rc = smc_llc_send_add_link(link,
359                                    link->smcibdev->mac[link->ibport - 1],
360                                    &link->smcibdev->gid[link->ibport - 1],
361                                    SMC_LLC_RESP);
362         if (rc < 0)
363                 return SMC_CLC_DECL_TCL;
364
365         link->state = SMC_LNK_ACTIVE;
366
367         return 0;
368 }
369
370 static void smc_conn_save_peer_info(struct smc_sock *smc,
371                                     struct smc_clc_msg_accept_confirm *clc)
372 {
373         smc->conn.peer_conn_idx = clc->conn_idx;
374         smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
375         smc->conn.peer_rmbe_size = smc_uncompress_bufsize(clc->rmbe_size);
376         atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
377 }
378
379 static void smc_link_save_peer_info(struct smc_link *link,
380                                     struct smc_clc_msg_accept_confirm *clc)
381 {
382         link->peer_qpn = ntoh24(clc->qpn);
383         memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE);
384         memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac));
385         link->peer_psn = ntoh24(clc->psn);
386         link->peer_mtu = clc->qp_mtu;
387 }
388
389 /* setup for RDMA connection of client */
390 static int smc_connect_rdma(struct smc_sock *smc)
391 {
392         struct smc_clc_msg_accept_confirm aclc;
393         int local_contact = SMC_FIRST_CONTACT;
394         struct smc_ib_device *smcibdev;
395         struct smc_link *link;
396         u8 srv_first_contact;
397         int reason_code = 0;
398         int rc = 0;
399         u8 ibport;
400
401         sock_hold(&smc->sk); /* sock put in passive closing */
402
403         if (!tcp_sk(smc->clcsock->sk)->syn_smc) {
404                 /* peer has not signalled SMC-capability */
405                 smc->use_fallback = true;
406                 goto out_connected;
407         }
408
409         /* IPSec connections opt out of SMC-R optimizations */
410         if (using_ipsec(smc)) {
411                 reason_code = SMC_CLC_DECL_IPSEC;
412                 goto decline_rdma;
413         }
414
415         /* PNET table look up: search active ib_device and port
416          * within same PNETID that also contains the ethernet device
417          * used for the internal TCP socket
418          */
419         smc_pnet_find_roce_resource(smc->clcsock->sk, &smcibdev, &ibport);
420         if (!smcibdev) {
421                 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
422                 goto decline_rdma;
423         }
424
425         /* do inband token exchange */
426         reason_code = smc_clc_send_proposal(smc, smcibdev, ibport);
427         if (reason_code < 0) {
428                 rc = reason_code;
429                 goto out_err;
430         }
431         if (reason_code > 0) /* configuration error */
432                 goto decline_rdma;
433         /* receive SMC Accept CLC message */
434         reason_code = smc_clc_wait_msg(smc, &aclc, sizeof(aclc),
435                                        SMC_CLC_ACCEPT);
436         if (reason_code < 0) {
437                 rc = reason_code;
438                 goto out_err;
439         }
440         if (reason_code > 0)
441                 goto decline_rdma;
442
443         srv_first_contact = aclc.hdr.flag;
444         mutex_lock(&smc_create_lgr_pending);
445         local_contact = smc_conn_create(smc, smcibdev, ibport, &aclc.lcl,
446                                         srv_first_contact);
447         if (local_contact < 0) {
448                 rc = local_contact;
449                 if (rc == -ENOMEM)
450                         reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
451                 else if (rc == -ENOLINK)
452                         reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
453                 goto decline_rdma_unlock;
454         }
455         link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
456
457         smc_conn_save_peer_info(smc, &aclc);
458
459         /* create send buffer and rmb */
460         rc = smc_buf_create(smc);
461         if (rc) {
462                 reason_code = SMC_CLC_DECL_MEM;
463                 goto decline_rdma_unlock;
464         }
465
466         if (local_contact == SMC_FIRST_CONTACT)
467                 smc_link_save_peer_info(link, &aclc);
468
469         rc = smc_rmb_rtoken_handling(&smc->conn, &aclc);
470         if (rc) {
471                 reason_code = SMC_CLC_DECL_INTERR;
472                 goto decline_rdma_unlock;
473         }
474
475         smc_close_init(smc);
476         smc_rx_init(smc);
477
478         if (local_contact == SMC_FIRST_CONTACT) {
479                 rc = smc_ib_ready_link(link);
480                 if (rc) {
481                         reason_code = SMC_CLC_DECL_INTERR;
482                         goto decline_rdma_unlock;
483                 }
484         } else {
485                 if (!smc->conn.rmb_desc->reused) {
486                         if (smc_reg_rmb(link, smc->conn.rmb_desc)) {
487                                 reason_code = SMC_CLC_DECL_INTERR;
488                                 goto decline_rdma_unlock;
489                         }
490                 }
491         }
492         smc_rmb_sync_sg_for_device(&smc->conn);
493
494         rc = smc_clc_send_confirm(smc);
495         if (rc)
496                 goto out_err_unlock;
497
498         if (local_contact == SMC_FIRST_CONTACT) {
499                 /* QP confirmation over RoCE fabric */
500                 reason_code = smc_clnt_conf_first_link(smc);
501                 if (reason_code < 0) {
502                         rc = reason_code;
503                         goto out_err_unlock;
504                 }
505                 if (reason_code > 0)
506                         goto decline_rdma_unlock;
507         }
508
509         mutex_unlock(&smc_create_lgr_pending);
510         smc_tx_init(smc);
511
512 out_connected:
513         smc_copy_sock_settings_to_clc(smc);
514         if (smc->sk.sk_state == SMC_INIT)
515                 smc->sk.sk_state = SMC_ACTIVE;
516
517         return rc ? rc : local_contact;
518
519 decline_rdma_unlock:
520         if (local_contact == SMC_FIRST_CONTACT)
521                 smc_lgr_forget(smc->conn.lgr);
522         mutex_unlock(&smc_create_lgr_pending);
523         smc_conn_free(&smc->conn);
524 decline_rdma:
525         /* RDMA setup failed, switch back to TCP */
526         smc->use_fallback = true;
527         if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
528                 rc = smc_clc_send_decline(smc, reason_code);
529                 if (rc < 0)
530                         goto out_err;
531         }
532         goto out_connected;
533
534 out_err_unlock:
535         if (local_contact == SMC_FIRST_CONTACT)
536                 smc_lgr_forget(smc->conn.lgr);
537         mutex_unlock(&smc_create_lgr_pending);
538         smc_conn_free(&smc->conn);
539 out_err:
540         if (smc->sk.sk_state == SMC_INIT)
541                 sock_put(&smc->sk); /* passive closing */
542         return rc;
543 }
544
545 static int smc_connect(struct socket *sock, struct sockaddr *addr,
546                        int alen, int flags)
547 {
548         struct sock *sk = sock->sk;
549         struct smc_sock *smc;
550         int rc = -EINVAL;
551
552         smc = smc_sk(sk);
553
554         /* separate smc parameter checking to be safe */
555         if (alen < sizeof(addr->sa_family))
556                 goto out_err;
557         if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6)
558                 goto out_err;
559
560         lock_sock(sk);
561         switch (sk->sk_state) {
562         default:
563                 goto out;
564         case SMC_ACTIVE:
565                 rc = -EISCONN;
566                 goto out;
567         case SMC_INIT:
568                 rc = 0;
569                 break;
570         }
571
572         smc_copy_sock_settings_to_clc(smc);
573         tcp_sk(smc->clcsock->sk)->syn_smc = 1;
574         rc = kernel_connect(smc->clcsock, addr, alen, flags);
575         if (rc)
576                 goto out;
577
578         /* setup RDMA connection */
579         rc = smc_connect_rdma(smc);
580         if (rc < 0)
581                 goto out;
582         else
583                 rc = 0; /* success cases including fallback */
584
585 out:
586         release_sock(sk);
587 out_err:
588         return rc;
589 }
590
591 static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
592 {
593         struct socket *new_clcsock = NULL;
594         struct sock *lsk = &lsmc->sk;
595         struct sock *new_sk;
596         int rc;
597
598         release_sock(lsk);
599         new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol);
600         if (!new_sk) {
601                 rc = -ENOMEM;
602                 lsk->sk_err = ENOMEM;
603                 *new_smc = NULL;
604                 lock_sock(lsk);
605                 goto out;
606         }
607         *new_smc = smc_sk(new_sk);
608
609         rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
610         lock_sock(lsk);
611         if  (rc < 0)
612                 lsk->sk_err = -rc;
613         if (rc < 0 || lsk->sk_state == SMC_CLOSED) {
614                 if (new_clcsock)
615                         sock_release(new_clcsock);
616                 new_sk->sk_state = SMC_CLOSED;
617                 sock_set_flag(new_sk, SOCK_DEAD);
618                 new_sk->sk_prot->unhash(new_sk);
619                 sock_put(new_sk); /* final */
620                 *new_smc = NULL;
621                 goto out;
622         }
623
624         (*new_smc)->clcsock = new_clcsock;
625 out:
626         return rc;
627 }
628
629 /* add a just created sock to the accept queue of the listen sock as
630  * candidate for a following socket accept call from user space
631  */
632 static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
633 {
634         struct smc_sock *par = smc_sk(parent);
635
636         sock_hold(sk); /* sock_put in smc_accept_unlink () */
637         spin_lock(&par->accept_q_lock);
638         list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
639         spin_unlock(&par->accept_q_lock);
640         sk_acceptq_added(parent);
641 }
642
643 /* remove a socket from the accept queue of its parental listening socket */
644 static void smc_accept_unlink(struct sock *sk)
645 {
646         struct smc_sock *par = smc_sk(sk)->listen_smc;
647
648         spin_lock(&par->accept_q_lock);
649         list_del_init(&smc_sk(sk)->accept_q);
650         spin_unlock(&par->accept_q_lock);
651         sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
652         sock_put(sk); /* sock_hold in smc_accept_enqueue */
653 }
654
655 /* remove a sock from the accept queue to bind it to a new socket created
656  * for a socket accept call from user space
657  */
658 struct sock *smc_accept_dequeue(struct sock *parent,
659                                 struct socket *new_sock)
660 {
661         struct smc_sock *isk, *n;
662         struct sock *new_sk;
663
664         list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
665                 new_sk = (struct sock *)isk;
666
667                 smc_accept_unlink(new_sk);
668                 if (new_sk->sk_state == SMC_CLOSED) {
669                         if (isk->clcsock) {
670                                 sock_release(isk->clcsock);
671                                 isk->clcsock = NULL;
672                         }
673                         new_sk->sk_prot->unhash(new_sk);
674                         sock_put(new_sk); /* final */
675                         continue;
676                 }
677                 if (new_sock)
678                         sock_graft(new_sk, new_sock);
679                 return new_sk;
680         }
681         return NULL;
682 }
683
684 /* clean up for a created but never accepted sock */
685 void smc_close_non_accepted(struct sock *sk)
686 {
687         struct smc_sock *smc = smc_sk(sk);
688
689         lock_sock(sk);
690         if (!sk->sk_lingertime)
691                 /* wait for peer closing */
692                 sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
693         if (!smc->use_fallback) {
694                 smc_close_active(smc);
695                 sock_set_flag(sk, SOCK_DEAD);
696                 sk->sk_shutdown |= SHUTDOWN_MASK;
697         }
698         if (smc->clcsock) {
699                 struct socket *tcp;
700
701                 tcp = smc->clcsock;
702                 smc->clcsock = NULL;
703                 sock_release(tcp);
704         }
705         if (smc->use_fallback) {
706                 sock_put(sk); /* passive closing */
707                 sk->sk_state = SMC_CLOSED;
708         } else {
709                 if (sk->sk_state == SMC_CLOSED)
710                         smc_conn_free(&smc->conn);
711         }
712         release_sock(sk);
713         sk->sk_prot->unhash(sk);
714         sock_put(sk); /* final sock_put */
715 }
716
717 static int smc_serv_conf_first_link(struct smc_sock *smc)
718 {
719         struct smc_link_group *lgr = smc->conn.lgr;
720         struct smc_link *link;
721         int rest;
722         int rc;
723
724         link = &lgr->lnk[SMC_SINGLE_LINK];
725
726         if (smc_reg_rmb(link, smc->conn.rmb_desc))
727                 return SMC_CLC_DECL_INTERR;
728
729         /* send CONFIRM LINK request to client over the RoCE fabric */
730         rc = smc_llc_send_confirm_link(link,
731                                        link->smcibdev->mac[link->ibport - 1],
732                                        &link->smcibdev->gid[link->ibport - 1],
733                                        SMC_LLC_REQ);
734         if (rc < 0)
735                 return SMC_CLC_DECL_TCL;
736
737         /* receive CONFIRM LINK response from client over the RoCE fabric */
738         rest = wait_for_completion_interruptible_timeout(
739                 &link->llc_confirm_resp,
740                 SMC_LLC_WAIT_FIRST_TIME);
741         if (rest <= 0) {
742                 struct smc_clc_msg_decline dclc;
743
744                 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
745                                       SMC_CLC_DECLINE);
746                 return rc;
747         }
748
749         if (link->llc_confirm_resp_rc)
750                 return SMC_CLC_DECL_RMBE_EC;
751
752         /* send ADD LINK request to client over the RoCE fabric */
753         rc = smc_llc_send_add_link(link,
754                                    link->smcibdev->mac[link->ibport - 1],
755                                    &link->smcibdev->gid[link->ibport - 1],
756                                    SMC_LLC_REQ);
757         if (rc < 0)
758                 return SMC_CLC_DECL_TCL;
759
760         /* receive ADD LINK response from client over the RoCE fabric */
761         rest = wait_for_completion_interruptible_timeout(&link->llc_add_resp,
762                                                          SMC_LLC_WAIT_TIME);
763         if (rest <= 0) {
764                 struct smc_clc_msg_decline dclc;
765
766                 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
767                                       SMC_CLC_DECLINE);
768                 return rc;
769         }
770
771         link->state = SMC_LNK_ACTIVE;
772
773         return 0;
774 }
775
776 /* setup for RDMA connection of server */
777 static void smc_listen_work(struct work_struct *work)
778 {
779         struct smc_sock *new_smc = container_of(work, struct smc_sock,
780                                                 smc_listen_work);
781         struct smc_clc_msg_proposal_prefix *pclc_prfx;
782         struct socket *newclcsock = new_smc->clcsock;
783         struct smc_sock *lsmc = new_smc->listen_smc;
784         struct smc_clc_msg_accept_confirm cclc;
785         int local_contact = SMC_REUSE_CONTACT;
786         struct sock *newsmcsk = &new_smc->sk;
787         struct smc_clc_msg_proposal *pclc;
788         struct smc_ib_device *smcibdev;
789         u8 buf[SMC_CLC_MAX_LEN];
790         struct smc_link *link;
791         int reason_code = 0;
792         int rc = 0;
793         u8 ibport;
794
795         /* check if peer is smc capable */
796         if (!tcp_sk(newclcsock->sk)->syn_smc) {
797                 new_smc->use_fallback = true;
798                 goto out_connected;
799         }
800
801         /* do inband token exchange -
802          *wait for and receive SMC Proposal CLC message
803          */
804         reason_code = smc_clc_wait_msg(new_smc, &buf, sizeof(buf),
805                                        SMC_CLC_PROPOSAL);
806         if (reason_code < 0)
807                 goto out_err;
808         if (reason_code > 0)
809                 goto decline_rdma;
810
811         /* IPSec connections opt out of SMC-R optimizations */
812         if (using_ipsec(new_smc)) {
813                 reason_code = SMC_CLC_DECL_IPSEC;
814                 goto decline_rdma;
815         }
816
817         /* PNET table look up: search active ib_device and port
818          * within same PNETID that also contains the ethernet device
819          * used for the internal TCP socket
820          */
821         smc_pnet_find_roce_resource(newclcsock->sk, &smcibdev, &ibport);
822         if (!smcibdev) {
823                 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
824                 goto decline_rdma;
825         }
826
827         pclc = (struct smc_clc_msg_proposal *)&buf;
828         pclc_prfx = smc_clc_proposal_get_prefix(pclc);
829
830         rc = smc_clc_prfx_match(newclcsock, pclc_prfx);
831         if (rc) {
832                 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
833                 goto decline_rdma;
834         }
835
836         /* allocate connection / link group */
837         mutex_lock(&smc_create_lgr_pending);
838         local_contact = smc_conn_create(new_smc, smcibdev, ibport, &pclc->lcl,
839                                         0);
840         if (local_contact < 0) {
841                 rc = local_contact;
842                 if (rc == -ENOMEM)
843                         reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
844                 goto decline_rdma_unlock;
845         }
846         link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
847
848         /* create send buffer and rmb */
849         rc = smc_buf_create(new_smc);
850         if (rc) {
851                 reason_code = SMC_CLC_DECL_MEM;
852                 goto decline_rdma_unlock;
853         }
854
855         smc_close_init(new_smc);
856         smc_rx_init(new_smc);
857
858         if (local_contact != SMC_FIRST_CONTACT) {
859                 if (!new_smc->conn.rmb_desc->reused) {
860                         if (smc_reg_rmb(link, new_smc->conn.rmb_desc)) {
861                                 reason_code = SMC_CLC_DECL_INTERR;
862                                 goto decline_rdma_unlock;
863                         }
864                 }
865         }
866         smc_rmb_sync_sg_for_device(&new_smc->conn);
867
868         rc = smc_clc_send_accept(new_smc, local_contact);
869         if (rc)
870                 goto out_err_unlock;
871
872         /* receive SMC Confirm CLC message */
873         reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
874                                        SMC_CLC_CONFIRM);
875         if (reason_code < 0)
876                 goto out_err_unlock;
877         if (reason_code > 0)
878                 goto decline_rdma_unlock;
879         smc_conn_save_peer_info(new_smc, &cclc);
880         if (local_contact == SMC_FIRST_CONTACT)
881                 smc_link_save_peer_info(link, &cclc);
882
883         rc = smc_rmb_rtoken_handling(&new_smc->conn, &cclc);
884         if (rc) {
885                 reason_code = SMC_CLC_DECL_INTERR;
886                 goto decline_rdma_unlock;
887         }
888
889         if (local_contact == SMC_FIRST_CONTACT) {
890                 rc = smc_ib_ready_link(link);
891                 if (rc) {
892                         reason_code = SMC_CLC_DECL_INTERR;
893                         goto decline_rdma_unlock;
894                 }
895                 /* QP confirmation over RoCE fabric */
896                 reason_code = smc_serv_conf_first_link(new_smc);
897                 if (reason_code < 0)
898                         /* peer is not aware of a problem */
899                         goto out_err_unlock;
900                 if (reason_code > 0)
901                         goto decline_rdma_unlock;
902         }
903
904         smc_tx_init(new_smc);
905         mutex_unlock(&smc_create_lgr_pending);
906
907 out_connected:
908         sk_refcnt_debug_inc(newsmcsk);
909         if (newsmcsk->sk_state == SMC_INIT)
910                 newsmcsk->sk_state = SMC_ACTIVE;
911 enqueue:
912         lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
913         if (lsmc->sk.sk_state == SMC_LISTEN) {
914                 smc_accept_enqueue(&lsmc->sk, newsmcsk);
915         } else { /* no longer listening */
916                 smc_close_non_accepted(newsmcsk);
917         }
918         release_sock(&lsmc->sk);
919
920         /* Wake up accept */
921         lsmc->sk.sk_data_ready(&lsmc->sk);
922         sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
923         return;
924
925 decline_rdma_unlock:
926         if (local_contact == SMC_FIRST_CONTACT)
927                 smc_lgr_forget(new_smc->conn.lgr);
928         mutex_unlock(&smc_create_lgr_pending);
929 decline_rdma:
930         /* RDMA setup failed, switch back to TCP */
931         smc_conn_free(&new_smc->conn);
932         new_smc->use_fallback = true;
933         if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
934                 if (smc_clc_send_decline(new_smc, reason_code) < 0)
935                         goto out_err;
936         }
937         goto out_connected;
938
939 out_err_unlock:
940         if (local_contact == SMC_FIRST_CONTACT)
941                 smc_lgr_forget(new_smc->conn.lgr);
942         mutex_unlock(&smc_create_lgr_pending);
943 out_err:
944         if (newsmcsk->sk_state == SMC_INIT)
945                 sock_put(&new_smc->sk); /* passive closing */
946         newsmcsk->sk_state = SMC_CLOSED;
947         smc_conn_free(&new_smc->conn);
948         goto enqueue; /* queue new sock with sk_err set */
949 }
950
951 static void smc_tcp_listen_work(struct work_struct *work)
952 {
953         struct smc_sock *lsmc = container_of(work, struct smc_sock,
954                                              tcp_listen_work);
955         struct sock *lsk = &lsmc->sk;
956         struct smc_sock *new_smc;
957         int rc = 0;
958
959         lock_sock(lsk);
960         while (lsk->sk_state == SMC_LISTEN) {
961                 rc = smc_clcsock_accept(lsmc, &new_smc);
962                 if (rc)
963                         goto out;
964                 if (!new_smc)
965                         continue;
966
967                 new_smc->listen_smc = lsmc;
968                 new_smc->use_fallback = false; /* assume rdma capability first*/
969                 sock_hold(lsk); /* sock_put in smc_listen_work */
970                 INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
971                 smc_copy_sock_settings_to_smc(new_smc);
972                 sock_hold(&new_smc->sk); /* sock_put in passive closing */
973                 if (!schedule_work(&new_smc->smc_listen_work))
974                         sock_put(&new_smc->sk);
975         }
976
977 out:
978         release_sock(lsk);
979         sock_put(&lsmc->sk); /* sock_hold in smc_listen */
980 }
981
982 static int smc_listen(struct socket *sock, int backlog)
983 {
984         struct sock *sk = sock->sk;
985         struct smc_sock *smc;
986         int rc;
987
988         smc = smc_sk(sk);
989         lock_sock(sk);
990
991         rc = -EINVAL;
992         if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN))
993                 goto out;
994
995         rc = 0;
996         if (sk->sk_state == SMC_LISTEN) {
997                 sk->sk_max_ack_backlog = backlog;
998                 goto out;
999         }
1000         /* some socket options are handled in core, so we could not apply
1001          * them to the clc socket -- copy smc socket options to clc socket
1002          */
1003         smc_copy_sock_settings_to_clc(smc);
1004         tcp_sk(smc->clcsock->sk)->syn_smc = 1;
1005
1006         rc = kernel_listen(smc->clcsock, backlog);
1007         if (rc)
1008                 goto out;
1009         sk->sk_max_ack_backlog = backlog;
1010         sk->sk_ack_backlog = 0;
1011         sk->sk_state = SMC_LISTEN;
1012         INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
1013         sock_hold(sk); /* sock_hold in tcp_listen_worker */
1014         if (!schedule_work(&smc->tcp_listen_work))
1015                 sock_put(sk);
1016
1017 out:
1018         release_sock(sk);
1019         return rc;
1020 }
1021
1022 static int smc_accept(struct socket *sock, struct socket *new_sock,
1023                       int flags, bool kern)
1024 {
1025         struct sock *sk = sock->sk, *nsk;
1026         DECLARE_WAITQUEUE(wait, current);
1027         struct smc_sock *lsmc;
1028         long timeo;
1029         int rc = 0;
1030
1031         lsmc = smc_sk(sk);
1032         sock_hold(sk); /* sock_put below */
1033         lock_sock(sk);
1034
1035         if (lsmc->sk.sk_state != SMC_LISTEN) {
1036                 rc = -EINVAL;
1037                 goto out;
1038         }
1039
1040         /* Wait for an incoming connection */
1041         timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1042         add_wait_queue_exclusive(sk_sleep(sk), &wait);
1043         while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
1044                 set_current_state(TASK_INTERRUPTIBLE);
1045                 if (!timeo) {
1046                         rc = -EAGAIN;
1047                         break;
1048                 }
1049                 release_sock(sk);
1050                 timeo = schedule_timeout(timeo);
1051                 /* wakeup by sk_data_ready in smc_listen_work() */
1052                 sched_annotate_sleep();
1053                 lock_sock(sk);
1054                 if (signal_pending(current)) {
1055                         rc = sock_intr_errno(timeo);
1056                         break;
1057                 }
1058         }
1059         set_current_state(TASK_RUNNING);
1060         remove_wait_queue(sk_sleep(sk), &wait);
1061
1062         if (!rc)
1063                 rc = sock_error(nsk);
1064
1065 out:
1066         release_sock(sk);
1067         sock_put(sk); /* sock_hold above */
1068         return rc;
1069 }
1070
1071 static int smc_getname(struct socket *sock, struct sockaddr *addr,
1072                        int peer)
1073 {
1074         struct smc_sock *smc;
1075
1076         if (peer && (sock->sk->sk_state != SMC_ACTIVE) &&
1077             (sock->sk->sk_state != SMC_APPCLOSEWAIT1))
1078                 return -ENOTCONN;
1079
1080         smc = smc_sk(sock->sk);
1081
1082         return smc->clcsock->ops->getname(smc->clcsock, addr, peer);
1083 }
1084
1085 static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
1086 {
1087         struct sock *sk = sock->sk;
1088         struct smc_sock *smc;
1089         int rc = -EPIPE;
1090
1091         smc = smc_sk(sk);
1092         lock_sock(sk);
1093         if ((sk->sk_state != SMC_ACTIVE) &&
1094             (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1095             (sk->sk_state != SMC_INIT))
1096                 goto out;
1097         if (smc->use_fallback)
1098                 rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
1099         else
1100                 rc = smc_tx_sendmsg(smc, msg, len);
1101 out:
1102         release_sock(sk);
1103         return rc;
1104 }
1105
1106 static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
1107                        int flags)
1108 {
1109         struct sock *sk = sock->sk;
1110         struct smc_sock *smc;
1111         int rc = -ENOTCONN;
1112
1113         smc = smc_sk(sk);
1114         lock_sock(sk);
1115         if ((sk->sk_state == SMC_INIT) ||
1116             (sk->sk_state == SMC_LISTEN) ||
1117             (sk->sk_state == SMC_CLOSED))
1118                 goto out;
1119
1120         if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
1121                 rc = 0;
1122                 goto out;
1123         }
1124
1125         if (smc->use_fallback)
1126                 rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
1127         else
1128                 rc = smc_rx_recvmsg(smc, msg, len, flags);
1129
1130 out:
1131         release_sock(sk);
1132         return rc;
1133 }
1134
1135 static __poll_t smc_accept_poll(struct sock *parent)
1136 {
1137         struct smc_sock *isk = smc_sk(parent);
1138         __poll_t mask = 0;
1139
1140         spin_lock(&isk->accept_q_lock);
1141         if (!list_empty(&isk->accept_q))
1142                 mask = EPOLLIN | EPOLLRDNORM;
1143         spin_unlock(&isk->accept_q_lock);
1144
1145         return mask;
1146 }
1147
1148 static __poll_t smc_poll(struct file *file, struct socket *sock,
1149                              poll_table *wait)
1150 {
1151         struct sock *sk = sock->sk;
1152         __poll_t mask = 0;
1153         struct smc_sock *smc;
1154         int rc;
1155
1156         if (!sk)
1157                 return EPOLLNVAL;
1158
1159         smc = smc_sk(sock->sk);
1160         sock_hold(sk);
1161         lock_sock(sk);
1162         if ((sk->sk_state == SMC_INIT) || smc->use_fallback) {
1163                 /* delegate to CLC child sock */
1164                 release_sock(sk);
1165                 mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
1166                 lock_sock(sk);
1167                 sk->sk_err = smc->clcsock->sk->sk_err;
1168                 if (sk->sk_err) {
1169                         mask |= EPOLLERR;
1170                 } else {
1171                         /* if non-blocking connect finished ... */
1172                         if (sk->sk_state == SMC_INIT &&
1173                             mask & EPOLLOUT &&
1174                             smc->clcsock->sk->sk_state != TCP_CLOSE) {
1175                                 rc = smc_connect_rdma(smc);
1176                                 if (rc < 0)
1177                                         mask |= EPOLLERR;
1178                                 /* success cases including fallback */
1179                                 mask |= EPOLLOUT | EPOLLWRNORM;
1180                         }
1181                 }
1182         } else {
1183                 if (sk->sk_state != SMC_CLOSED) {
1184                         release_sock(sk);
1185                         sock_poll_wait(file, sk_sleep(sk), wait);
1186                         lock_sock(sk);
1187                 }
1188                 if (sk->sk_err)
1189                         mask |= EPOLLERR;
1190                 if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
1191                     (sk->sk_state == SMC_CLOSED))
1192                         mask |= EPOLLHUP;
1193                 if (sk->sk_state == SMC_LISTEN) {
1194                         /* woken up by sk_data_ready in smc_listen_work() */
1195                         mask = smc_accept_poll(sk);
1196                 } else {
1197                         if (atomic_read(&smc->conn.sndbuf_space) ||
1198                             sk->sk_shutdown & SEND_SHUTDOWN) {
1199                                 mask |= EPOLLOUT | EPOLLWRNORM;
1200                         } else {
1201                                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1202                                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1203                         }
1204                         if (atomic_read(&smc->conn.bytes_to_rcv))
1205                                 mask |= EPOLLIN | EPOLLRDNORM;
1206                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1207                                 mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
1208                         if (sk->sk_state == SMC_APPCLOSEWAIT1)
1209                                 mask |= EPOLLIN;
1210                 }
1211
1212         }
1213         release_sock(sk);
1214         sock_put(sk);
1215
1216         return mask;
1217 }
1218
1219 static int smc_shutdown(struct socket *sock, int how)
1220 {
1221         struct sock *sk = sock->sk;
1222         struct smc_sock *smc;
1223         int rc = -EINVAL;
1224         int rc1 = 0;
1225
1226         smc = smc_sk(sk);
1227
1228         if ((how < SHUT_RD) || (how > SHUT_RDWR))
1229                 return rc;
1230
1231         lock_sock(sk);
1232
1233         rc = -ENOTCONN;
1234         if ((sk->sk_state != SMC_LISTEN) &&
1235             (sk->sk_state != SMC_ACTIVE) &&
1236             (sk->sk_state != SMC_PEERCLOSEWAIT1) &&
1237             (sk->sk_state != SMC_PEERCLOSEWAIT2) &&
1238             (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1239             (sk->sk_state != SMC_APPCLOSEWAIT2) &&
1240             (sk->sk_state != SMC_APPFINCLOSEWAIT))
1241                 goto out;
1242         if (smc->use_fallback) {
1243                 rc = kernel_sock_shutdown(smc->clcsock, how);
1244                 sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
1245                 if (sk->sk_shutdown == SHUTDOWN_MASK)
1246                         sk->sk_state = SMC_CLOSED;
1247                 goto out;
1248         }
1249         switch (how) {
1250         case SHUT_RDWR:         /* shutdown in both directions */
1251                 rc = smc_close_active(smc);
1252                 break;
1253         case SHUT_WR:
1254                 rc = smc_close_shutdown_write(smc);
1255                 break;
1256         case SHUT_RD:
1257                 rc = 0;
1258                 /* nothing more to do because peer is not involved */
1259                 break;
1260         }
1261         if (smc->clcsock)
1262                 rc1 = kernel_sock_shutdown(smc->clcsock, how);
1263         /* map sock_shutdown_cmd constants to sk_shutdown value range */
1264         sk->sk_shutdown |= how + 1;
1265
1266 out:
1267         release_sock(sk);
1268         return rc ? rc : rc1;
1269 }
1270
1271 static int smc_setsockopt(struct socket *sock, int level, int optname,
1272                           char __user *optval, unsigned int optlen)
1273 {
1274         struct sock *sk = sock->sk;
1275         struct smc_sock *smc;
1276
1277         smc = smc_sk(sk);
1278
1279         /* generic setsockopts reaching us here always apply to the
1280          * CLC socket
1281          */
1282         return smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
1283                                              optval, optlen);
1284 }
1285
1286 static int smc_getsockopt(struct socket *sock, int level, int optname,
1287                           char __user *optval, int __user *optlen)
1288 {
1289         struct smc_sock *smc;
1290
1291         smc = smc_sk(sock->sk);
1292         /* socket options apply to the CLC socket */
1293         return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
1294                                              optval, optlen);
1295 }
1296
1297 static int smc_ioctl(struct socket *sock, unsigned int cmd,
1298                      unsigned long arg)
1299 {
1300         struct smc_sock *smc;
1301
1302         smc = smc_sk(sock->sk);
1303         if (smc->use_fallback)
1304                 return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
1305         else
1306                 return sock_no_ioctl(sock, cmd, arg);
1307 }
1308
1309 static ssize_t smc_sendpage(struct socket *sock, struct page *page,
1310                             int offset, size_t size, int flags)
1311 {
1312         struct sock *sk = sock->sk;
1313         struct smc_sock *smc;
1314         int rc = -EPIPE;
1315
1316         smc = smc_sk(sk);
1317         lock_sock(sk);
1318         if (sk->sk_state != SMC_ACTIVE)
1319                 goto out;
1320         if (smc->use_fallback)
1321                 rc = kernel_sendpage(smc->clcsock, page, offset,
1322                                      size, flags);
1323         else
1324                 rc = sock_no_sendpage(sock, page, offset, size, flags);
1325
1326 out:
1327         release_sock(sk);
1328         return rc;
1329 }
1330
1331 static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
1332                                struct pipe_inode_info *pipe, size_t len,
1333                                     unsigned int flags)
1334 {
1335         struct sock *sk = sock->sk;
1336         struct smc_sock *smc;
1337         int rc = -ENOTCONN;
1338
1339         smc = smc_sk(sk);
1340         lock_sock(sk);
1341         if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED))
1342                 goto out;
1343         if (smc->use_fallback) {
1344                 rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
1345                                                     pipe, len, flags);
1346         } else {
1347                 rc = -EOPNOTSUPP;
1348         }
1349 out:
1350         release_sock(sk);
1351         return rc;
1352 }
1353
1354 /* must look like tcp */
1355 static const struct proto_ops smc_sock_ops = {
1356         .family         = PF_SMC,
1357         .owner          = THIS_MODULE,
1358         .release        = smc_release,
1359         .bind           = smc_bind,
1360         .connect        = smc_connect,
1361         .socketpair     = sock_no_socketpair,
1362         .accept         = smc_accept,
1363         .getname        = smc_getname,
1364         .poll           = smc_poll,
1365         .ioctl          = smc_ioctl,
1366         .listen         = smc_listen,
1367         .shutdown       = smc_shutdown,
1368         .setsockopt     = smc_setsockopt,
1369         .getsockopt     = smc_getsockopt,
1370         .sendmsg        = smc_sendmsg,
1371         .recvmsg        = smc_recvmsg,
1372         .mmap           = sock_no_mmap,
1373         .sendpage       = smc_sendpage,
1374         .splice_read    = smc_splice_read,
1375 };
1376
1377 static int smc_create(struct net *net, struct socket *sock, int protocol,
1378                       int kern)
1379 {
1380         int family = (protocol == SMCPROTO_SMC6) ? PF_INET6 : PF_INET;
1381         struct smc_sock *smc;
1382         struct sock *sk;
1383         int rc;
1384
1385         rc = -ESOCKTNOSUPPORT;
1386         if (sock->type != SOCK_STREAM)
1387                 goto out;
1388
1389         rc = -EPROTONOSUPPORT;
1390         if (protocol != SMCPROTO_SMC && protocol != SMCPROTO_SMC6)
1391                 goto out;
1392
1393         rc = -ENOBUFS;
1394         sock->ops = &smc_sock_ops;
1395         sk = smc_sock_alloc(net, sock, protocol);
1396         if (!sk)
1397                 goto out;
1398
1399         /* create internal TCP socket for CLC handshake and fallback */
1400         smc = smc_sk(sk);
1401         smc->use_fallback = false; /* assume rdma capability first */
1402         rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP,
1403                               &smc->clcsock);
1404         if (rc) {
1405                 sk_common_release(sk);
1406                 goto out;
1407         }
1408         smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE);
1409         smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE);
1410
1411 out:
1412         return rc;
1413 }
1414
1415 static const struct net_proto_family smc_sock_family_ops = {
1416         .family = PF_SMC,
1417         .owner  = THIS_MODULE,
1418         .create = smc_create,
1419 };
1420
1421 static int __init smc_init(void)
1422 {
1423         int rc;
1424
1425         rc = smc_pnet_init();
1426         if (rc)
1427                 return rc;
1428
1429         rc = smc_llc_init();
1430         if (rc) {
1431                 pr_err("%s: smc_llc_init fails with %d\n", __func__, rc);
1432                 goto out_pnet;
1433         }
1434
1435         rc = smc_cdc_init();
1436         if (rc) {
1437                 pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc);
1438                 goto out_pnet;
1439         }
1440
1441         rc = proto_register(&smc_proto, 1);
1442         if (rc) {
1443                 pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc);
1444                 goto out_pnet;
1445         }
1446
1447         rc = proto_register(&smc_proto6, 1);
1448         if (rc) {
1449                 pr_err("%s: proto_register(v6) fails with %d\n", __func__, rc);
1450                 goto out_proto;
1451         }
1452
1453         rc = sock_register(&smc_sock_family_ops);
1454         if (rc) {
1455                 pr_err("%s: sock_register fails with %d\n", __func__, rc);
1456                 goto out_proto6;
1457         }
1458         INIT_HLIST_HEAD(&smc_v4_hashinfo.ht);
1459         INIT_HLIST_HEAD(&smc_v6_hashinfo.ht);
1460
1461         rc = smc_ib_register_client();
1462         if (rc) {
1463                 pr_err("%s: ib_register fails with %d\n", __func__, rc);
1464                 goto out_sock;
1465         }
1466
1467         static_branch_enable(&tcp_have_smc);
1468         return 0;
1469
1470 out_sock:
1471         sock_unregister(PF_SMC);
1472 out_proto6:
1473         proto_unregister(&smc_proto6);
1474 out_proto:
1475         proto_unregister(&smc_proto);
1476 out_pnet:
1477         smc_pnet_exit();
1478         return rc;
1479 }
1480
1481 static void __exit smc_exit(void)
1482 {
1483         struct smc_link_group *lgr, *lg;
1484         LIST_HEAD(lgr_freeing_list);
1485
1486         spin_lock_bh(&smc_lgr_list.lock);
1487         if (!list_empty(&smc_lgr_list.list))
1488                 list_splice_init(&smc_lgr_list.list, &lgr_freeing_list);
1489         spin_unlock_bh(&smc_lgr_list.lock);
1490         list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) {
1491                 list_del_init(&lgr->list);
1492                 cancel_delayed_work_sync(&lgr->free_work);
1493                 smc_lgr_free(lgr); /* free link group */
1494         }
1495         static_branch_disable(&tcp_have_smc);
1496         smc_ib_unregister_client();
1497         sock_unregister(PF_SMC);
1498         proto_unregister(&smc_proto6);
1499         proto_unregister(&smc_proto);
1500         smc_pnet_exit();
1501 }
1502
1503 module_init(smc_init);
1504 module_exit(smc_exit);
1505
1506 MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
1507 MODULE_DESCRIPTION("smc socket address family");
1508 MODULE_LICENSE("GPL");
1509 MODULE_ALIAS_NETPROTO(PF_SMC);